1305 files changed, 118413 insertions, 15846 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update b/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update
index 0a41afe0ab4c..9051695d2211 100644
--- a/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update
+++ b/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update
@@ -1,7 +1,7 @@
 What:		/sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/sr_root_entry_hash
 Date:		Sep 2022
 KernelVersion:	5.20
-Contact:	Russ Weight <russell.h.weight@intel.com>
+Contact:	Peter Colberg <peter.colberg@intel.com>
 Description:	Read only. Returns the root entry hash for the static
 		region if one is programmed, else it returns the
 		string: "hash not programmed".  This file is only
@@ -11,7 +11,7 @@ Description:	Read only. Returns the root entry hash for the static
 What:		/sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/pr_root_entry_hash
 Date:		Sep 2022
 KernelVersion:	5.20
-Contact:	Russ Weight <russell.h.weight@intel.com>
+Contact:	Peter Colberg <peter.colberg@intel.com>
 Description:	Read only. Returns the root entry hash for the partial
 		reconfiguration region if one is programmed, else it
 		returns the string: "hash not programmed".  This file
@@ -21,7 +21,7 @@ Description:	Read only. Returns the root entry hash for the partial
 What:		/sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/bmc_root_entry_hash
 Date:		Sep 2022
 KernelVersion:	5.20
-Contact:	Russ Weight <russell.h.weight@intel.com>
+Contact:	Peter Colberg <peter.colberg@intel.com>
 Description:	Read only. Returns the root entry hash for the BMC image
 		if one is programmed, else it returns the string:
 		"hash not programmed".  This file is only visible if the
@@ -31,7 +31,7 @@ Description:	Read only. Returns the root entry hash for the BMC image
 What:		/sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/sr_canceled_csks
 Date:		Sep 2022
 KernelVersion:	5.20
-Contact:	Russ Weight <russell.h.weight@intel.com>
+Contact:	Peter Colberg <peter.colberg@intel.com>
 Description:	Read only. Returns a list of indices for canceled code
 		signing keys for the static region. The standard bitmap
 		list format is used (e.g. "1,2-6,9").
@@ -39,7 +39,7 @@ Description:	Read only. Returns a list of indices for canceled code
 What:		/sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/pr_canceled_csks
 Date:		Sep 2022
 KernelVersion:	5.20
-Contact:	Russ Weight <russell.h.weight@intel.com>
+Contact:	Peter Colberg <peter.colberg@intel.com>
 Description:	Read only. Returns a list of indices for canceled code
 		signing keys for the partial reconfiguration region. The
 		standard bitmap list format is used (e.g. "1,2-6,9").
@@ -47,7 +47,7 @@ Description:	Read only. Returns a list of indices for canceled code
 What:		/sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/bmc_canceled_csks
 Date:		Sep 2022
 KernelVersion:	5.20
-Contact:	Russ Weight <russell.h.weight@intel.com>
+Contact:	Peter Colberg <peter.colberg@intel.com>
 Description:	Read only. Returns a list of indices for canceled code
 		signing keys for the BMC.  The standard bitmap list format
 		is used (e.g. "1,2-6,9").
@@ -55,7 +55,7 @@ Description:	Read only. Returns a list of indices for canceled code
 What:		/sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/flash_count
 Date:		Sep 2022
 KernelVersion:	5.20
-Contact:	Russ Weight <russell.h.weight@intel.com>
+Contact:	Peter Colberg <peter.colberg@intel.com>
 Description:	Read only. Returns number of times the secure update
 		staging area has been flashed.
 		Format: "%u".
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
index 93d899d53258..414f8a2012d6 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
@@ -181,7 +181,7 @@ operations is carried out at several levels:
    of this wait (or series of waits, as the case may be) is to permit a
    concurrent CPU-hotplug operation to complete.
 #. In the case of RCU-sched, one of the last acts of an outgoing CPU is
-   to invoke ``rcu_report_dead()``, which reports a quiescent state for
+   to invoke ``rcutree_report_cpu_dead()``, which reports a quiescent state for
    that CPU. However, this is likely paranoia-induced redundancy.
 
 +-----------------------------------------------------------------------+
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg
index 7ac6f9269806..63eff867175a 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg
@@ -566,15 +566,6 @@
        style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_migrate_callbacks()</text>
     <text
        xml:space="preserve"
-       x="8335.4873"
-       y="5357.1006"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       id="text202-7-9-6-0"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_migrate_callbacks()</text>
-    <text
-       xml:space="preserve"
        x="8768.4678"
        y="6224.9038"
        font-style="normal"
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
index 7ddc094d7f28..d82a77d03d8c 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
@@ -1135,7 +1135,7 @@
        font-weight="bold"
        font-size="192"
        id="text202-7-5-3-27-6-5"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
     <text
        xml:space="preserve"
        x="3745.7725"
@@ -1256,7 +1256,7 @@
        font-style="normal"
        y="3679.27"
        x="-3804.9949"
-       xml:space="preserve">rcu_cpu_starting()</text>
+       xml:space="preserve">rcutree_report_cpu_starting()</text>
     <g
        style="fill:none;stroke-width:0.025in"
        id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
index 069f6f8371c2..53e0dc2a2c79 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
@@ -1448,15 +1448,6 @@
        style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_migrate_callbacks()</text>
     <text
        xml:space="preserve"
-       x="8335.4873"
-       y="5357.1006"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       id="text202-7-9-6-0"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_migrate_callbacks()</text>
-    <text
-       xml:space="preserve"
        x="8768.4678"
        y="6224.9038"
        font-style="normal"
@@ -3274,7 +3265,7 @@
          font-weight="bold"
          font-size="192"
          id="text202-7-5-3-27-6-5"
-         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
       <text
          xml:space="preserve"
          x="3745.7725"
@@ -3395,7 +3386,7 @@
          font-style="normal"
          y="3679.27"
          x="-3804.9949"
-         xml:space="preserve">rcu_cpu_starting()</text>
+         xml:space="preserve">rcutree_report_cpu_starting()</text>
       <g
          style="fill:none;stroke-width:0.025in"
          id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
index 2c9310ba29ba..4fa7506082bf 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
@@ -607,7 +607,7 @@
        font-weight="bold"
        font-size="192"
        id="text202-7-5-3-27-6"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
     <text
        xml:space="preserve"
        x="3745.7725"
@@ -728,7 +728,7 @@
        font-style="normal"
        y="3679.27"
        x="-3804.9949"
-       xml:space="preserve">rcu_cpu_starting()</text>
+       xml:space="preserve">rcutree_report_cpu_starting()</text>
     <g
        style="fill:none;stroke-width:0.025in"
        id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index f3b605285a87..cccafdaa1f84 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1955,12 +1955,12 @@ if offline CPUs block an RCU grace period for too long.
 
 An offline CPU's quiescent state will be reported either:
 
-1.  As the CPU goes offline using RCU's hotplug notifier (rcu_report_dead()).
+1.  As the CPU goes offline using RCU's hotplug notifier (rcutree_report_cpu_dead()).
 2.  When grace period initialization (rcu_gp_init()) detects a
     race either with CPU offlining or with a task unblocking on a leaf
     ``rcu_node`` structure whose CPUs are all offline.
 
-The CPU-online path (rcu_cpu_starting()) should never need to report
+The CPU-online path (rcutree_report_cpu_starting()) should never need to report
 a quiescent state for an offline CPU.  However, as a debugging measure,
 it does emit a warning if a quiescent state was not already reported
 for that CPU.
diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst
index bdc4bcc5289f..ed5c9d8c9afe 100644
--- a/Documentation/RCU/listRCU.rst
+++ b/Documentation/RCU/listRCU.rst
@@ -8,6 +8,15 @@ One of the most common uses of RCU is protecting read-mostly linked lists
 that all of the required memory ordering is provided by the list macros.
 This document describes several list-based RCU use cases.
 
+When iterating a list while holding the rcu_read_lock(), writers may
+modify the list.  The reader is guaranteed to see all of the elements
+which were added to the list before they acquired the rcu_read_lock()
+and are still on the list when they drop the rcu_read_unlock().
+Elements which are added to, or removed from the list may or may not
+be seen.  If the writer calls list_replace_rcu(), the reader may see
+either the old element or the new element; they will not see both,
+nor will they see neither.
+
 
 Example 1: Read-mostly list: Deferred Destruction
 -------------------------------------------------
diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index e488c8e557a9..60ce02475142 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -59,8 +59,8 @@ experiment with should focus on Section 2.  People who prefer to start
 with example uses should focus on Sections 3 and 4.  People who need to
 understand the RCU implementation should focus on Section 5, then dive
 into the kernel source code.  People who reason best by analogy should
-focus on Section 6.  Section 7 serves as an index to the docbook API
-documentation, and Section 8 is the traditional answer key.
+focus on Section 6 and 7.  Section 8 serves as an index to the docbook
+API documentation, and Section 9 is the traditional answer key.
 
 So, start with the section that makes the most sense to you and your
 preferred method of learning.  If you need to know everything about
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index b26b5274eaaf..e440aee4fe94 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -364,6 +364,13 @@ constraint, a threaded controller must be able to handle competition
 between threads in a non-leaf cgroup and its child cgroups.  Each
 threaded controller defines how such competitions are handled.
 
+Currently, the following controllers are threaded and can be enabled
+in a threaded cgroup::
+
+- cpu
+- cpuset
+- perf_event
+- pids
 
 [Un]populated Notification
 --------------------------
@@ -2226,6 +2233,49 @@ Cpuset Interface Files
 
 	Its value will be affected by memory nodes hotplug events.
 
+  cpuset.cpus.exclusive
+	A read-write multiple values file which exists on non-root
+	cpuset-enabled cgroups.
+
+	It lists all the exclusive CPUs that are allowed to be used
+	to create a new cpuset partition.  Its value is not used
+	unless the cgroup becomes a valid partition root.  See the
+	"cpuset.cpus.partition" section below for a description of what
+	a cpuset partition is.
+
+	When the cgroup becomes a partition root, the actual exclusive
+	CPUs that are allocated to that partition are listed in
+	"cpuset.cpus.exclusive.effective" which may be different
+	from "cpuset.cpus.exclusive".  If "cpuset.cpus.exclusive"
+	has previously been set, "cpuset.cpus.exclusive.effective"
+	is always a subset of it.
+
+	Users can manually set it to a value that is different from
+	"cpuset.cpus".	The only constraint in setting it is that the
+	list of CPUs must be exclusive with respect to its sibling.
+
+	For a parent cgroup, any one of its exclusive CPUs can only
+	be distributed to at most one of its child cgroups.  Having an
+	exclusive CPU appearing in two or more of its child cgroups is
+	not allowed (the exclusivity rule).  A value that violates the
+	exclusivity rule will be rejected with a write error.
+
+	The root cgroup is a partition root and all its available CPUs
+	are in its exclusive CPU set.
+
+  cpuset.cpus.exclusive.effective
+	A read-only multiple values file which exists on all non-root
+	cpuset-enabled cgroups.
+
+	This file shows the effective set of exclusive CPUs that
+	can be used to create a partition root.  The content of this
+	file will always be a subset of "cpuset.cpus" and its parent's
+	"cpuset.cpus.exclusive.effective" if its parent is not the root
+	cgroup.  It will also be a subset of "cpuset.cpus.exclusive"
+	if it is set.  If "cpuset.cpus.exclusive" is not set, it is
+	treated to have an implicit value of "cpuset.cpus" in the
+	formation of local partition.
+
   cpuset.cpus.partition
 	A read-write single value file which exists on non-root
 	cpuset-enabled cgroups.  This flag is owned by the parent cgroup
@@ -2239,26 +2289,41 @@ Cpuset Interface Files
 	  "isolated"	Partition root without load balancing
 	  ==========	=====================================
 
-	The root cgroup is always a partition root and its state
-	cannot be changed.  All other non-root cgroups start out as
-	"member".
+	A cpuset partition is a collection of cpuset-enabled cgroups with
+	a partition root at the top of the hierarchy and its descendants
+	except those that are separate partition roots themselves and
+	their descendants.  A partition has exclusive access to the
+	set of exclusive CPUs allocated to it.	Other cgroups outside
+	of that partition cannot use any CPUs in that set.
+
+	There are two types of partitions - local and remote.  A local
+	partition is one whose parent cgroup is also a valid partition
+	root.  A remote partition is one whose parent cgroup is not a
+	valid partition root itself.  Writing to "cpuset.cpus.exclusive"
+	is optional for the creation of a local partition as its
+	"cpuset.cpus.exclusive" file will assume an implicit value that
+	is the same as "cpuset.cpus" if it is not set.	Writing the
+	proper "cpuset.cpus.exclusive" values down the cgroup hierarchy
+	before the target partition root is mandatory for the creation
+	of a remote partition.
+
+	Currently, a remote partition cannot be created under a local
+	partition.  All the ancestors of a remote partition root except
+	the root cgroup cannot be a partition root.
+
+	The root cgroup is always a partition root and its state cannot
+	be changed.  All other non-root cgroups start out as "member".
 
 	When set to "root", the current cgroup is the root of a new
-	partition or scheduling domain that comprises itself and all
-	its descendants except those that are separate partition roots
-	themselves and their descendants.
+	partition or scheduling domain.  The set of exclusive CPUs is
+	determined by the value of its "cpuset.cpus.exclusive.effective".
 
-	When set to "isolated", the CPUs in that partition root will
+	When set to "isolated", the CPUs in that partition will
 	be in an isolated state without any load balancing from the
 	scheduler.  Tasks placed in such a partition with multiple
 	CPUs should be carefully distributed and bound to each of the
 	individual CPUs for optimal performance.
 
-	The value shown in "cpuset.cpus.effective" of a partition root
-	is the CPUs that the partition root can dedicate to a potential
-	new child partition root. The new child subtracts available
-	CPUs from its parent "cpuset.cpus.effective".
-
 	A partition root ("root" or "isolated") can be in one of the
 	two possible states - valid or invalid.  An invalid partition
 	root is in a degraded state where some state information may
@@ -2281,37 +2346,33 @@ Cpuset Interface Files
 	In the case of an invalid partition root, a descriptive string on
 	why the partition is invalid is included within parentheses.
 
-	For a partition root to become valid, the following conditions
+	For a local partition root to be valid, the following conditions
 	must be met.
 
-	1) The "cpuset.cpus" is exclusive with its siblings , i.e. they
-	   are not shared by any of its siblings (exclusivity rule).
-	2) The parent cgroup is a valid partition root.
-	3) The "cpuset.cpus" is not empty and must contain at least
-	   one of the CPUs from parent's "cpuset.cpus", i.e. they overlap.
-	4) The "cpuset.cpus.effective" cannot be empty unless there is
+	1) The parent cgroup is a valid partition root.
+	2) The "cpuset.cpus.exclusive.effective" file cannot be empty,
+	   though it may contain offline CPUs.
+	3) The "cpuset.cpus.effective" cannot be empty unless there is
 	   no task associated with this partition.
 
-	External events like hotplug or changes to "cpuset.cpus" can
-	cause a valid partition root to become invalid and vice versa.
-	Note that a task cannot be moved to a cgroup with empty
-	"cpuset.cpus.effective".
+	For a remote partition root to be valid, all the above conditions
+	except the first one must be met.
 
-	For a valid partition root with the sibling cpu exclusivity
-	rule enabled, changes made to "cpuset.cpus" that violate the
-	exclusivity rule will invalidate the partition as well as its
-	sibling partitions with conflicting cpuset.cpus values. So
-	care must be taking in changing "cpuset.cpus".
+	External events like hotplug or changes to "cpuset.cpus" or
+	"cpuset.cpus.exclusive" can cause a valid partition root to
+	become invalid and vice versa.	Note that a task cannot be
+	moved to a cgroup with empty "cpuset.cpus.effective".
 
 	A valid non-root parent partition may distribute out all its CPUs
-	to its child partitions when there is no task associated with it.
+	to its child local partitions when there is no task associated
+	with it.
 
-	Care must be taken to change a valid partition root to
-	"member" as all its child partitions, if present, will become
+	Care must be taken to change a valid partition root to "member"
+	as all its child local partitions, if present, will become
 	invalid causing disruption to tasks running in those child
 	partitions. These inactivated partitions could be recovered if
 	their parent is switched back to a partition root with a proper
-	set of "cpuset.cpus".
+	value in "cpuset.cpus" or "cpuset.cpus.exclusive".
 
 	Poll and inotify events are triggered whenever the state of
 	"cpuset.cpus.partition" changes.  That includes changes caused
@@ -2321,6 +2382,11 @@ Cpuset Interface Files
 	to "cpuset.cpus.partition" without the need to do continuous
 	polling.
 
+	A user can pre-configure certain CPUs to an isolated state
+	with load balancing disabled at boot time with the "isolcpus"
+	kernel boot command line option.  If those CPUs are to be put
+	into a partition, they have to be used in an isolated partition.
+
 
 Device controller
 -----------------
diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst
index b6cfb51cb0b4..e715bfc09879 100644
--- a/Documentation/admin-guide/hw-vuln/srso.rst
+++ b/Documentation/admin-guide/hw-vuln/srso.rst
@@ -46,12 +46,22 @@ The possible values in this file are:
 
    The processor is not vulnerable
 
- * 'Vulnerable: no microcode':
+* 'Vulnerable':
+
+   The processor is vulnerable and no mitigations have been applied.
+
+ * 'Vulnerable: No microcode':
 
    The processor is vulnerable, no microcode extending IBPB
    functionality to address the vulnerability has been applied.
 
- * 'Mitigation: microcode':
+ * 'Vulnerable: Safe RET, no microcode':
+
+   The "Safe RET" mitigation (see below) has been applied to protect the
+   kernel, but the IBPB-extending microcode has not been applied.  User
+   space tasks may still be vulnerable.
+
+ * 'Vulnerable: Microcode, no safe RET':
 
    Extended IBPB functionality microcode patch has been applied. It does
    not address User->Kernel and Guest->Host transitions protection but it
@@ -72,11 +82,11 @@ The possible values in this file are:
 
    (spec_rstack_overflow=microcode)
 
- * 'Mitigation: safe RET':
+ * 'Mitigation: Safe RET':
 
-   Software-only mitigation. It complements the extended IBPB microcode
-   patch functionality by addressing User->Kernel and Guest->Host
-   transitions protection.
+   Combined microcode/software mitigation. It complements the
+   extended IBPB microcode patch functionality by addressing
+   User->Kernel and Guest->Host transitions protection.
 
    Selected by default or by spec_rstack_overflow=safe-ret
 
@@ -129,7 +139,7 @@ an indrect branch prediction barrier after having applied the required
 microcode patch for one's system. This mitigation comes also at
 a performance cost.
 
-Mitigation: safe RET
+Mitigation: Safe RET
 --------------------
 
 The mitigation works by ensuring all RET instructions speculate to
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0a1731a0f0ef..758bb25ea3e6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -580,6 +580,10 @@
 			named mounts. Specifying both "all" and "named" disables
 			all v1 hierarchies.
 
+	cgroup_favordynmods= [KNL] Enable or Disable favordynmods.
+			Format: { "true" | "false" }
+			Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS.
+
 	cgroup.memory=	[KNL] Pass options to the cgroup memory controller.
 			Format: <string>
 			nosocket -- Disable socket memory accounting.
@@ -1893,6 +1897,12 @@
 			 0 -- machine default
 			 1 -- force brightness inversion
 
+	ia32_emulation=	[X86-64]
+			Format: <bool>
+			When true, allows loading 32-bit programs and executing 32-bit
+			syscalls, essentially overriding IA32_EMULATION_DEFAULT_DISABLED at
+			boot time. When false, unconditionally disables IA32 emulation.
+
 	icn=		[HW,ISDN]
 			Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
 
@@ -2913,6 +2923,38 @@
 			to extract confidential information from the kernel
 			are also disabled.
 
+	locktorture.acq_writer_lim= [KNL]
+			Set the time limit in jiffies for a lock
+			acquisition.  Acquisitions exceeding this limit
+			will result in a splat once they do complete.
+
+	locktorture.bind_readers= [KNL]
+			Specify the list of CPUs to which the readers are
+			to be bound.
+
+	locktorture.bind_writers= [KNL]
+			Specify the list of CPUs to which the writers are
+			to be bound.
+
+	locktorture.call_rcu_chains= [KNL]
+			Specify the number of self-propagating call_rcu()
+			chains to set up.  These are used to ensure that
+			there is a high probability of an RCU grace period
+			in progress at any given time.	Defaults to 0,
+			which disables these call_rcu() chains.
+
+	locktorture.long_hold= [KNL]
+			Specify the duration in milliseconds for the
+			occasional long-duration lock hold time.  Defaults
+			to 100 milliseconds.  Select 0 to disable.
+
+	locktorture.nested_locks= [KNL]
+			Specify the maximum lock nesting depth that
+			locktorture is to exercise, up to a limit of 8
+			(MAX_NESTED_LOCKS).  Specify zero to disable.
+			Note that this parameter is ineffective on types
+			of locks that do not support nested acquisition.
+
 	locktorture.nreaders_stress= [KNL]
 			Set the number of locking read-acquisition kthreads.
 			Defaults to being automatically set based on the
@@ -2928,6 +2970,25 @@
 			Set time (s) between CPU-hotplug operations, or
 			zero to disable CPU-hotplug testing.
 
+	locktorture.rt_boost= [KNL]
+			Do periodic testing of real-time lock priority
+			boosting.  Select 0 to disable, 1 to boost
+			only rt_mutex, and 2 to boost unconditionally.
+			Defaults to 2, which might seem to be an
+			odd choice, but which should be harmless for
+			non-real-time spinlocks, due to their disabling
+			of preemption.	Note that non-realtime mutexes
+			disable boosting.
+
+	locktorture.rt_boost_factor= [KNL]
+			Number that determines how often and for how
+			long priority boosting is exercised.  This is
+			scaled down by the number of writers, so that the
+			number of boosts per unit time remains roughly
+			constant as the number of writers increases.
+			On the other hand, the duration of each boost
+			increases with the number of writers.
+
 	locktorture.shuffle_interval= [KNL]
 			Set task-shuffle interval (jiffies).  Shuffling
 			tasks allows some CPUs to go into dyntick-idle
@@ -2950,13 +3011,13 @@
 	locktorture.torture_type= [KNL]
 			Specify the locking implementation to test.
 
+	locktorture.verbose= [KNL]
+			Enable additional printk() statements.
+
 	locktorture.writer_fifo= [KNL]
 			Run the write-side locktorture kthreads at
 			sched_set_fifo() real-time priority.
 
-	locktorture.verbose= [KNL]
-			Enable additional printk() statements.
-
 	logibm.irq=	[HW,MOUSE] Logitech Bus Mouse Driver
 			Format: <irq>
 
@@ -4769,6 +4830,13 @@
 			Set maximum number of finished RCU callbacks to
 			process in one batch.
 
+	rcutree.do_rcu_barrier=	[KNL]
+			Request a call to rcu_barrier().  This is
+			throttled so that userspace tests can safely
+			hammer on the sysfs variable if they so choose.
+			If triggered before the RCU grace-period machinery
+			is fully active, this will error out with EAGAIN.
+
 	rcutree.dump_tree=	[KNL]
 			Dump the structure of the rcu_node combining tree
 			out at early boot.  This is used for diagnostic
@@ -5422,6 +5490,12 @@
 			test until boot completes in order to avoid
 			interference.
 
+	refscale.lookup_instances= [KNL]
+			Number of data elements to use for the forms of
+			SLAB_TYPESAFE_BY_RCU testing.  A negative number
+			is negated and multiplied by nr_cpu_ids, while
+			zero specifies nr_cpu_ids.
+
 	refscale.loops= [KNL]
 			Set the number of loops over the synchronization
 			primitive under test.  Increasing this number
@@ -5858,6 +5932,13 @@
 			This feature may be more efficiently disabled
 			using the csdlock_debug- kernel parameter.
 
+	smp.panic_on_ipistall= [KNL]
+			If a csd_lock_timeout extends for more than
+			the specified number of milliseconds, panic the
+			system.  By default, let CSD-lock acquisition
+			take as long as they take.  Specifying 300,000
+			for this value provides a 5-minute timeout.
+
 	smsc-ircc2.nopnp	[HW] Don't use PNP to discover SMC devices
 	smsc-ircc2.ircc_cfg=	[HW] Device configuration I/O port
 	smsc-ircc2.ircc_sir=	[HW] SIR base I/O port
diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst
index b799a43da62e..39bd6ecce7de 100644
--- a/Documentation/admin-guide/pm/intel_idle.rst
+++ b/Documentation/admin-guide/pm/intel_idle.rst
@@ -170,7 +170,7 @@ and ``idle=nomwait``.  If any of them is present in the kernel command line, the
 ``MWAIT`` instruction is not allowed to be used, so the initialization of
 ``intel_idle`` will fail.
 
-Apart from that there are four module parameters recognized by ``intel_idle``
+Apart from that there are five module parameters recognized by ``intel_idle``
 itself that can be set via the kernel command line (they cannot be updated via
 sysfs, so that is the only way to change their values).
 
@@ -216,6 +216,21 @@ are ignored).
 The idle states disabled this way can be enabled (on a per-CPU basis) from user
 space via ``sysfs``.
 
+The ``ibrs_off`` module parameter is a boolean flag (defaults to
+false). If set, it is used to control if IBRS (Indirect Branch Restricted
+Speculation) should be turned off when the CPU enters an idle state.
+This flag does not affect CPUs that use Enhanced IBRS which can remain
+on with little performance impact.
+
+For some CPUs, IBRS will be selected as mitigation for Spectre v2 and Retbleed
+security vulnerabilities by default.  Leaving the IBRS mode on while idling may
+have a performance impact on its sibling CPU.  The IBRS mode will be turned off
+by default when the CPU enters into a deep idle state, but not in some
+shallower ones.  Setting the ``ibrs_off`` module parameter will force the IBRS
+mode to off when the CPU is in any one of the available idle states.  This may
+help performance of a sibling CPU at the expense of a slightly higher wakeup
+latency for the idle CPU.
+
 
 .. _intel-idle-core-and-package-idle-states:
 
diff --git a/Documentation/admin-guide/pstore-blk.rst b/Documentation/admin-guide/pstore-blk.rst
index 2d22ead9520e..1bb2a1c292aa 100644
--- a/Documentation/admin-guide/pstore-blk.rst
+++ b/Documentation/admin-guide/pstore-blk.rst
@@ -76,7 +76,7 @@ kmsg_size
 ~~~~~~~~~
 
 The chunk size in KB for oops/panic front-end. It **MUST** be a multiple of 4.
-It's optional if you do not care oops/panic log.
+It's optional if you do not care about the oops/panic log.
 
 There are multiple chunks for oops/panic front-end depending on the remaining
 space except other pstore front-ends.
@@ -88,7 +88,7 @@ pmsg_size
 ~~~~~~~~~
 
 The chunk size in KB for pmsg front-end. It **MUST** be a multiple of 4.
-It's optional if you do not care pmsg log.
+It's optional if you do not care about the pmsg log.
 
 Unlike oops/panic front-end, there is only one chunk for pmsg front-end.
 
@@ -100,7 +100,7 @@ console_size
 ~~~~~~~~~~~~
 
 The chunk size in KB for console front-end.  It **MUST** be a multiple of 4.
-It's optional if you do not care console log.
+It's optional if you do not care about the console log.
 
 Similar to pmsg front-end, there is only one chunk for console front-end.
 
@@ -111,7 +111,7 @@ ftrace_size
 ~~~~~~~~~~~
 
 The chunk size in KB for ftrace front-end. It **MUST** be a multiple of 4.
-It's optional if you do not care console log.
+It's optional if you do not care about the ftrace log.
 
 Similar to oops front-end, there are multiple chunks for ftrace front-end
 depending on the count of cpu processors. Each chunk size is equal to
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index cf33de56da27..d89ac2bd8dc4 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1182,7 +1182,8 @@ automatically on platforms where it can run (that is,
 platforms with asymmetric CPU topologies and having an Energy
 Model available). If your platform happens to meet the
 requirements for EAS but you do not want to use it, change
-this value to 0.
+this value to 0. On Non-EAS platforms, write operation fails and
+read doesn't return anything.
 
 task_delayacct
 ===============
diff --git a/Documentation/arch/x86/amd-memory-encryption.rst b/Documentation/arch/x86/amd-memory-encryption.rst
index 934310ce7258..07caa8fff852 100644
--- a/Documentation/arch/x86/amd-memory-encryption.rst
+++ b/Documentation/arch/x86/amd-memory-encryption.rst
@@ -130,4 +130,4 @@ SNP feature support.
 
 More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
 
-[1] https://www.amd.com/system/files/TechDocs/40332.pdf
+[1] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf
diff --git a/Documentation/arch/x86/iommu.rst b/Documentation/arch/x86/iommu.rst
index 42c7a6faa39a..41fbadfe2221 100644
--- a/Documentation/arch/x86/iommu.rst
+++ b/Documentation/arch/x86/iommu.rst
@@ -5,7 +5,7 @@ x86 IOMMU Support
 The architecture specs can be obtained from the below locations.
 
 - Intel: http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
-- AMD: https://www.amd.com/system/files/TechDocs/48882_IOMMU.pdf
+- AMD: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_3_07_PUB.pdf
 
 This guide gives a quick cheat sheet for some basic understanding.
 
diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
index cb05d90111b4..a6279df64a9d 100644
--- a/Documentation/arch/x86/resctrl.rst
+++ b/Documentation/arch/x86/resctrl.rst
@@ -35,7 +35,7 @@ about the feature from resctrl's info directory.
 
 To use the feature mount the file system::
 
- # mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps]] /sys/fs/resctrl
+ # mount -t resctrl resctrl [-o cdp[,cdpl2][,mba_MBps][,debug]] /sys/fs/resctrl
 
 mount options are:
 
@@ -46,6 +46,9 @@ mount options are:
 "mba_MBps":
 	Enable the MBA Software Controller(mba_sc) to specify MBA
 	bandwidth in MBps
+"debug":
+	Make debug files accessible. Available debug files are annotated with
+	"Available only with debug option".
 
 L2 and L3 CDP are controlled separately.
 
@@ -124,6 +127,13 @@ related to allocation:
 			"P":
 			      Corresponding region is pseudo-locked. No
 			      sharing allowed.
+"sparse_masks":
+		Indicates if non-contiguous 1s value in CBM is supported.
+
+			"0":
+			      Only contiguous 1s value in CBM is supported.
+			"1":
+			      Non-contiguous 1s value in CBM is supported.
 
 Memory bandwidth(MB) subdirectory contains the following files
 with respect to allocation:
@@ -299,7 +309,14 @@ All groups contain the following files:
 "tasks":
 	Reading this file shows the list of all tasks that belong to
 	this group. Writing a task id to the file will add a task to the
-	group. If the group is a CTRL_MON group the task is removed from
+	group. Multiple tasks can be added by separating the task ids
+	with commas. Tasks will be assigned sequentially. Multiple
+	failures are not supported. A single failure encountered while
+	attempting to assign a task will cause the operation to abort and
+	already added tasks before the failure will remain in the group.
+	Failures will be logged to /sys/fs/resctrl/info/last_cmd_status.
+
+	If the group is a CTRL_MON group the task is removed from
 	whichever previous CTRL_MON group owned the task and also from
 	any MON group that owned the task. If the group is a MON group,
 	then the task must already belong to the CTRL_MON parent of this
@@ -342,6 +359,10 @@ When control is enabled all CTRL_MON groups will also contain:
 	file. On successful pseudo-locked region creation the mode will
 	automatically change to "pseudo-locked".
 
+"ctrl_hw_id":
+	Available only with debug option. The identifier used by hardware
+	for the control group. On x86 this is the CLOSID.
+
 When monitoring is enabled all MON groups will also contain:
 
 "mon_data":
@@ -355,6 +376,10 @@ When monitoring is enabled all MON groups will also contain:
 	the sum for all tasks in the CTRL_MON group and all tasks in
 	MON groups. Please see example section for more details on usage.
 
+"mon_hw_id":
+	Available only with debug option. The identifier used by hardware
+	for the monitor group. On x86 this is the RMID.
+
 Resource allocation rules
 -------------------------
 
@@ -445,12 +470,13 @@ For cache resources we describe the portion of the cache that is available
 for allocation using a bitmask. The maximum value of the mask is defined
 by each cpu model (and may be different for different cache levels). It
 is found using CPUID, but is also provided in the "info" directory of
-the resctrl file system in "info/{resource}/cbm_mask". Intel hardware
+the resctrl file system in "info/{resource}/cbm_mask". Some Intel hardware
 requires that these masks have all the '1' bits in a contiguous block. So
 0x3, 0x6 and 0xC are legal 4-bit masks with two bits set, but 0x5, 0x9
-and 0xA are not.  On a system with a 20-bit mask each bit represents 5%
-of the capacity of the cache. You could partition the cache into four
-equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000.
+and 0xA are not. Check /sys/fs/resctrl/info/{resource}/sparse_masks
+if non-contiguous 1s value is supported. On a system with a 20-bit mask
+each bit represents 5% of the capacity of the cache. You could partition
+the cache into four equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000.
 
 Memory bandwidth Allocation and monitoring
 ==========================================
diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst
index 7f58010ea86a..08ebf9edbfc1 100644
--- a/Documentation/arch/x86/topology.rst
+++ b/Documentation/arch/x86/topology.rst
@@ -55,19 +55,19 @@ Package-related topology information in the kernel:
 
     The number of dies in a package. This information is retrieved via CPUID.
 
-  - cpuinfo_x86.cpu_die_id:
+  - cpuinfo_x86.topo.die_id:
 
     The physical ID of the die. This information is retrieved via CPUID.
 
-  - cpuinfo_x86.phys_proc_id:
+  - cpuinfo_x86.topo.pkg_id:
 
     The physical ID of the package. This information is retrieved via CPUID
     and deduced from the APIC IDs of the cores in the package.
 
     Modern systems use this value for the socket. There may be multiple
-    packages within a socket. This value may differ from cpu_die_id.
+    packages within a socket. This value may differ from topo.die_id.
 
-  - cpuinfo_x86.logical_proc_id:
+  - cpuinfo_x86.topo.logical_pkg_id:
 
     The logical ID of the package. As we do not trust BIOSes to enumerate the
     packages in a consistent way, we introduced the concept of logical package
@@ -79,9 +79,7 @@ Package-related topology information in the kernel:
     The maximum possible number of packages in the system. Helpful for per
     package facilities to preallocate per package information.
 
-  - cpu_llc_id:
-
-    A per-CPU variable containing:
+  - cpuinfo_x86.topo.llc_id:
 
       - On Intel, the first APIC ID of the list of CPUs sharing the Last Level
         Cache
diff --git a/Documentation/devicetree/bindings/iio/addac/adi,ad74115.yaml b/Documentation/devicetree/bindings/iio/addac/adi,ad74115.yaml
index 2594fa192f93..2a04906531fb 100644
--- a/Documentation/devicetree/bindings/iio/addac/adi,ad74115.yaml
+++ b/Documentation/devicetree/bindings/iio/addac/adi,ad74115.yaml
@@ -32,7 +32,8 @@ properties:
 
   spi-cpol: true
 
-  reset-gpios: true
+  reset-gpios:
+    maxItems: 1
 
   interrupts:
     minItems: 1
diff --git a/Documentation/devicetree/bindings/iio/dac/adi,ad5758.yaml b/Documentation/devicetree/bindings/iio/dac/adi,ad5758.yaml
index 4e508bfcc9d8..5121685337b5 100644
--- a/Documentation/devicetree/bindings/iio/dac/adi,ad5758.yaml
+++ b/Documentation/devicetree/bindings/iio/dac/adi,ad5758.yaml
@@ -78,7 +78,8 @@ properties:
           - const: -1000
           - const: 22000
 
-  reset-gpios: true
+  reset-gpios:
+    maxItems: 1
 
   adi,dc-dc-ilim-microamp:
     enum: [150000, 200000, 250000, 300000, 350000, 400000]
diff --git a/Documentation/devicetree/bindings/iio/health/ti,afe4403.yaml b/Documentation/devicetree/bindings/iio/health/ti,afe4403.yaml
index b9b5beac33b2..5b6cde86b5a5 100644
--- a/Documentation/devicetree/bindings/iio/health/ti,afe4403.yaml
+++ b/Documentation/devicetree/bindings/iio/health/ti,afe4403.yaml
@@ -23,7 +23,8 @@ properties:
     maxItems: 1
     description: Connected to ADC_RDY pin.
 
-  reset-gpios: true
+  reset-gpios:
+    maxItems: 1
 
 required:
   - compatible
diff --git a/Documentation/devicetree/bindings/iio/health/ti,afe4404.yaml b/Documentation/devicetree/bindings/iio/health/ti,afe4404.yaml
index 2958c4ca75b4..167d10bd60af 100644
--- a/Documentation/devicetree/bindings/iio/health/ti,afe4404.yaml
+++ b/Documentation/devicetree/bindings/iio/health/ti,afe4404.yaml
@@ -23,7 +23,8 @@ properties:
     maxItems: 1
     description: Connected to ADC_RDY pin.
 
-  reset-gpios: true
+  reset-gpios:
+    maxItems: 1
 
 additionalProperties: false
 
diff --git a/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml
new file mode 100644
index 000000000000..12f8e9f350bc
--- /dev/null
+++ b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/memory-controllers/xlnx,versal-ddrmc-edac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Xilinx Versal DDRMC (Integrated DDR Memory Controller)
+
+maintainers:
+  - Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
+  - Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
+
+description:
+  The integrated DDR Memory Controllers (DDRMCs) support both DDR4 and LPDDR4/
+  4X memory interfaces. Versal DDR memory controller has an optional ECC support
+  which correct single bit ECC errors and detect double bit ECC errors.
+
+properties:
+  compatible:
+    const: xlnx,versal-ddrmc
+
+  reg:
+    items:
+      - description: DDR Memory Controller registers
+      - description: NOC registers corresponding to DDR Memory Controller
+
+  reg-names:
+    items:
+      - const: base
+      - const: noc
+
+  interrupts:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - reg-names
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    bus {
+      #address-cells = <2>;
+      #size-cells = <2>;
+      memory-controller@f6150000 {
+        compatible = "xlnx,versal-ddrmc";
+        reg = <0x0 0xf6150000 0x0 0x2000>, <0x0 0xf6070000 0x0 0x20000>;
+        reg-names = "base", "noc";
+        interrupt-parent = <&gic>;
+        interrupts = <GIC_SPI 147 IRQ_TYPE_LEVEL_HIGH>;
+      };
+    };
diff --git a/Documentation/devicetree/bindings/timer/cirrus,ep9301-timer.yaml b/Documentation/devicetree/bindings/timer/cirrus,ep9301-timer.yaml
new file mode 100644
index 000000000000..e463e11e259d
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/cirrus,ep9301-timer.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/timer/cirrus,ep9301-timer.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Cirrus Logic EP93xx timer
+
+maintainers:
+  - Alexander Sverdlin <alexander.sverdlin@gmail.com>
+  - Nikita Shubin <nikita.shubin@maquefel.me>
+
+properties:
+  compatible:
+    oneOf:
+      - const: cirrus,ep9301-timer
+      - items:
+          - enum:
+              - cirrus,ep9302-timer
+              - cirrus,ep9307-timer
+              - cirrus,ep9312-timer
+              - cirrus,ep9315-timer
+          - const: cirrus,ep9301-timer
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  resets:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    timer@80810000 {
+      compatible = "cirrus,ep9301-timer";
+      reg = <0x80810000 0x100>;
+      interrupt-parent = <&vic1>;
+      interrupts = <19>;
+    };
+...
diff --git a/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml b/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml
index bffdab0b0185..3931054b42fb 100644
--- a/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml
+++ b/Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml
@@ -11,8 +11,8 @@ maintainers:
 
 description: |
   This hardware block consists of eight 16-bit timer channels and one
-  32- bit timer channel. It supports the following specifications:
-    - Pulse input/output: 28 lines max.
+  32-bit timer channel. It supports the following specifications:
+    - Pulse input/output: 28 lines max
     - Pulse input 3 lines
     - Count clock 11 clocks for each channel (14 clocks for MTU0, 12 clocks
       for MTU2, and 10 clocks for MTU5, four clocks for MTU1-MTU2 combination
@@ -23,11 +23,11 @@ description: |
         - Input capture function (noise filter setting available)
         - Counter-clearing operation
         - Simultaneous writing to multiple timer counters (TCNT)
-          (excluding MTU8).
+          (excluding MTU8)
         - Simultaneous clearing on compare match or input capture
-          (excluding MTU8).
+          (excluding MTU8)
         - Simultaneous input and output to registers in synchronization with
-          counter operations           (excluding MTU8).
+          counter operations (excluding MTU8)
         - Up to 12-phase PWM output in combination with synchronous operation
           (excluding MTU8)
     - [MTU0 MTU3, MTU4, MTU6, MTU7, and MTU8]
@@ -40,26 +40,26 @@ description: |
     - [MTU3, MTU4, MTU6, and MTU7]
         - Through interlocked operation of MTU3/4 and MTU6/7, the positive and
           negative signals in six phases (12 phases in total) can be output in
-          complementary PWM and reset-synchronized PWM operation.
+          complementary PWM and reset-synchronized PWM operation
         - In complementary PWM mode, values can be transferred from buffer
           registers to temporary registers at crests and troughs of the timer-
           counter values or when the buffer registers (TGRD registers in MTU4
-          and MTU7) are written to.
-        - Double-buffering selectable in complementary PWM mode.
+          and MTU7) are written to
+        - Double-buffering selectable in complementary PWM mode
     - [MTU3 and MTU4]
         - Through interlocking with MTU0, a mode for driving AC synchronous
           motors (brushless DC motors) by using complementary PWM output and
           reset-synchronized PWM output is settable and allows the selection
-          of two types of waveform output (chopping or level).
+          of two types of waveform output (chopping or level)
     - [MTU5]
-        - Capable of operation as a dead-time compensation counter.
+        - Capable of operation as a dead-time compensation counter
     - [MTU0/MTU5, MTU1, MTU2, and MTU8]
         - 32-bit phase counting mode specifiable by combining MTU1 and MTU2 and
-          through interlocked operation with MTU0/MTU5 and MTU8.
+          through interlocked operation with MTU0/MTU5 and MTU8
     - Interrupt-skipping function
         - In complementary PWM mode, interrupts on crests and troughs of counter
           values and triggers to start conversion by the A/D converter can be
-          skipped.
+          skipped
     - Interrupt sources: 43 sources.
     - Buffer operation:
         - Automatic transfer of register data (transfer from the buffer
@@ -68,9 +68,9 @@ description: |
         - A/D converter start triggers can be generated
         - A/D converter start request delaying function enables A/D converter
           to be started with any desired timing and to be synchronized with
-          PWM output.
+          PWM output
     - Low power consumption function
-        - The MTU3a can be placed in the module-stop state.
+        - The MTU3a can be placed in the module-stop state
 
     There are two phase counting modes. 16-bit phase counting mode in which
     MTU1 and MTU2 operate independently, and cascade connection 32-bit phase
@@ -109,6 +109,7 @@ properties:
   compatible:
     items:
       - enum:
+          - renesas,r9a07g043-mtu3  # RZ/{G2UL,Five}
           - renesas,r9a07g044-mtu3  # RZ/G2{L,LC}
           - renesas,r9a07g054-mtu3  # RZ/V2L
       - const: renesas,rz-mtu3
@@ -169,27 +170,27 @@ properties:
       - const: tgib0
       - const: tgic0
       - const: tgid0
-      - const: tgiv0
+      - const: tciv0
       - const: tgie0
       - const: tgif0
       - const: tgia1
       - const: tgib1
-      - const: tgiv1
-      - const: tgiu1
+      - const: tciv1
+      - const: tciu1
       - const: tgia2
       - const: tgib2
-      - const: tgiv2
-      - const: tgiu2
+      - const: tciv2
+      - const: tciu2
       - const: tgia3
       - const: tgib3
       - const: tgic3
       - const: tgid3
-      - const: tgiv3
+      - const: tciv3
       - const: tgia4
       - const: tgib4
       - const: tgic4
       - const: tgid4
-      - const: tgiv4
+      - const: tciv4
       - const: tgiu5
       - const: tgiv5
       - const: tgiw5
@@ -197,18 +198,18 @@ properties:
       - const: tgib6
       - const: tgic6
       - const: tgid6
-      - const: tgiv6
+      - const: tciv6
       - const: tgia7
       - const: tgib7
       - const: tgic7
       - const: tgid7
-      - const: tgiv7
+      - const: tciv7
       - const: tgia8
       - const: tgib8
       - const: tgic8
       - const: tgid8
-      - const: tgiv8
-      - const: tgiu8
+      - const: tciv8
+      - const: tciu8
 
   clocks:
     maxItems: 1
@@ -285,16 +286,16 @@ examples:
                    <GIC_SPI 211 IRQ_TYPE_EDGE_RISING>,
                    <GIC_SPI 212 IRQ_TYPE_EDGE_RISING>,
                    <GIC_SPI 213 IRQ_TYPE_EDGE_RISING>;
-      interrupt-names = "tgia0", "tgib0", "tgic0", "tgid0", "tgiv0", "tgie0",
+      interrupt-names = "tgia0", "tgib0", "tgic0", "tgid0", "tciv0", "tgie0",
                         "tgif0",
-                        "tgia1", "tgib1", "tgiv1", "tgiu1",
-                        "tgia2", "tgib2", "tgiv2", "tgiu2",
-                        "tgia3", "tgib3", "tgic3", "tgid3", "tgiv3",
-                        "tgia4", "tgib4", "tgic4", "tgid4", "tgiv4",
+                        "tgia1", "tgib1", "tciv1", "tciu1",
+                        "tgia2", "tgib2", "tciv2", "tciu2",
+                        "tgia3", "tgib3", "tgic3", "tgid3", "tciv3",
+                        "tgia4", "tgib4", "tgic4", "tgid4", "tciv4",
                         "tgiu5", "tgiv5", "tgiw5",
-                        "tgia6", "tgib6", "tgic6", "tgid6", "tgiv6",
-                        "tgia7", "tgib7", "tgic7", "tgid7", "tgiv7",
-                        "tgia8", "tgib8", "tgic8", "tgid8", "tgiv8", "tgiu8";
+                        "tgia6", "tgib6", "tgic6", "tgid6", "tciv6",
+                        "tgia7", "tgib7", "tgic7", "tgid7", "tciv7",
+                        "tgia8", "tgib8", "tgic8", "tgid8", "tciv8", "tciu8";
       clocks = <&cpg CPG_MOD R9A07G044_MTU_X_MCK_MTU3>;
       power-domains = <&cpg>;
       resets = <&cpg R9A07G044_MTU_X_PRESET_MTU3>;
diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index bcf84459917f..9e38e4c221ca 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@ the fdtable structure -
    be held.
 
 4. To look up the file structure given an fd, a reader
-   must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These
+   must use either lookup_fdget_rcu() or files_lookup_fdget_rcu() APIs. These
    take care of barrier requirements due to lock-free lookup.
 
    An example::
@@ -70,43 +70,22 @@ the fdtable structure -
 	struct file *file;
 
 	rcu_read_lock();
-	file = lookup_fd_rcu(fd);
-	if (file) {
-		...
-	}
-	....
+	file = lookup_fdget_rcu(fd);
 	rcu_read_unlock();
-
-5. Handling of the file structures is special. Since the look-up
-   of the fd (fget()/fget_light()) are lock-free, it is possible
-   that look-up may race with the last put() operation on the
-   file structure. This is avoided using atomic_long_inc_not_zero()
-   on ->f_count::
-
-	rcu_read_lock();
-	file = files_lookup_fd_rcu(files, fd);
 	if (file) {
-		if (atomic_long_inc_not_zero(&file->f_count))
-			*fput_needed = 1;
-		else
-		/* Didn't get the reference, someone's freed */
-			file = NULL;
+		...
+                fput(file);
 	}
-	rcu_read_unlock();
 	....
-	return file;
-
-   atomic_long_inc_not_zero() detects if refcounts is already zero or
-   goes to zero during increment. If it does, we fail
-   fget()/fget_light().
 
-6. Since both fdtable and file structures can be looked up
+5. Since both fdtable and file structures can be looked up
    lock-free, they must be installed using rcu_assign_pointer()
    API. If they are looked up lock-free, rcu_dereference()
    must be used. However it is advisable to use files_fdtable()
-   and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues.
+   and lookup_fdget_rcu()/files_lookup_fdget_rcu() which take care of these
+   issues.
 
-7. While updating, the fdtable pointer must be looked up while
+6. While updating, the fdtable pointer must be looked up while
    holding files->file_lock. If ->file_lock is dropped, then
    another thread expand the files thereby creating a new
    fdtable and making the earlier fdtable pointer stale.
@@ -126,3 +105,19 @@ the fdtable structure -
    Since locate_fd() can drop ->file_lock (and reacquire ->file_lock),
    the fdtable pointer (fdt) must be loaded after locate_fd().
 
+On newer kernels rcu based file lookup has been switched to rely on
+SLAB_TYPESAFE_BY_RCU instead of call_rcu(). It isn't sufficient anymore
+to just acquire a reference to the file in question under rcu using
+atomic_long_inc_not_zero() since the file might have already been
+recycled and someone else might have bumped the reference. In other
+words, callers might see reference count bumps from newer users. For
+this is reason it is necessary to verify that the pointer is the same
+before and after the reference count increment. This pattern can be seen
+in get_file_rcu() and __files_get_rcu().
+
+In addition, it isn't possible to access or check fields in struct file
+without first aqcuiring a reference on it under rcu lookup. Not doing
+that was always very dodgy and it was only usable for non-pointer data
+in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers
+either first acquire a reference or they must hold the files_lock of the
+fdtable.
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index a624e92f2687..1b84f818e574 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -261,9 +261,9 @@ DIRECT_KEY policies
 
 The Adiantum encryption mode (see `Encryption modes and usage`_) is
 suitable for both contents and filenames encryption, and it accepts
-long IVs --- long enough to hold both an 8-byte logical block number
-and a 16-byte per-file nonce.  Also, the overhead of each Adiantum key
-is greater than that of an AES-256-XTS key.
+long IVs --- long enough to hold both an 8-byte data unit index and a
+16-byte per-file nonce.  Also, the overhead of each Adiantum key is
+greater than that of an AES-256-XTS key.
 
 Therefore, to improve performance and save memory, for Adiantum a
 "direct key" configuration is supported.  When the user has enabled
@@ -300,8 +300,8 @@ IV_INO_LBLK_32 policies
 
 IV_INO_LBLK_32 policies work like IV_INO_LBLK_64, except that for
 IV_INO_LBLK_32, the inode number is hashed with SipHash-2-4 (where the
-SipHash key is derived from the master key) and added to the file
-logical block number mod 2^32 to produce a 32-bit IV.
+SipHash key is derived from the master key) and added to the file data
+unit index mod 2^32 to produce a 32-bit IV.
 
 This format is optimized for use with inline encryption hardware
 compliant with the eMMC v5.2 standard, which supports only 32 IV bits
@@ -451,31 +451,62 @@ acceleration is recommended:
 Contents encryption
 -------------------
 
-For file contents, each filesystem block is encrypted independently.
-Starting from Linux kernel 5.5, encryption of filesystems with block
-size less than system's page size is supported.
-
-Each block's IV is set to the logical block number within the file as
-a little endian number, except that:
-
-- With CBC mode encryption, ESSIV is also used.  Specifically, each IV
-  is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
-  of the file's data encryption key.
-
-- With `DIRECT_KEY policies`_, the file's nonce is appended to the IV.
-  Currently this is only allowed with the Adiantum encryption mode.
-
-- With `IV_INO_LBLK_64 policies`_, the logical block number is limited
-  to 32 bits and is placed in bits 0-31 of the IV.  The inode number
-  (which is also limited to 32 bits) is placed in bits 32-63.
-
-- With `IV_INO_LBLK_32 policies`_, the logical block number is limited
-  to 32 bits and is placed in bits 0-31 of the IV.  The inode number
-  is then hashed and added mod 2^32.
-
-Note that because file logical block numbers are included in the IVs,
-filesystems must enforce that blocks are never shifted around within
-encrypted files, e.g. via "collapse range" or "insert range".
+For contents encryption, each file's contents is divided into "data
+units".  Each data unit is encrypted independently.  The IV for each
+data unit incorporates the zero-based index of the data unit within
+the file.  This ensures that each data unit within a file is encrypted
+differently, which is essential to prevent leaking information.
+
+Note: the encryption depending on the offset into the file means that
+operations like "collapse range" and "insert range" that rearrange the
+extent mapping of files are not supported on encrypted files.
+
+There are two cases for the sizes of the data units:
+
+* Fixed-size data units.  This is how all filesystems other than UBIFS
+  work.  A file's data units are all the same size; the last data unit
+  is zero-padded if needed.  By default, the data unit size is equal
+  to the filesystem block size.  On some filesystems, users can select
+  a sub-block data unit size via the ``log2_data_unit_size`` field of
+  the encryption policy; see `FS_IOC_SET_ENCRYPTION_POLICY`_.
+
+* Variable-size data units.  This is what UBIFS does.  Each "UBIFS
+  data node" is treated as a crypto data unit.  Each contains variable
+  length, possibly compressed data, zero-padded to the next 16-byte
+  boundary.  Users cannot select a sub-block data unit size on UBIFS.
+
+In the case of compression + encryption, the compressed data is
+encrypted.  UBIFS compression works as described above.  f2fs
+compression works a bit differently; it compresses a number of
+filesystem blocks into a smaller number of filesystem blocks.
+Therefore a f2fs-compressed file still uses fixed-size data units, and
+it is encrypted in a similar way to a file containing holes.
+
+As mentioned in `Key hierarchy`_, the default encryption setting uses
+per-file keys.  In this case, the IV for each data unit is simply the
+index of the data unit in the file.  However, users can select an
+encryption setting that does not use per-file keys.  For these, some
+kind of file identifier is incorporated into the IVs as follows:
+
+- With `DIRECT_KEY policies`_, the data unit index is placed in bits
+  0-63 of the IV, and the file's nonce is placed in bits 64-191.
+
+- With `IV_INO_LBLK_64 policies`_, the data unit index is placed in
+  bits 0-31 of the IV, and the file's inode number is placed in bits
+  32-63.  This setting is only allowed when data unit indices and
+  inode numbers fit in 32 bits.
+
+- With `IV_INO_LBLK_32 policies`_, the file's inode number is hashed
+  and added to the data unit index.  The resulting value is truncated
+  to 32 bits and placed in bits 0-31 of the IV.  This setting is only
+  allowed when data unit indices and inode numbers fit in 32 bits.
+
+The byte order of the IV is always little endian.
+
+If the user selects FSCRYPT_MODE_AES_128_CBC for the contents mode, an
+ESSIV layer is automatically included.  In this case, before the IV is
+passed to AES-128-CBC, it is encrypted with AES-256 where the AES-256
+key is the SHA-256 hash of the file's contents encryption key.
 
 Filenames encryption
 --------------------
@@ -544,7 +575,8 @@ follows::
             __u8 contents_encryption_mode;
             __u8 filenames_encryption_mode;
             __u8 flags;
-            __u8 __reserved[4];
+            __u8 log2_data_unit_size;
+            __u8 __reserved[3];
             __u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
     };
 
@@ -586,6 +618,29 @@ This structure must be initialized as follows:
   The DIRECT_KEY, IV_INO_LBLK_64, and IV_INO_LBLK_32 flags are
   mutually exclusive.
 
+- ``log2_data_unit_size`` is the log2 of the data unit size in bytes,
+  or 0 to select the default data unit size.  The data unit size is
+  the granularity of file contents encryption.  For example, setting
+  ``log2_data_unit_size`` to 12 causes file contents be passed to the
+  underlying encryption algorithm (such as AES-256-XTS) in 4096-byte
+  data units, each with its own IV.
+
+  Not all filesystems support setting ``log2_data_unit_size``.  ext4
+  and f2fs support it since Linux v6.7.  On filesystems that support
+  it, the supported nonzero values are 9 through the log2 of the
+  filesystem block size, inclusively.  The default value of 0 selects
+  the filesystem block size.
+
+  The main use case for ``log2_data_unit_size`` is for selecting a
+  data unit size smaller than the filesystem block size for
+  compatibility with inline encryption hardware that only supports
+  smaller data unit sizes.  ``/sys/block/$disk/queue/crypto/`` may be
+  useful for checking which data unit sizes are supported by a
+  particular system's inline encryption hardware.
+
+  Leave this field zeroed unless you are certain you need it.  Using
+  an unnecessarily small data unit size reduces performance.
+
 - For v2 encryption policies, ``__reserved`` must be zeroed.
 
 - For v1 encryption policies, ``master_key_descriptor`` specifies how
@@ -1079,8 +1134,8 @@ The caller must zero all input fields, then fill in ``key_spec``:
 On success, 0 is returned and the kernel fills in the output fields:
 
 - ``status`` indicates whether the key is absent, present, or
-  incompletely removed.  Incompletely removed means that the master
-  secret has been removed, but some files are still in use; i.e.,
+  incompletely removed.  Incompletely removed means that removal has
+  been initiated, but some files are still in use; i.e.,
   `FS_IOC_REMOVE_ENCRYPTION_KEY`_ returned 0 but set the informational
   status flag FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY.
 
diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index 4b30daee399a..198d805d611c 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -241,3 +241,10 @@ following flags are defined:
     all of an inode's dirty data on last close. Exports that behave this
     way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip
     waiting for writeback when closing such files.
+
+  EXPORT_OP_ASYNC_LOCK - Indicates a capable filesystem to do async lock
+    requests from lockd. Only set EXPORT_OP_ASYNC_LOCK if the filesystem has
+    it's own ->lock() functionality as core posix_lock_file() implementation
+    has no async lock request handling yet. For more information about how to
+    indicate an async lock request from a ->lock() file_operations struct, see
+    fs/locks.c and comment for the function vfs_lock_file().
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 4d05b9862451..d69f59700a23 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1045,3 +1045,10 @@ filesystem type is now moved to a later point when the devices are closed:
 As this is a VFS level change it has no practical consequences for filesystems
 other than that all of them must use one of the provided kill_litter_super(),
 kill_anon_super(), or kill_block_super() helpers.
+
+---
+
+**mandatory**
+
+Lock ordering has been changed so that s_umount ranks above open_mutex again.
+All places where s_umount was taken under open_mutex have been fixed up.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 06e14efd8662..d414e145f912 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -396,6 +396,10 @@ Memory barriers come in four basic varieties:
 
 
  (2) Address-dependency barriers (historical).
+     [!] This section is marked as HISTORICAL: For more up-to-date
+     information, including how compiler transformations related to pointer
+     comparisons can sometimes cause problems, see
+     Documentation/RCU/rcu_dereference.rst.
 
      An address-dependency barrier is a weaker form of read barrier.  In the
      case where two loads are performed such that the second depends on the
@@ -556,6 +560,9 @@ There are certain things that the Linux kernel memory barriers do not guarantee:
 
 ADDRESS-DEPENDENCY BARRIERS (HISTORICAL)
 ----------------------------------------
+[!] This section is marked as HISTORICAL: For more up-to-date information,
+including how compiler transformations related to pointer comparisons can
+sometimes cause problems, see Documentation/RCU/rcu_dereference.rst.
 
 As of v4.15 of the Linux kernel, an smp_mb() was added to READ_ONCE() for
 DEC Alpha, which means that about the only people who need to pay attention
diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml
new file mode 100644
index 000000000000..05acc73e2e33
--- /dev/null
+++ b/Documentation/netlink/specs/nfsd.yaml
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+
+name: nfsd
+protocol: genetlink
+uapi-header: linux/nfsd_netlink.h
+
+doc: NFSD configuration over generic netlink.
+
+attribute-sets:
+  -
+    name: rpc-status
+    attributes:
+      -
+        name: xid
+        type: u32
+        byte-order: big-endian
+      -
+        name: flags
+        type: u32
+      -
+        name: prog
+        type: u32
+      -
+        name: version
+        type: u8
+      -
+        name: proc
+        type: u32
+      -
+        name: service_time
+        type: s64
+      -
+        name: pad
+        type: pad
+      -
+        name: saddr4
+        type: u32
+        byte-order: big-endian
+        display-hint: ipv4
+      -
+        name: daddr4
+        type: u32
+        byte-order: big-endian
+        display-hint: ipv4
+      -
+        name: saddr6
+        type: binary
+        display-hint: ipv6
+      -
+        name: daddr6
+        type: binary
+        display-hint: ipv6
+      -
+        name: sport
+        type: u16
+        byte-order: big-endian
+      -
+        name: dport
+        type: u16
+        byte-order: big-endian
+      -
+        name: compound-ops
+        type: u32
+        multi-attr: true
+
+operations:
+  list:
+    -
+      name: rpc-status-get
+      doc: dump pending nfsd rpc
+      attribute-set: rpc-status
+      dump:
+        pre: nfsd-nl-rpc-status-get-start
+        post: nfsd-nl-rpc-status-get-done
+        reply:
+          attributes:
+            - xid
+            - flags
+            - prog
+            - version
+            - proc
+            - service_time
+            - saddr4
+            - daddr4
+            - saddr6
+            - daddr6
+            - sport
+            - dport
+            - compound-ops
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index b48da698d6f2..bb96ca0f774b 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -31,7 +31,7 @@ you probably needn't concern yourself with pcmciautils.
 ====================== ===============  ========================================
 GNU C                  5.1              gcc --version
 Clang/LLVM (optional)  11.0.0           clang --version
-Rust (optional)        1.71.1           rustc --version
+Rust (optional)        1.73.0           rustc --version
 bindgen (optional)     0.65.1           bindgen --version
 GNU make               3.82             make --version
 bash                   4.2              bash --version
diff --git a/Documentation/rust/index.rst b/Documentation/rust/index.rst
index e599be2cec9b..965f2db529e0 100644
--- a/Documentation/rust/index.rst
+++ b/Documentation/rust/index.rst
@@ -6,6 +6,25 @@ Rust
 Documentation related to Rust within the kernel. To start using Rust
 in the kernel, please read the quick-start.rst guide.
 
+
+The Rust experiment
+-------------------
+
+The Rust support was merged in v6.1 into mainline in order to help in
+determining whether Rust as a language was suitable for the kernel, i.e. worth
+the tradeoffs.
+
+Currently, the Rust support is primarily intended for kernel developers and
+maintainers interested in the Rust support, so that they can start working on
+abstractions and drivers, as well as helping the development of infrastructure
+and tools.
+
+If you are an end user, please note that there are currently no in-tree
+drivers/modules suitable or intended for production use, and that the Rust
+support is still in development/experimental, especially for certain kernel
+configurations.
+
+
 .. only:: rustdoc and html
 
 	You can also browse `rustdoc documentation <rustdoc/kernel/index.html>`_.
diff --git a/Documentation/scheduler/sched-capacity.rst b/Documentation/scheduler/sched-capacity.rst
index e2c1cf743158..de414b33dd2a 100644
--- a/Documentation/scheduler/sched-capacity.rst
+++ b/Documentation/scheduler/sched-capacity.rst
@@ -39,14 +39,15 @@ per Hz, leading to::
 -------------------
 
 Two different capacity values are used within the scheduler. A CPU's
-``capacity_orig`` is its maximum attainable capacity, i.e. its maximum
-attainable performance level. A CPU's ``capacity`` is its ``capacity_orig`` to
-which some loss of available performance (e.g. time spent handling IRQs) is
-subtracted.
+``original capacity`` is its maximum attainable capacity, i.e. its maximum
+attainable performance level. This original capacity is returned by
+the function arch_scale_cpu_capacity(). A CPU's ``capacity`` is its ``original
+capacity`` to which some loss of available performance (e.g. time spent
+handling IRQs) is subtracted.
 
 Note that a CPU's ``capacity`` is solely intended to be used by the CFS class,
-while ``capacity_orig`` is class-agnostic. The rest of this document will use
-the term ``capacity`` interchangeably with ``capacity_orig`` for the sake of
+while ``original capacity`` is class-agnostic. The rest of this document will use
+the term ``capacity`` interchangeably with ``original capacity`` for the sake of
 brevity.
 
 1.3 Platform examples
diff --git a/Documentation/scheduler/sched-energy.rst b/Documentation/scheduler/sched-energy.rst
index fc853c8cc346..70e2921ef725 100644
--- a/Documentation/scheduler/sched-energy.rst
+++ b/Documentation/scheduler/sched-energy.rst
@@ -359,32 +359,9 @@ in milli-Watts or in an 'abstract scale'.
 6.3 - Energy Model complexity
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The task wake-up path is very latency-sensitive. When the EM of a platform is
-too complex (too many CPUs, too many performance domains, too many performance
-states, ...), the cost of using it in the wake-up path can become prohibitive.
-The energy-aware wake-up algorithm has a complexity of:
-
-	C = Nd * (Nc + Ns)
-
-with: Nd the number of performance domains; Nc the number of CPUs; and Ns the
-total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8).
-
-A complexity check is performed at the root domain level, when scheduling
-domains are built. EAS will not start on a root domain if its C happens to be
-higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the
-time of writing).
-
-If you really want to use EAS but the complexity of your platform's Energy
-Model is too high to be used with a single root domain, you're left with only
-two possible options:
-
-    1. split your system into separate, smaller, root domains using exclusive
-       cpusets and enable EAS locally on each of them. This option has the
-       benefit to work out of the box but the drawback of preventing load
-       balance between root domains, which can result in an unbalanced system
-       overall;
-    2. submit patches to reduce the complexity of the EAS wake-up algorithm,
-       hence enabling it to cope with larger EMs in reasonable time.
+EAS does not impose any complexity limit on the number of PDs/OPPs/CPUs but
+restricts the number of CPUs to EM_MAX_NUM_CPUS to prevent overflows during
+the energy estimation.
 
 
 6.4 - Schedutil governor
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
index 655a096ec8fb..d685609ed3d7 100644
--- a/Documentation/scheduler/sched-rt-group.rst
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -39,10 +39,10 @@ Most notable:
 1.1 The problem
 ---------------
 
-Realtime scheduling is all about determinism, a group has to be able to rely on
+Real-time scheduling is all about determinism, a group has to be able to rely on
 the amount of bandwidth (eg. CPU time) being constant. In order to schedule
-multiple groups of realtime tasks, each group must be assigned a fixed portion
-of the CPU time available.  Without a minimum guarantee a realtime group can
+multiple groups of real-time tasks, each group must be assigned a fixed portion
+of the CPU time available.  Without a minimum guarantee a real-time group can
 obviously fall short. A fuzzy upper limit is of no use since it cannot be
 relied upon. Which leaves us with just the single fixed portion.
 
@@ -50,14 +50,14 @@ relied upon. Which leaves us with just the single fixed portion.
 ----------------
 
 CPU time is divided by means of specifying how much time can be spent running
-in a given period. We allocate this "run time" for each realtime group which
-the other realtime groups will not be permitted to use.
+in a given period. We allocate this "run time" for each real-time group which
+the other real-time groups will not be permitted to use.
 
-Any time not allocated to a realtime group will be used to run normal priority
+Any time not allocated to a real-time group will be used to run normal priority
 tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by
 SCHED_OTHER.
 
-Let's consider an example: a frame fixed realtime renderer must deliver 25
+Let's consider an example: a frame fixed real-time renderer must deliver 25
 frames a second, which yields a period of 0.04s per frame. Now say it will also
 have to play some music and respond to input, leaving it with around 80% CPU
 time dedicated for the graphics. We can then give this group a run time of 0.8
@@ -70,7 +70,7 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
 of 0.00015s.
 
 The remaining CPU time will be used for user input and other tasks. Because
-realtime tasks have explicitly allocated the CPU time they need to perform
+real-time tasks have explicitly allocated the CPU time they need to perform
 their tasks, buffer underruns in the graphics or audio can be eliminated.
 
 NOTE: the above example is not fully implemented yet. We still
@@ -87,18 +87,20 @@ lack an EDF scheduler to make non-uniform periods usable.
 The system wide settings are configured under the /proc virtual file system:
 
 /proc/sys/kernel/sched_rt_period_us:
-  The scheduling period that is equivalent to 100% CPU bandwidth
+  The scheduling period that is equivalent to 100% CPU bandwidth.
 
 /proc/sys/kernel/sched_rt_runtime_us:
-  A global limit on how much time realtime scheduling may use.  Even without
-  CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime
-  processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth
-  available to all realtime groups.
+  A global limit on how much time real-time scheduling may use. This is always
+  less or equal to the period_us, as it denotes the time allocated from the
+  period_us for the real-time tasks. Even without CONFIG_RT_GROUP_SCHED enabled,
+  this will limit time reserved to real-time processes. With
+  CONFIG_RT_GROUP_SCHED=y it signifies the total bandwidth available to all
+  real-time groups.
 
   * Time is specified in us because the interface is s32. This gives an
     operating range from 1us to about 35 minutes.
   * sched_rt_period_us takes values from 1 to INT_MAX.
-  * sched_rt_runtime_us takes values from -1 to (INT_MAX - 1).
+  * sched_rt_runtime_us takes values from -1 to sched_rt_period_us.
   * A run time of -1 specifies runtime == period, ie. no limit.
 
 
@@ -108,7 +110,7 @@ The system wide settings are configured under the /proc virtual file system:
 The default values for sched_rt_period_us (1000000 or 1s) and
 sched_rt_runtime_us (950000 or 0.95s).  This gives 0.05s to be used by
 SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away
-realtime tasks will not lock up the machine but leave a little time to recover
+real-time tasks will not lock up the machine but leave a little time to recover
 it.  By setting runtime to -1 you'd get the old behaviour back.
 
 By default all bandwidth is assigned to the root group and new groups get the
@@ -116,10 +118,10 @@ period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you
 want to assign bandwidth to another group, reduce the root group's bandwidth
 and assign some or all of the difference to another group.
 
-Realtime group scheduling means you have to assign a portion of total CPU
-bandwidth to the group before it will accept realtime tasks. Therefore you will
-not be able to run realtime tasks as any user other than root until you have
-done that, even if the user has the rights to run processes with realtime
+Real-time group scheduling means you have to assign a portion of total CPU
+bandwidth to the group before it will accept real-time tasks. Therefore you will
+not be able to run real-time tasks as any user other than root until you have
+done that, even if the user has the rights to run processes with real-time
 priority!
 
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 76dea3f2e391..f9a5be31e694 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3481,6 +3481,14 @@ W:	http://bcache.evilpiepirate.org
 C:	irc://irc.oftc.net/bcache
 F:	drivers/md/bcache/
 
+BCACHEFS
+M:	Kent Overstreet <kent.overstreet@linux.dev>
+R:	Brian Foster <bfoster@redhat.com>
+L:	linux-bcachefs@vger.kernel.org
+S:	Supported
+C:	irc://irc.oftc.net/bcache
+F:	fs/bcachefs/
+
 BDISP ST MEDIA DRIVER
 M:	Fabien Dessenne <fabien.dessenne@foss.st.com>
 L:	linux-media@vger.kernel.org
@@ -5076,6 +5084,14 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
 F:	Documentation/devicetree/bindings/timer/
 F:	drivers/clocksource/
 
+CLOSURES
+M:	Kent Overstreet <kent.overstreet@linux.dev>
+L:	linux-bcachefs@vger.kernel.org
+S:	Supported
+C:	irc://irc.oftc.net/bcache
+F:	include/linux/closure.h
+F:	lib/closure.c
+
 CMPC ACPI DRIVER
 M:	Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
 M:	Daniel Oliveira Nascimento <don@syst.com.br>
@@ -8646,6 +8662,8 @@ L:	linux-hardening@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening
 F:	Documentation/kbuild/gcc-plugins.rst
+F:	include/linux/stackleak.h
+F:	kernel/stackleak.c
 F:	scripts/Makefile.gcc-plugins
 F:	scripts/gcc-plugins/
 
@@ -8761,6 +8779,13 @@ S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/linux-pm.git
 F:	drivers/pmdomain/
 
+GENERIC RADIX TREE
+M:	Kent Overstreet <kent.overstreet@linux.dev>
+S:	Supported
+C:	irc://irc.oftc.net/bcache
+F:	include/linux/generic-radix-tree.h
+F:	lib/generic-radix-tree.c
+
 GENERIC RESISTIVE TOUCHSCREEN ADC DRIVER
 M:	Eugen Hristev <eugen.hristev@microchip.com>
 L:	linux-input@vger.kernel.org
@@ -10713,7 +10738,7 @@ F:	drivers/mfd/intel-m10-bmc*
 F:	include/linux/mfd/intel-m10-bmc.h
 
 INTEL MAX10 BMC SECURE UPDATES
-M:	Russ Weight <russell.h.weight@intel.com>
+M:	Peter Colberg <peter.colberg@intel.com>
 L:	linux-fpga@vger.kernel.org
 S:	Maintained
 F:	Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update
@@ -11403,16 +11428,20 @@ F:	usr/
 
 KERNEL HARDENING (not covered by other areas)
 M:	Kees Cook <keescook@chromium.org>
+R:	Gustavo A. R. Silva <gustavoars@kernel.org>
 L:	linux-hardening@vger.kernel.org
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening
 F:	Documentation/ABI/testing/sysfs-kernel-oops_count
 F:	Documentation/ABI/testing/sysfs-kernel-warn_count
+F:	arch/*/configs/hardening.config
 F:	include/linux/overflow.h
 F:	include/linux/randomize_kstack.h
+F:	kernel/configs/hardening.config
 F:	mm/usercopy.c
 K:	\b(add|choose)_random_kstack_offset\b
 K:	\b__check_(object_size|heap_object)\b
+K:	\b__counted_by\b
 
 KERNEL JANITORS
 L:	kernel-janitors@vger.kernel.org
@@ -13856,9 +13885,10 @@ F:	Documentation/devicetree/bindings/media/amlogic,gx-vdec.yaml
 F:	drivers/staging/media/meson/vdec/
 
 METHODE UDPU SUPPORT
-M:	Vladimir Vid <vladimir.vid@sartura.hr>
+M:	Robert Marko <robert.marko@sartura.hr>
 S:	Maintained
-F:	arch/arm64/boot/dts/marvell/armada-3720-uDPU.dts
+F:	arch/arm64/boot/dts/marvell/armada-3720-eDPU.dts
+F:	arch/arm64/boot/dts/marvell/armada-3720-uDPU.*
 
 MHI BUS
 M:	Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
@@ -18716,9 +18746,10 @@ R:	Andreas Hindborg <a.hindborg@samsung.com>
 R:	Alice Ryhl <aliceryhl@google.com>
 L:	rust-for-linux@vger.kernel.org
 S:	Supported
-W:	https://github.com/Rust-for-Linux/linux
+W:	https://rust-for-linux.com
 B:	https://github.com/Rust-for-Linux/linux/issues
 C:	zulip://rust-for-linux.zulipchat.com
+P:	https://rust-for-linux.com/contributing
 T:	git https://github.com/Rust-for-Linux/linux.git rust-next
 F:	Documentation/rust/
 F:	rust/
@@ -23731,6 +23762,13 @@ F:	Documentation/devicetree/bindings/media/xilinx/
 F:	drivers/media/platform/xilinx/
 F:	include/uapi/linux/xilinx-v4l2-controls.h
 
+XILINX VERSAL EDAC DRIVER
+M:	Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
+M:	Sai Krishna Potthuri <sai.krishna.potthuri@amd.com>
+S:	Maintained
+F:	Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml
+F:	drivers/edac/versal_edac.c
+
 XILINX WATCHDOG DRIVER
 M:	Srinivas Neeli <srinivas.neeli@amd.com>
 R:	Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
diff --git a/Makefile b/Makefile
index 5fc735c7fed1..5c418efbe89b 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Hurr durr I'ma ninja sloth
 
 # *DOCUMENTATION*
diff --git a/arch/alpha/include/asm/local.h b/arch/alpha/include/asm/local.h
index 0fcaad642cc3..88eb398947a5 100644
--- a/arch/alpha/include/asm/local.h
+++ b/arch/alpha/include/asm/local.h
@@ -65,28 +65,27 @@ static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new)
 #define local_xchg(l, n) (xchg_local(&((l)->a.counter), (n)))
 
 /**
- * local_add_unless - add unless the number is a given value
+ * local_add_unless - add unless the number is already a given value
  * @l: pointer of type local_t
  * @a: the amount to add to l...
  * @u: ...unless l is equal to u.
  *
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
+ * Atomically adds @a to @l, if @v was not already @u.
+ * Returns true if the addition was done.
  */
-#define local_add_unless(l, a, u)				\
-({								\
-	long c, old;						\
-	c = local_read(l);					\
-	for (;;) {						\
-		if (unlikely(c == (u)))				\
-			break;					\
-		old = local_cmpxchg((l), c, c + (a));	\
-		if (likely(old == c))				\
-			break;					\
-		c = old;					\
-	}							\
-	c != (u);						\
-})
+static __inline__ bool
+local_add_unless(local_t *l, long a, long u)
+{
+	long c = local_read(l);
+
+	do {
+		if (unlikely(c == u))
+			return false;
+	} while (!local_try_cmpxchg(l, &c, c + a));
+
+	return true;
+}
+
 #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
 
 #define local_add_negative(a, l) (local_add_return((a), (l)) < 0)
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index ad37569d0507..b68f1f56b836 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -492,3 +492,7 @@
 560	common	set_mempolicy_home_node		sys_ni_syscall
 561	common	cachestat			sys_cachestat
 562	common	fchmodat2			sys_fchmodat2
+# 563 reserved for map_shadow_stack
+564	common	futex_wake			sys_futex_wake
+565	common	futex_wait			sys_futex_wait
+566	common	futex_requeue			sys_futex_requeue
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index d5b3ed2c58f5..c380d8c30704 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address)
 	 */
 	if (vma) {
 		char buf[ARC_PATH_MAX];
-		char *nm = "?";
+		char *nm = "anon";
 
 		if (vma->vm_file) {
-			nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1);
+			/* XXX: can we use %pD below and get rid of buf? */
+			nm = d_path(file_user_path(vma->vm_file), buf,
+				    ARC_PATH_MAX-1);
 			if (IS_ERR(nm))
 				nm = "?";
 		}
diff --git a/arch/arm/boot/dts/rockchip/rk3128.dtsi b/arch/arm/boot/dts/rockchip/rk3128.dtsi
index b63bd4ad3143..88a4b0d6d928 100644
--- a/arch/arm/boot/dts/rockchip/rk3128.dtsi
+++ b/arch/arm/boot/dts/rockchip/rk3128.dtsi
@@ -64,7 +64,8 @@
 		compatible = "arm,armv7-timer";
 		interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>,
 			     <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>,
-			     <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>;
+			     <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>,
+			     <GIC_PPI 10 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>;
 		arm,cpu-registers-not-fw-configured;
 		clock-frequency = <24000000>;
 	};
@@ -233,7 +234,7 @@
 		compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer";
 		reg = <0x20044000 0x20>;
 		interrupts = <GIC_SPI 28 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cru PCLK_TIMER>, <&xin24m>;
+		clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER0>;
 		clock-names = "pclk", "timer";
 	};
 
@@ -241,7 +242,7 @@
 		compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer";
 		reg = <0x20044020 0x20>;
 		interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cru PCLK_TIMER>, <&xin24m>;
+		clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER1>;
 		clock-names = "pclk", "timer";
 	};
 
@@ -249,7 +250,7 @@
 		compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer";
 		reg = <0x20044040 0x20>;
 		interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cru PCLK_TIMER>, <&xin24m>;
+		clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER2>;
 		clock-names = "pclk", "timer";
 	};
 
@@ -257,7 +258,7 @@
 		compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer";
 		reg = <0x20044060 0x20>;
 		interrupts = <GIC_SPI 60 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cru PCLK_TIMER>, <&xin24m>;
+		clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER3>;
 		clock-names = "pclk", "timer";
 	};
 
@@ -265,7 +266,7 @@
 		compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer";
 		reg = <0x20044080 0x20>;
 		interrupts = <GIC_SPI 61 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cru PCLK_TIMER>, <&xin24m>;
+		clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER4>;
 		clock-names = "pclk", "timer";
 	};
 
@@ -273,7 +274,7 @@
 		compatible = "rockchip,rk3128-timer", "rockchip,rk3288-timer";
 		reg = <0x200440a0 0x20>;
 		interrupts = <GIC_SPI 62 IRQ_TYPE_LEVEL_HIGH>;
-		clocks = <&cru PCLK_TIMER>, <&xin24m>;
+		clocks = <&cru PCLK_TIMER>, <&cru SCLK_TIMER5>;
 		clock-names = "pclk", "timer";
 	};
 
@@ -426,7 +427,7 @@
 
 	i2c0: i2c@20072000 {
 		compatible = "rockchip,rk3128-i2c", "rockchip,rk3288-i2c";
-		reg = <20072000 0x1000>;
+		reg = <0x20072000 0x1000>;
 		interrupts = <GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>;
 		clock-names = "i2c";
 		clocks = <&cru PCLK_I2C0>;
@@ -458,6 +459,7 @@
 		interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
 			     <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>;
 		arm,pl330-broken-no-flushp;
+		arm,pl330-periph-burst;
 		clocks = <&cru ACLK_DMAC>;
 		clock-names = "apb_pclk";
 		#dma-cells = <1>;
diff --git a/arch/arm/boot/dts/ti/omap/omap4-l4-abe.dtsi b/arch/arm/boot/dts/ti/omap/omap4-l4-abe.dtsi
index 7ae8b620515c..59f546a278f8 100644
--- a/arch/arm/boot/dts/ti/omap/omap4-l4-abe.dtsi
+++ b/arch/arm/boot/dts/ti/omap/omap4-l4-abe.dtsi
@@ -109,6 +109,8 @@
 				reg = <0x0 0xff>, /* MPU private access */
 				      <0x49022000 0xff>; /* L3 Interconnect */
 				reg-names = "mpu", "dma";
+				clocks = <&abe_clkctrl OMAP4_MCBSP1_CLKCTRL 24>;
+				clock-names = "fck";
 				interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
 				interrupt-names = "common";
 				ti,buffer-size = <128>;
@@ -142,6 +144,8 @@
 				reg = <0x0 0xff>, /* MPU private access */
 				      <0x49024000 0xff>; /* L3 Interconnect */
 				reg-names = "mpu", "dma";
+				clocks = <&abe_clkctrl OMAP4_MCBSP2_CLKCTRL 24>;
+				clock-names = "fck";
 				interrupts = <GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>;
 				interrupt-names = "common";
 				ti,buffer-size = <128>;
@@ -175,6 +179,8 @@
 				reg = <0x0 0xff>, /* MPU private access */
 				      <0x49026000 0xff>; /* L3 Interconnect */
 				reg-names = "mpu", "dma";
+				clocks = <&abe_clkctrl OMAP4_MCBSP3_CLKCTRL 24>;
+				clock-names = "fck";
 				interrupts = <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>;
 				interrupt-names = "common";
 				ti,buffer-size = <128>;
diff --git a/arch/arm/boot/dts/ti/omap/omap4-l4.dtsi b/arch/arm/boot/dts/ti/omap/omap4-l4.dtsi
index 46b8f9efd413..3fcef3080eae 100644
--- a/arch/arm/boot/dts/ti/omap/omap4-l4.dtsi
+++ b/arch/arm/boot/dts/ti/omap/omap4-l4.dtsi
@@ -2043,6 +2043,8 @@
 				compatible = "ti,omap4-mcbsp";
 				reg = <0x0 0xff>; /* L4 Interconnect */
 				reg-names = "mpu";
+				clocks = <&l4_per_clkctrl OMAP4_MCBSP4_CLKCTRL 24>;
+				clock-names = "fck";
 				interrupts = <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>;
 				interrupt-names = "common";
 				ti,buffer-size = <128>;
diff --git a/arch/arm/boot/dts/ti/omap/omap5-l4-abe.dtsi b/arch/arm/boot/dts/ti/omap/omap5-l4-abe.dtsi
index a03bca5a3584..97b0c3b5f573 100644
--- a/arch/arm/boot/dts/ti/omap/omap5-l4-abe.dtsi
+++ b/arch/arm/boot/dts/ti/omap/omap5-l4-abe.dtsi
@@ -109,6 +109,8 @@
 				reg = <0x0 0xff>, /* MPU private access */
 				      <0x49022000 0xff>; /* L3 Interconnect */
 				reg-names = "mpu", "dma";
+				clocks = <&abe_clkctrl OMAP5_MCBSP1_CLKCTRL 24>;
+				clock-names = "fck";
 				interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
 				interrupt-names = "common";
 				ti,buffer-size = <128>;
@@ -142,6 +144,8 @@
 				reg = <0x0 0xff>, /* MPU private access */
 				      <0x49024000 0xff>; /* L3 Interconnect */
 				reg-names = "mpu", "dma";
+				clocks = <&abe_clkctrl OMAP5_MCBSP2_CLKCTRL 24>;
+				clock-names = "fck";
 				interrupts = <GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>;
 				interrupt-names = "common";
 				ti,buffer-size = <128>;
@@ -175,6 +179,8 @@
 				reg = <0x0 0xff>, /* MPU private access */
 				      <0x49026000 0xff>; /* L3 Interconnect */
 				reg-names = "mpu", "dma";
+				clocks = <&abe_clkctrl OMAP5_MCBSP3_CLKCTRL 24>;
+				clock-names = "fck";
 				interrupts = <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>;
 				interrupt-names = "common";
 				ti,buffer-size = <128>;
diff --git a/arch/arm/configs/hardening.config b/arch/arm/configs/hardening.config
new file mode 100644
index 000000000000..327349ce6377
--- /dev/null
+++ b/arch/arm/configs/hardening.config
@@ -0,0 +1,7 @@
+# Basic kernel hardening options (specific to arm)
+
+# Make sure PXN/PAN emulation is enabled.
+CONFIG_CPU_SW_DOMAIN_PAN=y
+
+# Dangerous; old interfaces and needless additional attack surface.
+# CONFIG_OABI_COMPAT is not set
diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index 9808cd27e2cf..67de96c7717d 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -550,6 +550,7 @@ static struct platform_device *ams_delta_devices[] __initdata = {
 	&ams_delta_nand_device,
 	&ams_delta_lcd_device,
 	&cx20442_codec_device,
+	&modem_nreset_device,
 };
 
 static struct gpiod_lookup_table *ams_delta_gpio_tables[] __initdata = {
@@ -782,26 +783,28 @@ static struct plat_serial8250_port ams_delta_modem_ports[] = {
 	{ },
 };
 
+static int ams_delta_modem_pm_activate(struct device *dev)
+{
+	modem_priv.regulator = regulator_get(dev, "RESET#");
+	if (IS_ERR(modem_priv.regulator))
+		return -EPROBE_DEFER;
+
+	return 0;
+}
+
+static struct dev_pm_domain ams_delta_modem_pm_domain = {
+	.activate	= ams_delta_modem_pm_activate,
+};
+
 static struct platform_device ams_delta_modem_device = {
 	.name	= "serial8250",
 	.id	= PLAT8250_DEV_PLATFORM1,
 	.dev		= {
 		.platform_data = ams_delta_modem_ports,
+		.pm_domain = &ams_delta_modem_pm_domain,
 	},
 };
 
-static int __init modem_nreset_init(void)
-{
-	int err;
-
-	err = platform_device_register(&modem_nreset_device);
-	if (err)
-		pr_err("Couldn't register the modem regulator device\n");
-
-	return err;
-}
-
-
 /*
  * This function expects MODEM IRQ number already assigned to the port.
  * The MODEM device requires its RESET# pin kept high during probe.
@@ -833,37 +836,6 @@ static int __init ams_delta_modem_init(void)
 }
 arch_initcall_sync(ams_delta_modem_init);
 
-static int __init late_init(void)
-{
-	int err;
-
-	err = modem_nreset_init();
-	if (err)
-		return err;
-
-	/*
-	 * Once the modem device is registered, the modem_nreset
-	 * regulator can be requested on behalf of that device.
-	 */
-	modem_priv.regulator = regulator_get(&ams_delta_modem_device.dev,
-			"RESET#");
-	if (IS_ERR(modem_priv.regulator)) {
-		err = PTR_ERR(modem_priv.regulator);
-		goto unregister;
-	}
-	return 0;
-
-unregister:
-	platform_device_unregister(&ams_delta_modem_device);
-	return err;
-}
-
-static void __init ams_delta_init_late(void)
-{
-	omap1_init_late();
-	late_init();
-}
-
 static void __init ams_delta_map_io(void)
 {
 	omap1_map_io();
@@ -877,7 +849,7 @@ MACHINE_START(AMS_DELTA, "Amstrad E3 (Delta)")
 	.init_early	= omap1_init_early,
 	.init_irq	= omap1_init_irq,
 	.init_machine	= ams_delta_init,
-	.init_late	= ams_delta_init_late,
+	.init_late	= omap1_init_late,
 	.init_time	= omap1_timer_init,
 	.restart	= omap1_restart,
 MACHINE_END
diff --git a/arch/arm/mach-omap1/timer32k.c b/arch/arm/mach-omap1/timer32k.c
index 410d17d1d443..f618a6df2938 100644
--- a/arch/arm/mach-omap1/timer32k.c
+++ b/arch/arm/mach-omap1/timer32k.c
@@ -176,17 +176,18 @@ static u64 notrace omap_32k_read_sched_clock(void)
 	return sync32k_cnt_reg ? readl_relaxed(sync32k_cnt_reg) : 0;
 }
 
+static struct timespec64 persistent_ts;
+static cycles_t cycles;
+static unsigned int persistent_mult, persistent_shift;
+
 /**
  * omap_read_persistent_clock64 -  Return time from a persistent clock.
+ * @ts: &struct timespec64 for the returned time
  *
  * Reads the time from a source which isn't disabled during PM, the
  * 32k sync timer.  Convert the cycles elapsed since last read into
  * nsecs and adds to a monotonically increasing timespec64.
  */
-static struct timespec64 persistent_ts;
-static cycles_t cycles;
-static unsigned int persistent_mult, persistent_shift;
-
 static void omap_read_persistent_clock64(struct timespec64 *ts)
 {
 	unsigned long long nsecs;
@@ -206,10 +207,9 @@ static void omap_read_persistent_clock64(struct timespec64 *ts)
 /**
  * omap_init_clocksource_32k - setup and register counter 32k as a
  * kernel clocksource
- * @pbase: base addr of counter_32k module
- * @size: size of counter_32k to map
+ * @vbase: base addr of counter_32k module
  *
- * Returns 0 upon success or negative error code upon failure.
+ * Returns: %0 upon success or negative error code upon failure.
  *
  */
 static int __init omap_init_clocksource_32k(void __iomem *vbase)
diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 1e17b5f77588..ba71928c0fcb 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -2209,7 +2209,7 @@ int omap_hwmod_parse_module_range(struct omap_hwmod *oh,
 		return err;
 
 	pr_debug("omap_hwmod: %s %pOFn at %pR\n",
-		 oh->name, np, &res);
+		 oh->name, np, res);
 
 	if (oh && oh->mpu_rt_idx) {
 		omap_hwmod_fix_mpu_rt_idx(oh, np, res);
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index c572d6c3dee0..93d0d46cbb15 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -466,3 +466,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c.dts b/arch/arm64/boot/dts/qcom/apq8096-db820c.dts
index 385b178314db..3067a4091a7a 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c.dts
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c.dts
@@ -62,25 +62,23 @@
 		stdout-path = "serial0:115200n8";
 	};
 
-	clocks {
-		divclk4: divclk4 {
-			compatible = "fixed-clock";
-			#clock-cells = <0>;
-			clock-frequency = <32768>;
-			clock-output-names = "divclk4";
+	div1_mclk: divclk1 {
+		compatible = "gpio-gate-clock";
+		pinctrl-0 = <&audio_mclk>;
+		pinctrl-names = "default";
+		clocks = <&rpmcc RPM_SMD_DIV_CLK1>;
+		#clock-cells = <0>;
+		enable-gpios = <&pm8994_gpios 15 0>;
+	};
 
-			pinctrl-names = "default";
-			pinctrl-0 = <&divclk4_pin_a>;
-		};
+	divclk4: divclk4 {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <32768>;
+		clock-output-names = "divclk4";
 
-		div1_mclk: divclk1 {
-			compatible = "gpio-gate-clock";
-			pinctrl-0 = <&audio_mclk>;
-			pinctrl-names = "default";
-			clocks = <&rpmcc RPM_SMD_DIV_CLK1>;
-			#clock-cells = <0>;
-			enable-gpios = <&pm8994_gpios 15 0>;
-		};
+		pinctrl-names = "default";
+		pinctrl-0 = <&divclk4_pin_a>;
 	};
 
 	gpio-keys {
diff --git a/arch/arm64/boot/dts/qcom/msm8996-xiaomi-common.dtsi b/arch/arm64/boot/dts/qcom/msm8996-xiaomi-common.dtsi
index bcd2397eb373..06f8ff624181 100644
--- a/arch/arm64/boot/dts/qcom/msm8996-xiaomi-common.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996-xiaomi-common.dtsi
@@ -11,26 +11,24 @@
 #include <dt-bindings/pinctrl/qcom,pmic-gpio.h>
 
 / {
-	clocks {
-		divclk1_cdc: divclk1 {
-			compatible = "gpio-gate-clock";
-			clocks = <&rpmcc RPM_SMD_DIV_CLK1>;
-			#clock-cells = <0>;
-			enable-gpios = <&pm8994_gpios 15 GPIO_ACTIVE_HIGH>;
+	divclk1_cdc: divclk1 {
+		compatible = "gpio-gate-clock";
+		clocks = <&rpmcc RPM_SMD_DIV_CLK1>;
+		#clock-cells = <0>;
+		enable-gpios = <&pm8994_gpios 15 GPIO_ACTIVE_HIGH>;
 
-			pinctrl-names = "default";
-			pinctrl-0 = <&divclk1_default>;
-		};
+		pinctrl-names = "default";
+		pinctrl-0 = <&divclk1_default>;
+	};
 
-		divclk4: divclk4 {
-			compatible = "fixed-clock";
-			#clock-cells = <0>;
-			clock-frequency = <32768>;
-			clock-output-names = "divclk4";
+	divclk4: divclk4 {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <32768>;
+		clock-output-names = "divclk4";
 
-			pinctrl-names = "default";
-			pinctrl-0 = <&divclk4_pin_a>;
-		};
+		pinctrl-names = "default";
+		pinctrl-0 = <&divclk4_pin_a>;
 	};
 
 	gpio-keys {
diff --git a/arch/arm64/boot/dts/qcom/msm8996-xiaomi-gemini.dts b/arch/arm64/boot/dts/qcom/msm8996-xiaomi-gemini.dts
index d1066edaea47..f8e9d90afab0 100644
--- a/arch/arm64/boot/dts/qcom/msm8996-xiaomi-gemini.dts
+++ b/arch/arm64/boot/dts/qcom/msm8996-xiaomi-gemini.dts
@@ -20,16 +20,14 @@
 	qcom,pmic-id = <0x20009 0x2000a 0x00 0x00>;
 	qcom,board-id = <31 0>;
 
-	clocks {
-		divclk2_haptics: divclk2 {
-			compatible = "fixed-clock";
-			#clock-cells = <0>;
-			clock-frequency = <32768>;
-			clock-output-names = "divclk2";
-
-			pinctrl-names = "default";
-			pinctrl-0 = <&divclk2_pin_a>;
-		};
+	divclk2_haptics: divclk2 {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <32768>;
+		clock-output-names = "divclk2";
+
+		pinctrl-names = "default";
+		pinctrl-0 = <&divclk2_pin_a>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/qcom/sa8775p-pmics.dtsi b/arch/arm64/boot/dts/qcom/sa8775p-pmics.dtsi
index 3c3b6287cd27..eaa43f022a65 100644
--- a/arch/arm64/boot/dts/qcom/sa8775p-pmics.dtsi
+++ b/arch/arm64/boot/dts/qcom/sa8775p-pmics.dtsi
@@ -173,7 +173,7 @@
 			compatible = "qcom,pmm8654au-gpio", "qcom,spmi-gpio";
 			reg = <0x8800>;
 			gpio-controller;
-			gpio-ranges = <&pmm8654au_2_gpios 0 0 12>;
+			gpio-ranges = <&pmm8654au_1_gpios 0 0 12>;
 			#gpio-cells = <2>;
 			interrupt-controller;
 			#interrupt-cells = <2>;
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
index 08a3ad3e7ae9..de0a1f2af983 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
@@ -68,15 +68,17 @@
 		simple-audio-card,format = "i2s";
 		simple-audio-card,name = "Haikou,I2S-codec";
 		simple-audio-card,mclk-fs = <512>;
+		simple-audio-card,frame-master = <&sgtl5000_codec>;
+		simple-audio-card,bitclock-master = <&sgtl5000_codec>;
 
-		simple-audio-card,codec {
-			clocks = <&sgtl5000_clk>;
+		sgtl5000_codec: simple-audio-card,codec {
 			sound-dai = <&sgtl5000>;
+			// Prevent the dai subsystem from overwriting the clock
+			// frequency. We are using a fixed-frequency oscillator.
+			system-clock-fixed;
 		};
 
 		simple-audio-card,cpu {
-			bitclock-master;
-			frame-master;
 			sound-dai = <&i2s0_8ch>;
 		};
 	};
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
index 7dccbe8a9393..f2279aa6ca9e 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-rock-pi-4.dtsi
@@ -492,6 +492,7 @@
 
 &i2s0 {
 	pinctrl-0 = <&i2s0_2ch_bus>;
+	pinctrl-1 = <&i2s0_2ch_bus_bclk_off>;
 	rockchip,capture-channels = <2>;
 	rockchip,playback-channels = <2>;
 	status = "okay";
diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
index 9da0b6d77c8d..5bc2d4faeea6 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
@@ -2457,6 +2457,16 @@
 					<4 RK_PA0 1 &pcfg_pull_none>;
 			};
 
+			i2s0_2ch_bus_bclk_off: i2s0-2ch-bus-bclk-off {
+				rockchip,pins =
+					<3 RK_PD0 RK_FUNC_GPIO &pcfg_pull_none>,
+					<3 RK_PD1 1 &pcfg_pull_none>,
+					<3 RK_PD2 1 &pcfg_pull_none>,
+					<3 RK_PD3 1 &pcfg_pull_none>,
+					<3 RK_PD7 1 &pcfg_pull_none>,
+					<4 RK_PA0 1 &pcfg_pull_none>;
+			};
+
 			i2s0_8ch_bus: i2s0-8ch-bus {
 				rockchip,pins =
 					<3 RK_PD0 1 &pcfg_pull_none>,
diff --git a/arch/arm64/configs/hardening.config b/arch/arm64/configs/hardening.config
new file mode 100644
index 000000000000..b0e795208998
--- /dev/null
+++ b/arch/arm64/configs/hardening.config
@@ -0,0 +1,22 @@
+# Basic kernel hardening options (specific to arm64)
+
+# Make sure PAN emulation is enabled.
+CONFIG_ARM64_SW_TTBR0_PAN=y
+
+# Software Shadow Stack or PAC
+CONFIG_SHADOW_CALL_STACK=y
+
+# Pointer authentication (ARMv8.3 and later). If hardware actually supports
+# it, one can turn off CONFIG_STACKPROTECTOR_STRONG with this enabled.
+CONFIG_ARM64_PTR_AUTH=y
+CONFIG_ARM64_PTR_AUTH_KERNEL=y
+
+# Available in ARMv8.5 and later.
+CONFIG_ARM64_BTI=y
+CONFIG_ARM64_BTI_KERNEL=y
+CONFIG_ARM64_MTE=y
+CONFIG_KASAN_HW_TAGS=y
+CONFIG_ARM64_E0PD=y
+
+# Available in ARMv8.7 and later.
+CONFIG_ARM64_EPAN=y
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index bd77253b62e0..531effca5f1f 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		453
+#define __NR_compat_syscalls		457
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 78b68311ec81..c453291154fd 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -911,6 +911,12 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 __SYSCALL(__NR_cachestat, sys_cachestat)
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+#define __NR_futex_wake 454
+__SYSCALL(__NR_futex_wake, sys_futex_wake)
+#define __NR_futex_wait 455
+__SYSCALL(__NR_futex_wait, sys_futex_wait)
+#define __NR_futex_requeue 456
+__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 960b98b43506..196533c362e1 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -215,7 +215,7 @@ asmlinkage notrace void secondary_start_kernel(void)
 	if (system_uses_irq_prio_masking())
 		init_gic_priority_masking();
 
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	trace_hardirqs_off();
 
 	/*
@@ -401,7 +401,7 @@ void __noreturn cpu_die_early(void)
 
 	/* Mark this CPU absent */
 	set_cpu_present(cpu, 0);
-	rcu_report_dead(cpu);
+	rcutree_report_cpu_dead();
 
 	if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
 		update_cpu_boot_status(CPU_KILL_ME);
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 83d8609aec03..81375ea78288 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -373,3 +373,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/loongarch/include/asm/local.h b/arch/loongarch/include/asm/local.h
index c49675852bdc..f53ea653af76 100644
--- a/arch/loongarch/include/asm/local.h
+++ b/arch/loongarch/include/asm/local.h
@@ -70,22 +70,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
 #define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n)))
 
 /**
- * local_add_unless - add unless the number is a given value
+ * local_add_unless - add unless the number is already a given value
  * @l: pointer of type local_t
  * @a: the amount to add to l...
  * @u: ...unless l is equal to u.
  *
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
+ * Atomically adds @a to @l, if @v was not already @u.
+ * Returns true if the addition was done.
  */
-#define local_add_unless(l, a, u)				\
-({								\
-	long c, old;						\
-	c = local_read(l);					\
-	while (c != (u) && (old = local_cmpxchg((l), c, c + (a))) != c) \
-		c = old;					\
-	c != (u);						\
-})
+static inline bool
+local_add_unless(local_t *l, long a, long u)
+{
+	long c = local_read(l);
+
+	do {
+		if (unlikely(c == u))
+			return false;
+	} while (!local_try_cmpxchg(l, &c, c + a));
+
+	return true;
+}
+
 #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
 
 #define local_dec_return(l) local_sub_return(1, (l))
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 259ceb125367..f7f997a88bab 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -452,3 +452,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a3798c2637fd..2967ec26b978 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -458,3 +458,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/include/asm/local.h b/arch/mips/include/asm/local.h
index e6ae3df0349d..86fc24022242 100644
--- a/arch/mips/include/asm/local.h
+++ b/arch/mips/include/asm/local.h
@@ -108,22 +108,27 @@ static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new)
 #define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n)))
 
 /**
- * local_add_unless - add unless the number is a given value
+ * local_add_unless - add unless the number is already a given value
  * @l: pointer of type local_t
  * @a: the amount to add to l...
  * @u: ...unless l is equal to u.
  *
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
+ * Atomically adds @a to @l, if @v was not already @u.
+ * Returns true if the addition was done.
  */
-#define local_add_unless(l, a, u)				\
-({								\
-	long c, old;						\
-	c = local_read(l);					\
-	while (c != (u) && (old = local_cmpxchg((l), c, c + (a))) != c) \
-		c = old;					\
-	c != (u);						\
-})
+static __inline__ bool
+local_add_unless(local_t *l, long a, long u)
+{
+	long c = local_read(l);
+
+	do {
+		if (unlikely(c == u))
+			return false;
+	} while (!local_try_cmpxchg(l, &c, c + a));
+
+	return true;
+}
+
 #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
 
 #define local_dec_return(l) local_sub_return(1, (l))
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 152034b8e0a0..383abb1713f4 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -391,3 +391,6 @@
 450	n32	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	n32	cachestat			sys_cachestat
 452	n32	fchmodat2			sys_fchmodat2
+454	n32	futex_wake			sys_futex_wake
+455	n32	futex_wait			sys_futex_wait
+456	n32	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index cb5e757f6621..c9bd09ba905f 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -367,3 +367,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	n64	cachestat			sys_cachestat
 452	n64	fchmodat2			sys_fchmodat2
+454	n64	futex_wake			sys_futex_wake
+455	n64	futex_wait			sys_futex_wait
+456	n64	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 1a646813afdc..ba5ef6cea97a 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -440,3 +440,6 @@
 450	o32	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	o32	cachestat			sys_cachestat
 452	o32	fchmodat2			sys_fchmodat2
+454	o32	futex_wake			sys_futex_wake
+455	o32	futex_wait			sys_futex_wait
+456	o32	futex_requeue			sys_futex_requeue
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index e97c175b56f9..9f0f6df55361 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -451,3 +451,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/powerpc/configs/hardening.config b/arch/powerpc/configs/hardening.config
new file mode 100644
index 000000000000..4e9bba327e8f
--- /dev/null
+++ b/arch/powerpc/configs/hardening.config
@@ -0,0 +1,10 @@
+# PowerPC specific hardening options
+
+# Block kernel from unexpectedly reading userspace memory.
+CONFIG_PPC_KUAP=y
+
+# Attack surface reduction.
+# CONFIG_SCOM_DEBUGFS is not set
+
+# Disable internal kernel debugger.
+# CONFIG_XMON is not set
diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h
index 45492fb5bf22..ec6ced6d7ced 100644
--- a/arch/powerpc/include/asm/local.h
+++ b/arch/powerpc/include/asm/local.h
@@ -115,23 +115,23 @@ static __inline__ long local_xchg(local_t *l, long n)
 }
 
 /**
- * local_add_unless - add unless the number is a given value
+ * local_add_unless - add unless the number is already a given value
  * @l: pointer of type local_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
  *
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
+ * Atomically adds @a to @l, if @v was not already @u.
+ * Returns true if the addition was done.
  */
-static __inline__ int local_add_unless(local_t *l, long a, long u)
+static __inline__ bool local_add_unless(local_t *l, long a, long u)
 {
 	unsigned long flags;
-	int ret = 0;
+	bool ret = false;
 
 	powerpc_local_irq_pmu_save(flags);
 	if (l->v != u) {
 		l->v += a;
-		ret = 1;
+		ret = true;
 	}
 	powerpc_local_irq_pmu_restore(flags);
 
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 2f1026fba00d..20f72cd1d813 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -948,6 +948,8 @@ void __init setup_arch(char **cmdline_p)
 
 	/* Parse memory topology */
 	mem_topology_setup();
+	/* Set max_mapnr before paging_init() */
+	set_max_mapnr(max_pfn);
 
 	/*
 	 * Release secondary cpus out of their spinloops at 0x60 now that
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5826f5108a12..ab691c89d787 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1051,7 +1051,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
 #endif
 	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
 	{ cpu_mc_mask, SD_INIT_NAME(MC) },
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
 	{ NULL, },
 };
 
@@ -1595,7 +1595,7 @@ static void add_cpu_to_masks(int cpu)
 	/* Skip all CPUs already part of current CPU core mask */
 	cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu));
 
-	/* If chip_id is -1; limit the cpu_core_mask to within DIE*/
+	/* If chip_id is -1; limit the cpu_core_mask to within PKG */
 	if (chip_id == -1)
 		cpumask_and(mask, mask, cpu_cpu_mask(cpu));
 
@@ -1629,7 +1629,7 @@ void start_secondary(void *unused)
 
 	smp_store_cpu_info(cpu);
 	set_dec(tb_ticks_per_jiffy);
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	cpu_callin_map[cpu] = 1;
 
 	if (smp_ops->setup_cpu)
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 20e50586e8a2..26fc41904266 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -539,3 +539,6 @@
 450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 8b121df7b08f..07e8f4f1e07f 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -288,7 +288,6 @@ void __init mem_init(void)
 #endif
 
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-	set_max_mapnr(max_pfn);
 
 	kasan_late_init();
 
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 3ba9fe411604..4d69bfb9bc11 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -104,6 +104,8 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
 /* Embedded type MMU with HW exec support. This is a bit more complicated
  * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
  * instead we "filter out" the exec permission for non clean pages.
+ *
+ * This is also called once for the folio. So only work with folio->flags here.
  */
 static inline pte_t set_pte_filter(pte_t pte)
 {
@@ -190,29 +192,39 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
 void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 		pte_t pte, unsigned int nr)
 {
-	/*
-	 * Make sure hardware valid bit is not set. We don't do
-	 * tlb flush for this update.
-	 */
-	VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 
 	/* Note: mm->context.id might not yet have been assigned as
 	 * this context might not have been activated yet when this
-	 * is called.
+	 * is called. Filter the pte value and use the filtered value
+	 * to setup all the ptes in the range.
 	 */
 	pte = set_pte_filter(pte);
 
-	/* Perform the setting of the PTE */
-	arch_enter_lazy_mmu_mode();
+	/*
+	 * We don't need to call arch_enter/leave_lazy_mmu_mode()
+	 * because we expect set_ptes to be only be used on not present
+	 * and not hw_valid ptes. Hence there is no translation cache flush
+	 * involved that need to be batched.
+	 */
 	for (;;) {
+
+		/*
+		 * Make sure hardware valid bit is not set. We don't do
+		 * tlb flush for this update.
+		 */
+		VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+		/* Perform the setting of the PTE */
 		__set_pte_at(mm, addr, ptep, pte, 0);
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT));
 		addr += PAGE_SIZE;
+		/*
+		 * increment the pfn.
+		 */
+		pte = pfn_pte(pte_pfn(pte) + 1, pte_pgprot((pte)));
 	}
-	arch_leave_lazy_mmu_mode();
 }
 
 void unmap_kernel_page(unsigned long va)
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 1a587618015c..18daafbe2e65 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -66,7 +66,7 @@ static int match_context(const void *v, struct file *file, unsigned fd)
  */
 static struct spu_context *coredump_next_context(int *fd)
 {
-	struct spu_context *ctx;
+	struct spu_context *ctx = NULL;
 	struct file *file;
 	int n = iterate_fd(current->files, *fd, match_context, NULL);
 	if (!n)
@@ -74,10 +74,13 @@ static struct spu_context *coredump_next_context(int *fd)
 	*fd = n - 1;
 
 	rcu_read_lock();
-	file = lookup_fd_rcu(*fd);
-	ctx = SPUFS_I(file_inode(file))->i_ctx;
-	get_spu_context(ctx);
+	file = lookup_fdget_rcu(*fd);
 	rcu_read_unlock();
+	if (file) {
+		ctx = SPUFS_I(file_inode(file))->i_ctx;
+		get_spu_context(ctx);
+		fput(file);
+	}
 
 	return ctx;
 }
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 38c5be34c895..10c1320adfd0 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -86,7 +86,7 @@ spufs_new_inode(struct super_block *sb, umode_t mode)
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 out:
 	return inode;
 }
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index d607ab0f7c6d..9c48fecc6719 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -273,11 +273,9 @@ config RISCV_DMA_NONCOHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select DMA_BOUNCE_UNALIGNED_KMALLOC if SWIOTLB
-	select DMA_DIRECT_REMAP if MMU
 
 config RISCV_NONSTANDARD_CACHE_OPS
 	bool
-	depends on RISCV_DMA_NONCOHERENT
 	help
 	  This enables function pointer support for non-standard noncoherent
 	  systems to handle cache management.
@@ -550,6 +548,7 @@ config RISCV_ISA_ZICBOM
 	depends on RISCV_ALTERNATIVE
 	default y
 	select RISCV_DMA_NONCOHERENT
+	select DMA_DIRECT_REMAP
 	help
 	   Adds support to dynamically detect the presence of the ZICBOM
 	   extension (Cache Block Management Operations) and enable its
diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
index 566bcefeab50..e2c731cfed8c 100644
--- a/arch/riscv/Kconfig.errata
+++ b/arch/riscv/Kconfig.errata
@@ -77,6 +77,7 @@ config ERRATA_THEAD_PBMT
 config ERRATA_THEAD_CMO
 	bool "Apply T-Head cache management errata"
 	depends on ERRATA_THEAD && MMU
+	select DMA_DIRECT_REMAP
 	select RISCV_DMA_NONCOHERENT
 	default y
 	help
diff --git a/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2.dtsi b/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2.dtsi
index 12ebe9792356..2c02358abd71 100644
--- a/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2.dtsi
+++ b/arch/riscv/boot/dts/starfive/jh7110-starfive-visionfive-2.dtsi
@@ -431,7 +431,7 @@
 		};
 
 		ss-pins {
-			pinmux = <GPIOMUX(48, GPOUT_SYS_SPI0_FSS,
+			pinmux = <GPIOMUX(49, GPOUT_SYS_SPI0_FSS,
 					      GPOEN_ENABLE,
 					      GPI_SYS_SPI0_FSS)>;
 			bias-disable;
diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi
index ce708183b6f6..ff364709a6df 100644
--- a/arch/riscv/boot/dts/thead/th1520.dtsi
+++ b/arch/riscv/boot/dts/thead/th1520.dtsi
@@ -139,6 +139,7 @@
 		interrupt-parent = <&plic>;
 		#address-cells = <2>;
 		#size-cells = <2>;
+		dma-noncoherent;
 		ranges;
 
 		plic: interrupt-controller@ffd8000000 {
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index ada83149932f..858beaf4a8cb 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -53,7 +53,7 @@ static void hypfs_update_update(struct super_block *sb)
 	struct inode *inode = d_inode(sb_info->update_file);
 
 	sb_info->last_update = ktime_get_seconds();
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 }
 
 /* directory tree removal functions */
@@ -101,7 +101,7 @@ static struct inode *hypfs_make_inode(struct super_block *sb, umode_t mode)
 		ret->i_mode = mode;
 		ret->i_uid = hypfs_info->uid;
 		ret->i_gid = hypfs_info->gid;
-		ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret);
+		simple_inode_init_ts(ret);
 		if (S_ISDIR(mode))
 			set_nlink(ret, 2);
 	}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index a4edb7ea66ea..214a1b67f80a 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -898,7 +898,7 @@ static void smp_start_secondary(void *cpuvoid)
 	S390_lowcore.restart_flags = 0;
 	restore_access_regs(S390_lowcore.access_regs_save_area);
 	cpu_init();
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	init_cpu_timer();
 	vtime_init();
 	vdso_getcpu_init();
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 0122cc156952..31be90b241f7 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -455,3 +455,6 @@
 450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
 451  common	cachestat		sys_cachestat			sys_cachestat
 452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
+454  common	futex_wake		sys_futex_wake			sys_futex_wake
+455  common	futex_wait		sys_futex_wait			sys_futex_wait
+456  common	futex_requeue		sys_futex_requeue			sys_futex_requeue
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 68adf1de8888..66bda6a8f918 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -522,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] = {
 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 	{ cpu_book_mask, SD_INIT_NAME(BOOK) },
 	{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
 	{ NULL, },
 };
 
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index e90d585c4d3e..4bc5d488ab17 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -455,3 +455,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
index f07ea88a83af..8fcf2d8c6bd2 100644
--- a/arch/sparc/kernel/cpumap.c
+++ b/arch/sparc/kernel/cpumap.c
@@ -50,7 +50,7 @@ struct cpuinfo_tree {
 
 	/* Offsets into nodes[] for each level of the tree */
 	struct cpuinfo_level level[CPUINFO_LVL_MAX];
-	struct cpuinfo_node  nodes[];
+	struct cpuinfo_node  nodes[] __counted_by(total_nodes);
 };
 
 
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4ed06c71c43f..8404c8e50394 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -498,3 +498,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S
index 84ad709cbecb..66eda40fce36 100644
--- a/arch/sparc/lib/checksum_32.S
+++ b/arch/sparc/lib/checksum_32.S
@@ -453,5 +453,5 @@ ccslow:	cmp	%g1, 0
  * we only bother with faults on loads... */
 
 cc_fault:
-	ret
+	retl
 	 clr	%o0
diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c
index 9483021d86dd..3363851a4ae8 100644
--- a/arch/um/os-Linux/drivers/ethertap_user.c
+++ b/arch/um/os-Linux/drivers/ethertap_user.c
@@ -105,7 +105,7 @@ static int etap_tramp(char *dev, char *gate, int control_me,
 	sprintf(data_fd_buf, "%d", data_remote);
 	sprintf(version_buf, "%d", UML_NET_VERSION);
 	if (gate != NULL) {
-		strncpy(gate_buf, gate, 15);
+		strscpy(gate_buf, gate, sizeof(gate_buf));
 		args = setup_args;
 	}
 	else args = nosetup_args;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 66bfabae8814..ad478a2b49e2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,7 +28,6 @@ config X86_64
 	select ARCH_HAS_GIGANTIC_PAGE
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_PER_VMA_LOCK
-	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE
@@ -118,6 +117,7 @@ config X86
 	select ARCH_SUPPORTS_LTO_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG_THIN
 	select ARCH_USE_BUILTIN_BSWAP
+	select ARCH_USE_CMPXCHG_LOCKREF		if X86_CMPXCHG64
 	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
@@ -1534,6 +1534,7 @@ config NUMA
 	depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
 	default y if X86_BIGSMP
 	select USE_PERCPU_NUMA_NODE_ID
+	select OF_NUMA if OF
 	help
 	  Enable NUMA (Non-Uniform Memory Access) support.
 
@@ -2954,6 +2955,15 @@ config IA32_EMULATION
 	  64-bit kernel. You should likely turn this on, unless you're
 	  100% sure that you don't have any 32-bit programs left.
 
+config IA32_EMULATION_DEFAULT_DISABLED
+	bool "IA32 emulation disabled by default"
+	default n
+	depends on IA32_EMULATION
+	help
+	  Make IA32 emulation disabled by default. This prevents loading 32-bit
+	  processes and access to 32-bit syscalls. If unsure, leave it to its
+	  default value.
+
 config X86_X32_ABI
 	bool "x32 ABI for 64-bit mode"
 	depends on X86_64
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5bfe5caaa444..76da1e8b3eb0 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -43,7 +43,7 @@ endif
 
 # How to compile the 16-bit code.  Note we always compile for -march=i386;
 # that way we can complain to the user if the CPU is insufficient.
-REALMODE_CFLAGS	:= -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
+REALMODE_CFLAGS	:= -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
 		   -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
 		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
 		   -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
@@ -81,6 +81,7 @@ ifeq ($(CONFIG_X86_KERNEL_IBT),y)
 #   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816
 #
 KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables)
+KBUILD_RUSTFLAGS += -Zcf-protection=branch -Zno-jump-tables
 else
 KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
 endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index f33e45ed1437..3cece19b7473 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -89,7 +89,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
 
 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
 
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|efi32_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|z_.*\)$$/\#define ZO_\2 0x\1/p'
 
 quiet_cmd_zoffset = ZOFFSET $@
       cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 9caf89063e77..55c98fdd67d2 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -30,13 +30,13 @@ __efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len)
 	 * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to
 	 * ACPI_TABLE_GUID because it has more features.
 	 */
-	rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len,
+	rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len,
 					  ACPI_20_TABLE_GUID);
 	if (rsdp_addr)
 		return (acpi_physical_address)rsdp_addr;
 
 	/* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */
-	rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len,
+	rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len,
 					  ACPI_TABLE_GUID);
 	if (rsdp_addr)
 		return (acpi_physical_address)rsdp_addr;
@@ -56,15 +56,15 @@ static acpi_physical_address efi_get_rsdp_addr(void)
 	enum efi_type et;
 	int ret;
 
-	et = efi_get_type(boot_params);
+	et = efi_get_type(boot_params_ptr);
 	if (et == EFI_TYPE_NONE)
 		return 0;
 
-	systab_pa = efi_get_system_table(boot_params);
+	systab_pa = efi_get_system_table(boot_params_ptr);
 	if (!systab_pa)
 		error("EFI support advertised, but unable to locate system table.");
 
-	ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len);
+	ret = efi_get_conf_table(boot_params_ptr, &cfg_tbl_pa, &cfg_tbl_len);
 	if (ret || !cfg_tbl_pa)
 		error("EFI config table not found.");
 
@@ -156,7 +156,7 @@ acpi_physical_address get_rsdp_addr(void)
 {
 	acpi_physical_address pa;
 
-	pa = boot_params->acpi_rsdp_addr;
+	pa = boot_params_ptr->acpi_rsdp_addr;
 
 	if (!pa)
 		pa = efi_get_rsdp_addr();
@@ -210,7 +210,7 @@ static unsigned long get_acpi_srat_table(void)
 	rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp();
 	if (!rsdp)
 		rsdp = (struct acpi_table_rsdp *)(long)
-			boot_params->acpi_rsdp_addr;
+			boot_params_ptr->acpi_rsdp_addr;
 
 	if (!rsdp)
 		return 0;
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index f1add5d85da9..c1bb180973ea 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -14,9 +14,9 @@ static inline char rdfs8(addr_t addr)
 #include "../cmdline.c"
 unsigned long get_cmd_line_ptr(void)
 {
-	unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
+	unsigned long cmd_line_ptr = boot_params_ptr->hdr.cmd_line_ptr;
 
-	cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32;
+	cmd_line_ptr |= (u64)boot_params_ptr->ext_cmd_line_ptr << 32;
 
 	return cmd_line_ptr;
 }
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index 08f93b0401bb..473ba59b82a8 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -159,8 +159,9 @@ void initialize_identity_maps(void *rmode)
 	 * or does not touch all the pages covering them.
 	 */
 	kernel_add_identity_map((unsigned long)_head, (unsigned long)_end);
-	boot_params = rmode;
-	kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1));
+	boot_params_ptr = rmode;
+	kernel_add_identity_map((unsigned long)boot_params_ptr,
+				(unsigned long)(boot_params_ptr + 1));
 	cmdline = get_cmd_line_ptr();
 	kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
 
@@ -168,7 +169,7 @@ void initialize_identity_maps(void *rmode)
 	 * Also map the setup_data entries passed via boot_params in case they
 	 * need to be accessed by uncompressed kernel via the identity mapping.
 	 */
-	sd = (struct setup_data *)boot_params->hdr.setup_data;
+	sd = (struct setup_data *)boot_params_ptr->hdr.setup_data;
 	while (sd) {
 		unsigned long sd_addr = (unsigned long)sd;
 
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 9193acf0e9cd..dec961c6d16a 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -63,7 +63,7 @@ static unsigned long get_boot_seed(void)
 	unsigned long hash = 0;
 
 	hash = rotate_xor(hash, build_str, sizeof(build_str));
-	hash = rotate_xor(hash, boot_params, sizeof(*boot_params));
+	hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr));
 
 	return hash;
 }
@@ -383,7 +383,7 @@ static void handle_mem_options(void)
 static void mem_avoid_init(unsigned long input, unsigned long input_size,
 			   unsigned long output)
 {
-	unsigned long init_size = boot_params->hdr.init_size;
+	unsigned long init_size = boot_params_ptr->hdr.init_size;
 	u64 initrd_start, initrd_size;
 	unsigned long cmd_line, cmd_line_size;
 
@@ -395,10 +395,10 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
 	mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
 
 	/* Avoid initrd. */
-	initrd_start  = (u64)boot_params->ext_ramdisk_image << 32;
-	initrd_start |= boot_params->hdr.ramdisk_image;
-	initrd_size  = (u64)boot_params->ext_ramdisk_size << 32;
-	initrd_size |= boot_params->hdr.ramdisk_size;
+	initrd_start  = (u64)boot_params_ptr->ext_ramdisk_image << 32;
+	initrd_start |= boot_params_ptr->hdr.ramdisk_image;
+	initrd_size  = (u64)boot_params_ptr->ext_ramdisk_size << 32;
+	initrd_size |= boot_params_ptr->hdr.ramdisk_size;
 	mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
 	mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
 	/* No need to set mapping for initrd, it will be handled in VO. */
@@ -413,8 +413,8 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
 	}
 
 	/* Avoid boot parameters. */
-	mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
-	mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
+	mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr;
+	mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr);
 
 	/* We don't need to set a mapping for setup_data. */
 
@@ -447,7 +447,7 @@ static bool mem_avoid_overlap(struct mem_vector *img,
 	}
 
 	/* Avoid all entries in the setup_data linked list. */
-	ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
+	ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
 	while (ptr) {
 		struct mem_vector avoid;
 
@@ -706,7 +706,7 @@ static inline bool memory_type_is_free(efi_memory_desc_t *md)
 static bool
 process_efi_entries(unsigned long minimum, unsigned long image_size)
 {
-	struct efi_info *e = &boot_params->efi_info;
+	struct efi_info *e = &boot_params_ptr->efi_info;
 	bool efi_mirror_found = false;
 	struct mem_vector region;
 	efi_memory_desc_t *md;
@@ -777,8 +777,8 @@ static void process_e820_entries(unsigned long minimum,
 	struct boot_e820_entry *entry;
 
 	/* Verify potential e820 positions, appending to slots list. */
-	for (i = 0; i < boot_params->e820_entries; i++) {
-		entry = &boot_params->e820_table[i];
+	for (i = 0; i < boot_params_ptr->e820_entries; i++) {
+		entry = &boot_params_ptr->e820_table[i];
 		/* Skip non-RAM entries. */
 		if (entry->type != E820_TYPE_RAM)
 			continue;
@@ -852,7 +852,7 @@ void choose_random_location(unsigned long input,
 		return;
 	}
 
-	boot_params->hdr.loadflags |= KASLR_FLAG;
+	boot_params_ptr->hdr.loadflags |= KASLR_FLAG;
 
 	if (IS_ENABLED(CONFIG_X86_32))
 		mem_limit = KERNEL_IMAGE_SIZE;
diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c
index 3c1609245f2a..b3c3a4be7471 100644
--- a/arch/x86/boot/compressed/mem.c
+++ b/arch/x86/boot/compressed/mem.c
@@ -54,17 +54,17 @@ bool init_unaccepted_memory(void)
 	enum efi_type et;
 	int ret;
 
-	et = efi_get_type(boot_params);
+	et = efi_get_type(boot_params_ptr);
 	if (et == EFI_TYPE_NONE)
 		return false;
 
-	ret = efi_get_conf_table(boot_params, &cfg_table_pa, &cfg_table_len);
+	ret = efi_get_conf_table(boot_params_ptr, &cfg_table_pa, &cfg_table_len);
 	if (ret) {
 		warn("EFI config table not found.");
 		return false;
 	}
 
-	table = (void *)efi_find_vendor_table(boot_params, cfg_table_pa,
+	table = (void *)efi_find_vendor_table(boot_params_ptr, cfg_table_pa,
 					      cfg_table_len, guid);
 	if (!table)
 		return false;
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index f711f2a85862..b99e08e6815b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -46,7 +46,7 @@ void *memmove(void *dest, const void *src, size_t n);
 /*
  * This is set up by the setup-routine at boot-time
  */
-struct boot_params *boot_params;
+struct boot_params *boot_params_ptr;
 
 struct port_io_ops pio_ops;
 
@@ -132,8 +132,8 @@ void __putstr(const char *s)
 	if (lines == 0 || cols == 0)
 		return;
 
-	x = boot_params->screen_info.orig_x;
-	y = boot_params->screen_info.orig_y;
+	x = boot_params_ptr->screen_info.orig_x;
+	y = boot_params_ptr->screen_info.orig_y;
 
 	while ((c = *s++) != '\0') {
 		if (c == '\n') {
@@ -154,8 +154,8 @@ void __putstr(const char *s)
 		}
 	}
 
-	boot_params->screen_info.orig_x = x;
-	boot_params->screen_info.orig_y = y;
+	boot_params_ptr->screen_info.orig_x = x;
+	boot_params_ptr->screen_info.orig_y = y;
 
 	pos = (x + cols * y) * 2;	/* Update cursor position */
 	outb(14, vidport);
@@ -382,14 +382,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
 	size_t entry_offset;
 
 	/* Retain x86 boot parameters pointer passed from startup_32/64. */
-	boot_params = rmode;
+	boot_params_ptr = rmode;
 
 	/* Clear flags intended for solely in-kernel use. */
-	boot_params->hdr.loadflags &= ~KASLR_FLAG;
+	boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG;
 
-	sanitize_boot_params(boot_params);
+	sanitize_boot_params(boot_params_ptr);
 
-	if (boot_params->screen_info.orig_video_mode == 7) {
+	if (boot_params_ptr->screen_info.orig_video_mode == 7) {
 		vidmem = (char *) 0xb0000;
 		vidport = 0x3b4;
 	} else {
@@ -397,8 +397,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
 		vidport = 0x3d4;
 	}
 
-	lines = boot_params->screen_info.orig_video_lines;
-	cols = boot_params->screen_info.orig_video_cols;
+	lines = boot_params_ptr->screen_info.orig_video_lines;
+	cols = boot_params_ptr->screen_info.orig_video_cols;
 
 	init_default_io_ops();
 
@@ -417,7 +417,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
 	 * so that early debugging output from the RSDP parsing code can be
 	 * collected.
 	 */
-	boot_params->acpi_rsdp_addr = get_rsdp_addr();
+	boot_params_ptr->acpi_rsdp_addr = get_rsdp_addr();
 
 	debug_putstr("early console in extract_kernel\n");
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index cc70d3fb9049..c0d502bd8716 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -61,7 +61,6 @@ extern memptr free_mem_ptr;
 extern memptr free_mem_end_ptr;
 void *malloc(int size);
 void free(void *where);
-extern struct boot_params *boot_params;
 void __putstr(const char *s);
 void __puthex(unsigned long value);
 #define error_putstr(__x)  __putstr(__x)
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 7939eb6e6ce9..51f957b24ba7 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -28,7 +28,6 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
  */
 unsigned long *trampoline_32bit __section(".data");
 
-extern struct boot_params *boot_params;
 int cmdline_find_option_bool(const char *option);
 
 static unsigned long find_trampoline_placement(void)
@@ -49,7 +48,7 @@ static unsigned long find_trampoline_placement(void)
 	 *
 	 * Only look for values in the legacy ROM for non-EFI system.
 	 */
-	signature = (char *)&boot_params->efi_info.efi_loader_signature;
+	signature = (char *)&boot_params_ptr->efi_info.efi_loader_signature;
 	if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
 	    strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) {
 		ebda_start = *(unsigned short *)0x40e << 4;
@@ -65,10 +64,10 @@ static unsigned long find_trampoline_placement(void)
 	bios_start = round_down(bios_start, PAGE_SIZE);
 
 	/* Find the first usable memory region under bios_start. */
-	for (i = boot_params->e820_entries - 1; i >= 0; i--) {
+	for (i = boot_params_ptr->e820_entries - 1; i >= 0; i--) {
 		unsigned long new = bios_start;
 
-		entry = &boot_params->e820_table[i];
+		entry = &boot_params_ptr->e820_table[i];
 
 		/* Skip all entries above bios_start. */
 		if (bios_start <= entry->addr)
@@ -107,7 +106,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
 	bool l5_required = false;
 
 	/* Initialize boot_params. Required for cmdline_find_option_bool(). */
-	boot_params = bp;
+	boot_params_ptr = bp;
 
 	/*
 	 * Check if LA57 is desired and supported.
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 80d76aea1f7b..454acd7a2daf 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -25,7 +25,7 @@
 #include "error.h"
 #include "../msr.h"
 
-struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
+static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
 struct ghcb *boot_ghcb;
 
 /*
@@ -615,7 +615,7 @@ void sev_prep_identity_maps(unsigned long top_level_pgt)
 	 * accessed after switchover.
 	 */
 	if (sev_snp_enabled()) {
-		unsigned long cc_info_pa = boot_params->cc_blob_address;
+		unsigned long cc_info_pa = boot_params_ptr->cc_blob_address;
 		struct cc_blob_sev_info *cc_info;
 
 		kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info));
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index b22f34b8684a..083ec6d7722a 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -43,11 +43,13 @@ SECTIONS
 		*(.rodata.*)
 		_erodata = . ;
 	}
-	.data :	{
+	.data :	ALIGN(0x1000) {
 		_data = . ;
 		*(.data)
 		*(.data.*)
-		*(.bss.efistub)
+
+		/* Add 4 bytes of extra space for a CRC-32 checksum */
+		. = ALIGN(. + 4, 0x200);
 		_edata = . ;
 	}
 	. = ALIGN(L1_CACHE_BYTES);
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b04ca8e2b213..b2771710ed98 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -36,66 +36,20 @@ SYSSEG		= 0x1000		/* historical load address >> 4 */
 #define ROOT_RDONLY 1
 #endif
 
+	.set	salign, 0x1000
+	.set	falign, 0x200
+
 	.code16
 	.section ".bstext", "ax"
-
-	.global bootsect_start
-bootsect_start:
 #ifdef CONFIG_EFI_STUB
 	# "MZ", MS-DOS header
 	.word	MZ_MAGIC
-#endif
-
-	# Normalize the start address
-	ljmp	$BOOTSEG, $start2
-
-start2:
-	movw	%cs, %ax
-	movw	%ax, %ds
-	movw	%ax, %es
-	movw	%ax, %ss
-	xorw	%sp, %sp
-	sti
-	cld
-
-	movw	$bugger_off_msg, %si
-
-msg_loop:
-	lodsb
-	andb	%al, %al
-	jz	bs_die
-	movb	$0xe, %ah
-	movw	$7, %bx
-	int	$0x10
-	jmp	msg_loop
-
-bs_die:
-	# Allow the user to press a key, then reboot
-	xorw	%ax, %ax
-	int	$0x16
-	int	$0x19
-
-	# int 0x19 should never return.  In case it does anyway,
-	# invoke the BIOS reset code...
-	ljmp	$0xf000,$0xfff0
-
-#ifdef CONFIG_EFI_STUB
 	.org	0x38
 	#
 	# Offset to the PE header.
 	#
 	.long	LINUX_PE_MAGIC
 	.long	pe_header
-#endif /* CONFIG_EFI_STUB */
-
-	.section ".bsdata", "a"
-bugger_off_msg:
-	.ascii	"Use a boot loader.\r\n"
-	.ascii	"\n"
-	.ascii	"Remove disk and press any key to reboot...\r\n"
-	.byte	0
-
-#ifdef CONFIG_EFI_STUB
 pe_header:
 	.long	PE_MAGIC
 
@@ -124,30 +78,26 @@ optional_header:
 	.byte	0x02				# MajorLinkerVersion
 	.byte	0x14				# MinorLinkerVersion
 
-	# Filled in by build.c
-	.long	0				# SizeOfCode
+	.long	ZO__data			# SizeOfCode
 
-	.long	0				# SizeOfInitializedData
+	.long	ZO__end - ZO__data		# SizeOfInitializedData
 	.long	0				# SizeOfUninitializedData
 
-	# Filled in by build.c
-	.long	0x0000				# AddressOfEntryPoint
+	.long	setup_size + ZO_efi_pe_entry	# AddressOfEntryPoint
 
-	.long	0x0200				# BaseOfCode
+	.long	setup_size			# BaseOfCode
 #ifdef CONFIG_X86_32
 	.long	0				# data
 #endif
 
 extra_header_fields:
-	# PE specification requires ImageBase to be 64k aligned
-	.set	image_base, (LOAD_PHYSICAL_ADDR + 0xffff) & ~0xffff
 #ifdef CONFIG_X86_32
-	.long	image_base			# ImageBase
+	.long	0				# ImageBase
 #else
-	.quad	image_base			# ImageBase
+	.quad	0				# ImageBase
 #endif
-	.long	0x20				# SectionAlignment
-	.long	0x20				# FileAlignment
+	.long	salign				# SectionAlignment
+	.long	falign				# FileAlignment
 	.word	0				# MajorOperatingSystemVersion
 	.word	0				# MinorOperatingSystemVersion
 	.word	LINUX_EFISTUB_MAJOR_VERSION	# MajorImageVersion
@@ -156,12 +106,10 @@ extra_header_fields:
 	.word	0				# MinorSubsystemVersion
 	.long	0				# Win32VersionValue
 
-	#
-	# The size of the bzImage is written in tools/build.c
-	#
-	.long	0				# SizeOfImage
+	.long	setup_size + ZO__end + pecompat_vsize
+						# SizeOfImage
 
-	.long	0x200				# SizeOfHeaders
+	.long	salign				# SizeOfHeaders
 	.long	0				# CheckSum
 	.word	IMAGE_SUBSYSTEM_EFI_APPLICATION	# Subsystem (EFI application)
 #ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES
@@ -192,87 +140,78 @@ extra_header_fields:
 
 	# Section table
 section_table:
-	#
-	# The offset & size fields are filled in by build.c.
-	#
 	.ascii	".setup"
 	.byte	0
 	.byte	0
-	.long	0
-	.long	0x0				# startup_{32,64}
-	.long	0				# Size of initialized data
-						# on disk
-	.long	0x0				# startup_{32,64}
-	.long	0				# PointerToRelocations
-	.long	0				# PointerToLineNumbers
-	.word	0				# NumberOfRelocations
-	.word	0				# NumberOfLineNumbers
-	.long	IMAGE_SCN_CNT_CODE		| \
-		IMAGE_SCN_MEM_READ		| \
-		IMAGE_SCN_MEM_EXECUTE		| \
-		IMAGE_SCN_ALIGN_16BYTES		# Characteristics
+	.long	setup_size - salign 		# VirtualSize
+	.long	salign				# VirtualAddress
+	.long	pecompat_fstart - salign	# SizeOfRawData
+	.long	salign				# PointerToRawData
 
-	#
-	# The EFI application loader requires a relocation section
-	# because EFI applications must be relocatable. The .reloc
-	# offset & size fields are filled in by build.c.
-	#
-	.ascii	".reloc"
-	.byte	0
-	.byte	0
-	.long	0
-	.long	0
-	.long	0				# SizeOfRawData
-	.long	0				# PointerToRawData
-	.long	0				# PointerToRelocations
-	.long	0				# PointerToLineNumbers
-	.word	0				# NumberOfRelocations
-	.word	0				# NumberOfLineNumbers
+	.long	0, 0, 0
 	.long	IMAGE_SCN_CNT_INITIALIZED_DATA	| \
 		IMAGE_SCN_MEM_READ		| \
-		IMAGE_SCN_MEM_DISCARDABLE	| \
-		IMAGE_SCN_ALIGN_1BYTES		# Characteristics
+		IMAGE_SCN_MEM_DISCARDABLE	# Characteristics
 
 #ifdef CONFIG_EFI_MIXED
-	#
-	# The offset & size fields are filled in by build.c.
-	#
 	.asciz	".compat"
-	.long	0
-	.long	0x0
-	.long	0				# Size of initialized data
-						# on disk
-	.long	0x0
-	.long	0				# PointerToRelocations
-	.long	0				# PointerToLineNumbers
-	.word	0				# NumberOfRelocations
-	.word	0				# NumberOfLineNumbers
+
+	.long	8				# VirtualSize
+	.long	setup_size + ZO__end		# VirtualAddress
+	.long	pecompat_fsize			# SizeOfRawData
+	.long	pecompat_fstart			# PointerToRawData
+
+	.long	0, 0, 0
 	.long	IMAGE_SCN_CNT_INITIALIZED_DATA	| \
 		IMAGE_SCN_MEM_READ		| \
-		IMAGE_SCN_MEM_DISCARDABLE	| \
-		IMAGE_SCN_ALIGN_1BYTES		# Characteristics
+		IMAGE_SCN_MEM_DISCARDABLE	# Characteristics
+
+	/*
+	 * Put the IA-32 machine type and the associated entry point address in
+	 * the .compat section, so loaders can figure out which other execution
+	 * modes this image supports.
+	 */
+	.pushsection ".pecompat", "a", @progbits
+	.balign	falign
+	.set	pecompat_vsize, salign
+	.globl	pecompat_fstart
+pecompat_fstart:
+	.byte	0x1				# Version
+	.byte	8				# Size
+	.word	IMAGE_FILE_MACHINE_I386		# PE machine type
+	.long	setup_size + ZO_efi32_pe_entry	# Entrypoint
+	.popsection
+#else
+	.set	pecompat_vsize, 0
+	.set	pecompat_fstart, setup_size
 #endif
-
-	#
-	# The offset & size fields are filled in by build.c.
-	#
 	.ascii	".text"
 	.byte	0
 	.byte	0
 	.byte	0
-	.long	0
-	.long	0x0				# startup_{32,64}
-	.long	0				# Size of initialized data
+	.long	ZO__data
+	.long	setup_size
+	.long	ZO__data			# Size of initialized data
 						# on disk
-	.long	0x0				# startup_{32,64}
+	.long	setup_size
 	.long	0				# PointerToRelocations
 	.long	0				# PointerToLineNumbers
 	.word	0				# NumberOfRelocations
 	.word	0				# NumberOfLineNumbers
 	.long	IMAGE_SCN_CNT_CODE		| \
 		IMAGE_SCN_MEM_READ		| \
-		IMAGE_SCN_MEM_EXECUTE		| \
-		IMAGE_SCN_ALIGN_16BYTES		# Characteristics
+		IMAGE_SCN_MEM_EXECUTE		# Characteristics
+
+	.ascii	".data\0\0\0"
+	.long	ZO__end - ZO__data		# VirtualSize
+	.long	setup_size + ZO__data		# VirtualAddress
+	.long	ZO__edata - ZO__data		# SizeOfRawData
+	.long	setup_size + ZO__data		# PointerToRawData
+
+	.long	0, 0, 0
+	.long	IMAGE_SCN_CNT_INITIALIZED_DATA	| \
+		IMAGE_SCN_MEM_READ		| \
+		IMAGE_SCN_MEM_WRITE		# Characteristics
 
 	.set	section_count, (. - section_table) / 40
 #endif /* CONFIG_EFI_STUB */
@@ -286,12 +225,12 @@ sentinel:	.byte 0xff, 0xff        /* Used to detect broken loaders */
 
 	.globl	hdr
 hdr:
-setup_sects:	.byte 0			/* Filled in by build.c */
+		.byte setup_sects - 1
 root_flags:	.word ROOT_RDONLY
-syssize:	.long 0			/* Filled in by build.c */
+syssize:	.long ZO__edata / 16
 ram_size:	.word 0			/* Obsolete */
 vid_mode:	.word SVGA_MODE
-root_dev:	.word 0			/* Filled in by build.c */
+root_dev:	.word 0			/* Default to major/minor 0/0 */
 boot_flag:	.word 0xAA55
 
 	# offset 512, entry point
@@ -579,9 +518,25 @@ pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr
 # define INIT_SIZE VO_INIT_SIZE
 #endif
 
+	.macro		__handover_offset
+#ifndef CONFIG_EFI_HANDOVER_PROTOCOL
+	.long		0
+#elif !defined(CONFIG_X86_64)
+	.long		ZO_efi32_stub_entry
+#else
+	/* Yes, this is really how we defined it :( */
+	.long		ZO_efi64_stub_entry - 0x200
+#ifdef CONFIG_EFI_MIXED
+	.if		ZO_efi32_stub_entry != ZO_efi64_stub_entry - 0x200
+	.error		"32-bit and 64-bit EFI entry points do not match"
+	.endif
+#endif
+#endif
+	.endm
+
 init_size:		.long INIT_SIZE		# kernel initialization size
-handover_offset:	.long 0			# Filled in by build.c
-kernel_info_offset:	.long 0			# Filled in by build.c
+handover_offset:	__handover_offset
+kernel_info_offset:	.long ZO_kernel_info
 
 # End of setup header #####################################################
 
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 49546c247ae2..83bb7efad8ae 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -10,10 +10,11 @@ ENTRY(_start)
 SECTIONS
 {
 	. = 0;
-	.bstext		: { *(.bstext) }
-	.bsdata		: { *(.bsdata) }
+	.bstext	: {
+		*(.bstext)
+		. = 495;
+	} =0xffffffff
 
-	. = 495;
 	.header		: { *(.header) }
 	.entrytext	: { *(.entrytext) }
 	.inittext	: { *(.inittext) }
@@ -35,11 +36,16 @@ SECTIONS
 	. = ALIGN(16);
 	.data		: { *(.data*) }
 
+	.pecompat	: { *(.pecompat) }
+	PROVIDE(pecompat_fsize = setup_size - pecompat_fstart);
+
 	.signature	: {
 		setup_sig = .;
 		LONG(0x5a5aaa55)
-	}
 
+		setup_size = ALIGN(ABSOLUTE(.), 4096);
+		setup_sects = ABSOLUTE(setup_size / 512);
+	}
 
 	. = ALIGN(16);
 	.bss		:
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index bd247692b701..10311d77c67f 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -40,10 +40,6 @@ typedef unsigned char  u8;
 typedef unsigned short u16;
 typedef unsigned int   u32;
 
-#define DEFAULT_MAJOR_ROOT 0
-#define DEFAULT_MINOR_ROOT 0
-#define DEFAULT_ROOT_DEV (DEFAULT_MAJOR_ROOT << 8 | DEFAULT_MINOR_ROOT)
-
 /* Minimal number of setup sectors */
 #define SETUP_SECT_MIN 5
 #define SETUP_SECT_MAX 64
@@ -51,22 +47,7 @@ typedef unsigned int   u32;
 /* This must be large enough to hold the entire setup */
 u8 buf[SETUP_SECT_MAX*512];
 
-#define PECOFF_RELOC_RESERVE 0x20
-
-#ifdef CONFIG_EFI_MIXED
-#define PECOFF_COMPAT_RESERVE 0x20
-#else
-#define PECOFF_COMPAT_RESERVE 0x0
-#endif
-
-static unsigned long efi32_stub_entry;
-static unsigned long efi64_stub_entry;
-static unsigned long efi_pe_entry;
-static unsigned long efi32_pe_entry;
-static unsigned long kernel_info;
-static unsigned long startup_64;
-static unsigned long _ehead;
-static unsigned long _end;
+static unsigned long _edata;
 
 /*----------------------------------------------------------------------*/
 
@@ -152,180 +133,6 @@ static void usage(void)
 	die("Usage: build setup system zoffset.h image");
 }
 
-#ifdef CONFIG_EFI_STUB
-
-static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
-{
-	unsigned int pe_header;
-	unsigned short num_sections;
-	u8 *section;
-
-	pe_header = get_unaligned_le32(&buf[0x3c]);
-	num_sections = get_unaligned_le16(&buf[pe_header + 6]);
-
-#ifdef CONFIG_X86_32
-	section = &buf[pe_header + 0xa8];
-#else
-	section = &buf[pe_header + 0xb8];
-#endif
-
-	while (num_sections > 0) {
-		if (strncmp((char*)section, section_name, 8) == 0) {
-			/* section header size field */
-			put_unaligned_le32(size, section + 0x8);
-
-			/* section header vma field */
-			put_unaligned_le32(vma, section + 0xc);
-
-			/* section header 'size of initialised data' field */
-			put_unaligned_le32(datasz, section + 0x10);
-
-			/* section header 'file offset' field */
-			put_unaligned_le32(offset, section + 0x14);
-
-			break;
-		}
-		section += 0x28;
-		num_sections--;
-	}
-}
-
-static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
-{
-	update_pecoff_section_header_fields(section_name, offset, size, size, offset);
-}
-
-static void update_pecoff_setup_and_reloc(unsigned int size)
-{
-	u32 setup_offset = 0x200;
-	u32 reloc_offset = size - PECOFF_RELOC_RESERVE - PECOFF_COMPAT_RESERVE;
-#ifdef CONFIG_EFI_MIXED
-	u32 compat_offset = reloc_offset + PECOFF_RELOC_RESERVE;
-#endif
-	u32 setup_size = reloc_offset - setup_offset;
-
-	update_pecoff_section_header(".setup", setup_offset, setup_size);
-	update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE);
-
-	/*
-	 * Modify .reloc section contents with a single entry. The
-	 * relocation is applied to offset 10 of the relocation section.
-	 */
-	put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]);
-	put_unaligned_le32(10, &buf[reloc_offset + 4]);
-
-#ifdef CONFIG_EFI_MIXED
-	update_pecoff_section_header(".compat", compat_offset, PECOFF_COMPAT_RESERVE);
-
-	/*
-	 * Put the IA-32 machine type (0x14c) and the associated entry point
-	 * address in the .compat section, so loaders can figure out which other
-	 * execution modes this image supports.
-	 */
-	buf[compat_offset] = 0x1;
-	buf[compat_offset + 1] = 0x8;
-	put_unaligned_le16(0x14c, &buf[compat_offset + 2]);
-	put_unaligned_le32(efi32_pe_entry + size, &buf[compat_offset + 4]);
-#endif
-}
-
-static void update_pecoff_text(unsigned int text_start, unsigned int file_sz,
-			       unsigned int init_sz)
-{
-	unsigned int pe_header;
-	unsigned int text_sz = file_sz - text_start;
-	unsigned int bss_sz = init_sz - file_sz;
-
-	pe_header = get_unaligned_le32(&buf[0x3c]);
-
-	/*
-	 * The PE/COFF loader may load the image at an address which is
-	 * misaligned with respect to the kernel_alignment field in the setup
-	 * header.
-	 *
-	 * In order to avoid relocating the kernel to correct the misalignment,
-	 * add slack to allow the buffer to be aligned within the declared size
-	 * of the image.
-	 */
-	bss_sz	+= CONFIG_PHYSICAL_ALIGN;
-	init_sz	+= CONFIG_PHYSICAL_ALIGN;
-
-	/*
-	 * Size of code: Subtract the size of the first sector (512 bytes)
-	 * which includes the header.
-	 */
-	put_unaligned_le32(file_sz - 512 + bss_sz, &buf[pe_header + 0x1c]);
-
-	/* Size of image */
-	put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
-
-	/*
-	 * Address of entry point for PE/COFF executable
-	 */
-	put_unaligned_le32(text_start + efi_pe_entry, &buf[pe_header + 0x28]);
-
-	update_pecoff_section_header_fields(".text", text_start, text_sz + bss_sz,
-					    text_sz, text_start);
-}
-
-static int reserve_pecoff_reloc_section(int c)
-{
-	/* Reserve 0x20 bytes for .reloc section */
-	memset(buf+c, 0, PECOFF_RELOC_RESERVE);
-	return PECOFF_RELOC_RESERVE;
-}
-
-static void efi_stub_defaults(void)
-{
-	/* Defaults for old kernel */
-#ifdef CONFIG_X86_32
-	efi_pe_entry = 0x10;
-#else
-	efi_pe_entry = 0x210;
-	startup_64 = 0x200;
-#endif
-}
-
-static void efi_stub_entry_update(void)
-{
-	unsigned long addr = efi32_stub_entry;
-
-#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
-#ifdef CONFIG_X86_64
-	/* Yes, this is really how we defined it :( */
-	addr = efi64_stub_entry - 0x200;
-#endif
-
-#ifdef CONFIG_EFI_MIXED
-	if (efi32_stub_entry != addr)
-		die("32-bit and 64-bit EFI entry points do not match\n");
-#endif
-#endif
-	put_unaligned_le32(addr, &buf[0x264]);
-}
-
-#else
-
-static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
-static inline void update_pecoff_text(unsigned int text_start,
-				      unsigned int file_sz,
-				      unsigned int init_sz) {}
-static inline void efi_stub_defaults(void) {}
-static inline void efi_stub_entry_update(void) {}
-
-static inline int reserve_pecoff_reloc_section(int c)
-{
-	return 0;
-}
-#endif /* CONFIG_EFI_STUB */
-
-static int reserve_pecoff_compat_section(int c)
-{
-	/* Reserve 0x20 bytes for .compat section */
-	memset(buf+c, 0, PECOFF_COMPAT_RESERVE);
-	return PECOFF_COMPAT_RESERVE;
-}
-
 /*
  * Parse zoffset.h and find the entry points. We could just #include zoffset.h
  * but that would mean tools/build would have to be rebuilt every time. It's
@@ -354,14 +161,7 @@ static void parse_zoffset(char *fname)
 	p = (char *)buf;
 
 	while (p && *p) {
-		PARSE_ZOFS(p, efi32_stub_entry);
-		PARSE_ZOFS(p, efi64_stub_entry);
-		PARSE_ZOFS(p, efi_pe_entry);
-		PARSE_ZOFS(p, efi32_pe_entry);
-		PARSE_ZOFS(p, kernel_info);
-		PARSE_ZOFS(p, startup_64);
-		PARSE_ZOFS(p, _ehead);
-		PARSE_ZOFS(p, _end);
+		PARSE_ZOFS(p, _edata);
 
 		p = strchr(p, '\n');
 		while (p && (*p == '\r' || *p == '\n'))
@@ -371,17 +171,14 @@ static void parse_zoffset(char *fname)
 
 int main(int argc, char ** argv)
 {
-	unsigned int i, sz, setup_sectors, init_sz;
+	unsigned int i, sz, setup_sectors;
 	int c;
-	u32 sys_size;
 	struct stat sb;
 	FILE *file, *dest;
 	int fd;
 	void *kernel;
 	u32 crc = 0xffffffffUL;
 
-	efi_stub_defaults();
-
 	if (argc != 5)
 		usage();
 	parse_zoffset(argv[3]);
@@ -403,72 +200,27 @@ int main(int argc, char ** argv)
 		die("Boot block hasn't got boot flag (0xAA55)");
 	fclose(file);
 
-	c += reserve_pecoff_compat_section(c);
-	c += reserve_pecoff_reloc_section(c);
-
 	/* Pad unused space with zeros */
-	setup_sectors = (c + 511) / 512;
+	setup_sectors = (c + 4095) / 4096;
+	setup_sectors *= 8;
 	if (setup_sectors < SETUP_SECT_MIN)
 		setup_sectors = SETUP_SECT_MIN;
 	i = setup_sectors*512;
 	memset(buf+c, 0, i-c);
 
-	update_pecoff_setup_and_reloc(i);
-
-	/* Set the default root device */
-	put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
-
 	/* Open and stat the kernel file */
 	fd = open(argv[2], O_RDONLY);
 	if (fd < 0)
 		die("Unable to open `%s': %m", argv[2]);
 	if (fstat(fd, &sb))
 		die("Unable to stat `%s': %m", argv[2]);
-	sz = sb.st_size;
+	if (_edata != sb.st_size)
+		die("Unexpected file size `%s': %u != %u", argv[2], _edata,
+		    sb.st_size);
+	sz = _edata - 4;
 	kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
 	if (kernel == MAP_FAILED)
 		die("Unable to mmap '%s': %m", argv[2]);
-	/* Number of 16-byte paragraphs, including space for a 4-byte CRC */
-	sys_size = (sz + 15 + 4) / 16;
-#ifdef CONFIG_EFI_STUB
-	/*
-	 * COFF requires minimum 32-byte alignment of sections, and
-	 * adding a signature is problematic without that alignment.
-	 */
-	sys_size = (sys_size + 1) & ~1;
-#endif
-
-	/* Patch the setup code with the appropriate size parameters */
-	buf[0x1f1] = setup_sectors-1;
-	put_unaligned_le32(sys_size, &buf[0x1f4]);
-
-	init_sz = get_unaligned_le32(&buf[0x260]);
-#ifdef CONFIG_EFI_STUB
-	/*
-	 * The decompression buffer will start at ImageBase. When relocating
-	 * the compressed kernel to its end, we must ensure that the head
-	 * section does not get overwritten.  The head section occupies
-	 * [i, i + _ehead), and the destination is [init_sz - _end, init_sz).
-	 *
-	 * At present these should never overlap, because 'i' is at most 32k
-	 * because of SETUP_SECT_MAX, '_ehead' is less than 1k, and the
-	 * calculation of INIT_SIZE in boot/header.S ensures that
-	 * 'init_sz - _end' is at least 64k.
-	 *
-	 * For future-proofing, increase init_sz if necessary.
-	 */
-
-	if (init_sz - _end < i + _ehead) {
-		init_sz = (i + _ehead + _end + 4095) & ~4095;
-		put_unaligned_le32(init_sz, &buf[0x260]);
-	}
-#endif
-	update_pecoff_text(setup_sectors * 512, i + (sys_size * 16), init_sz);
-
-	efi_stub_entry_update();
-
-	/* Update kernel_info offset. */
-	put_unaligned_le32(kernel_info, &buf[0x268]);
 
 	crc = partial_crc32(buf, i, crc);
 	if (fwrite(buf, 1, i, dest) != i)
@@ -479,13 +231,6 @@ int main(int argc, char ** argv)
 	if (fwrite(kernel, 1, sz, dest) != sz)
 		die("Writing kernel failed");
 
-	/* Add padding leaving 4 bytes for the checksum */
-	while (sz++ < (sys_size*16) - 4) {
-		crc = partial_crc32_one('\0', crc);
-		if (fwrite("\0", 1, 1, dest) != 1)
-			die("Writing padding failed");
-	}
-
 	/* Write the CRC */
 	put_unaligned_le32(crc, buf);
 	if (fwrite(buf, 1, 4, dest) != 4)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 1d6b863c42b0..2e1be592c220 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -119,7 +119,7 @@ static void __noreturn tdx_panic(const char *msg)
 	} message;
 
 	/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
-	strncpy(message.str, msg, 64);
+	strtomem_pad(message.str, msg, '\0');
 
 	args.r8  = message.r8;
 	args.r9  = message.r9;
diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config
new file mode 100644
index 000000000000..7b497f3b7bc3
--- /dev/null
+++ b/arch/x86/configs/hardening.config
@@ -0,0 +1,14 @@
+# Basic kernel hardening options (specific to x86)
+
+# Modern libc no longer needs a fixed-position mapping in userspace, remove
+# it as a possible target.
+CONFIG_LEGACY_VSYSCALL_NONE=y
+
+# Enable chip-specific IOMMU support.
+CONFIG_INTEL_IOMMU=y
+CONFIG_INTEL_IOMMU_DEFAULT_ON=y
+CONFIG_INTEL_IOMMU_SVM=y
+CONFIG_AMD_IOMMU=y
+
+# Enable CET Shadow Stack for userspace.
+CONFIG_X86_USER_SHADOW_STACK=y
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 1b411bbf3cb0..73abbbdd26f8 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -281,4 +281,5 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_UNWINDER_FRAME_POINTER=y
+CONFIG_DEBUG_ENTRY=y
 # CONFIG_64BIT is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 409e9182bd29..61e25f6209ed 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -276,3 +276,4 @@ CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_DEBUG_ENTRY=y
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 93c60c0c9d4a..d813160b14d8 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -19,6 +19,7 @@
 #include <linux/nospec.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/init.h>
 
 #ifdef CONFIG_XEN_PV
 #include <xen/xen-ops.h>
@@ -70,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 	return false;
 }
 
-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
 {
 	add_random_kstack_offset();
 	nr = syscall_enter_from_user_mode(regs, nr);
@@ -84,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
 
 	instrumentation_end();
 	syscall_exit_to_user_mode(regs);
+
+	/*
+	 * Check that the register state is valid for using SYSRET to exit
+	 * to userspace.  Otherwise use the slower but fully capable IRET
+	 * exit path.
+	 */
+
+	/* XEN PV guests always use the IRET path */
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return false;
+
+	/* SYSRET requires RCX == RIP and R11 == EFLAGS */
+	if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+		return false;
+
+	/* CS and SS must match the values set in MSR_STAR */
+	if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+		return false;
+
+	/*
+	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+	 * in kernel space.  This essentially lets the user take over
+	 * the kernel, since userspace controls RSP.
+	 *
+	 * TASK_SIZE_MAX covers all user-accessible addresses other than
+	 * the deprecated vsyscall page.
+	 */
+	if (unlikely(regs->ip >= TASK_SIZE_MAX))
+		return false;
+
+	/*
+	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
+	 * restoring TF results in a trap from userspace immediately after
+	 * SYSRET.
+	 */
+	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+		return false;
+
+	/* Use SYSRET to exit to userspace */
+	return true;
 }
 #endif
 
@@ -96,6 +138,16 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs)
 	return (int)regs->orig_ax;
 }
 
+#ifdef CONFIG_IA32_EMULATION
+bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
+
+static int ia32_emulation_override_cmdline(char *arg)
+{
+	return kstrtobool(arg, &__ia32_enabled);
+}
+early_param("ia32_emulation", ia32_emulation_override_cmdline);
+#endif
+
 /*
  * Invoke a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.
  */
@@ -182,8 +234,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
 	return true;
 }
 
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
 {
 	/*
 	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
@@ -201,41 +253,36 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
 
 	/* Invoke the syscall. If it failed, keep it simple: use IRET. */
 	if (!__do_fast_syscall_32(regs))
-		return 0;
+		return false;
 
-#ifdef CONFIG_X86_64
 	/*
-	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
-	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
-	 * bother with SYSEXIT.
-	 *
-	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
-	 * because the ECX fixup above will ensure that this is essentially
-	 * never the case.
-	 */
-	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
-		regs->ip == landing_pad &&
-		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
-	/*
-	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
-	 *
-	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
-	 * because the ECX fixup above will ensure that this is essentially
-	 * never the case.
-	 *
-	 * We don't allow syscalls at all from VM86 mode, but we still
-	 * need to check VM, because we might be returning from sys_vm86.
+	 * Check that the register state is valid for using SYSRETL/SYSEXIT
+	 * to exit to userspace.  Otherwise use the slower but fully capable
+	 * IRET exit path.
 	 */
-	return static_cpu_has(X86_FEATURE_SEP) &&
-		regs->cs == __USER_CS && regs->ss == __USER_DS &&
-		regs->ip == landing_pad &&
-		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
+
+	/* XEN PV guests always use the IRET path */
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return false;
+
+	/* EIP must point to the VDSO landing pad */
+	if (unlikely(regs->ip != landing_pad))
+		return false;
+
+	/* CS and SS must match the values set in MSR_STAR */
+	if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+		return false;
+
+	/* If the TF, RF, or VM flags are set, use IRET */
+	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+		return false;
+
+	/* Use SYSRETL/SYSEXIT to exit to userspace */
+	return true;
 }
 
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
 {
 	/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
 	regs->sp = regs->bp;
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index bfb7bcb362bc..8c8d38f0cb1d 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -3,8 +3,8 @@
  * Common place for both 32- and 64-bit entry routines.
  */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
-#include <asm/export.h>
 #include <asm/msr-index.h>
 
 .pushsection .noinstr.text, "ax"
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 6e6af42e044a..c73047bf9f4b 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -837,7 +837,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
 	movl	%esp, %eax
 	call	do_SYSENTER_32
-	testl	%eax, %eax
+	testb	%al, %al
 	jz	.Lsyscall_32_done
 
 	STACKLEAK_ERASE
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 43606de22511..de6469dffe3a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -18,6 +18,7 @@
  * - SYM_FUNC_START/END:Define functions in the symbol table.
  * - idtentry:		Define exception entry points.
  */
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/cache.h>
@@ -34,7 +35,6 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
-#include <asm/export.h>
 #include <asm/frame.h>
 #include <asm/trapnr.h>
 #include <asm/nospec-branch.h>
@@ -126,70 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
 	 * In the Xen PV case we must use iret anyway.
 	 */
 
-	ALTERNATIVE "", "jmp	swapgs_restore_regs_and_return_to_usermode", \
-		X86_FEATURE_XENPV
-
-	movq	RCX(%rsp), %rcx
-	movq	RIP(%rsp), %r11
-
-	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	/*
-	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
-	 * in kernel space.  This essentially lets the user take over
-	 * the kernel, since userspace controls RSP.
-	 *
-	 * If width of "canonical tail" ever becomes variable, this will need
-	 * to be updated to remain correct on both old and new CPUs.
-	 *
-	 * Change top bits to match most significant bit (47th or 56th bit
-	 * depending on paging mode) in the address.
-	 */
-#ifdef CONFIG_X86_5LEVEL
-	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
-		"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
-	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
-	/* If this changed %rcx, it was not canonical */
-	cmpq	%rcx, %r11
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	movq	R11(%rsp), %r11
-	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne	swapgs_restore_regs_and_return_to_usermode
-
-	/*
-	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
-	 * restore RF properly. If the slowpath sets it for whatever reason, we
-	 * need to restore it correctly.
-	 *
-	 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
-	 * trap from userspace immediately after SYSRET.  This would cause an
-	 * infinite loop whenever #DB happens with register state that satisfies
-	 * the opportunistic SYSRET conditions.  For example, single-stepping
-	 * this user code:
-	 *
-	 *           movq	$stuck_here, %rcx
-	 *           pushfq
-	 *           popq %r11
-	 *   stuck_here:
-	 *
-	 * would never get past 'stuck_here'.
-	 */
-	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz	swapgs_restore_regs_and_return_to_usermode
-
-	/* nothing to check for RSP */
-
-	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
-	jne	swapgs_restore_regs_and_return_to_usermode
+	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 
 	/*
 	 * We win! This label is here just for ease of understanding
@@ -1163,8 +1101,8 @@ SYM_CODE_START(asm_exc_nmi)
 	 * anyway.
 	 *
 	 * To handle this case we do the following:
-	 *  Check the a special location on the stack that contains
-	 *  a variable that is set when NMIs are executing.
+	 *  Check a special location on the stack that contains a
+	 *  variable that is set when NMIs are executing.
 	 *  The interrupted task's stack is also checked to see if it
 	 *  is an NMI stack.
 	 *  If the variable is not set and the stack is not the NMI
@@ -1237,7 +1175,6 @@ SYM_CODE_START(asm_exc_nmi)
 	 */
 
 	movq	%rsp, %rdi
-	movq	$-1, %rsi
 	call	exc_nmi
 
 	/*
@@ -1295,8 +1232,8 @@ SYM_CODE_START(asm_exc_nmi)
 	 * end_repeat_nmi, then we are a nested NMI.  We must not
 	 * modify the "iret" frame because it's being written by
 	 * the outer NMI.  That's okay; the outer NMI handler is
-	 * about to about to call exc_nmi() anyway, so we can just
-	 * resume the outer NMI.
+	 * about to call exc_nmi() anyway, so we can just resume
+	 * the outer NMI.
 	 */
 
 	movq	$repeat_nmi, %rdx
@@ -1451,7 +1388,6 @@ end_repeat_nmi:
 	UNWIND_HINT_REGS
 
 	movq	%rsp, %rdi
-	movq	$-1, %rsi
 	call	exc_nmi
 
 	/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
@@ -1511,18 +1447,16 @@ nmi_restore:
 	iretq
 SYM_CODE_END(asm_exc_nmi)
 
-#ifndef CONFIG_IA32_EMULATION
 /*
  * This handles SYSCALL from 32-bit code.  There is no way to program
  * MSRs to fully disable 32-bit SYSCALL.
  */
-SYM_CODE_START(ignore_sysret)
+SYM_CODE_START(entry_SYSCALL32_ignore)
 	UNWIND_HINT_END_OF_STACK
 	ENDBR
 	mov	$-ENOSYS, %eax
 	sysretl
-SYM_CODE_END(ignore_sysret)
-#endif
+SYM_CODE_END(entry_SYSCALL32_ignore)
 
 .pushsection .text, "ax"
 	__FUNC_ALIGN
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 70150298f8bd..27c05d08558a 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -118,9 +118,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
 
 	movq	%rsp, %rdi
 	call	do_SYSENTER_32
-	/* XEN PV guests always use IRET path */
-	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
-		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 	jmp	sysret32_from_system_call
 
 .Lsysenter_fix_flags:
@@ -212,13 +209,15 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
 
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
+
+sysret32_from_system_call:
 	/* XEN PV guests always use IRET path */
-	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
 		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
 
-	/* Opportunistic SYSRET */
-sysret32_from_system_call:
 	/*
+	 * Opportunistic SYSRET
+	 *
 	 * We are not going to return to userspace from the trampoline
 	 * stack. So let's erase the thread stack right now.
 	 */
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..31c48bc2c3d8 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -457,3 +457,6 @@
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	i386	cachestat		sys_cachestat
 452	i386	fchmodat2		sys_fchmodat2
+454	i386	futex_wake		sys_futex_wake
+455	i386	futex_wait		sys_futex_wait
+456	i386	futex_requeue		sys_futex_requeue
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..a577bb27c16d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,6 +375,9 @@
 451	common	cachestat		sys_cachestat
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
+454	common	futex_wake		sys_futex_wake
+455	common	futex_wait		sys_futex_wait
+456	common	futex_requeue		sys_futex_requeue
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index ff6e7003da97..0103e103a657 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -4,9 +4,9 @@
  * Copyright 2008 by Steven Rostedt, Red Hat, Inc
  *  (inspired by Andi Kleen's thunk_64.S)
  */
+	#include <linux/export.h>
 	#include <linux/linkage.h>
 	#include <asm/asm.h>
-	#include <asm/export.h>
 
 	/* put return address in eax (arg1) */
 	.macro THUNK name, func, put_ret_addr_in_eax=0
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index 27b5da2111ac..416b400f39db 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -4,10 +4,10 @@
  * disturbance of register allocation in some inline assembly constructs.
  * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
  */
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include "calling.h"
 #include <asm/asm.h>
-#include <asm/export.h>
 
 	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */
 	.macro THUNK name, func
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 6a1821bd7d5e..83c0afb7c741 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -42,7 +42,8 @@ vdso_img-$(VDSO64-y)		+= 64
 vdso_img-$(VDSOX32-y)		+= x32
 vdso_img-$(VDSO32-y)		+= 32
 
-obj-$(VDSO32-y)			+= vdso32-setup.o
+obj-$(VDSO32-y)				 += vdso32-setup.o
+OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n
 
 vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
 vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F)
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index d77d278ee9dd..37a3d4c02366 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 #include <linux/linkage.h>
-#include <asm/export.h>
 #include <asm/errno.h>
 #include <asm/enclu.h>
 
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 83f15fe411b3..5bf03c575812 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -26,57 +26,66 @@
 #define RDPMC_BASE_LLC		10
 
 #define COUNTER_SHIFT		16
+#define UNCORE_NAME_LEN		16
+#define UNCORE_GROUP_MAX	256
 
 #undef pr_fmt
 #define pr_fmt(fmt)	"amd_uncore: " fmt
 
 static int pmu_version;
-static int num_counters_llc;
-static int num_counters_nb;
-static bool l3_mask;
 
-static HLIST_HEAD(uncore_unused_list);
-
-struct amd_uncore {
-	int id;
+struct amd_uncore_ctx {
 	int refcnt;
 	int cpu;
+	struct perf_event **events;
+	struct hlist_node node;
+};
+
+struct amd_uncore_pmu {
+	char name[UNCORE_NAME_LEN];
 	int num_counters;
 	int rdpmc_base;
 	u32 msr_base;
-	cpumask_t *active_mask;
-	struct pmu *pmu;
-	struct perf_event **events;
-	struct hlist_node node;
+	int group;
+	cpumask_t active_mask;
+	struct pmu pmu;
+	struct amd_uncore_ctx * __percpu *ctx;
 };
 
-static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_llc;
+enum {
+	UNCORE_TYPE_DF,
+	UNCORE_TYPE_L3,
+	UNCORE_TYPE_UMC,
 
-static struct pmu amd_nb_pmu;
-static struct pmu amd_llc_pmu;
+	UNCORE_TYPE_MAX
+};
 
-static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_llc_active_mask;
+union amd_uncore_info {
+	struct {
+		u64	aux_data:32;	/* auxiliary data */
+		u64	num_pmcs:8;	/* number of counters */
+		u64	gid:8;		/* group id */
+		u64	cid:8;		/* context id */
+	} split;
+	u64		full;
+};
 
-static bool is_nb_event(struct perf_event *event)
-{
-	return event->pmu->type == amd_nb_pmu.type;
-}
+struct amd_uncore {
+	union amd_uncore_info * __percpu info;
+	struct amd_uncore_pmu *pmus;
+	unsigned int num_pmus;
+	bool init_done;
+	void (*scan)(struct amd_uncore *uncore, unsigned int cpu);
+	int  (*init)(struct amd_uncore *uncore, unsigned int cpu);
+	void (*move)(struct amd_uncore *uncore, unsigned int cpu);
+	void (*free)(struct amd_uncore *uncore, unsigned int cpu);
+};
 
-static bool is_llc_event(struct perf_event *event)
-{
-	return event->pmu->type == amd_llc_pmu.type;
-}
+static struct amd_uncore uncores[UNCORE_TYPE_MAX];
 
-static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+static struct amd_uncore_pmu *event_to_amd_uncore_pmu(struct perf_event *event)
 {
-	if (is_nb_event(event) && amd_uncore_nb)
-		return *per_cpu_ptr(amd_uncore_nb, event->cpu);
-	else if (is_llc_event(event) && amd_uncore_llc)
-		return *per_cpu_ptr(amd_uncore_llc, event->cpu);
-
-	return NULL;
+	return container_of(event->pmu, struct amd_uncore_pmu, pmu);
 }
 
 static void amd_uncore_read(struct perf_event *event)
@@ -91,7 +100,16 @@ static void amd_uncore_read(struct perf_event *event)
 	 */
 
 	prev = local64_read(&hwc->prev_count);
-	rdpmcl(hwc->event_base_rdpmc, new);
+
+	/*
+	 * Some uncore PMUs do not have RDPMC assignments. In such cases,
+	 * read counts directly from the corresponding PERF_CTR.
+	 */
+	if (hwc->event_base_rdpmc < 0)
+		rdmsrl(hwc->event_base, new);
+	else
+		rdpmcl(hwc->event_base_rdpmc, new);
+
 	local64_set(&hwc->prev_count, new);
 	delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
 	delta >>= COUNTER_SHIFT;
@@ -118,7 +136,7 @@ static void amd_uncore_stop(struct perf_event *event, int flags)
 	hwc->state |= PERF_HES_STOPPED;
 
 	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		amd_uncore_read(event);
+		event->pmu->read(event);
 		hwc->state |= PERF_HES_UPTODATE;
 	}
 }
@@ -126,15 +144,16 @@ static void amd_uncore_stop(struct perf_event *event, int flags)
 static int amd_uncore_add(struct perf_event *event, int flags)
 {
 	int i;
-	struct amd_uncore *uncore = event_to_amd_uncore(event);
+	struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+	struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
 	struct hw_perf_event *hwc = &event->hw;
 
 	/* are we already assigned? */
-	if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+	if (hwc->idx != -1 && ctx->events[hwc->idx] == event)
 		goto out;
 
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (uncore->events[i] == event) {
+	for (i = 0; i < pmu->num_counters; i++) {
+		if (ctx->events[i] == event) {
 			hwc->idx = i;
 			goto out;
 		}
@@ -142,8 +161,8 @@ static int amd_uncore_add(struct perf_event *event, int flags)
 
 	/* if not, take the first available counter */
 	hwc->idx = -1;
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+	for (i = 0; i < pmu->num_counters; i++) {
+		if (cmpxchg(&ctx->events[i], NULL, event) == NULL) {
 			hwc->idx = i;
 			break;
 		}
@@ -153,23 +172,16 @@ out:
 	if (hwc->idx == -1)
 		return -EBUSY;
 
-	hwc->config_base = uncore->msr_base + (2 * hwc->idx);
-	hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
-	hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+	hwc->config_base = pmu->msr_base + (2 * hwc->idx);
+	hwc->event_base = pmu->msr_base + 1 + (2 * hwc->idx);
+	hwc->event_base_rdpmc = pmu->rdpmc_base + hwc->idx;
 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
-	/*
-	 * The first four DF counters are accessible via RDPMC index 6 to 9
-	 * followed by the L3 counters from index 10 to 15. For processors
-	 * with more than four DF counters, the DF RDPMC assignments become
-	 * discontiguous as the additional counters are accessible starting
-	 * from index 16.
-	 */
-	if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB)
-		hwc->event_base_rdpmc += NUM_COUNTERS_L3;
+	if (pmu->rdpmc_base < 0)
+		hwc->event_base_rdpmc = -1;
 
 	if (flags & PERF_EF_START)
-		amd_uncore_start(event, PERF_EF_RELOAD);
+		event->pmu->start(event, PERF_EF_RELOAD);
 
 	return 0;
 }
@@ -177,55 +189,36 @@ out:
 static void amd_uncore_del(struct perf_event *event, int flags)
 {
 	int i;
-	struct amd_uncore *uncore = event_to_amd_uncore(event);
+	struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event);
+	struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
 	struct hw_perf_event *hwc = &event->hw;
 
-	amd_uncore_stop(event, PERF_EF_UPDATE);
+	event->pmu->stop(event, PERF_EF_UPDATE);
 
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (cmpxchg(&uncore->events[i], event, NULL) == event)
+	for (i = 0; i < pmu->num_counters; i++) {
+		if (cmpxchg(&ctx->events[i], event, NULL) == event)
 			break;
 	}
 
 	hwc->idx = -1;
 }
 
-/*
- * Return a full thread and slice mask unless user
- * has provided them
- */
-static u64 l3_thread_slice_mask(u64 config)
-{
-	if (boot_cpu_data.x86 <= 0x18)
-		return ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) |
-		       ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK);
-
-	/*
-	 * If the user doesn't specify a threadmask, they're not trying to
-	 * count core 0, so we enable all cores & threads.
-	 * We'll also assume that they want to count slice 0 if they specify
-	 * a threadmask and leave sliceid and enallslices unpopulated.
-	 */
-	if (!(config & AMD64_L3_F19H_THREAD_MASK))
-		return AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES |
-		       AMD64_L3_EN_ALL_CORES;
-
-	return config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK |
-			 AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES |
-			 AMD64_L3_COREID_MASK);
-}
-
 static int amd_uncore_event_init(struct perf_event *event)
 {
-	struct amd_uncore *uncore;
+	struct amd_uncore_pmu *pmu;
+	struct amd_uncore_ctx *ctx;
 	struct hw_perf_event *hwc = &event->hw;
-	u64 event_mask = AMD64_RAW_EVENT_MASK_NB;
 
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
 
-	if (pmu_version >= 2 && is_nb_event(event))
-		event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB;
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	pmu = event_to_amd_uncore_pmu(event);
+	ctx = *per_cpu_ptr(pmu->ctx, event->cpu);
+	if (!ctx)
+		return -ENODEV;
 
 	/*
 	 * NB and Last level cache counters (MSRs) are shared across all cores
@@ -235,28 +228,14 @@ static int amd_uncore_event_init(struct perf_event *event)
 	 * out. So we do not support sampling and per-thread events via
 	 * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts:
 	 */
-	hwc->config = event->attr.config & event_mask;
+	hwc->config = event->attr.config;
 	hwc->idx = -1;
 
-	if (event->cpu < 0)
-		return -EINVAL;
-
-	/*
-	 * SliceMask and ThreadMask need to be set for certain L3 events.
-	 * For other events, the two fields do not affect the count.
-	 */
-	if (l3_mask && is_llc_event(event))
-		hwc->config |= l3_thread_slice_mask(event->attr.config);
-
-	uncore = event_to_amd_uncore(event);
-	if (!uncore)
-		return -ENODEV;
-
 	/*
 	 * since request can come in to any of the shared cores, we will remap
 	 * to a single common cpu.
 	 */
-	event->cpu = uncore->cpu;
+	event->cpu = ctx->cpu;
 
 	return 0;
 }
@@ -278,17 +257,10 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
 					    struct device_attribute *attr,
 					    char *buf)
 {
-	cpumask_t *active_mask;
-	struct pmu *pmu = dev_get_drvdata(dev);
+	struct pmu *ptr = dev_get_drvdata(dev);
+	struct amd_uncore_pmu *pmu = container_of(ptr, struct amd_uncore_pmu, pmu);
 
-	if (pmu->type == amd_nb_pmu.type)
-		active_mask = &amd_nb_active_mask;
-	else if (pmu->type == amd_llc_pmu.type)
-		active_mask = &amd_llc_active_mask;
-	else
-		return 0;
-
-	return cpumap_print_to_pagebuf(true, buf, active_mask);
+	return cpumap_print_to_pagebuf(true, buf, &pmu->active_mask);
 }
 static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
 
@@ -315,7 +287,7 @@ static struct device_attribute format_attr_##_var =			\
 DEFINE_UNCORE_FORMAT_ATTR(event12,	event,		"config:0-7,32-35");
 DEFINE_UNCORE_FORMAT_ATTR(event14,	event,		"config:0-7,32-35,59-60"); /* F17h+ DF */
 DEFINE_UNCORE_FORMAT_ATTR(event14v2,	event,		"config:0-7,32-37");	   /* PerfMonV2 DF */
-DEFINE_UNCORE_FORMAT_ATTR(event8,	event,		"config:0-7");		   /* F17h+ L3 */
+DEFINE_UNCORE_FORMAT_ATTR(event8,	event,		"config:0-7");		   /* F17h+ L3, PerfMonV2 UMC */
 DEFINE_UNCORE_FORMAT_ATTR(umask8,	umask,		"config:8-15");
 DEFINE_UNCORE_FORMAT_ATTR(umask12,	umask,		"config:8-15,24-27");	   /* PerfMonV2 DF */
 DEFINE_UNCORE_FORMAT_ATTR(coreid,	coreid,		"config:42-44");	   /* F19h L3 */
@@ -325,6 +297,7 @@ DEFINE_UNCORE_FORMAT_ATTR(threadmask2,	threadmask,	"config:56-57");	   /* F19h L
 DEFINE_UNCORE_FORMAT_ATTR(enallslices,	enallslices,	"config:46");		   /* F19h L3 */
 DEFINE_UNCORE_FORMAT_ATTR(enallcores,	enallcores,	"config:47");		   /* F19h L3 */
 DEFINE_UNCORE_FORMAT_ATTR(sliceid,	sliceid,	"config:48-50");	   /* F19h L3 */
+DEFINE_UNCORE_FORMAT_ATTR(rdwrmask,	rdwrmask,	"config:8-9");		   /* PerfMonV2 UMC */
 
 /* Common DF and NB attributes */
 static struct attribute *amd_uncore_df_format_attr[] = {
@@ -341,6 +314,13 @@ static struct attribute *amd_uncore_l3_format_attr[] = {
 	NULL,
 };
 
+/* Common UMC attributes */
+static struct attribute *amd_uncore_umc_format_attr[] = {
+	&format_attr_event8.attr,       /* event */
+	&format_attr_rdwrmask.attr,     /* rdwrmask */
+	NULL,
+};
+
 /* F17h unique L3 attributes */
 static struct attribute *amd_f17h_uncore_l3_format_attr[] = {
 	&format_attr_slicemask.attr,	/* slicemask */
@@ -378,6 +358,11 @@ static struct attribute_group amd_f19h_uncore_l3_format_group = {
 	.is_visible = amd_f19h_uncore_is_visible,
 };
 
+static struct attribute_group amd_uncore_umc_format_group = {
+	.name = "format",
+	.attrs = amd_uncore_umc_format_attr,
+};
+
 static const struct attribute_group *amd_uncore_df_attr_groups[] = {
 	&amd_uncore_attr_group,
 	&amd_uncore_df_format_group,
@@ -396,259 +381,636 @@ static const struct attribute_group *amd_uncore_l3_attr_update[] = {
 	NULL,
 };
 
-static struct pmu amd_nb_pmu = {
-	.task_ctx_nr	= perf_invalid_context,
-	.attr_groups	= amd_uncore_df_attr_groups,
-	.name		= "amd_nb",
-	.event_init	= amd_uncore_event_init,
-	.add		= amd_uncore_add,
-	.del		= amd_uncore_del,
-	.start		= amd_uncore_start,
-	.stop		= amd_uncore_stop,
-	.read		= amd_uncore_read,
-	.capabilities	= PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
-	.module		= THIS_MODULE,
+static const struct attribute_group *amd_uncore_umc_attr_groups[] = {
+	&amd_uncore_attr_group,
+	&amd_uncore_umc_format_group,
+	NULL,
 };
 
-static struct pmu amd_llc_pmu = {
-	.task_ctx_nr	= perf_invalid_context,
-	.attr_groups	= amd_uncore_l3_attr_groups,
-	.attr_update	= amd_uncore_l3_attr_update,
-	.name		= "amd_l2",
-	.event_init	= amd_uncore_event_init,
-	.add		= amd_uncore_add,
-	.del		= amd_uncore_del,
-	.start		= amd_uncore_start,
-	.stop		= amd_uncore_stop,
-	.read		= amd_uncore_read,
-	.capabilities	= PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
-	.module		= THIS_MODULE,
-};
+static __always_inline
+int amd_uncore_ctx_cid(struct amd_uncore *uncore, unsigned int cpu)
+{
+	union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+	return info->split.cid;
+}
 
-static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+static __always_inline
+int amd_uncore_ctx_gid(struct amd_uncore *uncore, unsigned int cpu)
 {
-	return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
-			cpu_to_node(cpu));
+	union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+	return info->split.gid;
 }
 
-static inline struct perf_event **
-amd_uncore_events_alloc(unsigned int num, unsigned int cpu)
+static __always_inline
+int amd_uncore_ctx_num_pmcs(struct amd_uncore *uncore, unsigned int cpu)
 {
-	return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL,
-			    cpu_to_node(cpu));
+	union amd_uncore_info *info = per_cpu_ptr(uncore->info, cpu);
+	return info->split.num_pmcs;
 }
 
-static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+static void amd_uncore_ctx_free(struct amd_uncore *uncore, unsigned int cpu)
 {
-	struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL;
+	struct amd_uncore_pmu *pmu;
+	struct amd_uncore_ctx *ctx;
+	int i;
 
-	if (amd_uncore_nb) {
-		*per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
-		uncore_nb = amd_uncore_alloc(cpu);
-		if (!uncore_nb)
-			goto fail;
-		uncore_nb->cpu = cpu;
-		uncore_nb->num_counters = num_counters_nb;
-		uncore_nb->rdpmc_base = RDPMC_BASE_NB;
-		uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
-		uncore_nb->active_mask = &amd_nb_active_mask;
-		uncore_nb->pmu = &amd_nb_pmu;
-		uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu);
-		if (!uncore_nb->events)
-			goto fail;
-		uncore_nb->id = -1;
-		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+	if (!uncore->init_done)
+		return;
+
+	for (i = 0; i < uncore->num_pmus; i++) {
+		pmu = &uncore->pmus[i];
+		ctx = *per_cpu_ptr(pmu->ctx, cpu);
+		if (!ctx)
+			continue;
+
+		if (cpu == ctx->cpu)
+			cpumask_clear_cpu(cpu, &pmu->active_mask);
+
+		if (!--ctx->refcnt) {
+			kfree(ctx->events);
+			kfree(ctx);
+		}
+
+		*per_cpu_ptr(pmu->ctx, cpu) = NULL;
 	}
+}
 
-	if (amd_uncore_llc) {
-		*per_cpu_ptr(amd_uncore_llc, cpu) = NULL;
-		uncore_llc = amd_uncore_alloc(cpu);
-		if (!uncore_llc)
-			goto fail;
-		uncore_llc->cpu = cpu;
-		uncore_llc->num_counters = num_counters_llc;
-		uncore_llc->rdpmc_base = RDPMC_BASE_LLC;
-		uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
-		uncore_llc->active_mask = &amd_llc_active_mask;
-		uncore_llc->pmu = &amd_llc_pmu;
-		uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu);
-		if (!uncore_llc->events)
-			goto fail;
-		uncore_llc->id = -1;
-		*per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc;
+static int amd_uncore_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+	struct amd_uncore_ctx *curr, *prev;
+	struct amd_uncore_pmu *pmu;
+	int node, cid, gid, i, j;
+
+	if (!uncore->init_done || !uncore->num_pmus)
+		return 0;
+
+	cid = amd_uncore_ctx_cid(uncore, cpu);
+	gid = amd_uncore_ctx_gid(uncore, cpu);
+
+	for (i = 0; i < uncore->num_pmus; i++) {
+		pmu = &uncore->pmus[i];
+		*per_cpu_ptr(pmu->ctx, cpu) = NULL;
+		curr = NULL;
+
+		/* Check for group exclusivity */
+		if (gid != pmu->group)
+			continue;
+
+		/* Find a sibling context */
+		for_each_online_cpu(j) {
+			if (cpu == j)
+				continue;
+
+			prev = *per_cpu_ptr(pmu->ctx, j);
+			if (!prev)
+				continue;
+
+			if (cid == amd_uncore_ctx_cid(uncore, j)) {
+				curr = prev;
+				break;
+			}
+		}
+
+		/* Allocate context if sibling does not exist */
+		if (!curr) {
+			node = cpu_to_node(cpu);
+			curr = kzalloc_node(sizeof(*curr), GFP_KERNEL, node);
+			if (!curr)
+				goto fail;
+
+			curr->cpu = cpu;
+			curr->events = kzalloc_node(sizeof(*curr->events) *
+						    pmu->num_counters,
+						    GFP_KERNEL, node);
+			if (!curr->events) {
+				kfree(curr);
+				goto fail;
+			}
+
+			cpumask_set_cpu(cpu, &pmu->active_mask);
+		}
+
+		curr->refcnt++;
+		*per_cpu_ptr(pmu->ctx, cpu) = curr;
 	}
 
 	return 0;
 
 fail:
-	if (uncore_nb) {
-		kfree(uncore_nb->events);
-		kfree(uncore_nb);
-	}
-
-	if (uncore_llc) {
-		kfree(uncore_llc->events);
-		kfree(uncore_llc);
-	}
+	amd_uncore_ctx_free(uncore, cpu);
 
 	return -ENOMEM;
 }
 
-static struct amd_uncore *
-amd_uncore_find_online_sibling(struct amd_uncore *this,
-			       struct amd_uncore * __percpu *uncores)
+static void amd_uncore_ctx_move(struct amd_uncore *uncore, unsigned int cpu)
 {
-	unsigned int cpu;
-	struct amd_uncore *that;
-
-	for_each_online_cpu(cpu) {
-		that = *per_cpu_ptr(uncores, cpu);
+	struct amd_uncore_ctx *curr, *next;
+	struct amd_uncore_pmu *pmu;
+	int i, j;
 
-		if (!that)
-			continue;
+	if (!uncore->init_done)
+		return;
 
-		if (this == that)
+	for (i = 0; i < uncore->num_pmus; i++) {
+		pmu = &uncore->pmus[i];
+		curr = *per_cpu_ptr(pmu->ctx, cpu);
+		if (!curr)
 			continue;
 
-		if (this->id == that->id) {
-			hlist_add_head(&this->node, &uncore_unused_list);
-			this = that;
-			break;
+		/* Migrate to a shared sibling if possible */
+		for_each_online_cpu(j) {
+			next = *per_cpu_ptr(pmu->ctx, j);
+			if (!next || cpu == j)
+				continue;
+
+			if (curr == next) {
+				perf_pmu_migrate_context(&pmu->pmu, cpu, j);
+				cpumask_clear_cpu(cpu, &pmu->active_mask);
+				cpumask_set_cpu(j, &pmu->active_mask);
+				next->cpu = j;
+				break;
+			}
 		}
 	}
-
-	this->refcnt++;
-	return this;
 }
 
 static int amd_uncore_cpu_starting(unsigned int cpu)
 {
-	unsigned int eax, ebx, ecx, edx;
 	struct amd_uncore *uncore;
+	int i;
+
+	for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+		uncore = &uncores[i];
+		uncore->scan(uncore, cpu);
+	}
+
+	return 0;
+}
 
-	if (amd_uncore_nb) {
-		uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
-		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-		uncore->id = ecx & 0xff;
+static int amd_uncore_cpu_online(unsigned int cpu)
+{
+	struct amd_uncore *uncore;
+	int i;
 
-		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
-		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+	for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+		uncore = &uncores[i];
+		if (uncore->init(uncore, cpu))
+			break;
 	}
 
-	if (amd_uncore_llc) {
-		uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
-		uncore->id = get_llc_id(cpu);
+	return 0;
+}
 
-		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
-		*per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
+static int amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+	struct amd_uncore *uncore;
+	int i;
+
+	for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+		uncore = &uncores[i];
+		uncore->move(uncore, cpu);
 	}
 
 	return 0;
 }
 
-static void uncore_clean_online(void)
+static int amd_uncore_cpu_dead(unsigned int cpu)
 {
 	struct amd_uncore *uncore;
-	struct hlist_node *n;
+	int i;
 
-	hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) {
-		hlist_del(&uncore->node);
-		kfree(uncore->events);
-		kfree(uncore);
+	for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+		uncore = &uncores[i];
+		uncore->free(uncore, cpu);
 	}
+
+	return 0;
 }
 
-static void uncore_online(unsigned int cpu,
-			  struct amd_uncore * __percpu *uncores)
+static int amd_uncore_df_event_init(struct perf_event *event)
 {
-	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = amd_uncore_event_init(event);
 
-	uncore_clean_online();
+	if (ret || pmu_version < 2)
+		return ret;
 
-	if (cpu == uncore->cpu)
-		cpumask_set_cpu(cpu, uncore->active_mask);
+	hwc->config = event->attr.config &
+		      (pmu_version >= 2 ? AMD64_PERFMON_V2_RAW_EVENT_MASK_NB :
+					  AMD64_RAW_EVENT_MASK_NB);
+
+	return 0;
 }
 
-static int amd_uncore_cpu_online(unsigned int cpu)
+static int amd_uncore_df_add(struct perf_event *event, int flags)
 {
-	if (amd_uncore_nb)
-		uncore_online(cpu, amd_uncore_nb);
+	int ret = amd_uncore_add(event, flags & ~PERF_EF_START);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (ret)
+		return ret;
+
+	/*
+	 * The first four DF counters are accessible via RDPMC index 6 to 9
+	 * followed by the L3 counters from index 10 to 15. For processors
+	 * with more than four DF counters, the DF RDPMC assignments become
+	 * discontiguous as the additional counters are accessible starting
+	 * from index 16.
+	 */
+	if (hwc->idx >= NUM_COUNTERS_NB)
+		hwc->event_base_rdpmc += NUM_COUNTERS_L3;
 
-	if (amd_uncore_llc)
-		uncore_online(cpu, amd_uncore_llc);
+	/* Delayed start after rdpmc base update */
+	if (flags & PERF_EF_START)
+		amd_uncore_start(event, PERF_EF_RELOAD);
 
 	return 0;
 }
 
-static void uncore_down_prepare(unsigned int cpu,
-				struct amd_uncore * __percpu *uncores)
+static
+void amd_uncore_df_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
 {
-	unsigned int i;
-	struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+	union cpuid_0x80000022_ebx ebx;
+	union amd_uncore_info info;
 
-	if (this->cpu != cpu)
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_NB))
 		return;
 
-	/* this cpu is going down, migrate to a shared sibling if possible */
-	for_each_online_cpu(i) {
-		struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+	info.split.aux_data = 0;
+	info.split.num_pmcs = NUM_COUNTERS_NB;
+	info.split.gid = 0;
+	info.split.cid = topology_die_id(cpu);
 
-		if (cpu == i)
-			continue;
+	if (pmu_version >= 2) {
+		ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+		info.split.num_pmcs = ebx.split.num_df_pmc;
+	}
 
-		if (this == that) {
-			perf_pmu_migrate_context(this->pmu, cpu, i);
-			cpumask_clear_cpu(cpu, that->active_mask);
-			cpumask_set_cpu(i, that->active_mask);
-			that->cpu = i;
-			break;
-		}
+	*per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_df_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+	struct attribute **df_attr = amd_uncore_df_format_attr;
+	struct amd_uncore_pmu *pmu;
+
+	/* Run just once */
+	if (uncore->init_done)
+		return amd_uncore_ctx_init(uncore, cpu);
+
+	/* No grouping, single instance for a system */
+	uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL);
+	if (!uncore->pmus) {
+		uncore->num_pmus = 0;
+		goto done;
 	}
+
+	/*
+	 * For Family 17h and above, the Northbridge counters are repurposed
+	 * as Data Fabric counters. The PMUs are exported based on family as
+	 * either NB or DF.
+	 */
+	pmu = &uncore->pmus[0];
+	strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_df" : "amd_nb",
+		sizeof(pmu->name));
+	pmu->num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+	pmu->msr_base = MSR_F15H_NB_PERF_CTL;
+	pmu->rdpmc_base = RDPMC_BASE_NB;
+	pmu->group = amd_uncore_ctx_gid(uncore, cpu);
+
+	if (pmu_version >= 2) {
+		*df_attr++ = &format_attr_event14v2.attr;
+		*df_attr++ = &format_attr_umask12.attr;
+	} else if (boot_cpu_data.x86 >= 0x17) {
+		*df_attr = &format_attr_event14.attr;
+	}
+
+	pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+	if (!pmu->ctx)
+		goto done;
+
+	pmu->pmu = (struct pmu) {
+		.task_ctx_nr	= perf_invalid_context,
+		.attr_groups	= amd_uncore_df_attr_groups,
+		.name		= pmu->name,
+		.event_init	= amd_uncore_df_event_init,
+		.add		= amd_uncore_df_add,
+		.del		= amd_uncore_del,
+		.start		= amd_uncore_start,
+		.stop		= amd_uncore_stop,
+		.read		= amd_uncore_read,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+		.module		= THIS_MODULE,
+	};
+
+	if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+		free_percpu(pmu->ctx);
+		pmu->ctx = NULL;
+		goto done;
+	}
+
+	pr_info("%d %s%s counters detected\n", pmu->num_counters,
+		boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ?  "HYGON " : "",
+		pmu->pmu.name);
+
+	uncore->num_pmus = 1;
+
+done:
+	uncore->init_done = true;
+
+	return amd_uncore_ctx_init(uncore, cpu);
 }
 
-static int amd_uncore_cpu_down_prepare(unsigned int cpu)
+static int amd_uncore_l3_event_init(struct perf_event *event)
 {
-	if (amd_uncore_nb)
-		uncore_down_prepare(cpu, amd_uncore_nb);
+	int ret = amd_uncore_event_init(event);
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config = event->attr.config;
+	u64 mask;
 
-	if (amd_uncore_llc)
-		uncore_down_prepare(cpu, amd_uncore_llc);
+	hwc->config = config & AMD64_RAW_EVENT_MASK_NB;
+
+	/*
+	 * SliceMask and ThreadMask need to be set for certain L3 events.
+	 * For other events, the two fields do not affect the count.
+	 */
+	if (ret || boot_cpu_data.x86 < 0x17)
+		return ret;
+
+	mask = config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK |
+			 AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES |
+			 AMD64_L3_COREID_MASK);
+
+	if (boot_cpu_data.x86 <= 0x18)
+		mask = ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) |
+		       ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK);
+
+	/*
+	 * If the user doesn't specify a ThreadMask, they're not trying to
+	 * count core 0, so we enable all cores & threads.
+	 * We'll also assume that they want to count slice 0 if they specify
+	 * a ThreadMask and leave SliceId and EnAllSlices unpopulated.
+	 */
+	else if (!(config & AMD64_L3_F19H_THREAD_MASK))
+		mask = AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES |
+		       AMD64_L3_EN_ALL_CORES;
+
+	hwc->config |= mask;
 
 	return 0;
 }
 
-static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+static
+void amd_uncore_l3_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
 {
-	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+	union amd_uncore_info info;
 
-	if (cpu == uncore->cpu)
-		cpumask_clear_cpu(cpu, uncore->active_mask);
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_LLC))
+		return;
+
+	info.split.aux_data = 0;
+	info.split.num_pmcs = NUM_COUNTERS_L2;
+	info.split.gid = 0;
+	info.split.cid = per_cpu_llc_id(cpu);
 
-	if (!--uncore->refcnt) {
-		kfree(uncore->events);
-		kfree(uncore);
+	if (boot_cpu_data.x86 >= 0x17)
+		info.split.num_pmcs = NUM_COUNTERS_L3;
+
+	*per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_l3_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+	struct attribute **l3_attr = amd_uncore_l3_format_attr;
+	struct amd_uncore_pmu *pmu;
+
+	/* Run just once */
+	if (uncore->init_done)
+		return amd_uncore_ctx_init(uncore, cpu);
+
+	/* No grouping, single instance for a system */
+	uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL);
+	if (!uncore->pmus) {
+		uncore->num_pmus = 0;
+		goto done;
 	}
 
-	*per_cpu_ptr(uncores, cpu) = NULL;
+	/*
+	 * For Family 17h and above, L3 cache counters are available instead
+	 * of L2 cache counters. The PMUs are exported based on family as
+	 * either L2 or L3.
+	 */
+	pmu = &uncore->pmus[0];
+	strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_l3" : "amd_l2",
+		sizeof(pmu->name));
+	pmu->num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+	pmu->msr_base = MSR_F16H_L2I_PERF_CTL;
+	pmu->rdpmc_base = RDPMC_BASE_LLC;
+	pmu->group = amd_uncore_ctx_gid(uncore, cpu);
+
+	if (boot_cpu_data.x86 >= 0x17) {
+		*l3_attr++ = &format_attr_event8.attr;
+		*l3_attr++ = &format_attr_umask8.attr;
+		*l3_attr++ = boot_cpu_data.x86 >= 0x19 ?
+			     &format_attr_threadmask2.attr :
+			     &format_attr_threadmask8.attr;
+	}
+
+	pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+	if (!pmu->ctx)
+		goto done;
+
+	pmu->pmu = (struct pmu) {
+		.task_ctx_nr	= perf_invalid_context,
+		.attr_groups	= amd_uncore_l3_attr_groups,
+		.attr_update	= amd_uncore_l3_attr_update,
+		.name		= pmu->name,
+		.event_init	= amd_uncore_l3_event_init,
+		.add		= amd_uncore_add,
+		.del		= amd_uncore_del,
+		.start		= amd_uncore_start,
+		.stop		= amd_uncore_stop,
+		.read		= amd_uncore_read,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+		.module		= THIS_MODULE,
+	};
+
+	if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+		free_percpu(pmu->ctx);
+		pmu->ctx = NULL;
+		goto done;
+	}
+
+	pr_info("%d %s%s counters detected\n", pmu->num_counters,
+		boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ?  "HYGON " : "",
+		pmu->pmu.name);
+
+	uncore->num_pmus = 1;
+
+done:
+	uncore->init_done = true;
+
+	return amd_uncore_ctx_init(uncore, cpu);
 }
 
-static int amd_uncore_cpu_dead(unsigned int cpu)
+static int amd_uncore_umc_event_init(struct perf_event *event)
 {
-	if (amd_uncore_nb)
-		uncore_dead(cpu, amd_uncore_nb);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = amd_uncore_event_init(event);
+
+	if (ret)
+		return ret;
 
-	if (amd_uncore_llc)
-		uncore_dead(cpu, amd_uncore_llc);
+	hwc->config = event->attr.config & AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC;
 
 	return 0;
 }
 
-static int __init amd_uncore_init(void)
+static void amd_uncore_umc_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (flags & PERF_EF_RELOAD)
+		wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+	hwc->state = 0;
+	wrmsrl(hwc->config_base, (hwc->config | AMD64_PERFMON_V2_ENABLE_UMC));
+	perf_event_update_userpage(event);
+}
+
+static
+void amd_uncore_umc_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
 {
-	struct attribute **df_attr = amd_uncore_df_format_attr;
-	struct attribute **l3_attr = amd_uncore_l3_format_attr;
 	union cpuid_0x80000022_ebx ebx;
+	union amd_uncore_info info;
+	unsigned int eax, ecx, edx;
+
+	if (pmu_version < 2)
+		return;
+
+	cpuid(EXT_PERFMON_DEBUG_FEATURES, &eax, &ebx.full, &ecx, &edx);
+	info.split.aux_data = ecx;	/* stash active mask */
+	info.split.num_pmcs = ebx.split.num_umc_pmc;
+	info.split.gid = topology_die_id(cpu);
+	info.split.cid = topology_die_id(cpu);
+	*per_cpu_ptr(uncore->info, cpu) = info;
+}
+
+static
+int amd_uncore_umc_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
+{
+	DECLARE_BITMAP(gmask, UNCORE_GROUP_MAX) = { 0 };
+	u8 group_num_pmus[UNCORE_GROUP_MAX] = { 0 };
+	u8 group_num_pmcs[UNCORE_GROUP_MAX] = { 0 };
+	union amd_uncore_info info;
+	struct amd_uncore_pmu *pmu;
+	int index = 0, gid, i;
+
+	if (pmu_version < 2)
+		return 0;
+
+	/* Run just once */
+	if (uncore->init_done)
+		return amd_uncore_ctx_init(uncore, cpu);
+
+	/* Find unique groups */
+	for_each_online_cpu(i) {
+		info = *per_cpu_ptr(uncore->info, i);
+		gid = info.split.gid;
+		if (test_bit(gid, gmask))
+			continue;
+
+		__set_bit(gid, gmask);
+		group_num_pmus[gid] = hweight32(info.split.aux_data);
+		group_num_pmcs[gid] = info.split.num_pmcs;
+		uncore->num_pmus += group_num_pmus[gid];
+	}
+
+	uncore->pmus = kzalloc(sizeof(*uncore->pmus) * uncore->num_pmus,
+			       GFP_KERNEL);
+	if (!uncore->pmus) {
+		uncore->num_pmus = 0;
+		goto done;
+	}
+
+	for_each_set_bit(gid, gmask, UNCORE_GROUP_MAX) {
+		for (i = 0; i < group_num_pmus[gid]; i++) {
+			pmu = &uncore->pmus[index];
+			snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%d", index);
+			pmu->num_counters = group_num_pmcs[gid] / group_num_pmus[gid];
+			pmu->msr_base = MSR_F19H_UMC_PERF_CTL + i * pmu->num_counters * 2;
+			pmu->rdpmc_base = -1;
+			pmu->group = gid;
+
+			pmu->ctx = alloc_percpu(struct amd_uncore_ctx *);
+			if (!pmu->ctx)
+				goto done;
+
+			pmu->pmu = (struct pmu) {
+				.task_ctx_nr	= perf_invalid_context,
+				.attr_groups	= amd_uncore_umc_attr_groups,
+				.name		= pmu->name,
+				.event_init	= amd_uncore_umc_event_init,
+				.add		= amd_uncore_add,
+				.del		= amd_uncore_del,
+				.start		= amd_uncore_umc_start,
+				.stop		= amd_uncore_stop,
+				.read		= amd_uncore_read,
+				.capabilities	= PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+				.module		= THIS_MODULE,
+			};
+
+			if (perf_pmu_register(&pmu->pmu, pmu->pmu.name, -1)) {
+				free_percpu(pmu->ctx);
+				pmu->ctx = NULL;
+				goto done;
+			}
+
+			pr_info("%d %s counters detected\n", pmu->num_counters,
+				pmu->pmu.name);
+
+			index++;
+		}
+	}
+
+done:
+	uncore->num_pmus = index;
+	uncore->init_done = true;
+
+	return amd_uncore_ctx_init(uncore, cpu);
+}
+
+static struct amd_uncore uncores[UNCORE_TYPE_MAX] = {
+	/* UNCORE_TYPE_DF */
+	{
+		.scan = amd_uncore_df_ctx_scan,
+		.init = amd_uncore_df_ctx_init,
+		.move = amd_uncore_ctx_move,
+		.free = amd_uncore_ctx_free,
+	},
+	/* UNCORE_TYPE_L3 */
+	{
+		.scan = amd_uncore_l3_ctx_scan,
+		.init = amd_uncore_l3_ctx_init,
+		.move = amd_uncore_ctx_move,
+		.free = amd_uncore_ctx_free,
+	},
+	/* UNCORE_TYPE_UMC */
+	{
+		.scan = amd_uncore_umc_ctx_scan,
+		.init = amd_uncore_umc_ctx_init,
+		.move = amd_uncore_ctx_move,
+		.free = amd_uncore_ctx_free,
+	},
+};
+
+static int __init amd_uncore_init(void)
+{
+	struct amd_uncore *uncore;
 	int ret = -ENODEV;
+	int i;
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
 	    boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -660,125 +1022,91 @@ static int __init amd_uncore_init(void)
 	if (boot_cpu_has(X86_FEATURE_PERFMON_V2))
 		pmu_version = 2;
 
-	num_counters_nb	= NUM_COUNTERS_NB;
-	num_counters_llc = NUM_COUNTERS_L2;
-	if (boot_cpu_data.x86 >= 0x17) {
-		/*
-		 * For F17h and above, the Northbridge counters are
-		 * repurposed as Data Fabric counters. Also, L3
-		 * counters are supported too. The PMUs are exported
-		 * based on family as either L2 or L3 and NB or DF.
-		 */
-		num_counters_llc	  = NUM_COUNTERS_L3;
-		amd_nb_pmu.name		  = "amd_df";
-		amd_llc_pmu.name	  = "amd_l3";
-		l3_mask			  = true;
-	}
+	for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+		uncore = &uncores[i];
 
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
-		if (pmu_version >= 2) {
-			*df_attr++ = &format_attr_event14v2.attr;
-			*df_attr++ = &format_attr_umask12.attr;
-		} else if (boot_cpu_data.x86 >= 0x17) {
-			*df_attr = &format_attr_event14.attr;
-		}
+		BUG_ON(!uncore->scan);
+		BUG_ON(!uncore->init);
+		BUG_ON(!uncore->move);
+		BUG_ON(!uncore->free);
 
-		amd_uncore_nb = alloc_percpu(struct amd_uncore *);
-		if (!amd_uncore_nb) {
+		uncore->info = alloc_percpu(union amd_uncore_info);
+		if (!uncore->info) {
 			ret = -ENOMEM;
-			goto fail_nb;
-		}
-		ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
-		if (ret)
-			goto fail_nb;
-
-		if (pmu_version >= 2) {
-			ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
-			num_counters_nb = ebx.split.num_df_pmc;
-		}
-
-		pr_info("%d %s %s counters detected\n", num_counters_nb,
-			boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ?  "HYGON" : "",
-			amd_nb_pmu.name);
-
-		ret = 0;
-	}
-
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
-		if (boot_cpu_data.x86 >= 0x19) {
-			*l3_attr++ = &format_attr_event8.attr;
-			*l3_attr++ = &format_attr_umask8.attr;
-			*l3_attr++ = &format_attr_threadmask2.attr;
-		} else if (boot_cpu_data.x86 >= 0x17) {
-			*l3_attr++ = &format_attr_event8.attr;
-			*l3_attr++ = &format_attr_umask8.attr;
-			*l3_attr++ = &format_attr_threadmask8.attr;
-		}
-
-		amd_uncore_llc = alloc_percpu(struct amd_uncore *);
-		if (!amd_uncore_llc) {
-			ret = -ENOMEM;
-			goto fail_llc;
+			goto fail;
 		}
-		ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1);
-		if (ret)
-			goto fail_llc;
-
-		pr_info("%d %s %s counters detected\n", num_counters_llc,
-			boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ?  "HYGON" : "",
-			amd_llc_pmu.name);
-		ret = 0;
-	}
+	};
 
 	/*
 	 * Install callbacks. Core will call them for each online cpu.
 	 */
-	if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
-			      "perf/x86/amd/uncore:prepare",
-			      amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead))
-		goto fail_llc;
-
-	if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
-			      "perf/x86/amd/uncore:starting",
-			      amd_uncore_cpu_starting, NULL))
+	ret = cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
+				"perf/x86/amd/uncore:prepare",
+				NULL, amd_uncore_cpu_dead);
+	if (ret)
+		goto fail;
+
+	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
+				"perf/x86/amd/uncore:starting",
+				amd_uncore_cpu_starting, NULL);
+	if (ret)
 		goto fail_prep;
-	if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
-			      "perf/x86/amd/uncore:online",
-			      amd_uncore_cpu_online,
-			      amd_uncore_cpu_down_prepare))
+
+	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
+				"perf/x86/amd/uncore:online",
+				amd_uncore_cpu_online,
+				amd_uncore_cpu_down_prepare);
+	if (ret)
 		goto fail_start;
+
 	return 0;
 
 fail_start:
 	cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
 fail_prep:
 	cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
-fail_llc:
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
-		perf_pmu_unregister(&amd_nb_pmu);
-	free_percpu(amd_uncore_llc);
-fail_nb:
-	free_percpu(amd_uncore_nb);
+fail:
+	for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+		uncore = &uncores[i];
+		if (uncore->info) {
+			free_percpu(uncore->info);
+			uncore->info = NULL;
+		}
+	}
 
 	return ret;
 }
 
 static void __exit amd_uncore_exit(void)
 {
+	struct amd_uncore *uncore;
+	struct amd_uncore_pmu *pmu;
+	int i, j;
+
 	cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE);
 	cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
 	cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
 
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
-		perf_pmu_unregister(&amd_llc_pmu);
-		free_percpu(amd_uncore_llc);
-		amd_uncore_llc = NULL;
-	}
+	for (i = 0; i < UNCORE_TYPE_MAX; i++) {
+		uncore = &uncores[i];
+		if (!uncore->info)
+			continue;
+
+		free_percpu(uncore->info);
+		uncore->info = NULL;
+
+		for (j = 0; j < uncore->num_pmus; j++) {
+			pmu = &uncore->pmus[j];
+			if (!pmu->ctx)
+				continue;
+
+			perf_pmu_unregister(&pmu->pmu);
+			free_percpu(pmu->ctx);
+			pmu->ctx = NULL;
+		}
 
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
-		perf_pmu_unregister(&amd_nb_pmu);
-		free_percpu(amd_uncore_nb);
-		amd_uncore_nb = NULL;
+		kfree(uncore->pmus);
+		uncore->pmus = NULL;
 	}
 }
 
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 185f902e5f28..40ad1425ffa2 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1887,9 +1887,9 @@ ssize_t events_hybrid_sysfs_show(struct device *dev,
 
 	str = pmu_attr->event_str;
 	for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
-		if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type))
+		if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type))
 			continue;
-		if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) {
+		if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) {
 			next_str = strchr(str, ';');
 			if (next_str)
 				return snprintf(page, next_str - str + 1, "%s", str);
@@ -2169,7 +2169,7 @@ static int __init init_hw_perf_events(void)
 			hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
 
 			err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
-						(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
+						(hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
 			if (err)
 				break;
 		}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fa355d3658a6..a08f794a0e79 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -211,6 +211,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
 	EVENT_CONSTRAINT_END
 };
 
+static struct event_constraint intel_grt_event_constraints[] __read_mostly = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+	FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
+	EVENT_CONSTRAINT_END
+};
+
 static struct event_constraint intel_skl_event_constraints[] = {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
@@ -299,7 +307,7 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };
 
-static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+static struct extra_reg intel_glc_extra_regs[] __read_mostly = {
 	INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
 	INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
 	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
@@ -309,11 +317,12 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_spr_event_constraints[] = {
+static struct event_constraint intel_glc_event_constraints[] = {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x0100, 0),	/* INST_RETIRED.PREC_DIST */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
 	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	FIXED_EVENT_CONSTRAINT(0x013c, 2),	/* CPU_CLK_UNHALTED.REF_TSC_P */
 	FIXED_EVENT_CONSTRAINT(0x0400, 3),	/* SLOTS */
 	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
 	METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
@@ -349,7 +358,7 @@ static struct event_constraint intel_spr_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
-static struct extra_reg intel_gnr_extra_regs[] __read_mostly = {
+static struct extra_reg intel_rwc_extra_regs[] __read_mostly = {
 	INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
 	INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
 	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
@@ -473,7 +482,7 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
-static __initconst const u64 spr_hw_cache_event_ids
+static __initconst const u64 glc_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -552,7 +561,7 @@ static __initconst const u64 spr_hw_cache_event_ids
  },
 };
 
-static __initconst const u64 spr_hw_cache_extra_regs
+static __initconst const u64 glc_hw_cache_extra_regs
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -2556,16 +2565,6 @@ static int icl_set_topdown_event_period(struct perf_event *event)
 	return 0;
 }
 
-static int adl_set_topdown_event_period(struct perf_event *event)
-{
-	struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
-
-	if (pmu->cpu_type != hybrid_big)
-		return 0;
-
-	return icl_set_topdown_event_period(event);
-}
-
 DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period);
 
 static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
@@ -2708,16 +2707,6 @@ static u64 icl_update_topdown_event(struct perf_event *event)
 						 x86_pmu.num_topdown_events - 1);
 }
 
-static u64 adl_update_topdown_event(struct perf_event *event)
-{
-	struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
-
-	if (pmu->cpu_type != hybrid_big)
-		return 0;
-
-	return icl_update_topdown_event(event);
-}
-
 DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
 
 static void intel_pmu_read_topdown_event(struct perf_event *event)
@@ -3869,7 +3858,7 @@ static inline bool require_mem_loads_aux_event(struct perf_event *event)
 		return false;
 
 	if (is_hybrid())
-		return hybrid_pmu(event->pmu)->cpu_type == hybrid_big;
+		return hybrid_pmu(event->pmu)->pmu_type == hybrid_big;
 
 	return true;
 }
@@ -4273,7 +4262,7 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 }
 
 static struct event_constraint *
-spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+glc_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			  struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -4361,9 +4350,9 @@ adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 {
 	struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
 
-	if (pmu->cpu_type == hybrid_big)
-		return spr_get_event_constraints(cpuc, idx, event);
-	else if (pmu->cpu_type == hybrid_small)
+	if (pmu->pmu_type == hybrid_big)
+		return glc_get_event_constraints(cpuc, idx, event);
+	else if (pmu->pmu_type == hybrid_small)
 		return tnt_get_event_constraints(cpuc, idx, event);
 
 	WARN_ON(1);
@@ -4409,7 +4398,7 @@ rwc_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 {
 	struct event_constraint *c;
 
-	c = spr_get_event_constraints(cpuc, idx, event);
+	c = glc_get_event_constraints(cpuc, idx, event);
 
 	/* The Retire Latency is not supported by the fixed counter 0. */
 	if (event->attr.precise_ip &&
@@ -4433,9 +4422,9 @@ mtl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 {
 	struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
 
-	if (pmu->cpu_type == hybrid_big)
+	if (pmu->pmu_type == hybrid_big)
 		return rwc_get_event_constraints(cpuc, idx, event);
-	if (pmu->cpu_type == hybrid_small)
+	if (pmu->pmu_type == hybrid_small)
 		return cmt_get_event_constraints(cpuc, idx, event);
 
 	WARN_ON(1);
@@ -4446,18 +4435,18 @@ static int adl_hw_config(struct perf_event *event)
 {
 	struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
 
-	if (pmu->cpu_type == hybrid_big)
+	if (pmu->pmu_type == hybrid_big)
 		return hsw_hw_config(event);
-	else if (pmu->cpu_type == hybrid_small)
+	else if (pmu->pmu_type == hybrid_small)
 		return intel_pmu_hw_config(event);
 
 	WARN_ON(1);
 	return -EOPNOTSUPP;
 }
 
-static u8 adl_get_hybrid_cpu_type(void)
+static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
 {
-	return hybrid_big;
+	return HYBRID_INTEL_CORE;
 }
 
 /*
@@ -4490,7 +4479,7 @@ static void nhm_limit_period(struct perf_event *event, s64 *left)
 	*left = max(*left, 32LL);
 }
 
-static void spr_limit_period(struct perf_event *event, s64 *left)
+static void glc_limit_period(struct perf_event *event, s64 *left)
 {
 	if (event->attr.precise_ip == 3)
 		*left = max(*left, 128LL);
@@ -4618,6 +4607,23 @@ static void intel_pmu_check_num_counters(int *num_counters,
 					 int *num_counters_fixed,
 					 u64 *intel_ctrl, u64 fixed_mask);
 
+static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints,
+					      int num_counters,
+					      int num_counters_fixed,
+					      u64 intel_ctrl);
+
+static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs);
+
+static inline bool intel_pmu_broken_perf_cap(void)
+{
+	/* The Perf Metric (Bit 15) is always cleared */
+	if ((boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE) ||
+	    (boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE_L))
+		return true;
+
+	return false;
+}
+
 static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
 {
 	unsigned int sub_bitmaps = cpuid_eax(ARCH_PERFMON_EXT_LEAF);
@@ -4628,27 +4634,83 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
 			    &eax, &ebx, &ecx, &edx);
 		pmu->num_counters = fls(eax);
 		pmu->num_counters_fixed = fls(ebx);
-		intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed,
-					     &pmu->intel_ctrl, ebx);
+	}
+
+
+	if (!intel_pmu_broken_perf_cap()) {
+		/* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities);
 	}
 }
 
-static bool init_hybrid_pmu(int cpu)
+static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
+{
+	intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed,
+				     &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1);
+	pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+	pmu->unconstrained = (struct event_constraint)
+			     __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
+						0, pmu->num_counters, 0, 0);
+
+	if (pmu->intel_cap.perf_metrics)
+		pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+	else
+		pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
+
+	if (pmu->intel_cap.pebs_output_pt_available)
+		pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+	else
+		pmu->pmu.capabilities |= ~PERF_PMU_CAP_AUX_OUTPUT;
+
+	intel_pmu_check_event_constraints(pmu->event_constraints,
+					  pmu->num_counters,
+					  pmu->num_counters_fixed,
+					  pmu->intel_ctrl);
+
+	intel_pmu_check_extra_regs(pmu->extra_regs);
+}
+
+static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void)
 {
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 	u8 cpu_type = get_this_hybrid_cpu_type();
-	struct x86_hybrid_pmu *pmu = NULL;
 	int i;
 
-	if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
-		cpu_type = x86_pmu.get_hybrid_cpu_type();
+	/*
+	 * This is running on a CPU model that is known to have hybrid
+	 * configurations. But the CPU told us it is not hybrid, shame
+	 * on it. There should be a fixup function provided for these
+	 * troublesome CPUs (->get_hybrid_cpu_type).
+	 */
+	if (cpu_type == HYBRID_INTEL_NONE) {
+		if (x86_pmu.get_hybrid_cpu_type)
+			cpu_type = x86_pmu.get_hybrid_cpu_type();
+		else
+			return NULL;
+	}
 
+	/*
+	 * This essentially just maps between the 'hybrid_cpu_type'
+	 * and 'hybrid_pmu_type' enums:
+	 */
 	for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
-		if (x86_pmu.hybrid_pmu[i].cpu_type == cpu_type) {
-			pmu = &x86_pmu.hybrid_pmu[i];
-			break;
-		}
+		enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type;
+
+		if (cpu_type == HYBRID_INTEL_CORE &&
+		    pmu_type == hybrid_big)
+			return &x86_pmu.hybrid_pmu[i];
+		if (cpu_type == HYBRID_INTEL_ATOM &&
+		    pmu_type == hybrid_small)
+			return &x86_pmu.hybrid_pmu[i];
 	}
+
+	return NULL;
+}
+
+static bool init_hybrid_pmu(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct x86_hybrid_pmu *pmu = find_hybrid_pmu_for_cpu();
+
 	if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) {
 		cpuc->pmu = NULL;
 		return false;
@@ -4661,6 +4723,8 @@ static bool init_hybrid_pmu(int cpu)
 	if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
 		update_pmu_cap(pmu);
 
+	intel_pmu_check_hybrid_pmus(pmu);
+
 	if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed))
 		return false;
 
@@ -5337,14 +5401,14 @@ static struct attribute *icl_tsx_events_attrs[] = {
 EVENT_ATTR_STR(mem-stores,	mem_st_spr,	"event=0xcd,umask=0x2");
 EVENT_ATTR_STR(mem-loads-aux,	mem_ld_aux,	"event=0x03,umask=0x82");
 
-static struct attribute *spr_events_attrs[] = {
+static struct attribute *glc_events_attrs[] = {
 	EVENT_PTR(mem_ld_hsw),
 	EVENT_PTR(mem_st_spr),
 	EVENT_PTR(mem_ld_aux),
 	NULL,
 };
 
-static struct attribute *spr_td_events_attrs[] = {
+static struct attribute *glc_td_events_attrs[] = {
 	EVENT_PTR(slots),
 	EVENT_PTR(td_retiring),
 	EVENT_PTR(td_bad_spec),
@@ -5357,7 +5421,7 @@ static struct attribute *spr_td_events_attrs[] = {
 	NULL,
 };
 
-static struct attribute *spr_tsx_events_attrs[] = {
+static struct attribute *glc_tsx_events_attrs[] = {
 	EVENT_PTR(tx_start),
 	EVENT_PTR(tx_abort),
 	EVENT_PTR(tx_commit),
@@ -5699,7 +5763,7 @@ static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
 	struct perf_pmu_events_hybrid_attr *pmu_attr =
 		container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr);
 
-	return pmu->cpu_type & pmu_attr->pmu_type;
+	return pmu->pmu_type & pmu_attr->pmu_type;
 }
 
 static umode_t hybrid_events_is_visible(struct kobject *kobj,
@@ -5736,7 +5800,7 @@ static umode_t hybrid_format_is_visible(struct kobject *kobj,
 		container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr);
 	int cpu = hybrid_find_supported_cpu(pmu);
 
-	return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0;
+	return (cpu >= 0) && (pmu->pmu_type & pmu_attr->pmu_type) ? attr->mode : 0;
 }
 
 static struct attribute_group hybrid_group_events_td  = {
@@ -5880,40 +5944,105 @@ static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs)
 	}
 }
 
-static void intel_pmu_check_hybrid_pmus(u64 fixed_mask)
+static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = {
+	{ hybrid_small, "cpu_atom" },
+	{ hybrid_big, "cpu_core" },
+};
+
+static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus)
 {
+	unsigned long pmus_mask = pmus;
 	struct x86_hybrid_pmu *pmu;
-	int i;
+	int idx = 0, bit;
 
-	for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
-		pmu = &x86_pmu.hybrid_pmu[i];
+	x86_pmu.num_hybrid_pmus = hweight_long(pmus_mask);
+	x86_pmu.hybrid_pmu = kcalloc(x86_pmu.num_hybrid_pmus,
+				     sizeof(struct x86_hybrid_pmu),
+				     GFP_KERNEL);
+	if (!x86_pmu.hybrid_pmu)
+		return -ENOMEM;
 
-		intel_pmu_check_num_counters(&pmu->num_counters,
-					     &pmu->num_counters_fixed,
-					     &pmu->intel_ctrl,
-					     fixed_mask);
+	static_branch_enable(&perf_is_hybrid);
+	x86_pmu.filter = intel_pmu_filter;
 
-		if (pmu->intel_cap.perf_metrics) {
-			pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
-			pmu->intel_ctrl |= INTEL_PMC_MSK_FIXED_SLOTS;
+	for_each_set_bit(bit, &pmus_mask, ARRAY_SIZE(intel_hybrid_pmu_type_map)) {
+		pmu = &x86_pmu.hybrid_pmu[idx++];
+		pmu->pmu_type = intel_hybrid_pmu_type_map[bit].id;
+		pmu->name = intel_hybrid_pmu_type_map[bit].name;
+
+		pmu->num_counters = x86_pmu.num_counters;
+		pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+		pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+		pmu->unconstrained = (struct event_constraint)
+				     __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
+							0, pmu->num_counters, 0, 0);
+
+		pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
+		if (pmu->pmu_type & hybrid_small) {
+			pmu->intel_cap.perf_metrics = 0;
+			pmu->intel_cap.pebs_output_pt_available = 1;
+			pmu->mid_ack = true;
+		} else if (pmu->pmu_type & hybrid_big) {
+			pmu->intel_cap.perf_metrics = 1;
+			pmu->intel_cap.pebs_output_pt_available = 0;
+			pmu->late_ack = true;
 		}
+	}
 
-		if (pmu->intel_cap.pebs_output_pt_available)
-			pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+	return 0;
+}
 
-		intel_pmu_check_event_constraints(pmu->event_constraints,
-						  pmu->num_counters,
-						  pmu->num_counters_fixed,
-						  pmu->intel_ctrl);
+static __always_inline void intel_pmu_ref_cycles_ext(void)
+{
+	if (!(x86_pmu.events_maskl & (INTEL_PMC_MSK_FIXED_REF_CYCLES >> INTEL_PMC_IDX_FIXED)))
+		intel_perfmon_event_map[PERF_COUNT_HW_REF_CPU_CYCLES] = 0x013c;
+}
 
-		intel_pmu_check_extra_regs(pmu->extra_regs);
-	}
+static __always_inline void intel_pmu_init_glc(struct pmu *pmu)
+{
+	x86_pmu.late_ack = true;
+	x86_pmu.limit_period = glc_limit_period;
+	x86_pmu.pebs_aliases = NULL;
+	x86_pmu.pebs_prec_dist = true;
+	x86_pmu.pebs_block = true;
+	x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+	x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+	x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+	x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+	x86_pmu.lbr_pt_coexist = true;
+	x86_pmu.num_topdown_events = 8;
+	static_call_update(intel_pmu_update_topdown_event,
+			   &icl_update_topdown_event);
+	static_call_update(intel_pmu_set_topdown_event_period,
+			   &icl_set_topdown_event_period);
+
+	memcpy(hybrid_var(pmu, hw_cache_event_ids), glc_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+	memcpy(hybrid_var(pmu, hw_cache_extra_regs), glc_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+	hybrid(pmu, event_constraints) = intel_glc_event_constraints;
+	hybrid(pmu, pebs_constraints) = intel_glc_pebs_event_constraints;
+
+	intel_pmu_ref_cycles_ext();
 }
 
-static __always_inline bool is_mtl(u8 x86_model)
+static __always_inline void intel_pmu_init_grt(struct pmu *pmu)
 {
-	return (x86_model == INTEL_FAM6_METEORLAKE) ||
-	       (x86_model == INTEL_FAM6_METEORLAKE_L);
+	x86_pmu.mid_ack = true;
+	x86_pmu.limit_period = glc_limit_period;
+	x86_pmu.pebs_aliases = NULL;
+	x86_pmu.pebs_prec_dist = true;
+	x86_pmu.pebs_block = true;
+	x86_pmu.lbr_pt_coexist = true;
+	x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+	x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+
+	memcpy(hybrid_var(pmu, hw_cache_event_ids), glp_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+	memcpy(hybrid_var(pmu, hw_cache_extra_regs), tnt_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+	hybrid_var(pmu, hw_cache_event_ids)[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+	hybrid(pmu, event_constraints) = intel_grt_event_constraints;
+	hybrid(pmu, pebs_constraints) = intel_grt_pebs_event_constraints;
+	hybrid(pmu, extra_regs) = intel_grt_extra_regs;
+
+	intel_pmu_ref_cycles_ext();
 }
 
 __init int intel_pmu_init(void)
@@ -6194,28 +6323,10 @@ __init int intel_pmu_init(void)
 		break;
 
 	case INTEL_FAM6_ATOM_GRACEMONT:
-		x86_pmu.mid_ack = true;
-		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-		hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
-
-		x86_pmu.event_constraints = intel_slm_event_constraints;
-		x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_grt_extra_regs;
-
-		x86_pmu.pebs_aliases = NULL;
-		x86_pmu.pebs_prec_dist = true;
-		x86_pmu.pebs_block = true;
-		x86_pmu.lbr_pt_coexist = true;
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
-
+		intel_pmu_init_grt(NULL);
 		intel_pmu_pebs_data_source_grt();
 		x86_pmu.pebs_latency_data = adl_latency_data_small;
 		x86_pmu.get_event_constraints = tnt_get_event_constraints;
-		x86_pmu.limit_period = spr_limit_period;
 		td_attr = tnt_events_attrs;
 		mem_attr = grt_mem_attrs;
 		extra_attr = nhm_format_attr;
@@ -6225,28 +6336,11 @@ __init int intel_pmu_init(void)
 
 	case INTEL_FAM6_ATOM_CRESTMONT:
 	case INTEL_FAM6_ATOM_CRESTMONT_X:
-		x86_pmu.mid_ack = true;
-		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-		hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
-
-		x86_pmu.event_constraints = intel_slm_event_constraints;
-		x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints;
+		intel_pmu_init_grt(NULL);
 		x86_pmu.extra_regs = intel_cmt_extra_regs;
-
-		x86_pmu.pebs_aliases = NULL;
-		x86_pmu.pebs_prec_dist = true;
-		x86_pmu.lbr_pt_coexist = true;
-		x86_pmu.pebs_block = true;
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
-
 		intel_pmu_pebs_data_source_cmt();
 		x86_pmu.pebs_latency_data = mtl_latency_data_small;
 		x86_pmu.get_event_constraints = cmt_get_event_constraints;
-		x86_pmu.limit_period = spr_limit_period;
 		td_attr = cmt_events_attrs;
 		mem_attr = grt_mem_attrs;
 		extra_attr = cmt_format_attr;
@@ -6563,44 +6657,23 @@ __init int intel_pmu_init(void)
 	case INTEL_FAM6_SAPPHIRERAPIDS_X:
 	case INTEL_FAM6_EMERALDRAPIDS_X:
 		x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
-		x86_pmu.extra_regs = intel_spr_extra_regs;
+		x86_pmu.extra_regs = intel_glc_extra_regs;
 		fallthrough;
 	case INTEL_FAM6_GRANITERAPIDS_X:
 	case INTEL_FAM6_GRANITERAPIDS_D:
-		pmem = true;
-		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-		x86_pmu.event_constraints = intel_spr_event_constraints;
-		x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
+		intel_pmu_init_glc(NULL);
 		if (!x86_pmu.extra_regs)
-			x86_pmu.extra_regs = intel_gnr_extra_regs;
-		x86_pmu.limit_period = spr_limit_period;
+			x86_pmu.extra_regs = intel_rwc_extra_regs;
 		x86_pmu.pebs_ept = 1;
-		x86_pmu.pebs_aliases = NULL;
-		x86_pmu.pebs_prec_dist = true;
-		x86_pmu.pebs_block = true;
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
-
 		x86_pmu.hw_config = hsw_hw_config;
-		x86_pmu.get_event_constraints = spr_get_event_constraints;
+		x86_pmu.get_event_constraints = glc_get_event_constraints;
 		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
 			hsw_format_attr : nhm_format_attr;
 		extra_skl_attr = skl_format_attr;
-		mem_attr = spr_events_attrs;
-		td_attr = spr_td_events_attrs;
-		tsx_attr = spr_tsx_events_attrs;
-		x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
-		x86_pmu.lbr_pt_coexist = true;
-		intel_pmu_pebs_data_source_skl(pmem);
-		x86_pmu.num_topdown_events = 8;
-		static_call_update(intel_pmu_update_topdown_event,
-				   &icl_update_topdown_event);
-		static_call_update(intel_pmu_set_topdown_event_period,
-				   &icl_set_topdown_event_period);
+		mem_attr = glc_events_attrs;
+		td_attr = glc_td_events_attrs;
+		tsx_attr = glc_tsx_events_attrs;
+		intel_pmu_pebs_data_source_skl(true);
 		pr_cont("Sapphire Rapids events, ");
 		name = "sapphire_rapids";
 		break;
@@ -6610,47 +6683,17 @@ __init int intel_pmu_init(void)
 	case INTEL_FAM6_RAPTORLAKE:
 	case INTEL_FAM6_RAPTORLAKE_P:
 	case INTEL_FAM6_RAPTORLAKE_S:
-	case INTEL_FAM6_METEORLAKE:
-	case INTEL_FAM6_METEORLAKE_L:
 		/*
 		 * Alder Lake has 2 types of CPU, core and atom.
 		 *
 		 * Initialize the common PerfMon capabilities here.
 		 */
-		x86_pmu.hybrid_pmu = kcalloc(X86_HYBRID_NUM_PMUS,
-					     sizeof(struct x86_hybrid_pmu),
-					     GFP_KERNEL);
-		if (!x86_pmu.hybrid_pmu)
-			return -ENOMEM;
-		static_branch_enable(&perf_is_hybrid);
-		x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS;
+		intel_pmu_init_hybrid(hybrid_big_small);
 
-		x86_pmu.pebs_aliases = NULL;
-		x86_pmu.pebs_prec_dist = true;
-		x86_pmu.pebs_block = true;
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
-		x86_pmu.lbr_pt_coexist = true;
 		x86_pmu.pebs_latency_data = adl_latency_data_small;
-		x86_pmu.num_topdown_events = 8;
-		static_call_update(intel_pmu_update_topdown_event,
-				   &adl_update_topdown_event);
-		static_call_update(intel_pmu_set_topdown_event_period,
-				   &adl_set_topdown_event_period);
-
-		x86_pmu.filter = intel_pmu_filter;
 		x86_pmu.get_event_constraints = adl_get_event_constraints;
 		x86_pmu.hw_config = adl_hw_config;
-		x86_pmu.limit_period = spr_limit_period;
 		x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type;
-		/*
-		 * The rtm_abort_event is used to check whether to enable GPRs
-		 * for the RTM abort event. Atom doesn't have the RTM abort
-		 * event. There is no harmful to set it in the common
-		 * x86_pmu.rtm_abort_event.
-		 */
-		x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
 
 		td_attr = adl_hybrid_events_attrs;
 		mem_attr = adl_hybrid_mem_attrs;
@@ -6660,9 +6703,7 @@ __init int intel_pmu_init(void)
 
 		/* Initialize big core specific PerfMon capabilities.*/
 		pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
-		pmu->name = "cpu_core";
-		pmu->cpu_type = hybrid_big;
-		pmu->late_ack = true;
+		intel_pmu_init_glc(&pmu->pmu);
 		if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
 			pmu->num_counters = x86_pmu.num_counters + 2;
 			pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
@@ -6687,54 +6728,45 @@ __init int intel_pmu_init(void)
 		pmu->unconstrained = (struct event_constraint)
 					__EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
 							   0, pmu->num_counters, 0, 0);
-		pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
-		pmu->intel_cap.perf_metrics = 1;
-		pmu->intel_cap.pebs_output_pt_available = 0;
+		pmu->extra_regs = intel_glc_extra_regs;
+
+		/* Initialize Atom core specific PerfMon capabilities.*/
+		pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+		intel_pmu_init_grt(&pmu->pmu);
+
+		x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+		intel_pmu_pebs_data_source_adl();
+		pr_cont("Alderlake Hybrid events, ");
+		name = "alderlake_hybrid";
+		break;
+
+	case INTEL_FAM6_METEORLAKE:
+	case INTEL_FAM6_METEORLAKE_L:
+		intel_pmu_init_hybrid(hybrid_big_small);
 
-		memcpy(pmu->hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids));
-		memcpy(pmu->hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs));
-		pmu->event_constraints = intel_spr_event_constraints;
-		pmu->pebs_constraints = intel_spr_pebs_event_constraints;
-		pmu->extra_regs = intel_spr_extra_regs;
+		x86_pmu.pebs_latency_data = mtl_latency_data_small;
+		x86_pmu.get_event_constraints = mtl_get_event_constraints;
+		x86_pmu.hw_config = adl_hw_config;
+
+		td_attr = adl_hybrid_events_attrs;
+		mem_attr = mtl_hybrid_mem_attrs;
+		tsx_attr = adl_hybrid_tsx_attrs;
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+
+		/* Initialize big core specific PerfMon capabilities.*/
+		pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+		intel_pmu_init_glc(&pmu->pmu);
+		pmu->extra_regs = intel_rwc_extra_regs;
 
 		/* Initialize Atom core specific PerfMon capabilities.*/
 		pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
-		pmu->name = "cpu_atom";
-		pmu->cpu_type = hybrid_small;
-		pmu->mid_ack = true;
-		pmu->num_counters = x86_pmu.num_counters;
-		pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
-		pmu->max_pebs_events = x86_pmu.max_pebs_events;
-		pmu->unconstrained = (struct event_constraint)
-					__EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
-							   0, pmu->num_counters, 0, 0);
-		pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
-		pmu->intel_cap.perf_metrics = 0;
-		pmu->intel_cap.pebs_output_pt_available = 1;
-
-		memcpy(pmu->hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids));
-		memcpy(pmu->hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs));
-		pmu->hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
-		pmu->event_constraints = intel_slm_event_constraints;
-		pmu->pebs_constraints = intel_grt_pebs_event_constraints;
-		pmu->extra_regs = intel_grt_extra_regs;
-		if (is_mtl(boot_cpu_data.x86_model)) {
-			x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_gnr_extra_regs;
-			x86_pmu.pebs_latency_data = mtl_latency_data_small;
-			extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
-				mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
-			mem_attr = mtl_hybrid_mem_attrs;
-			intel_pmu_pebs_data_source_mtl();
-			x86_pmu.get_event_constraints = mtl_get_event_constraints;
-			pmu->extra_regs = intel_cmt_extra_regs;
-			pr_cont("Meteorlake Hybrid events, ");
-			name = "meteorlake_hybrid";
-		} else {
-			x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
-			intel_pmu_pebs_data_source_adl();
-			pr_cont("Alderlake Hybrid events, ");
-			name = "alderlake_hybrid";
-		}
+		intel_pmu_init_grt(&pmu->pmu);
+		pmu->extra_regs = intel_cmt_extra_regs;
+
+		intel_pmu_pebs_data_source_mtl();
+		pr_cont("Meteorlake Hybrid events, ");
+		name = "meteorlake_hybrid";
 		break;
 
 	default:
@@ -6846,9 +6878,6 @@ __init int intel_pmu_init(void)
 	if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
 		x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
 
-	if (is_hybrid())
-		intel_pmu_check_hybrid_pmus((u64)fixed_mask);
-
 	if (x86_pmu.intel_cap.pebs_timing_info)
 		x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
 
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 96fffb2d521d..cbeb6d2bf5b4 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -336,6 +336,9 @@ static int cstate_pmu_event_init(struct perf_event *event)
 		cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
 		if (!(pkg_msr_mask & (1 << cfg)))
 			return -EINVAL;
+
+		event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
 		event->hw.event_base = pkg_msr[cfg].msr;
 		cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
 				      topology_die_cpumask(event->cpu));
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index eb8dd8b8a1e8..bf97ab904d40 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -261,7 +261,7 @@ static u64 __adl_latency_data_small(struct perf_event *event, u64 status,
 {
 	u64 val;
 
-	WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big);
+	WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big);
 
 	dse &= PERF_PEBS_DATA_SOURCE_MASK;
 	val = hybrid_var(event->pmu, pebs_data_source)[dse];
@@ -1058,7 +1058,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
-struct event_constraint intel_spr_pebs_event_constraints[] = {
+struct event_constraint intel_glc_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL),	/* INST_RETIRED.PREC_DIST */
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
 
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 42a55794004a..8e2a12235e62 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -736,6 +736,7 @@ static bool topa_table_full(struct topa *topa)
 /**
  * topa_insert_pages() - create a list of ToPA tables
  * @buf:	PT buffer being initialized.
+ * @cpu:	CPU on which to allocate.
  * @gfp:	Allocation flags.
  *
  * This initializes a list of ToPA tables with entries from
@@ -1207,8 +1208,11 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf)
 /**
  * pt_buffer_init_topa() - initialize ToPA table for pt buffer
  * @buf:	PT buffer.
- * @size:	Total size of all regions within this ToPA.
+ * @cpu:	CPU on which to allocate.
+ * @nr_pages:	No. of pages to allocate.
  * @gfp:	Allocation flags.
+ *
+ * Return:	0 on success or error code.
  */
 static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
 			       unsigned long nr_pages, gfp_t gfp)
@@ -1281,7 +1285,7 @@ out:
 
 /**
  * pt_buffer_setup_aux() - set up topa tables for a PT buffer
- * @cpu:	Cpu on which to allocate, -1 means current.
+ * @event:	Performance event
  * @pages:	Array of pointers to buffer pages passed from perf core.
  * @nr_pages:	Number of pages in the buffer.
  * @snapshot:	If this is a snapshot/overwrite counter.
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 69043e02e8a7..01023aa5125b 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -74,7 +74,7 @@ int uncore_device_to_die(struct pci_dev *dev)
 		struct cpuinfo_x86 *c = &cpu_data(cpu);
 
 		if (c->initialized && cpu_to_node(cpu) == node)
-			return c->logical_die_id;
+			return c->topo.logical_die_id;
 	}
 
 	return -1;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index c8ba2be7585d..53dd5d495ba6 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -652,10 +652,29 @@ enum {
 #define PERF_PEBS_DATA_SOURCE_MAX	0x10
 #define PERF_PEBS_DATA_SOURCE_MASK	(PERF_PEBS_DATA_SOURCE_MAX - 1)
 
+enum hybrid_cpu_type {
+	HYBRID_INTEL_NONE,
+	HYBRID_INTEL_ATOM	= 0x20,
+	HYBRID_INTEL_CORE	= 0x40,
+};
+
+enum hybrid_pmu_type {
+	not_hybrid,
+	hybrid_small		= BIT(0),
+	hybrid_big		= BIT(1),
+
+	hybrid_big_small	= hybrid_big | hybrid_small, /* only used for matching */
+};
+
+#define X86_HYBRID_PMU_ATOM_IDX		0
+#define X86_HYBRID_PMU_CORE_IDX		1
+
+#define X86_HYBRID_NUM_PMUS		2
+
 struct x86_hybrid_pmu {
 	struct pmu			pmu;
 	const char			*name;
-	u8				cpu_type;
+	enum hybrid_pmu_type		pmu_type;
 	cpumask_t			supported_cpus;
 	union perf_capabilities		intel_cap;
 	u64				intel_ctrl;
@@ -721,18 +740,6 @@ extern struct static_key_false perf_is_hybrid;
 	__Fp;						\
 })
 
-enum hybrid_pmu_type {
-	hybrid_big		= 0x40,
-	hybrid_small		= 0x20,
-
-	hybrid_big_small	= hybrid_big | hybrid_small,
-};
-
-#define X86_HYBRID_PMU_ATOM_IDX		0
-#define X86_HYBRID_PMU_CORE_IDX		1
-
-#define X86_HYBRID_NUM_PMUS		2
-
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -940,7 +947,7 @@ struct x86_pmu {
 	 */
 	int				num_hybrid_pmus;
 	struct x86_hybrid_pmu		*hybrid_pmu;
-	u8 (*get_hybrid_cpu_type)	(void);
+	enum hybrid_cpu_type (*get_hybrid_cpu_type)	(void);
 };
 
 struct x86_perf_task_context_opt {
@@ -1521,7 +1528,7 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
 
 extern struct event_constraint intel_icl_pebs_event_constraints[];
 
-extern struct event_constraint intel_spr_pebs_event_constraints[];
+extern struct event_constraint intel_glc_pebs_event_constraints[];
 
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
 
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 1579429846cc..8d98d468b976 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -115,7 +115,7 @@ struct rapl_pmu {
 struct rapl_pmus {
 	struct pmu		pmu;
 	unsigned int		maxdie;
-	struct rapl_pmu		*pmus[];
+	struct rapl_pmu		*pmus[] __counted_by(maxdie);
 };
 
 enum rapl_unit_quirk {
@@ -179,15 +179,11 @@ static u64 rapl_event_update(struct perf_event *event)
 	s64 delta, sdelta;
 	int shift = RAPL_CNTR_WIDTH;
 
-again:
 	prev_raw_count = local64_read(&hwc->prev_count);
-	rdmsrl(event->hw.event_base, new_raw_count);
-
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-			    new_raw_count) != prev_raw_count) {
-		cpu_relax();
-		goto again;
-	}
+	do {
+		rdmsrl(event->hw.event_base, new_raw_count);
+	} while (!local64_try_cmpxchg(&hwc->prev_count,
+				      &prev_raw_count, new_raw_count));
 
 	/*
 	 * Now we have the new raw value and have updated the prev
@@ -537,11 +533,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
  * - want to use same event codes across both architectures
  */
 static struct perf_msr amd_rapl_msrs[] = {
-	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, 0, false, 0 },
+	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
-	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   0, false, 0 },
-	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   0, false, 0 },
-	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  0, false, 0 },
+	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
+	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   NULL, false, 0 },
+	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
 };
 
 static int rapl_cpu_offline(unsigned int cpu)
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 999f5ac82fe9..96e6c51515f5 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -196,7 +196,7 @@ static int hv_vtl_apicid_to_vp_id(u32 apic_id)
 	return ret;
 }
 
-static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
 {
 	int vp_id;
 
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 8c6bf07f7d2b..c6edde1a1dec 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -288,7 +288,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
 		free_page((unsigned long)vmsa);
 }
 
-int hv_snp_boot_ap(int cpu, unsigned long start_ip)
+int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
 {
 	struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
 		__get_free_page(GFP_KERNEL | __GFP_ZERO);
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4f1ce5fc4e19..a192bdea69e2 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,5 +10,4 @@ generated-y += unistd_64_x32.h
 generated-y += xen-hypercalls.h
 
 generic-y += early_ioremap.h
-generic-y += export.h
 generic-y += mcs_spinlock.h
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 9c4da699e11a..65f79092c9d9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -58,7 +58,7 @@
 #define ANNOTATE_IGNORE_ALTERNATIVE				\
 	"999:\n\t"						\
 	".pushsection .discard.ignore_alts\n\t"			\
-	".long 999b - .\n\t"					\
+	".long 999b\n\t"					\
 	".popsection\n\t"
 
 /*
@@ -352,7 +352,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 .macro ANNOTATE_IGNORE_ALTERNATIVE
 	.Lannotate_\@:
 	.pushsection .discard.ignore_alts
-	.long .Lannotate_\@ - .
+	.long .Lannotate_\@
 	.popsection
 .endm
 
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5af4ec1a0f71..b0d192f613b7 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -54,7 +54,7 @@ extern int local_apic_timer_c2_ok;
 extern bool apic_is_disabled;
 extern unsigned int lapic_timer_period;
 
-extern int cpuid_to_apicid[];
+extern u32 cpuid_to_apicid[];
 
 extern enum apic_intr_mode_id apic_intr_mode;
 enum apic_intr_mode_id {
@@ -292,19 +292,19 @@ struct apic {
 	int	(*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
 	bool	(*apic_id_registered)(void);
 
-	bool	(*check_apicid_used)(physid_mask_t *map, int apicid);
+	bool	(*check_apicid_used)(physid_mask_t *map, u32 apicid);
 	void	(*init_apic_ldr)(void);
 	void	(*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
-	int	(*cpu_present_to_apicid)(int mps_cpu);
-	int	(*phys_pkg_id)(int cpuid_apic, int index_msb);
+	u32	(*cpu_present_to_apicid)(int mps_cpu);
+	u32	(*phys_pkg_id)(u32 cpuid_apic, int index_msb);
 
-	u32	(*get_apic_id)(unsigned long x);
-	u32	(*set_apic_id)(unsigned int id);
+	u32	(*get_apic_id)(u32 id);
+	u32	(*set_apic_id)(u32 apicid);
 
 	/* wakeup_secondary_cpu */
-	int	(*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
+	int	(*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip);
 	/* wakeup secondary CPU using 64-bit wakeup point */
-	int	(*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
+	int	(*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip);
 
 	char	*name;
 };
@@ -322,8 +322,8 @@ struct apic_override {
 	void	(*send_IPI_self)(int vector);
 	u64	(*icr_read)(void);
 	void	(*icr_write)(u32 low, u32 high);
-	int	(*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
-	int	(*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
+	int	(*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip);
+	int	(*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip);
 };
 
 /*
@@ -493,16 +493,6 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector)
 	return !!(irr & (1U << (vector % 32)));
 }
 
-static inline unsigned default_get_apic_id(unsigned long x)
-{
-	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-
-	if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
-		return (x >> 24) & 0xFF;
-	else
-		return (x >> 24) & 0x0F;
-}
-
 /*
  * Warm reset vector position:
  */
@@ -517,9 +507,9 @@ extern void generic_bigsmp_probe(void);
 
 extern struct apic apic_noop;
 
-static inline unsigned int read_apic_id(void)
+static inline u32 read_apic_id(void)
 {
-	unsigned int reg = apic_read(APIC_ID);
+	u32 reg = apic_read(APIC_ID);
 
 	return apic->get_apic_id(reg);
 }
@@ -538,13 +528,12 @@ extern int default_apic_id_valid(u32 apicid);
 extern u32 apic_default_calc_apicid(unsigned int cpu);
 extern u32 apic_flat_calc_apicid(unsigned int cpu);
 
-extern bool default_check_apicid_used(physid_mask_t *map, int apicid);
 extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap);
-extern int default_cpu_present_to_apicid(int mps_cpu);
+extern u32 default_cpu_present_to_apicid(int mps_cpu);
 
 #else /* CONFIG_X86_LOCAL_APIC */
 
-static inline unsigned int read_apic_id(void) { return 0; }
+static inline u32 read_apic_id(void) { return 0; }
 
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 2edf68475fec..50e5ebf9d0a0 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -293,6 +293,9 @@ static __always_inline unsigned long variable_ffz(unsigned long word)
  */
 static __always_inline unsigned long __fls(unsigned long word)
 {
+	if (__builtin_constant_p(word))
+		return BITS_PER_LONG - 1 - __builtin_clzl(word);
+
 	asm("bsr %1,%0"
 	    : "=r" (word)
 	    : "rm" (word));
@@ -360,6 +363,9 @@ static __always_inline int fls(unsigned int x)
 {
 	int r;
 
+	if (__builtin_constant_p(x))
+		return x ? 32 - __builtin_clz(x) : 0;
+
 #ifdef CONFIG_X86_64
 	/*
 	 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
@@ -401,6 +407,9 @@ static __always_inline int fls(unsigned int x)
 static __always_inline int fls64(__u64 x)
 {
 	int bitpos = -1;
+
+	if (__builtin_constant_p(x))
+		return x ? 64 - __builtin_clzll(x) : 0;
 	/*
 	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
 	 * dest reg is undefined if x==0, but their CPU architect says its
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index b3a7cfb0d99e..a38cc0afc90a 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -85,6 +85,8 @@ extern const unsigned long kernel_total_size;
 
 unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
 				void (*error)(char *x));
+
+extern struct boot_params *boot_params_ptr;
 #endif
 
 #endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
index ce9685fc78d8..5aa061199866 100644
--- a/arch/x86/include/asm/cacheinfo.h
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -7,9 +7,6 @@ extern unsigned int memory_caching_control;
 #define CACHE_MTRR 0x01
 #define CACHE_PAT  0x02
 
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu);
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu);
-
 void cache_disable(void);
 void cache_enable(void);
 void set_cache_aps_delayed_init(bool val);
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d53636506134..5612648b0202 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -221,12 +221,18 @@ extern void __add_wrong_size(void)
 #define __try_cmpxchg(ptr, pold, new, size)				\
 	__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
 
+#define __sync_try_cmpxchg(ptr, pold, new, size)			\
+	__raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ")
+
 #define __try_cmpxchg_local(ptr, pold, new, size)			\
 	__raw_try_cmpxchg((ptr), (pold), (new), (size), "")
 
 #define arch_try_cmpxchg(ptr, pold, new) 				\
 	__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
 
+#define arch_sync_try_cmpxchg(ptr, pold, new) 				\
+	__sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
 #define arch_try_cmpxchg_local(ptr, pold, new)				\
 	__try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr)))
 
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 18fd06f7936a..a0234dfd1031 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -7,6 +7,7 @@
  */
 #include <linux/thread_info.h>
 
+#include <asm/ia32.h>
 #include <asm/ptrace.h>
 #include <asm/user.h>
 #include <asm/auxvec.h>
@@ -149,7 +150,7 @@ do {						\
 	((x)->e_machine == EM_X86_64)
 
 #define compat_elf_check_arch(x)					\
-	(elf_check_arch_ia32(x) ||					\
+	((elf_check_arch_ia32(x) && ia32_enabled()) ||			\
 	 (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
 
 static inline void elf_common_init(struct thread_struct *t,
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 551829884734..b02c3cd3c0f6 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,7 +28,7 @@
 #include <asm/irq.h>
 #include <asm/sections.h>
 
-#ifdef	CONFIG_X86_LOCAL_APIC
+#ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 struct irq_data;
 struct pci_dev;
 struct msi_desc;
@@ -105,10 +105,10 @@ static inline void irq_complete_move(struct irq_cfg *c) { }
 #endif
 
 extern void apic_ack_edge(struct irq_data *data);
-#else	/*  CONFIG_X86_LOCAL_APIC */
+#else	/*  CONFIG_IRQ_DOMAIN_HIERARCHY */
 static inline void lock_vector_lock(void) {}
 static inline void unlock_vector_lock(void) {}
-#endif	/* CONFIG_X86_LOCAL_APIC */
+#endif	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 
 /* Statistics */
 extern atomic_t irq_err_count;
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 637fa1df3512..c715097e92fd 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -69,6 +69,8 @@ struct legacy_pic {
 	void (*make_irq)(unsigned int irq);
 };
 
+void legacy_pic_pcat_compat(void);
+
 extern struct legacy_pic *legacy_pic;
 extern struct legacy_pic null_legacy_pic;
 
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index fada857f0a1e..5a2ae24b1204 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -68,6 +68,20 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm);
 
 #endif
 
-#endif /* CONFIG_IA32_EMULATION */
+extern bool __ia32_enabled;
+
+static inline bool ia32_enabled(void)
+{
+	return __ia32_enabled;
+}
+
+#else /* !CONFIG_IA32_EMULATION */
+
+static inline bool ia32_enabled(void)
+{
+	return IS_ENABLED(CONFIG_X86_32);
+}
+
+#endif
 
 #endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 5f1d3c421f68..cc9ccf61b6bd 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_INIT_H
 #define _ASM_X86_INIT_H
 
+#define __head	__section(".head.text")
+
 struct x86_mapping_info {
 	void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
 	void *context;			 /* context for alloc_pgt_page */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 5fcd85fd64fd..197316121f04 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -27,6 +27,7 @@
  *		_X	- regular server parts
  *		_D	- micro server parts
  *		_N,_P	- other mobile parts
+ *		_H	- premium mobile parts
  *		_S	- other client parts
  *
  *		Historical OPTDIFFs:
@@ -124,6 +125,7 @@
 #define INTEL_FAM6_METEORLAKE		0xAC
 #define INTEL_FAM6_METEORLAKE_L		0xAA
 
+#define INTEL_FAM6_ARROWLAKE_H		0xC5
 #define INTEL_FAM6_ARROWLAKE		0xC6
 
 #define INTEL_FAM6_LUNARLAKE_M		0xBD
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 635132a12778..73dba8b94443 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -135,28 +135,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
 #define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
 
 /**
- * local_add_unless - add unless the number is a given value
+ * local_add_unless - add unless the number is already a given value
  * @l: pointer of type local_t
  * @a: the amount to add to l...
  * @u: ...unless l is equal to u.
  *
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
+ * Atomically adds @a to @l, if @v was not already @u.
+ * Returns true if the addition was done.
  */
-#define local_add_unless(l, a, u)				\
-({								\
-	long c, old;						\
-	c = local_read((l));					\
-	for (;;) {						\
-		if (unlikely(c == (u)))				\
-			break;					\
-		old = local_cmpxchg((l), c, c + (a));		\
-		if (likely(old == c))				\
-			break;					\
-		c = old;					\
-	}							\
-	c != (u);						\
-})
+static __always_inline bool
+local_add_unless(local_t *l, long a, long u)
+{
+	long c = local_read(l);
+
+	do {
+		if (unlikely(c == u))
+			return false;
+	} while (!local_try_cmpxchg(l, &c, c + a));
+
+	return true;
+}
+
 #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
 
 /* On x86_32, these are no better than the atomic variants.
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 180b1cbfcc4e..6de6e1d95952 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -245,7 +245,7 @@ static inline void cmci_recheck(void) {}
 int mce_available(struct cpuinfo_x86 *c);
 bool mce_is_memory_error(struct mce *m);
 bool mce_is_correctable(struct mce *m);
-int mce_usable_address(struct mce *m);
+bool mce_usable_address(struct mce *m);
 
 DECLARE_PER_CPU(unsigned, mce_exception_count);
 DECLARE_PER_CPU(unsigned, mce_poll_count);
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 473b16d73b47..359ada486fa9 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -19,8 +19,10 @@
 
 #ifdef CONFIG_X86_MEM_ENCRYPT
 void __init mem_encrypt_init(void);
+void __init mem_encrypt_setup_arch(void);
 #else
 static inline void mem_encrypt_init(void) { }
+static inline void __init mem_encrypt_setup_arch(void) { }
 #endif
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -43,7 +45,6 @@ void __init sme_map_bootdata(char *real_mode_data);
 void __init sme_unmap_bootdata(char *real_mode_data);
 
 void __init sme_early_init(void);
-void __init sev_setup_arch(void);
 
 void __init sme_encrypt_kernel(struct boot_params *bp);
 void __init sme_enable(struct boot_params *bp);
@@ -73,7 +74,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
 static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
 
 static inline void __init sme_early_init(void) { }
-static inline void __init sev_setup_arch(void) { }
 
 static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index f46df8349e86..4b0f98a8d338 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -37,7 +37,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
-extern unsigned int boot_cpu_physical_apicid;
+extern u32 boot_cpu_physical_apicid;
 extern u8 boot_cpu_apic_version;
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 896445edc6a8..ce4ce8720d55 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -276,11 +276,11 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 bool hv_ghcb_negotiate_protocol(void);
 void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
-int hv_snp_boot_ap(int cpu, unsigned long start_ip);
+int hv_snp_boot_ap(u32 cpu, unsigned long start_ip);
 #else
 static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
 static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
-static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; }
+static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; }
 #endif
 
 #if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b37abb55e948..e3fa9cecd599 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -638,12 +638,16 @@
 #define MSR_AMD64_LBR_SELECT			0xc000010e
 
 /* Zen4 */
-#define MSR_ZEN4_BP_CFG			0xc001102e
+#define MSR_ZEN4_BP_CFG                 0xc001102e
 #define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5
 
+/* Fam 19h MSRs */
+#define MSR_F19H_UMC_PERF_CTL           0xc0010800
+#define MSR_F19H_UMC_PERF_CTR           0xc0010801
+
 /* Zen 2 */
-#define MSR_ZEN2_SPECTRAL_CHICKEN	0xc00110e3
-#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT	BIT_ULL(1)
+#define MSR_ZEN2_SPECTRAL_CHICKEN       0xc00110e3
+#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT   BIT_ULL(1)
 
 /* Fam 17h MSRs */
 #define MSR_F17H_IRPERF			0xc00000e9
@@ -1117,12 +1121,16 @@
 #define MSR_IA32_VMX_MISC_INTEL_PT                 (1ULL << 14)
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
-/* AMD-V MSRs */
 
+/* AMD-V MSRs */
 #define MSR_VM_CR                       0xc0010114
 #define MSR_VM_IGNNE                    0xc0010115
 #define MSR_VM_HSAVE_PA                 0xc0010117
 
+#define SVM_VM_CR_VALID_MASK		0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK		0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK		0x0010ULL
+
 /* Hardware Feedback Interface */
 #define MSR_IA32_HW_FEEDBACK_PTR        0x17d0
 #define MSR_IA32_HW_FEEDBACK_CONFIG     0x17d1
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index c55cc243592e..f93e9b96927a 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -196,7 +196,7 @@
 .macro ANNOTATE_RETPOLINE_SAFE
 .Lhere_\@:
 	.pushsection .discard.retpoline_safe
-	.long .Lhere_\@ - .
+	.long .Lhere_\@
 	.popsection
 .endm
 
@@ -271,7 +271,7 @@
 .Lskip_rsb_\@:
 .endm
 
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
 #define CALL_UNTRAIN_RET	"call entry_untrain_ret"
 #else
 #define CALL_UNTRAIN_RET	""
@@ -288,38 +288,24 @@
  * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point
  * where we have a stack but before any RET instruction.
  */
-.macro UNTRAIN_RET
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
-	defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
+.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
+#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY)
 	VALIDATE_UNRET_END
 	ALTERNATIVE_3 "",						\
 		      CALL_UNTRAIN_RET, X86_FEATURE_UNRET,		\
-		      "call entry_ibpb", X86_FEATURE_ENTRY_IBPB,	\
-		      __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+		      "call entry_ibpb", \ibpb_feature,			\
+		     __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH
 #endif
 .endm
 
-.macro UNTRAIN_RET_VM
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
-	defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
-	VALIDATE_UNRET_END
-	ALTERNATIVE_3 "",						\
-		      CALL_UNTRAIN_RET, X86_FEATURE_UNRET,		\
-		      "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT,	\
-		      __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
-#endif
-.endm
+#define UNTRAIN_RET \
+	__UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH)
 
-.macro UNTRAIN_RET_FROM_CALL
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
-	defined(CONFIG_CALL_DEPTH_TRACKING)
-	VALIDATE_UNRET_END
-	ALTERNATIVE_3 "",						\
-		      CALL_UNTRAIN_RET, X86_FEATURE_UNRET,		\
-		      "call entry_ibpb", X86_FEATURE_ENTRY_IBPB,	\
-		      __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
-#endif
-.endm
+#define UNTRAIN_RET_VM \
+	__UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH)
+
+#define UNTRAIN_RET_FROM_CALL \
+	__UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL)
 
 
 .macro CALL_DEPTH_ACCOUNT
@@ -334,7 +320,7 @@
 #define ANNOTATE_RETPOLINE_SAFE					\
 	"999:\n\t"						\
 	".pushsection .discard.retpoline_safe\n\t"		\
-	".long 999b - .\n\t"					\
+	".long 999b\n\t"					\
 	".popsection\n\t"
 
 typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
@@ -348,13 +334,23 @@ extern void __x86_return_thunk(void);
 static inline void __x86_return_thunk(void) {}
 #endif
 
+#ifdef CONFIG_CPU_UNRET_ENTRY
 extern void retbleed_return_thunk(void);
+#else
+static inline void retbleed_return_thunk(void) {}
+#endif
+
+#ifdef CONFIG_CPU_SRSO
 extern void srso_return_thunk(void);
 extern void srso_alias_return_thunk(void);
+#else
+static inline void srso_return_thunk(void) {}
+static inline void srso_alias_return_thunk(void) {}
+#endif
 
-extern void retbleed_untrain_ret(void);
-extern void srso_untrain_ret(void);
-extern void srso_alias_untrain_ret(void);
+extern void retbleed_return_thunk(void);
+extern void srso_return_thunk(void);
+extern void srso_alias_return_thunk(void);
 
 extern void entry_untrain_ret(void);
 extern void entry_ibpb(void);
@@ -362,12 +358,7 @@ extern void entry_ibpb(void);
 extern void (*x86_return_thunk)(void);
 
 #ifdef CONFIG_CALL_DEPTH_TRACKING
-extern void __x86_return_skl(void);
-
-static inline void x86_set_skl_return_thunk(void)
-{
-	x86_return_thunk = &__x86_return_skl;
-}
+extern void call_depth_return_thunk(void);
 
 #define CALL_DEPTH_ACCOUNT					\
 	ALTERNATIVE("",						\
@@ -380,12 +371,12 @@ DECLARE_PER_CPU(u64, __x86_ret_count);
 DECLARE_PER_CPU(u64, __x86_stuffs_count);
 DECLARE_PER_CPU(u64, __x86_ctxsw_count);
 #endif
-#else
-static inline void x86_set_skl_return_thunk(void) {}
+#else /* !CONFIG_CALL_DEPTH_TRACKING */
 
+static inline void call_depth_return_thunk(void) {}
 #define CALL_DEPTH_ACCOUNT ""
 
-#endif
+#endif /* CONFIG_CALL_DEPTH_TRACKING */
 
 #ifdef CONFIG_RETPOLINE
 
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index e3bae2b60a0d..ef2844d69173 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -12,13 +12,6 @@
 
 #define NR_NODE_MEMBLKS		(MAX_NUMNODES*2)
 
-/*
- * Too small node sizes may confuse the VM badly. Usually they
- * result from BIOS bugs. So dont recognize nodes as standalone
- * NUMA entities that have less than this amount of RAM listed:
- */
-#define NODE_MIN_SIZE (4*1024*1024)
-
 extern int numa_off;
 
 /*
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d730463..20624b80f890 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do {									\
 	(typeof(_var))(unsigned long) pco_old__;			\
 })
 
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval)		\
+({									\
+	bool success;							\
+	__pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+	__pcpu_type_##size pco_old__ = *pco_oval__;			\
+	__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval);	\
+	asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]",		\
+				    __percpu_arg([var]))		\
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [oval] "+a" (pco_old__),				\
+		    [var] "+m" (_var)					\
+		  : [nval] __pcpu_reg_##size(, pco_new__)		\
+		  : "memory");						\
+	if (unlikely(!success))						\
+		*pco_oval__ = pco_old__;				\
+	likely(success);						\
+})
+
 #if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
 #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval)		\
 ({									\
@@ -223,26 +242,63 @@ do {									\
 	old__.var = _oval;						\
 	new__.var = _nval;						\
 									\
-	asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
+	asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu",		\
 			      "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
 		  : [var] "+m" (_var),					\
 		    "+a" (old__.low),					\
 		    "+d" (old__.high)					\
 		  : "b" (new__.low),					\
-		    "c" (new__.high)					\
-		  : "memory", "esi");					\
+		    "c" (new__.high),					\
+		    "S" (&(_var))					\
+		  : "memory");						\
 									\
 	old__.var;							\
 })
 
 #define raw_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg64_op(8,         , pcp, oval, nval)
 #define this_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval)	\
+({									\
+	bool success;							\
+	u64 *_oval = (u64 *)(_ovalp);					\
+	union {								\
+		u64 var;						\
+		struct {						\
+			u32 low, high;					\
+		};							\
+	} old__, new__;							\
+									\
+	old__.var = *_oval;						\
+	new__.var = _nval;						\
+									\
+	asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu",		\
+			      "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [var] "+m" (_var),					\
+		    "+a" (old__.low),					\
+		    "+d" (old__.high)					\
+		  : "b" (new__.low),					\
+		    "c" (new__.high),					\
+		    "S" (&(_var))					\
+		  : "memory");						\
+	if (unlikely(!success))						\
+		*_oval = old__.var;					\
+	likely(success);						\
+})
+
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval)		percpu_try_cmpxchg64_op(8,         , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval)	percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 #ifdef CONFIG_X86_64
 #define raw_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg_op(8,         , pcp, oval, nval);
 #define this_cpu_cmpxchg64(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
 
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval)		percpu_try_cmpxchg_op(8,         , pcp, ovalp, nval);
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
+
 #define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval)		\
 ({									\
 	union {								\
@@ -255,20 +311,54 @@ do {									\
 	old__.var = _oval;						\
 	new__.var = _nval;						\
 									\
-	asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
+	asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",		\
 			      "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
 		  : [var] "+m" (_var),					\
 		    "+a" (old__.low),					\
 		    "+d" (old__.high)					\
 		  : "b" (new__.low),					\
-		    "c" (new__.high)					\
-		  : "memory", "rsi");					\
+		    "c" (new__.high),					\
+		    "S" (&(_var))					\
+		  : "memory");						\
 									\
 	old__.var;							\
 })
 
 #define raw_cpu_cmpxchg128(pcp, oval, nval)	percpu_cmpxchg128_op(16,         , pcp, oval, nval)
 #define this_cpu_cmpxchg128(pcp, oval, nval)	percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval)	\
+({									\
+	bool success;							\
+	u128 *_oval = (u128 *)(_ovalp);					\
+	union {								\
+		u128 var;						\
+		struct {						\
+			u64 low, high;					\
+		};							\
+	} old__, new__;							\
+									\
+	old__.var = *_oval;						\
+	new__.var = _nval;						\
+									\
+	asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",		\
+			      "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [var] "+m" (_var),					\
+		    "+a" (old__.low),					\
+		    "+d" (old__.high)					\
+		  : "b" (new__.low),					\
+		    "c" (new__.high),					\
+		    "S" (&(_var))					\
+		  : "memory");						\
+	if (unlikely(!success))						\
+		*_oval = old__.var;					\
+	likely(success);						\
+})
+
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval)	percpu_try_cmpxchg128_op(16,         , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval)	percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
 #endif
 
 /*
@@ -343,6 +433,9 @@ do {									\
 #define raw_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
 
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, volatile, pcp, val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, volatile, pcp, val)
@@ -350,6 +443,9 @@ do {									\
 #define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
 
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -364,6 +460,7 @@ do {									\
 #define raw_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, , pcp, val)
 #define raw_cpu_xchg_8(pcp, nval)		raw_percpu_xchg_op(pcp, nval)
 #define raw_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
 
 #define this_cpu_read_8(pcp)			percpu_from_op(8, volatile, "mov", pcp)
 #define this_cpu_write_8(pcp, val)		percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -373,6 +470,7 @@ do {									\
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
 #define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 85a9fd5a3ec3..2618ec7c3d1d 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -112,6 +112,13 @@
 	(AMD64_PERFMON_V2_EVENTSEL_EVENT_NB	|	\
 	 AMD64_PERFMON_V2_EVENTSEL_UMASK_NB)
 
+#define AMD64_PERFMON_V2_ENABLE_UMC			BIT_ULL(31)
+#define AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC		GENMASK_ULL(7, 0)
+#define AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC		GENMASK_ULL(9, 8)
+#define AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC		\
+	(AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC	|	\
+	 AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC)
+
 #define AMD64_NUM_COUNTERS				4
 #define AMD64_NUM_COUNTERS_CORE				6
 #define AMD64_NUM_COUNTERS_NB				4
@@ -232,6 +239,8 @@ union cpuid_0x80000022_ebx {
 		unsigned int	lbr_v2_stack_sz:6;
 		/* Number of Data Fabric Counters */
 		unsigned int	num_df_pmc:6;
+		/* Number of Unified Memory Controller Counters */
+		unsigned int	num_umc_pmc:6;
 	} split;
 	unsigned int		full;
 };
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e02b179ec659..57bab91bbf50 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1716,6 +1716,14 @@ static inline bool pud_user_accessible_page(pud_t pud)
 }
 #endif
 
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
 #endif	/* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25b1bd8..4527e1430c6d 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
 {
 	int old, new;
 
+	old = raw_cpu_read_4(pcpu_hot.preempt_count);
 	do {
-		old = raw_cpu_read_4(pcpu_hot.preempt_count);
 		new = (old & PREEMPT_NEED_RESCHED) |
 			(pc & ~PREEMPT_NEED_RESCHED);
-	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
 }
 
 /*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a3669a7774ed..ae81a7191c1c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -75,11 +75,36 @@ extern u16 __read_mostly tlb_lld_4m[NR_INFO];
 extern u16 __read_mostly tlb_lld_1g[NR_INFO];
 
 /*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head_32.S, so think twice
- *  before touching them. [mj]
+ * CPU type and hardware bug flags. Kept separately for each CPU.
  */
 
+struct cpuinfo_topology {
+	// Real APIC ID read from the local APIC
+	u32			apicid;
+	// The initial APIC ID provided by CPUID
+	u32			initial_apicid;
+
+	// Physical package ID
+	u32			pkg_id;
+
+	// Physical die ID on AMD, Relative on Intel
+	u32			die_id;
+
+	// Compute unit ID - AMD specific
+	u32			cu_id;
+
+	// Core ID relative to the package
+	u32			core_id;
+
+	// Logical ID mappings
+	u32			logical_pkg_id;
+	u32			logical_die_id;
+
+	// Cache level topology IDs
+	u32			llc_id;
+	u32			l2c_id;
+};
+
 struct cpuinfo_x86 {
 	__u8			x86;		/* CPU family */
 	__u8			x86_vendor;	/* CPU vendor */
@@ -96,7 +121,6 @@ struct cpuinfo_x86 {
 	__u8			x86_phys_bits;
 	/* CPUID returned core id bits: */
 	__u8			x86_coreid_bits;
-	__u8			cu_id;
 	/* Max extended CPUID function supported: */
 	__u32			extended_cpuid_level;
 	/* Maximum supported CPUID level, -1=no CPUID: */
@@ -112,6 +136,7 @@ struct cpuinfo_x86 {
 	};
 	char			x86_vendor_id[16];
 	char			x86_model_id[64];
+	struct cpuinfo_topology	topo;
 	/* in KB - valid for CPUS which support this call: */
 	unsigned int		x86_cache_size;
 	int			x86_cache_alignment;	/* In bytes */
@@ -125,19 +150,9 @@ struct cpuinfo_x86 {
 	u64			ppin;
 	/* cpuid returned max cores value: */
 	u16			x86_max_cores;
-	u16			apicid;
-	u16			initial_apicid;
 	u16			x86_clflush_size;
 	/* number of cores as seen by the OS: */
 	u16			booted_cores;
-	/* Physical processor id: */
-	u16			phys_proc_id;
-	/* Logical processor id: */
-	u16			logical_proc_id;
-	/* Core id: */
-	u16			cpu_core_id;
-	u16			cpu_die_id;
-	u16			logical_die_id;
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 	/*  Is SMT active on this core? */
@@ -399,7 +414,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
 	return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
 }
 
-extern asmlinkage void ignore_sysret(void);
+extern asmlinkage void entry_SYSCALL32_ignore(void);
 
 /* Save actual FS/GS selectors and bases to current->thread */
 void current_save_fsgs(void);
@@ -678,7 +693,15 @@ extern int set_tsc_mode(unsigned int val);
 
 DECLARE_PER_CPU(u64, msr_misc_features_shadow);
 
-extern u16 get_llc_id(unsigned int cpu);
+static inline u32 per_cpu_llc_id(unsigned int cpu)
+{
+	return per_cpu(cpu_info.topo.llc_id, cpu);
+}
+
+static inline u32 per_cpu_l2c_id(unsigned int cpu)
+{
+	return per_cpu(cpu_info.topo.l2c_id, cpu);
+}
 
 #ifdef CONFIG_CPU_SUP_AMD
 extern u32 amd_get_nodes_per_socket(void);
@@ -724,14 +747,6 @@ enum mds_mitigations {
 	MDS_MITIGATION_VMWERV,
 };
 
-#ifdef CONFIG_X86_SGX
-int arch_memory_failure(unsigned long pfn, int flags);
-#define arch_memory_failure arch_memory_failure
-
-bool arch_is_platform_page(u64 paddr);
-#define arch_is_platform_page arch_is_platform_page
-#endif
-
 extern bool gds_ucode_mitigated(void);
 
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index b716d291d0d4..65dee2420624 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -31,6 +31,11 @@ static inline void x86_dtb_init(void) { }
 #define of_ioapic 0
 #endif
 
+#ifdef CONFIG_OF_EARLY_FLATTREE
+void x86_flattree_get_config(void);
+#else
+static inline void x86_flattree_get_config(void) { }
+#endif
 extern char cmd_line[COMMAND_LINE_SIZE];
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 12ef86b19910..4d84122bd643 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -36,6 +36,9 @@ void entry_INT80_compat(void);
 #ifdef CONFIG_XEN_PV
 void xen_entry_INT80_compat(void);
 #endif
+#else /* !CONFIG_IA32_EMULATION */
+#define entry_SYSCALL_compat NULL
+#define entry_SYSENTER_compat NULL
 #endif
 
 void x86_configure_nx(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index c31c633419fe..4fab2ed454f3 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -17,10 +17,8 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
 /* cpus sharing the last level cache: */
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
-DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
-DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
 
-DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
 
 struct task_struct;
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 64df897c0ee3..1be13b2dfe8b 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start);
 #define phys_to_target_node phys_to_target_node
 extern int memory_add_physaddr_to_nid(u64 start);
 #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+extern int numa_fill_memblks(u64 start, u64 end);
+#define numa_fill_memblks numa_fill_memblks
 #endif
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index cb0386fc4dc3..c648502e4535 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -4,6 +4,7 @@
 
 #include <linux/thread_info.h>
 #include <asm/nospec-branch.h>
+#include <asm/msr.h>
 
 /*
  * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
@@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
 	return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
 }
 
+/*
+ * This can be used in noinstr functions & should only be called in bare
+ * metal context.
+ */
+static __always_inline void __update_spec_ctrl(u64 val)
+{
+	__this_cpu_write(x86_spec_ctrl_current, val);
+	native_wrmsrl(MSR_IA32_SPEC_CTRL, val);
+}
+
 #ifdef CONFIG_SMP
 extern void speculative_store_bypass_ht_init(void);
 #else
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 3ac0ffc4f3e2..87a7b917d30e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -229,10 +229,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
 #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
 
-#define SVM_VM_CR_VALID_MASK	0x001fULL
-#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
-#define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
-
 #define SVM_NESTED_CTL_NP_ENABLE	BIT(0)
 #define SVM_NESTED_CTL_SEV_ENABLE	BIT(1)
 #define SVM_NESTED_CTL_SEV_ES_ENABLE	BIT(2)
@@ -572,8 +568,6 @@ struct vmcb {
 
 #define SVM_CPUID_FUNC 0x8000000a
 
-#define SVM_VM_CR_SVM_DISABLE 4
-
 #define SVM_SELECTOR_S_SHIFT 4
 #define SVM_SELECTOR_DPL_SHIFT 5
 #define SVM_SELECTOR_P_SHIFT 7
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 4fb36fba4b5a..f44e2f9ab65d 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -126,12 +126,12 @@ static inline int syscall_get_arch(struct task_struct *task)
 		? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
 }
 
-void do_syscall_64(struct pt_regs *regs, int nr);
+bool do_syscall_64(struct pt_regs *regs, int nr);
 
 #endif	/* CONFIG_X86_32 */
 
 void do_int80_syscall_32(struct pt_regs *regs);
-long do_fast_syscall_32(struct pt_regs *regs);
-long do_SYSENTER_32(struct pt_regs *regs);
+bool do_fast_syscall_32(struct pt_regs *regs);
+bool do_SYSENTER_32(struct pt_regs *regs);
 
 #endif	/* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 3235ba1e5b06..5f87f6b9b09e 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -105,17 +105,17 @@ static inline void setup_node_to_cpumask_map(void) { }
 extern const struct cpumask *cpu_coregroup_mask(int cpu);
 extern const struct cpumask *cpu_clustergroup_mask(int cpu);
 
-#define topology_logical_package_id(cpu)	(cpu_data(cpu).logical_proc_id)
-#define topology_physical_package_id(cpu)	(cpu_data(cpu).phys_proc_id)
-#define topology_logical_die_id(cpu)		(cpu_data(cpu).logical_die_id)
-#define topology_die_id(cpu)			(cpu_data(cpu).cpu_die_id)
-#define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
+#define topology_logical_package_id(cpu)	(cpu_data(cpu).topo.logical_pkg_id)
+#define topology_physical_package_id(cpu)	(cpu_data(cpu).topo.pkg_id)
+#define topology_logical_die_id(cpu)		(cpu_data(cpu).topo.logical_die_id)
+#define topology_die_id(cpu)			(cpu_data(cpu).topo.die_id)
+#define topology_core_id(cpu)			(cpu_data(cpu).topo.core_id)
 #define topology_ppin(cpu)			(cpu_data(cpu).ppin)
 
 extern unsigned int __max_die_per_package;
 
 #ifdef CONFIG_SMP
-#define topology_cluster_id(cpu)		(per_cpu(cpu_l2c_id, cpu))
+#define topology_cluster_id(cpu)		(cpu_data(cpu).topo.l2c_id)
 #define topology_die_cpumask(cpu)		(per_cpu(cpu_die_map, cpu))
 #define topology_cluster_cpumask(cpu)		(cpu_clustergroup_mask(cpu))
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8bae40a66282..5c367c1290c3 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -496,7 +496,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned len);
 #define copy_mc_to_kernel copy_mc_to_kernel
 
 unsigned long __must_check
-copy_mc_to_user(void *to, const void *from, unsigned len);
+copy_mc_to_user(void __user *to, const void *from, unsigned len);
 #endif
 
 /*
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 5240d88db52a..c878616a18b8 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -177,7 +177,7 @@ struct x86_init_ops {
  * struct x86_cpuinit_ops - platform specific cpu hotplug setups
  * @setup_percpu_clockev:	set up the per cpu clock event device
  * @early_percpu_clock_init:	early init of the per cpu clock event device
- * @fixup_cpu_id:		fixup function for cpuinfo_x86::phys_proc_id
+ * @fixup_cpu_id:		fixup function for cpuinfo_x86::topo.pkg_id
  * @parallel_bringup:		Parallel bringup control
  */
 struct x86_cpuinit_ops {
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2a0ea38955df..d0918a75cb00 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
 		pr_debug("Local APIC address 0x%08x\n", madt->address);
 	}
 
+	if (madt->flags & ACPI_MADT_PCAT_COMPAT)
+		legacy_pic_pcat_compat();
+
 	/* ACPI 6.3 and newer support the online capable bit. */
 	if (acpi_gbl_FADT.header.revision > 6 ||
 	    (acpi_gbl_FADT.header.revision == 6 &&
@@ -359,7 +362,7 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
 }
 
 #ifdef CONFIG_X86_64
-static int acpi_wakeup_cpu(int apicid, unsigned long start_ip)
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
 {
 	/*
 	 * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
@@ -856,7 +859,7 @@ int acpi_unmap_cpu(int cpu)
 	set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
 #endif
 
-	per_cpu(x86_cpu_to_apicid, cpu) = -1;
+	per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
 	set_cpu_present(cpu, false);
 	num_processors--;
 
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 356de955e78d..053f6dcc6b2c 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -27,6 +27,7 @@
 #define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT		0x153a
 #define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT		0x1507
 #define PCI_DEVICE_ID_AMD_MI200_ROOT		0x14bb
+#define PCI_DEVICE_ID_AMD_MI300_ROOT		0x14f8
 
 #define PCI_DEVICE_ID_AMD_17H_DF_F4		0x1464
 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4	0x15ec
@@ -43,6 +44,7 @@
 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4	0x12fc
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4	0x12c4
 #define PCI_DEVICE_ID_AMD_MI200_DF_F4		0x14d4
+#define PCI_DEVICE_ID_AMD_MI300_DF_F4		0x152c
 
 /* Protect the PCI config register pairs used for SMN. */
 static DEFINE_MUTEX(smn_mutex);
@@ -62,6 +64,7 @@ static const struct pci_device_id amd_root_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) },
 	{}
 };
 
@@ -93,6 +96,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
 	{}
 };
 
@@ -112,9 +116,13 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
 	{}
 };
 
@@ -386,7 +394,7 @@ int amd_get_subcaches(int cpu)
 
 	pci_read_config_dword(link, 0x1d4, &mask);
 
-	return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf;
+	return (mask >> (4 * cpu_data(cpu).topo.core_id)) & 0xf;
 }
 
 int amd_set_subcaches(int cpu, unsigned long mask)
@@ -412,7 +420,7 @@ int amd_set_subcaches(int cpu, unsigned long mask)
 		pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
 	}
 
-	cuid = cpu_data(cpu).cpu_core_id;
+	cuid = cpu_data(cpu).topo.core_id;
 	mask <<= 4 * cuid;
 	mask |= (0xf ^ (1 << cuid)) << 26;
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 760adac3d1a8..41093cf20acd 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -36,6 +36,8 @@
 #include <linux/smp.h>
 #include <linux/mm.h>
 
+#include <xen/xen.h>
+
 #include <asm/trace/irq_vectors.h>
 #include <asm/irq_remapping.h>
 #include <asm/pc-conf-reg.h>
@@ -70,7 +72,7 @@ unsigned int num_processors;
 unsigned disabled_cpus;
 
 /* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid __ro_after_init = -1U;
+u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID;
 EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
 
 u8 boot_cpu_apic_version __ro_after_init;
@@ -85,7 +87,7 @@ physid_mask_t phys_cpu_present_map;
  * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
  * avoid undefined behaviour caused by sending INIT from AP to BSP.
  */
-static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID;
+static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID;
 
 /*
  * This variable controls which CPUs receive external NMIs.  By default,
@@ -109,7 +111,7 @@ static inline bool apic_accessible(void)
 /*
  * Map cpu index to physical APIC ID
  */
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
@@ -1763,7 +1765,7 @@ static void __x2apic_enable(void)
 static int __init setup_nox2apic(char *str)
 {
 	if (x2apic_enabled()) {
-		int apicid = native_apic_msr_read(APIC_ID);
+		u32 apicid = native_apic_msr_read(APIC_ID);
 
 		if (apicid >= 255) {
 			pr_warn("Apicid: %08x, cannot enforce nox2apic\n",
@@ -2316,13 +2318,11 @@ static int nr_logical_cpuids = 1;
 /*
  * Used to store mapping between logical CPU IDs and APIC IDs.
  */
-int cpuid_to_apicid[] = {
-	[0 ... NR_CPUS - 1] = -1,
-};
+u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, };
 
 bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
 {
-	return phys_id == cpuid_to_apicid[cpu];
+	return phys_id == (u64)cpuid_to_apicid[cpu];
 }
 
 #ifdef CONFIG_SMP
@@ -2344,6 +2344,15 @@ static int __init smp_init_primary_thread_mask(void)
 {
 	unsigned int cpu;
 
+	/*
+	 * XEN/PV provides either none or useless topology information.
+	 * Pretend that all vCPUs are primary threads.
+	 */
+	if (xen_pv_domain()) {
+		cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask);
+		return 0;
+	}
+
 	for (cpu = 0; cpu < nr_logical_cpuids; cpu++)
 		cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]);
 	return 0;
@@ -2382,7 +2391,7 @@ static int allocate_logical_cpuid(int apicid)
 	return nr_logical_cpuids++;
 }
 
-static void cpu_update_apic(int cpu, int apicid)
+static void cpu_update_apic(int cpu, u32 apicid)
 {
 #if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
 	early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
@@ -2535,7 +2544,7 @@ static struct {
 	 */
 	int active;
 	/* r/w apic fields */
-	unsigned int apic_id;
+	u32 apic_id;
 	unsigned int apic_taskpri;
 	unsigned int apic_ldr;
 	unsigned int apic_dfr;
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 7bc5d9bf59cd..8a00141073ea 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -18,7 +18,7 @@ u32 apic_flat_calc_apicid(unsigned int cpu)
 	return 1U << cpu;
 }
 
-bool default_check_apicid_used(physid_mask_t *map, int apicid)
+bool default_check_apicid_used(physid_mask_t *map, u32 apicid)
 {
 	return physid_isset(apicid, *map);
 }
@@ -28,7 +28,7 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
 	*retmap = *phys_map;
 }
 
-int default_cpu_present_to_apicid(int mps_cpu)
+u32 default_cpu_present_to_apicid(int mps_cpu)
 {
 	if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
 		return (int)per_cpu(x86_cpu_to_apicid, mps_cpu);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 032a84e2c3cc..37daa3fd6819 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -56,17 +56,17 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
 	_flat_send_IPI_mask(mask, vector);
 }
 
-static unsigned int flat_get_apic_id(unsigned long x)
+static u32 flat_get_apic_id(u32 x)
 {
 	return (x >> 24) & 0xFF;
 }
 
-static u32 set_apic_id(unsigned int id)
+static u32 set_apic_id(u32 id)
 {
 	return (id & 0xFF) << 24;
 }
 
-static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb)
 {
 	return initial_apic_id >> index_msb;
 }
@@ -158,8 +158,6 @@ static struct apic apic_physflat __ro_after_init = {
 
 	.disable_esr			= 0,
 
-	.check_apicid_used		= NULL,
-	.ioapic_phys_id_map		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.phys_pkg_id			= flat_phys_pkg_id,
 
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 966d7cf10b95..b00d52ae84fa 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -18,6 +18,8 @@
 
 #include <asm/apic.h>
 
+#include "local.h"
+
 static void noop_send_IPI(int cpu, int vector) { }
 static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
 static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
@@ -25,10 +27,10 @@ static void noop_send_IPI_allbutself(int vector) { }
 static void noop_send_IPI_all(int vector) { }
 static void noop_send_IPI_self(int vector) { }
 static void noop_apic_icr_write(u32 low, u32 id) { }
-static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; }
+static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; }
 static u64 noop_apic_icr_read(void) { return 0; }
-static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; }
-static unsigned int noop_get_apic_id(unsigned long x) { return 0; }
+static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; }
+static u32 noop_get_apic_id(u32 apicid) { return 0; }
 static void noop_apic_eoi(void) { }
 
 static u32 noop_apic_read(u32 reg)
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 63f3d7be9dc7..456a14c44f67 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -25,7 +25,7 @@ static const struct apic apic_numachip1;
 static const struct apic apic_numachip2;
 static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
 
-static unsigned int numachip1_get_apic_id(unsigned long x)
+static u32 numachip1_get_apic_id(u32 x)
 {
 	unsigned long value;
 	unsigned int id = (x >> 24) & 0xff;
@@ -38,12 +38,12 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
 	return id;
 }
 
-static u32 numachip1_set_apic_id(unsigned int id)
+static u32 numachip1_set_apic_id(u32 id)
 {
 	return (id & 0xff) << 24;
 }
 
-static unsigned int numachip2_get_apic_id(unsigned long x)
+static u32 numachip2_get_apic_id(u32 x)
 {
 	u64 mcfg;
 
@@ -51,12 +51,12 @@ static unsigned int numachip2_get_apic_id(unsigned long x)
 	return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
 }
 
-static u32 numachip2_set_apic_id(unsigned int id)
+static u32 numachip2_set_apic_id(u32 id)
 {
 	return id << 24;
 }
 
-static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb)
 {
 	return initial_apic_id >> index_msb;
 }
@@ -71,7 +71,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val)
 	numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
 }
 
-static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
 {
 	numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
 	numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
@@ -161,7 +161,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
 	u64 val;
 	u32 nodes = 1;
 
-	this_cpu_write(cpu_llc_id, node);
+	c->topo.llc_id = node;
 
 	/* Account for nodes per socket in multi-core-module processors */
 	if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
@@ -169,7 +169,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
 		nodes = ((val >> 3) & 7) + 1;
 	}
 
-	c->phys_proc_id = node / nodes;
+	c->topo.pkg_id = node / nodes;
 }
 
 static int __init numachip_system_init(void)
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0e5535add4b5..7ee3c486cb33 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -13,12 +13,12 @@
 
 #include "local.h"
 
-static unsigned bigsmp_get_apic_id(unsigned long x)
+static u32 bigsmp_get_apic_id(u32 x)
 {
 	return (x >> 24) & 0xFF;
 }
 
-static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
+static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid)
 {
 	return false;
 }
@@ -29,7 +29,7 @@ static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *re
 	physids_promote(0xFFL, retmap);
 }
 
-static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb)
 {
 	return cpuid_apic >> index_msb;
 }
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index a44ba7209ef3..0078730a512e 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -281,7 +281,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
 }
 
 #ifdef CONFIG_SMP
-static int convert_apicid_to_cpu(int apic_id)
+static int convert_apicid_to_cpu(u32 apic_id)
 {
 	int i;
 
@@ -294,7 +294,8 @@ static int convert_apicid_to_cpu(int apic_id)
 
 int safe_smp_processor_id(void)
 {
-	int apicid, cpuid;
+	u32 apicid;
+	int cpuid;
 
 	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return 0;
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index ec219c659c7d..9ea6186ea88c 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -15,9 +15,9 @@
 
 /* X2APIC */
 void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
-unsigned int x2apic_get_apic_id(unsigned long id);
-u32 x2apic_set_apic_id(unsigned int id);
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
+u32 x2apic_get_apic_id(u32 id);
+u32 x2apic_set_apic_id(u32 id);
+u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb);
 
 void x2apic_send_IPI_all(int vector);
 void x2apic_send_IPI_allbutself(int vector);
@@ -64,6 +64,7 @@ void default_send_IPI_all(int vector);
 void default_send_IPI_self(int vector);
 
 bool default_apic_id_registered(void);
+bool default_check_apicid_used(physid_mask_t *map, u32 apicid);
 
 #ifdef CONFIG_X86_32
 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 6b6b711678fe..d9651f15ae4f 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -55,14 +55,14 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
 	 * caused by the non-atomic update of the address/data pair.
 	 *
 	 * Direct update is possible when:
-	 * - The MSI is maskable (remapped MSI does not use this code path)).
-	 *   The quirk bit is not set in this case.
+	 * - The MSI is maskable (remapped MSI does not use this code path).
+	 *   The reservation mode bit is set in this case.
 	 * - The new vector is the same as the old vector
 	 * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
 	 * - The interrupt is not yet started up
 	 * - The new destination CPU is the same as the old destination CPU
 	 */
-	if (!irqd_msi_nomask_quirk(irqd) ||
+	if (!irqd_can_reserve(irqd) ||
 	    cfg->vector == old_cfg.vector ||
 	    old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
 	    !irqd_is_started(irqd) ||
@@ -215,8 +215,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
 		if (WARN_ON_ONCE(domain != real_parent))
 			return false;
 		info->chip->irq_set_affinity = msi_set_affinity;
-		/* See msi_set_affinity() for the gory details */
-		info->flags |= MSI_FLAG_NOMASK_QUIRK;
 		break;
 	case DOMAIN_BUS_DMAR:
 	case DOMAIN_BUS_AMDVI:
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 9a06df6cdd68..5eb3fbe472da 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -18,11 +18,21 @@
 
 #include "local.h"
 
-static int default_phys_pkg_id(int cpuid_apic, int index_msb)
+static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb)
 {
 	return cpuid_apic >> index_msb;
 }
 
+static u32 default_get_apic_id(u32 x)
+{
+	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+
+	if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
+		return (x >> 24) & 0xFF;
+	else
+		return (x >> 24) & 0x0F;
+}
+
 /* should be called last. */
 static int probe_default(void)
 {
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 788cdb4ee394..7c9fe28f742f 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -124,17 +124,17 @@ static int x2apic_phys_probe(void)
 	return apic == &apic_x2apic_phys;
 }
 
-unsigned int x2apic_get_apic_id(unsigned long id)
+u32 x2apic_get_apic_id(u32 id)
 {
 	return id;
 }
 
-u32 x2apic_set_apic_id(unsigned int id)
+u32 x2apic_set_apic_id(u32 id)
 {
 	return id;
 }
 
-int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb)
 {
 	return initial_apicid >> index_msb;
 }
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 205cee567629..1b0d7336a28f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -110,7 +110,7 @@ static void __init early_get_pnodeid(void)
 	} else if (UVH_RH_GAM_ADDR_MAP_CONFIG) {
 		union uvh_rh_gam_addr_map_config_u  m_n_config;
 
-	m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
+		m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
 		uv_cpuid.n_skt = m_n_config.s.n_skt;
 		if (is_uv(UV3))
 			uv_cpuid.m_skt = m_n_config.s3.m_skt;
@@ -701,7 +701,7 @@ static __init void build_uv_gr_table(void)
 	}
 }
 
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
 {
 	unsigned long val;
 	int pnode;
@@ -779,7 +779,7 @@ static void uv_send_IPI_all(int vector)
 	uv_send_IPI_mask(cpu_online_mask, vector);
 }
 
-static u32 set_apic_id(unsigned int id)
+static u32 set_apic_id(u32 id)
 {
 	return id;
 }
@@ -789,7 +789,7 @@ static unsigned int uv_read_apic_id(void)
 	return x2apic_get_apic_id(apic_read(APIC_ID));
 }
 
-static int uv_phys_pkg_id(int initial_apicid, int index_msb)
+static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb)
 {
 	return uv_read_apic_id() >> index_msb;
 }
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index faa9f2299848..e9ad518a5003 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -48,11 +48,6 @@ EXPORT_SYMBOL_GPL(__x86_call_count);
 
 extern s32 __call_sites[], __call_sites_end[];
 
-struct thunk_desc {
-	void		*template;
-	unsigned int	template_size;
-};
-
 struct core_text {
 	unsigned long	base;
 	unsigned long	end;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4350f6bfc064..93eabf544031 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -54,6 +54,8 @@ obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 obj-$(CONFIG_HYPERVISOR_GUEST)		+= vmware.o hypervisor.o mshyperv.o
 obj-$(CONFIG_ACRN_GUEST)		+= acrn.o
 
+obj-$(CONFIG_DEBUG_FS)			+= debugfs.o
+
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ece2b5b7b0fe..a7eab05e5f29 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -382,7 +382,7 @@ static int nearby_node(int apicid)
 #endif
 
 /*
- * Fix up cpu_core_id for pre-F17h systems to be in the
+ * Fix up topo::core_id for pre-F17h systems to be in the
  * [0 .. cores_per_node - 1] range. Not really needed but
  * kept so as not to break existing setups.
  */
@@ -394,7 +394,7 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
 		return;
 
 	cus_per_node = c->x86_max_cores / nodes_per_socket;
-	c->cpu_core_id %= cus_per_node;
+	c->topo.core_id %= cus_per_node;
 }
 
 /*
@@ -405,8 +405,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
  */
 static void amd_get_topology(struct cpuinfo_x86 *c)
 {
-	int cpu = smp_processor_id();
-
 	/* get information required for multi-node processors */
 	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		int err;
@@ -414,13 +412,13 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
 
-		c->cpu_die_id  = ecx & 0xff;
+		c->topo.die_id  = ecx & 0xff;
 
 		if (c->x86 == 0x15)
-			c->cu_id = ebx & 0xff;
+			c->topo.cu_id = ebx & 0xff;
 
 		if (c->x86 >= 0x17) {
-			c->cpu_core_id = ebx & 0xff;
+			c->topo.core_id = ebx & 0xff;
 
 			if (smp_num_siblings > 1)
 				c->x86_max_cores /= smp_num_siblings;
@@ -434,15 +432,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		if (!err)
 			c->x86_coreid_bits = get_count_order(c->x86_max_cores);
 
-		cacheinfo_amd_init_llc_id(c, cpu);
+		cacheinfo_amd_init_llc_id(c);
 
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
 		u64 value;
 
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
-		c->cpu_die_id = value & 7;
-
-		per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+		c->topo.die_id = value & 7;
+		c->topo.llc_id = c->topo.die_id;
 	} else
 		return;
 
@@ -459,15 +456,14 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 static void amd_detect_cmp(struct cpuinfo_x86 *c)
 {
 	unsigned bits;
-	int cpu = smp_processor_id();
 
 	bits = c->x86_coreid_bits;
 	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+	c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1);
 	/* Convert the initial APIC ID into the socket ID */
-	c->phys_proc_id = c->initial_apicid >> bits;
+	c->topo.pkg_id = c->topo.initial_apicid >> bits;
 	/* use socket ID also for last level cache */
-	per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
+	c->topo.llc_id = c->topo.die_id = c->topo.pkg_id;
 }
 
 u32 amd_get_nodes_per_socket(void)
@@ -481,11 +477,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 	int cpu = smp_processor_id();
 	int node;
-	unsigned apicid = c->apicid;
+	unsigned apicid = c->topo.apicid;
 
 	node = numa_cpu_node(cpu);
 	if (node == NUMA_NO_NODE)
-		node = get_llc_id(cpu);
+		node = per_cpu_llc_id(cpu);
 
 	/*
 	 * On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -515,7 +511,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 		 * through CPU mapping may alter the outcome, directly
 		 * access __apicid_to_node[].
 		 */
-		int ht_nodeid = c->initial_apicid;
+		int ht_nodeid = c->topo.initial_apicid;
 
 		if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
 			node = __apicid_to_node[ht_nodeid];
@@ -1014,7 +1010,6 @@ static bool cpu_has_zenbleed_microcode(void)
 
 	default:
 		return false;
-		break;
 	}
 
 	if (boot_cpu_data.microcode < good_rev)
@@ -1044,6 +1039,8 @@ static void zenbleed_check(struct cpuinfo_x86 *c)
 
 static void init_amd(struct cpuinfo_x86 *c)
 {
+	u64 vm_cr;
+
 	early_init_amd(c);
 
 	/*
@@ -1060,7 +1057,7 @@ static void init_amd(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_FSRS);
 
 	/* get apicid instead of initial apic id from cpuid */
-	c->apicid = read_apic_id();
+	c->topo.apicid = read_apic_id();
 
 	/* K6s reports MCEs but don't actually have all the MSRs */
 	if (c->x86 < 6)
@@ -1095,6 +1092,14 @@ static void init_amd(struct cpuinfo_x86 *c)
 
 	init_amd_cacheinfo(c);
 
+	if (cpu_has(c, X86_FEATURE_SVM)) {
+		rdmsrl(MSR_VM_CR, vm_cr);
+		if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+			pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+			clear_cpu_cap(c, X86_FEATURE_SVM);
+		}
+	}
+
 	if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
 		/*
 		 * Use LFENCE for execution serialization.  On families which
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 10499bcd4e39..bb0ab8466b91 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(x86_pred_cmd);
 
 static DEFINE_MUTEX(spec_ctrl_mutex);
 
-void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
+void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
 
 /* Update SPEC_CTRL MSR and its cached copy unconditionally */
 static void update_spec_ctrl(u64 val)
@@ -717,7 +717,7 @@ void update_gds_msr(void)
 	case GDS_MITIGATION_UCODE_NEEDED:
 	case GDS_MITIGATION_HYPERVISOR:
 		return;
-	};
+	}
 
 	wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
 
@@ -1019,7 +1019,6 @@ static void __init retbleed_select_mitigation(void)
 
 do_cmd_auto:
 	case RETBLEED_CMD_AUTO:
-	default:
 		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
 		    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
 			if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY))
@@ -1042,8 +1041,7 @@ do_cmd_auto:
 		setup_force_cpu_cap(X86_FEATURE_RETHUNK);
 		setup_force_cpu_cap(X86_FEATURE_UNRET);
 
-		if (IS_ENABLED(CONFIG_RETHUNK))
-			x86_return_thunk = retbleed_return_thunk;
+		x86_return_thunk = retbleed_return_thunk;
 
 		if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
 		    boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
@@ -1061,7 +1059,8 @@ do_cmd_auto:
 	case RETBLEED_MITIGATION_STUFF:
 		setup_force_cpu_cap(X86_FEATURE_RETHUNK);
 		setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
-		x86_set_skl_return_thunk();
+
+		x86_return_thunk = call_depth_return_thunk;
 		break;
 
 	default:
@@ -1290,6 +1289,8 @@ spectre_v2_user_select_mitigation(void)
 
 		spectre_v2_user_ibpb = mode;
 		switch (cmd) {
+		case SPECTRE_V2_USER_CMD_NONE:
+			break;
 		case SPECTRE_V2_USER_CMD_FORCE:
 		case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
 		case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
@@ -1301,8 +1302,6 @@ spectre_v2_user_select_mitigation(void)
 		case SPECTRE_V2_USER_CMD_SECCOMP:
 			static_branch_enable(&switch_mm_cond_ibpb);
 			break;
-		default:
-			break;
 		}
 
 		pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
@@ -2160,6 +2159,10 @@ static int l1d_flush_prctl_get(struct task_struct *task)
 static int ssb_prctl_get(struct task_struct *task)
 {
 	switch (ssb_mode) {
+	case SPEC_STORE_BYPASS_NONE:
+		if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+			return PR_SPEC_ENABLE;
+		return PR_SPEC_NOT_AFFECTED;
 	case SPEC_STORE_BYPASS_DISABLE:
 		return PR_SPEC_DISABLE;
 	case SPEC_STORE_BYPASS_SECCOMP:
@@ -2171,11 +2174,8 @@ static int ssb_prctl_get(struct task_struct *task)
 		if (task_spec_ssb_disable(task))
 			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
 		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
-	default:
-		if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
-			return PR_SPEC_ENABLE;
-		return PR_SPEC_NOT_AFFECTED;
 	}
+	BUG();
 }
 
 static int ib_prctl_get(struct task_struct *task)
@@ -2353,6 +2353,8 @@ early_param("l1tf", l1tf_cmdline);
 
 enum srso_mitigation {
 	SRSO_MITIGATION_NONE,
+	SRSO_MITIGATION_UCODE_NEEDED,
+	SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
 	SRSO_MITIGATION_MICROCODE,
 	SRSO_MITIGATION_SAFE_RET,
 	SRSO_MITIGATION_IBPB,
@@ -2368,11 +2370,13 @@ enum srso_mitigation_cmd {
 };
 
 static const char * const srso_strings[] = {
-	[SRSO_MITIGATION_NONE]           = "Vulnerable",
-	[SRSO_MITIGATION_MICROCODE]      = "Mitigation: microcode",
-	[SRSO_MITIGATION_SAFE_RET]	 = "Mitigation: safe RET",
-	[SRSO_MITIGATION_IBPB]		 = "Mitigation: IBPB",
-	[SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+	[SRSO_MITIGATION_NONE]			= "Vulnerable",
+	[SRSO_MITIGATION_UCODE_NEEDED]		= "Vulnerable: No microcode",
+	[SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED]	= "Vulnerable: Safe RET, no microcode",
+	[SRSO_MITIGATION_MICROCODE]		= "Vulnerable: Microcode, no safe RET",
+	[SRSO_MITIGATION_SAFE_RET]		= "Mitigation: Safe RET",
+	[SRSO_MITIGATION_IBPB]			= "Mitigation: IBPB",
+	[SRSO_MITIGATION_IBPB_ON_VMEXIT]	= "Mitigation: IBPB on VMEXIT only"
 };
 
 static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
@@ -2406,34 +2410,44 @@ static void __init srso_select_mitigation(void)
 {
 	bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
 
-	if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
-		goto pred_cmd;
+	if (cpu_mitigations_off())
+		return;
 
-	if (!has_microcode) {
-		pr_warn("IBPB-extending microcode not applied!\n");
-		pr_warn(SRSO_NOTICE);
-	} else {
+	if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+		if (boot_cpu_has(X86_FEATURE_SBPB))
+			x86_pred_cmd = PRED_CMD_SBPB;
+		return;
+	}
+
+	if (has_microcode) {
 		/*
 		 * Zen1/2 with SMT off aren't vulnerable after the right
 		 * IBPB microcode has been applied.
+		 *
+		 * Zen1/2 don't have SBPB, no need to try to enable it here.
 		 */
 		if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
 			setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
 			return;
 		}
-	}
 
-	if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
-		if (has_microcode) {
-			pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n");
+		if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
 			srso_mitigation = SRSO_MITIGATION_IBPB;
-			goto pred_cmd;
+			goto out;
 		}
+	} else {
+		pr_warn("IBPB-extending microcode not applied!\n");
+		pr_warn(SRSO_NOTICE);
+
+		/* may be overwritten by SRSO_CMD_SAFE_RET below */
+		srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
 	}
 
 	switch (srso_cmd) {
 	case SRSO_CMD_OFF:
-		goto pred_cmd;
+		if (boot_cpu_has(X86_FEATURE_SBPB))
+			x86_pred_cmd = PRED_CMD_SBPB;
+		return;
 
 	case SRSO_CMD_MICROCODE:
 		if (has_microcode) {
@@ -2458,10 +2472,12 @@ static void __init srso_select_mitigation(void)
 				setup_force_cpu_cap(X86_FEATURE_SRSO);
 				x86_return_thunk = srso_return_thunk;
 			}
-			srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+			if (has_microcode)
+				srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+			else
+				srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
 		} else {
 			pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
-			goto pred_cmd;
 		}
 		break;
 
@@ -2473,7 +2489,6 @@ static void __init srso_select_mitigation(void)
 			}
 		} else {
 			pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
-			goto pred_cmd;
 		}
 		break;
 
@@ -2485,20 +2500,12 @@ static void __init srso_select_mitigation(void)
 			}
 		} else {
 			pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
-			goto pred_cmd;
                 }
 		break;
-
-	default:
-		break;
 	}
 
-	pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode"));
-
-pred_cmd:
-	if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) &&
-	     boot_cpu_has(X86_FEATURE_SBPB))
-		x86_pred_cmd = PRED_CMD_SBPB;
+out:
+	pr_info("%s\n", srso_strings[srso_mitigation]);
 }
 
 #undef pr_fmt
@@ -2704,9 +2711,7 @@ static ssize_t srso_show_state(char *buf)
 	if (boot_cpu_has(X86_FEATURE_SRSO_NO))
 		return sysfs_emit(buf, "Mitigation: SMT disabled\n");
 
-	return sysfs_emit(buf, "%s%s\n",
-			  srso_strings[srso_mitigation],
-			  boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode");
+	return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
 }
 
 static ssize_t gds_show_state(char *buf)
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 8f86eacf69f7..c131c412db89 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
 	return i;
 }
 
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c)
 {
 	/*
 	 * We may have multiple LLCs if L3 caches exist, so check if we
@@ -672,13 +672,13 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
 
 	if (c->x86 < 0x17) {
 		/* LLC is at the node level. */
-		per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+		c->topo.llc_id = c->topo.die_id;
 	} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
 		/*
 		 * LLC is at the core complex level.
 		 * Core complex ID is ApicId[3] for these processors.
 		 */
-		per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+		c->topo.llc_id = c->topo.apicid >> 3;
 	} else {
 		/*
 		 * LLC ID is calculated from the number of threads sharing the
@@ -694,12 +694,12 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
 		if (num_sharing_cache) {
 			int bits = get_count_order(num_sharing_cache);
 
-			per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+			c->topo.llc_id = c->topo.apicid >> bits;
 		}
 	}
 }
 
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
 {
 	/*
 	 * We may have multiple LLCs if L3 caches exist, so check if we
@@ -712,7 +712,7 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
 	 * LLC is at the core complex level.
 	 * Core complex ID is ApicId[3] for these processors.
 	 */
-	per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+	c->topo.llc_id = c->topo.apicid >> 3;
 }
 
 void init_amd_cacheinfo(struct cpuinfo_x86 *c)
@@ -740,9 +740,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
 	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_SMP
-	unsigned int cpu = c->cpu_index;
-#endif
 
 	if (c->cpuid_level > 3) {
 		static int is_initialized;
@@ -776,13 +773,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
 				new_l2 = this_leaf.size/1024;
 				num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
 				index_msb = get_count_order(num_threads_sharing);
-				l2_id = c->apicid & ~((1 << index_msb) - 1);
+				l2_id = c->topo.apicid & ~((1 << index_msb) - 1);
 				break;
 			case 3:
 				new_l3 = this_leaf.size/1024;
 				num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
 				index_msb = get_count_order(num_threads_sharing);
-				l3_id = c->apicid & ~((1 << index_msb) - 1);
+				l3_id = c->topo.apicid & ~((1 << index_msb) - 1);
 				break;
 			default:
 				break;
@@ -856,30 +853,24 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
 	if (new_l2) {
 		l2 = new_l2;
-#ifdef CONFIG_SMP
-		per_cpu(cpu_llc_id, cpu) = l2_id;
-		per_cpu(cpu_l2c_id, cpu) = l2_id;
-#endif
+		c->topo.llc_id = l2_id;
+		c->topo.l2c_id = l2_id;
 	}
 
 	if (new_l3) {
 		l3 = new_l3;
-#ifdef CONFIG_SMP
-		per_cpu(cpu_llc_id, cpu) = l3_id;
-#endif
+		c->topo.llc_id = l3_id;
 	}
 
-#ifdef CONFIG_SMP
 	/*
-	 * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+	 * If llc_id is not yet set, this means cpuid_level < 4 which in
 	 * turns means that the only possibility is SMT (as indicated in
 	 * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
 	 * that SMT shares all caches, we can unconditionally set cpu_llc_id to
-	 * c->phys_proc_id.
+	 * c->topo.pkg_id.
 	 */
-	if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
-		per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-#endif
+	if (c->topo.llc_id == BAD_APICID)
+		c->topo.llc_id = c->topo.pkg_id;
 
 	c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
 
@@ -915,7 +906,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
 		unsigned int apicid, nshared, first, last;
 
 		nshared = base->eax.split.num_threads_sharing + 1;
-		apicid = cpu_data(cpu).apicid;
+		apicid = cpu_data(cpu).topo.apicid;
 		first = apicid - (apicid % nshared);
 		last = first + nshared - 1;
 
@@ -924,14 +915,14 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
 			if (!this_cpu_ci->info_list)
 				continue;
 
-			apicid = cpu_data(i).apicid;
+			apicid = cpu_data(i).topo.apicid;
 			if ((apicid < first) || (apicid > last))
 				continue;
 
 			this_leaf = this_cpu_ci->info_list + index;
 
 			for_each_online_cpu(sibling) {
-				apicid = cpu_data(sibling).apicid;
+				apicid = cpu_data(sibling).topo.apicid;
 				if ((apicid < first) || (apicid > last))
 					continue;
 				cpumask_set_cpu(sibling,
@@ -969,7 +960,7 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
 	index_msb = get_count_order(num_threads_sharing);
 
 	for_each_online_cpu(i)
-		if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+		if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) {
 			struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
 
 			if (i == cpu || !sib_cpu_ci->info_list)
@@ -1024,7 +1015,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
 
 	num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
 	index_msb = get_count_order(num_threads_sharing);
-	id4_regs->id = c->apicid >> index_msb;
+	id4_regs->id = c->topo.apicid >> index_msb;
 }
 
 int populate_cache_leaves(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e5ffc8b0e46..5d9591146244 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -62,6 +62,7 @@
 #include <asm/intel-family.h>
 #include <asm/cpu_device_id.h>
 #include <asm/uv/uv.h>
+#include <asm/ia32.h>
 #include <asm/set_memory.h>
 #include <asm/traps.h>
 #include <asm/sev.h>
@@ -74,18 +75,6 @@ u32 elf_hwcap2 __read_mostly;
 int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
 
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
-
-u16 get_llc_id(unsigned int cpu)
-{
-	return per_cpu(cpu_llc_id, cpu);
-}
-EXPORT_SYMBOL_GPL(get_llc_id);
-
-/* L2 cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
-
 static struct ppin_info {
 	int	feature;
 	int	msr_ppin_ctl;
@@ -914,7 +903,7 @@ void detect_ht(struct cpuinfo_x86 *c)
 		return;
 
 	index_msb = get_count_order(smp_num_siblings);
-	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+	c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb);
 
 	smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
@@ -922,8 +911,8 @@ void detect_ht(struct cpuinfo_x86 *c)
 
 	core_bits = get_count_order(c->x86_max_cores);
 
-	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
-				       ((1 << core_bits) - 1);
+	c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) &
+		((1 << core_bits) - 1);
 #endif
 }
 
@@ -1114,18 +1103,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 void get_cpu_address_sizes(struct cpuinfo_x86 *c)
 {
 	u32 eax, ebx, ecx, edx;
+	bool vp_bits_from_cpuid = true;
 
-	if (c->extended_cpuid_level >= 0x80000008) {
+	if (!cpu_has(c, X86_FEATURE_CPUID) ||
+	    (c->extended_cpuid_level < 0x80000008))
+		vp_bits_from_cpuid = false;
+
+	if (vp_bits_from_cpuid) {
 		cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
 
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
+	} else {
+		if (IS_ENABLED(CONFIG_X86_64)) {
+			c->x86_clflush_size = 64;
+			c->x86_phys_bits = 36;
+			c->x86_virt_bits = 48;
+		} else {
+			c->x86_clflush_size = 32;
+			c->x86_virt_bits = 32;
+			c->x86_phys_bits = 32;
+
+			if (cpu_has(c, X86_FEATURE_PAE) ||
+			    cpu_has(c, X86_FEATURE_PSE36))
+				c->x86_phys_bits = 36;
+		}
 	}
-#ifdef CONFIG_X86_32
-	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
-		c->x86_phys_bits = 36;
-#endif
 	c->x86_cache_bits = c->x86_phys_bits;
+	c->x86_cache_alignment = c->x86_clflush_size;
 }
 
 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -1579,17 +1584,6 @@ static void __init cpu_parse_early_param(void)
  */
 static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_64
-	c->x86_clflush_size = 64;
-	c->x86_phys_bits = 36;
-	c->x86_virt_bits = 48;
-#else
-	c->x86_clflush_size = 32;
-	c->x86_phys_bits = 32;
-	c->x86_virt_bits = 32;
-#endif
-	c->x86_cache_alignment = c->x86_clflush_size;
-
 	memset(&c->x86_capability, 0, sizeof(c->x86_capability));
 	c->extended_cpuid_level = 0;
 
@@ -1601,7 +1595,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		cpu_detect(c);
 		get_cpu_vendor(c);
 		get_cpu_cap(c);
-		get_cpu_address_sizes(c);
 		setup_force_cpu_cap(X86_FEATURE_CPUID);
 		cpu_parse_early_param();
 
@@ -1617,6 +1610,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		setup_clear_cpu_cap(X86_FEATURE_CPUID);
 	}
 
+	get_cpu_address_sizes(c);
+
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
 	cpu_set_bug_bits(c);
@@ -1761,15 +1756,15 @@ static void generic_identify(struct cpuinfo_x86 *c)
 	get_cpu_address_sizes(c);
 
 	if (c->cpuid_level >= 0x00000001) {
-		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+		c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
 #ifdef CONFIG_X86_32
 # ifdef CONFIG_SMP
-		c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+		c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
 # else
-		c->apicid = c->initial_apicid;
+		c->topo.apicid = c->topo.initial_apicid;
 # endif
 #endif
-		c->phys_proc_id = c->initial_apicid;
+		c->topo.pkg_id = c->topo.initial_apicid;
 	}
 
 	get_model_name(c); /* Default name */
@@ -1799,18 +1794,19 @@ static void generic_identify(struct cpuinfo_x86 *c)
 static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	unsigned int apicid, cpu = smp_processor_id();
+	unsigned int cpu = smp_processor_id();
+	u32 apicid;
 
 	apicid = apic->cpu_present_to_apicid(cpu);
 
-	if (apicid != c->apicid) {
+	if (apicid != c->topo.apicid) {
 		pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
-		       cpu, apicid, c->initial_apicid);
+		       cpu, apicid, c->topo.initial_apicid);
 	}
-	BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
-	BUG_ON(topology_update_die_map(c->cpu_die_id, cpu));
+	BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu));
+	BUG_ON(topology_update_die_map(c->topo.die_id, cpu));
 #else
-	c->logical_proc_id = 0;
+	c->topo.logical_pkg_id = 0;
 #endif
 }
 
@@ -1829,7 +1825,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_model_id[0] = '\0';  /* Unset */
 	c->x86_max_cores = 1;
 	c->x86_coreid_bits = 0;
-	c->cu_id = 0xff;
+	c->topo.cu_id = 0xff;
+	c->topo.llc_id = BAD_APICID;
+	c->topo.l2c_id = BAD_APICID;
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
 	c->x86_phys_bits = 36;
@@ -1855,7 +1853,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	apply_forced_caps(c);
 
 #ifdef CONFIG_X86_64
-	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+	c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
 #endif
 
 	/*
@@ -2074,24 +2072,24 @@ void syscall_init(void)
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
 	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
-#ifdef CONFIG_IA32_EMULATION
-	wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
-	/*
-	 * This only works on Intel CPUs.
-	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
-	 * This does not cause SYSENTER to jump to the wrong location, because
-	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
-	 */
-	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-	wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
-		    (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
-	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
-#else
-	wrmsrl_cstar((unsigned long)ignore_sysret);
-	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
-	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
-	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
-#endif
+	if (ia32_enabled()) {
+		wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
+		/*
+		 * This only works on Intel CPUs.
+		 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+		 * This does not cause SYSENTER to jump to the wrong location, because
+		 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+		 */
+		wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+		wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+			    (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+		wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+	} else {
+		wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
+		wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+		wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+		wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+	}
 
 	/*
 	 * Flags to clear on syscall; clear as much as possible
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1dcd7d4e38ef..885281ae79a5 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -78,6 +78,9 @@ extern int detect_ht_early(struct cpuinfo_x86 *c);
 extern void detect_ht(struct cpuinfo_x86 *c);
 extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
 
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
+
 unsigned int aperfmperf_get_khz(int cpu);
 void cpu_select_mitigations(void);
 
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
new file mode 100644
index 000000000000..0c179d684b3b
--- /dev/null
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+static int cpu_debug_show(struct seq_file *m, void *p)
+{
+	unsigned long cpu = (unsigned long)m->private;
+	struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu);
+
+	seq_printf(m, "online:              %d\n", cpu_online(cpu));
+	if (!c->initialized)
+		return 0;
+
+	seq_printf(m, "initial_apicid:      %x\n", c->topo.initial_apicid);
+	seq_printf(m, "apicid:              %x\n", c->topo.apicid);
+	seq_printf(m, "pkg_id:              %u\n", c->topo.pkg_id);
+	seq_printf(m, "die_id:              %u\n", c->topo.die_id);
+	seq_printf(m, "cu_id:               %u\n", c->topo.cu_id);
+	seq_printf(m, "core_id:             %u\n", c->topo.core_id);
+	seq_printf(m, "logical_pkg_id:      %u\n", c->topo.logical_pkg_id);
+	seq_printf(m, "logical_die_id:      %u\n", c->topo.logical_die_id);
+	seq_printf(m, "llc_id:              %u\n", c->topo.llc_id);
+	seq_printf(m, "l2c_id:              %u\n", c->topo.l2c_id);
+	seq_printf(m, "max_cores:           %u\n", c->x86_max_cores);
+	seq_printf(m, "max_die_per_pkg:     %u\n", __max_die_per_package);
+	seq_printf(m, "smp_num_siblings:    %u\n", smp_num_siblings);
+	return 0;
+}
+
+static int cpu_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cpu_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_cpu_ops = {
+	.open		= cpu_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static __init int cpu_init_debugfs(void)
+{
+	struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
+	unsigned long id;
+	char name[24];
+
+	dir = debugfs_create_dir("cpus", base);
+	for_each_possible_cpu(id) {
+		sprintf(name, "%lu", id);
+		debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops);
+	}
+	return 0;
+}
+late_initcall(cpu_init_debugfs);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index defdc594be14..6f247d66758d 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -63,8 +63,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c)
  */
 static void hygon_get_topology(struct cpuinfo_x86 *c)
 {
-	int cpu = smp_processor_id();
-
 	/* get information required for multi-node processors */
 	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		int err;
@@ -72,9 +70,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
 
-		c->cpu_die_id  = ecx & 0xff;
+		c->topo.die_id  = ecx & 0xff;
 
-		c->cpu_core_id = ebx & 0xff;
+		c->topo.core_id = ebx & 0xff;
 
 		if (smp_num_siblings > 1)
 			c->x86_max_cores /= smp_num_siblings;
@@ -87,17 +85,20 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
 		if (!err)
 			c->x86_coreid_bits = get_count_order(c->x86_max_cores);
 
-		/* Socket ID is ApicId[6] for these processors. */
-		c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT;
+		/*
+		 * Socket ID is ApicId[6] for the processors with model <= 0x3
+		 * when running on host.
+		 */
+		if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3)
+			c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT;
 
-		cacheinfo_hygon_init_llc_id(c, cpu);
+		cacheinfo_hygon_init_llc_id(c);
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
 		u64 value;
 
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
-		c->cpu_die_id = value & 7;
-
-		per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+		c->topo.die_id = value & 7;
+		c->topo.llc_id = c->topo.die_id;
 	} else
 		return;
 
@@ -112,15 +113,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
 static void hygon_detect_cmp(struct cpuinfo_x86 *c)
 {
 	unsigned int bits;
-	int cpu = smp_processor_id();
 
 	bits = c->x86_coreid_bits;
 	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+	c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1);
 	/* Convert the initial APIC ID into the socket ID */
-	c->phys_proc_id = c->initial_apicid >> bits;
-	/* use socket ID also for last level cache */
-	per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
+	c->topo.pkg_id = c->topo.initial_apicid >> bits;
+	/* Use package ID also for last level cache */
+	c->topo.llc_id = c->topo.die_id = c->topo.pkg_id;
 }
 
 static void srat_detect_node(struct cpuinfo_x86 *c)
@@ -128,11 +128,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 	int cpu = smp_processor_id();
 	int node;
-	unsigned int apicid = c->apicid;
+	unsigned int apicid = c->topo.apicid;
 
 	node = numa_cpu_node(cpu);
 	if (node == NUMA_NO_NODE)
-		node = per_cpu(cpu_llc_id, cpu);
+		node = c->topo.llc_id;
 
 	/*
 	 * On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -161,7 +161,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 		 * through CPU mapping may alter the outcome, directly
 		 * access __apicid_to_node[].
 		 */
-		int ht_nodeid = c->initial_apicid;
+		int ht_nodeid = c->topo.initial_apicid;
 
 		if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
 			node = __apicid_to_node[ht_nodeid];
@@ -290,6 +290,8 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
 
 static void init_hygon(struct cpuinfo_x86 *c)
 {
+	u64 vm_cr;
+
 	early_init_hygon(c);
 
 	/*
@@ -301,7 +303,7 @@ static void init_hygon(struct cpuinfo_x86 *c)
 	set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 
 	/* get apicid instead of initial apic id from cpuid */
-	c->apicid = read_apic_id();
+	c->topo.apicid = read_apic_id();
 
 	/*
 	 * XXX someone from Hygon needs to confirm this DTRT
@@ -320,6 +322,14 @@ static void init_hygon(struct cpuinfo_x86 *c)
 
 	init_hygon_cacheinfo(c);
 
+	if (cpu_has(c, X86_FEATURE_SVM)) {
+		rdmsrl(MSR_VM_CR, vm_cr);
+		if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+			pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+			clear_cpu_cap(c, X86_FEATURE_SVM);
+		}
+	}
+
 	if (cpu_has(c, X86_FEATURE_XMM2)) {
 		/*
 		 * Use LFENCE for execution serialization.  On families which
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index be4045628fd3..55efadb0e998 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -314,19 +314,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		setup_clear_cpu_cap(X86_FEATURE_PGE);
 	}
 
-	if (c->cpuid_level >= 0x00000001) {
-		u32 eax, ebx, ecx, edx;
-
-		cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
-		/*
-		 * If HTT (EDX[28]) is set EBX[16:23] contain the number of
-		 * apicids which are reserved per package. Store the resulting
-		 * shift value for the package management code.
-		 */
-		if (edx & (1U << 28))
-			c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
-	}
-
 	check_memory_type_self_snoop_errata(c);
 
 	/*
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index c267f43de39e..f3517b8a8e91 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -713,17 +713,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 		deferred_error_interrupt_enable(c);
 }
 
-bool amd_mce_is_memory_error(struct mce *m)
+/*
+ * DRAM ECC errors are reported in the Northbridge (bank 4) with
+ * Extended Error Code 8.
+ */
+static bool legacy_mce_is_memory_error(struct mce *m)
+{
+	return m->bank == 4 && XEC(m->status, 0x1f) == 8;
+}
+
+/*
+ * DRAM ECC errors are reported in Unified Memory Controllers with
+ * Extended Error Code 0.
+ */
+static bool smca_mce_is_memory_error(struct mce *m)
 {
 	enum smca_bank_types bank_type;
-	/* ErrCodeExt[20:16] */
-	u8 xec = (m->status >> 16) & 0x1f;
+
+	if (XEC(m->status, 0x3f))
+		return false;
 
 	bank_type = smca_get_bank_type(m->extcpu, m->bank);
+
+	return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
+}
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
 	if (mce_flags.smca)
-		return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0;
+		return smca_mce_is_memory_error(m);
+	else
+		return legacy_mce_is_memory_error(m);
+}
+
+/*
+ * AMD systems do not have an explicit indicator that the value in MCA_ADDR is
+ * a system physical address. Therefore, individual cases need to be detected.
+ * Future cases and checks will be added as needed.
+ *
+ * 1) General case
+ *	a) Assume address is not usable.
+ * 2) Poison errors
+ *	a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
+ *	   northbridge (bank 4).
+ *	b) Refers to poison consumption in the core. Does not include "no action",
+ *	   "action optional", or "deferred" error severities.
+ *	c) Will include a usable address so that immediate action can be taken.
+ * 3) Northbridge DRAM ECC errors
+ *	a) Reported in legacy bank 4 with extended error code (XEC) 8.
+ *	b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
+ *	   this bit should not be checked.
+ *
+ * NOTE: SMCA UMC memory errors fall into case #1.
+ */
+bool amd_mce_usable_address(struct mce *m)
+{
+	/* Check special northbridge case 3) first. */
+	if (!mce_flags.smca) {
+		if (legacy_mce_is_memory_error(m))
+			return true;
+		else if (m->bank == 4)
+			return false;
+	}
 
-	return m->bank == 4 && xec == 0x8;
+	/* Check poison bit for all other bank types. */
+	if (m->status & MCI_STATUS_POISON)
+		return true;
+
+	/* Assume address is not usable for all others. */
+	return false;
 }
 
 static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 8ed341714686..7f7309ff67d0 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -103,9 +103,9 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
 	m.socketid = -1;
 
 	for_each_possible_cpu(cpu) {
-		if (cpu_data(cpu).initial_apicid == lapic_id) {
+		if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
 			m.extcpu = cpu;
-			m.socketid = cpu_data(m.extcpu).phys_proc_id;
+			m.socketid = cpu_data(m.extcpu).topo.pkg_id;
 			break;
 		}
 	}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 6f35f724cc14..7b397370b4d6 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -123,8 +123,8 @@ void mce_setup(struct mce *m)
 	m->time = __ktime_get_real_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
 	m->cpuid = cpuid_eax(1);
-	m->socketid = cpu_data(m->extcpu).phys_proc_id;
-	m->apicid = cpu_data(m->extcpu).initial_apicid;
+	m->socketid = cpu_data(m->extcpu).topo.pkg_id;
+	m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
 	m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
 	m->ppin = cpu_data(m->extcpu).ppin;
 	m->microcode = boot_cpu_data.microcode;
@@ -453,32 +453,22 @@ static void mce_irq_work_cb(struct irq_work *entry)
 	mce_schedule_work();
 }
 
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granularity for now.
- */
-int mce_usable_address(struct mce *m)
+bool mce_usable_address(struct mce *m)
 {
 	if (!(m->status & MCI_STATUS_ADDRV))
-		return 0;
-
-	/* Checks after this one are Intel/Zhaoxin-specific: */
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
-	    boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
-		return 1;
-
-	if (!(m->status & MCI_STATUS_MISCV))
-		return 0;
+		return false;
 
-	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
-		return 0;
+	switch (m->cpuvendor) {
+	case X86_VENDOR_AMD:
+		return amd_mce_usable_address(m);
 
-	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
-		return 0;
+	case X86_VENDOR_INTEL:
+	case X86_VENDOR_ZHAOXIN:
+		return intel_mce_usable_address(m);
 
-	return 1;
+	default:
+		return true;
+	}
 }
 EXPORT_SYMBOL_GPL(mce_usable_address);
 
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f5323551c1a9..52bce533ddcc 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -536,3 +536,23 @@ bool intel_filter_mce(struct mce *m)
 
 	return false;
 }
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granularity for now.
+ */
+bool intel_mce_usable_address(struct mce *m)
+{
+	if (!(m->status & MCI_STATUS_MISCV))
+		return false;
+
+	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+		return false;
+
+	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+		return false;
+
+	return true;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index bcf1b3c66c9c..e13a26c9c0ac 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -49,6 +49,7 @@ void intel_init_cmci(void);
 void intel_init_lmce(void);
 void intel_clear_lmce(void);
 bool intel_filter_mce(struct mce *m);
+bool intel_mce_usable_address(struct mce *m);
 #else
 # define cmci_intel_adjust_timer mce_adjust_timer_default
 static inline bool mce_intel_cmci_poll(void) { return false; }
@@ -58,6 +59,7 @@ static inline void intel_init_cmci(void) { }
 static inline void intel_init_lmce(void) { }
 static inline void intel_clear_lmce(void) { }
 static inline bool intel_filter_mce(struct mce *m) { return false; }
+static inline bool intel_mce_usable_address(struct mce *m) { return false; }
 #endif
 
 void mce_timer_kick(unsigned long interval);
@@ -210,6 +212,7 @@ extern bool filter_mce(struct mce *m);
 
 #ifdef CONFIG_X86_MCE_AMD
 extern bool amd_filter_mce(struct mce *m);
+bool amd_mce_usable_address(struct mce *m);
 
 /*
  * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
@@ -237,6 +240,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m)
 
 #else
 static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline bool amd_mce_usable_address(struct mce *m) { return false; }
 static inline void smca_extract_err_addr(struct mce *m) { }
 #endif
 
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 31c0e68f6227..e65fae63660e 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -20,13 +20,13 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 			      unsigned int cpu)
 {
 #ifdef CONFIG_SMP
-	seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+	seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id);
 	seq_printf(m, "siblings\t: %d\n",
 		   cpumask_weight(topology_core_cpumask(cpu)));
-	seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+	seq_printf(m, "core id\t\t: %d\n", c->topo.core_id);
 	seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
-	seq_printf(m, "apicid\t\t: %d\n", c->apicid);
-	seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
+	seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid);
+	seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid);
 #endif
 }
 
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 030d3b409768..19e0681f0435 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -152,6 +152,7 @@ static inline void cache_alloc_hsw_probe(void)
 	r->cache.cbm_len = 20;
 	r->cache.shareable_bits = 0xc0000;
 	r->cache.min_cbm_bits = 2;
+	r->cache.arch_has_sparse_bitmasks = false;
 	r->alloc_capable = true;
 
 	rdt_alloc_capable = true;
@@ -267,15 +268,18 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	union cpuid_0x10_1_eax eax;
+	union cpuid_0x10_x_ecx ecx;
 	union cpuid_0x10_x_edx edx;
-	u32 ebx, ecx;
+	u32 ebx;
 
-	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
 	hw_res->num_closid = edx.split.cos_max + 1;
 	r->cache.cbm_len = eax.split.cbm_len + 1;
 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
 	r->cache.shareable_bits = ebx & r->default_ctrl;
 	r->data_width = (r->cache.cbm_len + 3) / 4;
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
 	r->alloc_capable = true;
 }
 
@@ -872,7 +876,6 @@ static __init void rdt_init_res_defs_intel(void)
 
 		if (r->rid == RDT_RESOURCE_L3 ||
 		    r->rid == RDT_RESOURCE_L2) {
-			r->cache.arch_has_sparse_bitmaps = false;
 			r->cache.arch_has_per_cpu_cfg = false;
 			r->cache.min_cbm_bits = 1;
 		} else if (r->rid == RDT_RESOURCE_MBA) {
@@ -892,7 +895,7 @@ static __init void rdt_init_res_defs_amd(void)
 
 		if (r->rid == RDT_RESOURCE_L3 ||
 		    r->rid == RDT_RESOURCE_L2) {
-			r->cache.arch_has_sparse_bitmaps = true;
+			r->cache.arch_has_sparse_bitmasks = true;
 			r->cache.arch_has_per_cpu_cfg = true;
 			r->cache.min_cbm_bits = 0;
 		} else if (r->rid == RDT_RESOURCE_MBA) {
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index b44c487727d4..beccb0e87ba7 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -87,10 +87,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
 
 /*
  * Check whether a cache bit mask is valid.
- * For Intel the SDM says:
- *	Please note that all (and only) contiguous '1' combinations
- *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
- * Additionally Haswell requires at least two bits set.
+ * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
+ *   - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
+ *   - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
+ *
+ * Haswell does not support a non-contiguous 1s value and additionally
+ * requires at least two bits set.
  * AMD allows non-contiguous bitmasks.
  */
 static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
@@ -113,8 +115,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
 	first_bit = find_first_bit(&val, cbm_len);
 	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
 
-	/* Are non-contiguous bitmaps allowed? */
-	if (!r->cache.arch_has_sparse_bitmaps &&
+	/* Are non-contiguous bitmasks allowed? */
+	if (!r->cache.arch_has_sparse_bitmasks &&
 	    (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
 		rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
 		return false;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 85ceaf9a31ac..a4f1aa15f0a2 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -59,6 +59,7 @@ struct rdt_fs_context {
 	bool				enable_cdpl2;
 	bool				enable_cdpl3;
 	bool				enable_mba_mbps;
+	bool				enable_debug;
 };
 
 static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
@@ -243,18 +244,17 @@ struct rdtgroup {
  */
 #define RFTYPE_INFO			BIT(0)
 #define RFTYPE_BASE			BIT(1)
-#define RF_CTRLSHIFT			4
-#define RF_MONSHIFT			5
-#define RF_TOPSHIFT			6
-#define RFTYPE_CTRL			BIT(RF_CTRLSHIFT)
-#define RFTYPE_MON			BIT(RF_MONSHIFT)
-#define RFTYPE_TOP			BIT(RF_TOPSHIFT)
+#define RFTYPE_CTRL			BIT(4)
+#define RFTYPE_MON			BIT(5)
+#define RFTYPE_TOP			BIT(6)
 #define RFTYPE_RES_CACHE		BIT(8)
 #define RFTYPE_RES_MB			BIT(9)
-#define RF_CTRL_INFO			(RFTYPE_INFO | RFTYPE_CTRL)
-#define RF_MON_INFO			(RFTYPE_INFO | RFTYPE_MON)
-#define RF_TOP_INFO			(RFTYPE_INFO | RFTYPE_TOP)
-#define RF_CTRL_BASE			(RFTYPE_BASE | RFTYPE_CTRL)
+#define RFTYPE_DEBUG			BIT(10)
+#define RFTYPE_CTRL_INFO		(RFTYPE_INFO | RFTYPE_CTRL)
+#define RFTYPE_MON_INFO			(RFTYPE_INFO | RFTYPE_MON)
+#define RFTYPE_TOP_INFO			(RFTYPE_INFO | RFTYPE_TOP)
+#define RFTYPE_CTRL_BASE		(RFTYPE_BASE | RFTYPE_CTRL)
+#define RFTYPE_MON_BASE			(RFTYPE_BASE | RFTYPE_MON)
 
 /* List of all resource groups */
 extern struct list_head rdt_all_groups;
@@ -270,7 +270,7 @@ void __exit rdtgroup_exit(void);
  * @mode:	Access mode
  * @kf_ops:	File operations
  * @flags:	File specific RFTYPE_FLAGS_* flags
- * @fflags:	File specific RF_* or RFTYPE_* flags
+ * @fflags:	File specific RFTYPE_* flags
  * @seq_show:	Show content of the file
  * @write:	Write to the file
  */
@@ -492,6 +492,15 @@ union cpuid_0x10_3_eax {
 	unsigned int full;
 };
 
+/* CPUID.(EAX=10H, ECX=ResID).ECX */
+union cpuid_0x10_x_ecx {
+	struct {
+		unsigned int reserved:3;
+		unsigned int noncont:1;
+	} split;
+	unsigned int full;
+};
+
 /* CPUID.(EAX=10H, ECX=ResID).EDX */
 union cpuid_0x10_x_edx {
 	struct {
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 725344048f85..69a1de92384a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -54,8 +54,13 @@ static struct kernfs_node *kn_mondata;
 static struct seq_buf last_cmd_status;
 static char last_cmd_status_buf[512];
 
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
+static void rdtgroup_destroy_root(void);
+
 struct dentry *debugfs_resctrl;
 
+static bool resctrl_debug;
+
 void rdt_last_cmd_clear(void)
 {
 	lockdep_assert_held(&rdtgroup_mutex);
@@ -696,11 +701,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 				    char *buf, size_t nbytes, loff_t off)
 {
 	struct rdtgroup *rdtgrp;
+	char *pid_str;
 	int ret = 0;
 	pid_t pid;
 
-	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
-		return -EINVAL;
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (!rdtgrp) {
 		rdtgroup_kn_unlock(of->kn);
@@ -715,7 +719,27 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 		goto unlock;
 	}
 
-	ret = rdtgroup_move_task(pid, rdtgrp, of);
+	while (buf && buf[0] != '\0' && buf[0] != '\n') {
+		pid_str = strim(strsep(&buf, ","));
+
+		if (kstrtoint(pid_str, 0, &pid)) {
+			rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
+			ret = -EINVAL;
+			break;
+		}
+
+		if (pid < 0) {
+			rdt_last_cmd_printf("Invalid pid %d\n", pid);
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = rdtgroup_move_task(pid, rdtgrp, of);
+		if (ret) {
+			rdt_last_cmd_printf("Error while processing task %d\n", pid);
+			break;
+		}
+	}
 
 unlock:
 	rdtgroup_kn_unlock(of->kn);
@@ -755,6 +779,38 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 	return ret;
 }
 
+static int rdtgroup_closid_show(struct kernfs_open_file *of,
+				struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp)
+		seq_printf(s, "%u\n", rdtgrp->closid);
+	else
+		ret = -ENOENT;
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret;
+}
+
+static int rdtgroup_rmid_show(struct kernfs_open_file *of,
+			      struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp)
+		seq_printf(s, "%u\n", rdtgrp->mon.rmid);
+	else
+		ret = -ENOENT;
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret;
+}
+
 #ifdef CONFIG_PROC_CPU_RESCTRL
 
 /*
@@ -895,7 +951,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 	return 0;
 }
 
-/**
+/*
  * rdt_bit_usage_show - Display current usage of resources
  *
  * A domain is a shared resource that can now be allocated differently. Here
@@ -1117,12 +1173,24 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
 	}
 }
 
+static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
+					struct seq_file *seq, void *v)
+{
+	struct resctrl_schema *s = of->kn->parent->priv;
+	struct rdt_resource *r = s->res;
+
+	seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
+
+	return 0;
+}
+
 /**
  * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
  * @r: Resource to which domain instance @d belongs.
  * @d: The domain instance for which @closid is being tested.
  * @cbm: Capacity bitmask being tested.
  * @closid: Intended closid for @cbm.
+ * @type: CDP type of @r.
  * @exclusive: Only check if overlaps with exclusive resource groups
  *
  * Checks if provided @cbm intended to be used for @closid on domain
@@ -1209,6 +1277,7 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
 
 /**
  * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
+ * @rdtgrp: Resource group identified through its closid.
  *
  * An exclusive resource group implies that there should be no sharing of
  * its allocated resources. At the time this group is considered to be
@@ -1251,9 +1320,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 	return true;
 }
 
-/**
+/*
  * rdtgroup_mode_write - Modify the resource group's mode
- *
  */
 static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
 				   char *buf, size_t nbytes, loff_t off)
@@ -1357,12 +1425,11 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
 	return size;
 }
 
-/**
+/*
  * rdtgroup_size_show - Display size in bytes of allocated regions
  *
  * The "size" file mirrors the layout of the "schemata" file, printing the
  * size in bytes of each region instead of the capacity bitmask.
- *
  */
 static int rdtgroup_size_show(struct kernfs_open_file *of,
 			      struct seq_file *s, void *v)
@@ -1686,77 +1753,77 @@ static struct rftype res_common_files[] = {
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_last_cmd_status_show,
-		.fflags		= RF_TOP_INFO,
+		.fflags		= RFTYPE_TOP_INFO,
 	},
 	{
 		.name		= "num_closids",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_num_closids_show,
-		.fflags		= RF_CTRL_INFO,
+		.fflags		= RFTYPE_CTRL_INFO,
 	},
 	{
 		.name		= "mon_features",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_mon_features_show,
-		.fflags		= RF_MON_INFO,
+		.fflags		= RFTYPE_MON_INFO,
 	},
 	{
 		.name		= "num_rmids",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_num_rmids_show,
-		.fflags		= RF_MON_INFO,
+		.fflags		= RFTYPE_MON_INFO,
 	},
 	{
 		.name		= "cbm_mask",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_default_ctrl_show,
-		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
+		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "min_cbm_bits",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_min_cbm_bits_show,
-		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
+		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "shareable_bits",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_shareable_bits_show,
-		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
+		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "bit_usage",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_bit_usage_show,
-		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
+		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "min_bandwidth",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_min_bw_show,
-		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
+		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
 	},
 	{
 		.name		= "bandwidth_gran",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_bw_gran_show,
-		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
+		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
 	},
 	{
 		.name		= "delay_linear",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdt_delay_linear_show,
-		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
+		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
 	},
 	/*
 	 * Platform specific which (if any) capabilities are provided by
@@ -1775,7 +1842,7 @@ static struct rftype res_common_files[] = {
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.write		= max_threshold_occ_write,
 		.seq_show	= max_threshold_occ_show,
-		.fflags		= RF_MON_INFO | RFTYPE_RES_CACHE,
+		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
 	},
 	{
 		.name		= "mbm_total_bytes_config",
@@ -1817,12 +1884,19 @@ static struct rftype res_common_files[] = {
 		.fflags		= RFTYPE_BASE,
 	},
 	{
+		.name		= "mon_hw_id",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdtgroup_rmid_show,
+		.fflags		= RFTYPE_MON_BASE | RFTYPE_DEBUG,
+	},
+	{
 		.name		= "schemata",
 		.mode		= 0644,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.write		= rdtgroup_schemata_write,
 		.seq_show	= rdtgroup_schemata_show,
-		.fflags		= RF_CTRL_BASE,
+		.fflags		= RFTYPE_CTRL_BASE,
 	},
 	{
 		.name		= "mode",
@@ -1830,14 +1904,28 @@ static struct rftype res_common_files[] = {
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.write		= rdtgroup_mode_write,
 		.seq_show	= rdtgroup_mode_show,
-		.fflags		= RF_CTRL_BASE,
+		.fflags		= RFTYPE_CTRL_BASE,
 	},
 	{
 		.name		= "size",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
 		.seq_show	= rdtgroup_size_show,
-		.fflags		= RF_CTRL_BASE,
+		.fflags		= RFTYPE_CTRL_BASE,
+	},
+	{
+		.name		= "sparse_masks",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_has_sparse_bitmasks_show,
+		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+	},
+	{
+		.name		= "ctrl_hw_id",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdtgroup_closid_show,
+		.fflags		= RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
 	},
 
 };
@@ -1852,6 +1940,9 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
 
 	lockdep_assert_held(&rdtgroup_mutex);
 
+	if (resctrl_debug)
+		fflags |= RFTYPE_DEBUG;
+
 	for (rft = rfts; rft < rfts + len; rft++) {
 		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
 			ret = rdtgroup_add_file(kn, rft);
@@ -1894,7 +1985,7 @@ void __init thread_throttle_mode_init(void)
 	if (!rft)
 		return;
 
-	rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
+	rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
 }
 
 void __init mbm_config_rftype_init(const char *config)
@@ -1903,7 +1994,7 @@ void __init mbm_config_rftype_init(const char *config)
 
 	rft = rdtgroup_get_rftype_by_name(config);
 	if (rft)
-		rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE;
+		rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
 }
 
 /**
@@ -2038,21 +2129,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 	if (IS_ERR(kn_info))
 		return PTR_ERR(kn_info);
 
-	ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
+	ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
 	if (ret)
 		goto out_destroy;
 
 	/* loop over enabled controls, these are all alloc_capable */
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
-		fflags =  r->fflags | RF_CTRL_INFO;
+		fflags = r->fflags | RFTYPE_CTRL_INFO;
 		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
 		if (ret)
 			goto out_destroy;
 	}
 
 	for_each_mon_capable_rdt_resource(r) {
-		fflags =  r->fflags | RF_MON_INFO;
+		fflags = r->fflags | RFTYPE_MON_INFO;
 		sprintf(name, "%s_MON", r->name);
 		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
 		if (ret)
@@ -2271,14 +2362,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
 	return 0;
 }
 
-static void cdp_disable_all(void)
-{
-	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
-		resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
-	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
-		resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-}
-
 /*
  * We don't allow rdtgroup directories to be created anywhere
  * except the root directory. Thus when looking for the rdtgroup
@@ -2358,19 +2441,47 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
 			     struct rdtgroup *prgrp,
 			     struct kernfs_node **mon_data_kn);
 
+static void rdt_disable_ctx(void)
+{
+	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+	set_mba_sc(false);
+
+	resctrl_debug = false;
+}
+
 static int rdt_enable_ctx(struct rdt_fs_context *ctx)
 {
 	int ret = 0;
 
-	if (ctx->enable_cdpl2)
+	if (ctx->enable_cdpl2) {
 		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
+		if (ret)
+			goto out_done;
+	}
 
-	if (!ret && ctx->enable_cdpl3)
+	if (ctx->enable_cdpl3) {
 		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
+		if (ret)
+			goto out_cdpl2;
+	}
 
-	if (!ret && ctx->enable_mba_mbps)
+	if (ctx->enable_mba_mbps) {
 		ret = set_mba_sc(true);
+		if (ret)
+			goto out_cdpl3;
+	}
+
+	if (ctx->enable_debug)
+		resctrl_debug = true;
 
+	return 0;
+
+out_cdpl3:
+	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+out_cdpl2:
+	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+out_done:
 	return ret;
 }
 
@@ -2463,6 +2574,7 @@ static void schemata_list_destroy(void)
 static int rdt_get_tree(struct fs_context *fc)
 {
 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
+	unsigned long flags = RFTYPE_CTRL_BASE;
 	struct rdt_domain *dom;
 	struct rdt_resource *r;
 	int ret;
@@ -2477,18 +2589,31 @@ static int rdt_get_tree(struct fs_context *fc)
 		goto out;
 	}
 
+	ret = rdtgroup_setup_root(ctx);
+	if (ret)
+		goto out;
+
 	ret = rdt_enable_ctx(ctx);
-	if (ret < 0)
-		goto out_cdp;
+	if (ret)
+		goto out_root;
 
 	ret = schemata_list_create();
 	if (ret) {
 		schemata_list_destroy();
-		goto out_mba;
+		goto out_ctx;
 	}
 
 	closid_init();
 
+	if (rdt_mon_capable)
+		flags |= RFTYPE_MON;
+
+	ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
+	if (ret)
+		goto out_schemata_free;
+
+	kernfs_activate(rdtgroup_default.kn);
+
 	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
 	if (ret < 0)
 		goto out_schemata_free;
@@ -2543,11 +2668,10 @@ out_info:
 	kernfs_remove(kn_info);
 out_schemata_free:
 	schemata_list_destroy();
-out_mba:
-	if (ctx->enable_mba_mbps)
-		set_mba_sc(false);
-out_cdp:
-	cdp_disable_all();
+out_ctx:
+	rdt_disable_ctx();
+out_root:
+	rdtgroup_destroy_root();
 out:
 	rdt_last_cmd_clear();
 	mutex_unlock(&rdtgroup_mutex);
@@ -2559,6 +2683,7 @@ enum rdt_param {
 	Opt_cdp,
 	Opt_cdpl2,
 	Opt_mba_mbps,
+	Opt_debug,
 	nr__rdt_params
 };
 
@@ -2566,6 +2691,7 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = {
 	fsparam_flag("cdp",		Opt_cdp),
 	fsparam_flag("cdpl2",		Opt_cdpl2),
 	fsparam_flag("mba_MBps",	Opt_mba_mbps),
+	fsparam_flag("debug",		Opt_debug),
 	{}
 };
 
@@ -2591,6 +2717,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
 			return -EINVAL;
 		ctx->enable_mba_mbps = true;
 		return 0;
+	case Opt_debug:
+		ctx->enable_debug = true;
+		return 0;
 	}
 
 	return -EINVAL;
@@ -2618,7 +2747,6 @@ static int rdt_init_fs_context(struct fs_context *fc)
 	if (!ctx)
 		return -ENOMEM;
 
-	ctx->kfc.root = rdt_root;
 	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
 	fc->fs_private = &ctx->kfc;
 	fc->ops = &rdt_fs_context_ops;
@@ -2779,16 +2907,16 @@ static void rdt_kill_sb(struct super_block *sb)
 	cpus_read_lock();
 	mutex_lock(&rdtgroup_mutex);
 
-	set_mba_sc(false);
+	rdt_disable_ctx();
 
 	/*Put everything back to default values. */
 	for_each_alloc_capable_rdt_resource(r)
 		reset_all_ctrls(r);
-	cdp_disable_all();
 	rmdir_all_sub();
 	rdt_pseudo_lock_release();
 	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
 	schemata_list_destroy();
+	rdtgroup_destroy_root();
 	static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
 	static_branch_disable_cpuslocked(&rdt_mon_enable_key);
 	static_branch_disable_cpuslocked(&rdt_enable_key);
@@ -3170,8 +3298,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 			     enum rdt_group_type rtype, struct rdtgroup **r)
 {
 	struct rdtgroup *prdtgrp, *rdtgrp;
+	unsigned long files = 0;
 	struct kernfs_node *kn;
-	uint files = 0;
 	int ret;
 
 	prdtgrp = rdtgroup_kn_lock_live(parent_kn);
@@ -3223,7 +3351,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 		goto out_destroy;
 	}
 
-	files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+	if (rtype == RDTCTRL_GROUP) {
+		files = RFTYPE_BASE | RFTYPE_CTRL;
+		if (rdt_mon_capable)
+			files |= RFTYPE_MON;
+	} else {
+		files = RFTYPE_BASE | RFTYPE_MON;
+	}
+
 	ret = rdtgroup_add_files(kn, files);
 	if (ret) {
 		rdt_last_cmd_puts("kernfs fill error\n");
@@ -3656,6 +3791,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 	if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
 		seq_puts(seq, ",mba_MBps");
 
+	if (resctrl_debug)
+		seq_puts(seq, ",debug");
+
 	return 0;
 }
 
@@ -3666,10 +3804,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
 	.show_options	= rdtgroup_show_options,
 };
 
-static int __init rdtgroup_setup_root(void)
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
 {
-	int ret;
-
 	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
 				      KERNFS_ROOT_CREATE_DEACTIVATED |
 				      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
@@ -3677,6 +3813,20 @@ static int __init rdtgroup_setup_root(void)
 	if (IS_ERR(rdt_root))
 		return PTR_ERR(rdt_root);
 
+	ctx->kfc.root = rdt_root;
+	rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
+
+	return 0;
+}
+
+static void rdtgroup_destroy_root(void)
+{
+	kernfs_destroy_root(rdt_root);
+	rdtgroup_default.kn = NULL;
+}
+
+static void __init rdtgroup_setup_default(void)
+{
 	mutex_lock(&rdtgroup_mutex);
 
 	rdtgroup_default.closid = 0;
@@ -3686,19 +3836,7 @@ static int __init rdtgroup_setup_root(void)
 
 	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
 
-	ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE);
-	if (ret) {
-		kernfs_destroy_root(rdt_root);
-		goto out;
-	}
-
-	rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
-	kernfs_activate(rdtgroup_default.kn);
-
-out:
 	mutex_unlock(&rdtgroup_mutex);
-
-	return ret;
 }
 
 static void domain_destroy_mon_state(struct rdt_domain *d)
@@ -3820,13 +3958,11 @@ int __init rdtgroup_init(void)
 	seq_buf_init(&last_cmd_status, last_cmd_status_buf,
 		     sizeof(last_cmd_status_buf));
 
-	ret = rdtgroup_setup_root();
-	if (ret)
-		return ret;
+	rdtgroup_setup_default();
 
 	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
 	if (ret)
-		goto cleanup_root;
+		return ret;
 
 	ret = register_filesystem(&rdt_fs_type);
 	if (ret)
@@ -3859,8 +3995,6 @@ int __init rdtgroup_init(void)
 
 cleanup_mountpoint:
 	sysfs_remove_mount_point(fs_kobj, "resctrl");
-cleanup_root:
-	kernfs_destroy_root(rdt_root);
 
 	return ret;
 }
@@ -3870,5 +4004,4 @@ void __exit rdtgroup_exit(void)
 	debugfs_remove_recursive(debugfs_resctrl);
 	unregister_filesystem(&rdt_fs_type);
 	sysfs_remove_mount_point(fs_kobj, "resctrl");
-	kernfs_destroy_root(rdt_root);
 }
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 0270925fe013..dc136703566f 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -78,7 +78,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c)
 	/*
 	 * initial apic id, which also represents 32-bit extended x2apic id.
 	 */
-	c->initial_apicid = edx;
+	c->topo.initial_apicid = edx;
 	smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
 #endif
 	return 0;
@@ -108,7 +108,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
 	 * Populate HT related information from sub-leaf level 0.
 	 */
 	cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
-	c->initial_apicid = edx;
+	c->topo.initial_apicid = edx;
 	core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
 	smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
 	core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
@@ -146,20 +146,19 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
 	die_select_mask = (~(-1 << die_plus_mask_width)) >>
 				core_plus_mask_width;
 
-	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid,
+	c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid,
 				ht_mask_width) & core_select_mask;
 
 	if (die_level_present) {
-		c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
+		c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid,
 					core_plus_mask_width) & die_select_mask;
 	}
 
-	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
-				pkg_mask_width);
+	c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width);
 	/*
 	 * Reinit the apicid, now that we have extended initial_apicid.
 	 */
-	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+	c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
 
 	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
 	__max_die_per_package = (die_level_siblings / core_level_siblings);
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 05fa4ef63490..415564a6523b 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -65,20 +65,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
 	}
-
-	if (c->cpuid_level >= 0x00000001) {
-		u32 eax, ebx, ecx, edx;
-
-		cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
-		/*
-		 * If HTT (EDX[28]) is set EBX[16:23] contain the number of
-		 * apicids which are reserved per package. Store the resulting
-		 * shift value for the package management code.
-		 */
-		if (edx & (1U << 28))
-			c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
-	}
-
 }
 
 static void init_zhaoxin(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 87d38f17ff5c..afd09924094e 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -278,7 +278,7 @@ static void __init dtb_apic_setup(void)
 }
 
 #ifdef CONFIG_OF_EARLY_FLATTREE
-static void __init x86_flattree_get_config(void)
+void __init x86_flattree_get_config(void)
 {
 	u32 size, map_len;
 	void *dt;
@@ -300,14 +300,10 @@ static void __init x86_flattree_get_config(void)
 	unflatten_and_copy_device_tree();
 	early_memunmap(dt, map_len);
 }
-#else
-static inline void x86_flattree_get_config(void) { }
 #endif
 
 void __init x86_dtb_init(void)
 {
-	x86_flattree_get_config();
-
 	if (!of_have_populated_dt())
 		return;
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index ef6906107c54..117e74c44e75 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1736,7 +1736,6 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
 
 /**
  * fpu_xstate_prctl - xstate permission operations
- * @tsk:	Redundant pointer to current
  * @option:	A subfunction of arch_prctl()
  * @arg2:	option argument
  * Return:	0 if successful; otherwise, an error code
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index 24c1175a47e2..58d9ed50fe61 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -3,10 +3,10 @@
  *  Copyright (C) 2017  Steven Rostedt, VMware Inc.
  */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/page_types.h>
 #include <asm/segment.h>
-#include <asm/export.h>
 #include <asm/ftrace.h>
 #include <asm/nospec-branch.h>
 #include <asm/frame.h>
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 945cfa5f7239..214f30e9f0c0 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -3,12 +3,12 @@
  *  Copyright (C) 2014  Steven Rostedt, Red Hat Inc
  */
 
+#include <linux/export.h>
 #include <linux/cfi_types.h>
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include <asm/ptrace.h>
 #include <asm/ftrace.h>
-#include <asm/export.h>
 #include <asm/nospec-branch.h>
 #include <asm/unwind_hints.h>
 #include <asm/frame.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 49f7629b17f7..05a110c97111 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -41,6 +41,7 @@
 #include <asm/trapnr.h>
 #include <asm/sev.h>
 #include <asm/tdx.h>
+#include <asm/init.h>
 
 /*
  * Manage page tables very early on.
@@ -69,7 +70,7 @@ EXPORT_SYMBOL(vmemmap_base);
 /*
  * GDT used on the boot CPU before switching to virtual addresses.
  */
-static struct desc_struct startup_gdt[GDT_ENTRIES] = {
+static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = {
 	[GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
 	[GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
 	[GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
@@ -79,13 +80,11 @@ static struct desc_struct startup_gdt[GDT_ENTRIES] = {
  * Address needs to be set at runtime because it references the startup_gdt
  * while the kernel still uses a direct mapping.
  */
-static struct desc_ptr startup_gdt_descr = {
-	.size = sizeof(startup_gdt),
+static struct desc_ptr startup_gdt_descr __initdata = {
+	.size = sizeof(startup_gdt)-1,
 	.address = 0,
 };
 
-#define __head	__section(".head.text")
-
 static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
 {
 	return ptr - (void *)_text + (void *)physaddr;
@@ -211,7 +210,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
 
 	/* Fixup the physical addresses in the page table */
 
-	pgd = fixup_pointer(&early_top_pgt, physaddr);
+	pgd = fixup_pointer(early_top_pgt, physaddr);
 	p = pgd + pgd_index(__START_KERNEL_map);
 	if (la57)
 		*p = (unsigned long)level4_kernel_pgt;
@@ -220,11 +219,11 @@ unsigned long __head __startup_64(unsigned long physaddr,
 	*p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
 
 	if (la57) {
-		p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+		p4d = fixup_pointer(level4_kernel_pgt, physaddr);
 		p4d[511] += load_delta;
 	}
 
-	pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+	pud = fixup_pointer(level3_kernel_pgt, physaddr);
 	pud[510] += load_delta;
 	pud[511] += load_delta;
 
@@ -588,7 +587,7 @@ static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
 }
 
 /* This runs while still in the direct mapping */
-static void startup_64_load_idt(unsigned long physbase)
+static void __head startup_64_load_idt(unsigned long physbase)
 {
 	struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
 	gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c9318993f959..b6554212b7c7 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -8,6 +8,7 @@
  */
 
 .text
+#include <linux/export.h>
 #include <linux/threads.h>
 #include <linux/init.h>
 #include <linux/linkage.h>
@@ -25,7 +26,6 @@
 #include <asm/nops.h>
 #include <asm/nospec-branch.h>
 #include <asm/bootparam.h>
-#include <asm/export.h>
 #include <asm/pgtable_32.h>
 
 /* Physical address */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ea6995920b7a..086a2c3aaaa0 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -9,7 +9,7 @@
  *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
  */
 
-
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <linux/threads.h>
 #include <linux/init.h>
@@ -22,7 +22,6 @@
 #include <asm/percpu.h>
 #include <asm/nops.h>
 #include "../entry/calling.h"
-#include <asm/export.h>
 #include <asm/nospec-branch.h>
 #include <asm/apicdef.h>
 #include <asm/fixmap.h>
@@ -180,8 +179,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
 	movl	$0, %ecx
 #endif
 
-	/* Enable PAE mode, PGE and LA57 */
-	orl	$(X86_CR4_PAE | X86_CR4_PGE), %ecx
+	/* Enable PAE mode, PSE, PGE and LA57 */
+	orl	$(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
 #ifdef CONFIG_X86_5LEVEL
 	testl	$1, __pgtable_l5_enabled(%rip)
 	jz	1f
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1648aa0204d9..41eecf180b7f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -52,7 +52,7 @@ unsigned long				hpet_address;
 u8					hpet_blockid; /* OS timer block num */
 bool					hpet_msi_disable;
 
-#ifdef CONFIG_GENERIC_MSI_IRQ
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
 static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
 static struct irq_domain		*hpet_domain;
 #endif
@@ -469,7 +469,7 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
 /*
  * HPET MSI Support
  */
-#ifdef CONFIG_GENERIC_MSI_IRQ
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
 static void hpet_msi_unmask(struct irq_data *data)
 {
 	struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 30a55207c000..c20d1832c481 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -32,6 +32,7 @@
  */
 static void init_8259A(int auto_eoi);
 
+static bool pcat_compat __ro_after_init;
 static int i8259A_auto_eoi;
 DEFINE_RAW_SPINLOCK(i8259A_lock);
 
@@ -299,15 +300,32 @@ static void unmask_8259A(void)
 
 static int probe_8259A(void)
 {
+	unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
 	unsigned long flags;
-	unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
-	unsigned char new_val;
+
+	/*
+	 * If MADT has the PCAT_COMPAT flag set, then do not bother probing
+	 * for the PIC. Some BIOSes leave the PIC uninitialized and probing
+	 * fails.
+	 *
+	 * Right now this causes problems as quite some code depends on
+	 * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
+	 * when the system has an IO/APIC because then PIC is not required
+	 * at all, except for really old machines where the timer interrupt
+	 * must be routed through the PIC. So just pretend that the PIC is
+	 * there and let legacy_pic->init() initialize it for nothing.
+	 *
+	 * Alternatively this could just try to initialize the PIC and
+	 * repeat the probe, but for cases where there is no PIC that's
+	 * just pointless.
+	 */
+	if (pcat_compat)
+		return nr_legacy_irqs();
+
 	/*
-	 * Check to see if we have a PIC.
-	 * Mask all except the cascade and read
-	 * back the value we just wrote. If we don't
-	 * have a PIC, we will read 0xff as opposed to the
-	 * value we wrote.
+	 * Check to see if we have a PIC.  Mask all except the cascade and
+	 * read back the value we just wrote. If we don't have a PIC, we
+	 * will read 0xff as opposed to the value we wrote.
 	 */
 	raw_spin_lock_irqsave(&i8259A_lock, flags);
 
@@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void)
 
 	return 0;
 }
-
 device_initcall(i8259A_init_ops);
+
+void __init legacy_pic_pcat_compat(void)
+{
+	pcat_compat = true;
+}
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index b786d48f5a0f..8857abc706e4 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -10,6 +10,7 @@
 #include <asm/proto.h>
 #include <asm/desc.h>
 #include <asm/hw_irq.h>
+#include <asm/ia32.h>
 #include <asm/idtentry.h>
 
 #define DPL0		0x0
@@ -116,6 +117,9 @@ static const __initconst struct idt_data def_idts[] = {
 #endif
 
 	SYSG(X86_TRAP_OF,		asm_exc_overflow),
+};
+
+static const struct idt_data ia32_idt[] __initconst = {
 #if defined(CONFIG_IA32_EMULATION)
 	SYSG(IA32_SYSCALL_VECTOR,	entry_INT80_compat),
 #elif defined(CONFIG_X86_32)
@@ -225,6 +229,9 @@ void __init idt_setup_early_traps(void)
 void __init idt_setup_traps(void)
 {
 	idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+
+	if (ia32_enabled())
+		idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true);
 }
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index aaf9e776f323..7f542a7799cb 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 #include <asm/asm.h>
-#include <asm/export.h>
+#include <linux/export.h>
 #include <linux/linkage.h>
 
 /*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b8ab9ee5896c..0ddb3bd0f1aa 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -500,13 +500,13 @@ static bool pv_sched_yield_supported(void)
 static void __send_ipi_mask(const struct cpumask *mask, int vector)
 {
 	unsigned long flags;
-	int cpu, apic_id, icr;
-	int min = 0, max = 0;
+	int cpu, min = 0, max = 0;
 #ifdef CONFIG_X86_64
 	__uint128_t ipi_bitmap = 0;
 #else
 	u64 ipi_bitmap = 0;
 #endif
+	u32 apic_id, icr;
 	long ret;
 
 	if (cpumask_empty(mask))
@@ -1028,8 +1028,8 @@ arch_initcall(activate_jump_labels);
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
 static void kvm_kick_cpu(int cpu)
 {
-	int apicid;
 	unsigned long flags = 0;
+	u32 apicid;
 
 	apicid = per_cpu(x86_cpu_to_apicid, cpu);
 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index a0c551846b35..4766b6bed443 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -507,12 +507,13 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
 	}
 	this_cpu_write(nmi_state, NMI_EXECUTING);
 	this_cpu_write(nmi_cr2, read_cr2());
+
+nmi_restart:
 	if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
 		WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
 		WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
 		WRITE_ONCE(nsp->recv_jiffies, jiffies);
 	}
-nmi_restart:
 
 	/*
 	 * Needs to happen before DR7 is accessed, because the hypervisor can
@@ -548,16 +549,16 @@ nmi_restart:
 
 	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
 		write_cr2(this_cpu_read(nmi_cr2));
-	if (this_cpu_dec_return(nmi_state))
-		goto nmi_restart;
-
-	if (user_mode(regs))
-		mds_user_clear_cpu_buffers();
 	if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
 		WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
 		WARN_ON_ONCE(nsp->idt_seq & 0x1);
 		WRITE_ONCE(nsp->recv_jiffies, jiffies);
 	}
+	if (this_cpu_dec_return(nmi_state))
+		goto nmi_restart;
+
+	if (user_mode(regs))
+		mds_user_clear_cpu_buffers();
 }
 
 #if IS_ENABLED(CONFIG_KVM_INTEL)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b098b1fa2470..ccd3ad29a1dc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1120,7 +1120,7 @@ void __init setup_arch(char **cmdline_p)
 	 * Needs to run after memblock setup because it needs the physical
 	 * memory size.
 	 */
-	sev_setup_arch();
+	mem_encrypt_setup_arch();
 
 	efi_fake_memmap();
 	efi_find_mirror();
@@ -1217,6 +1217,8 @@ void __init setup_arch(char **cmdline_p)
 
 	early_acpi_boot_init();
 
+	x86_flattree_get_config();
+
 	initmem_init();
 	dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
 
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 6395bfd87b68..70472eebe719 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -966,7 +966,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
 		free_page((unsigned long)vmsa);
 }
 
-static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip)
+static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
 {
 	struct sev_es_save_area *cur_vmsa, *vmsa;
 	struct ghcb_state state;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2a187c0cbd5b..c4aca66f0902 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -87,6 +87,7 @@
 #include <asm/hw_irq.h>
 #include <asm/stackprotector.h>
 #include <asm/sev.h>
+#include <asm/spec-ctrl.h>
 
 /* representing HT siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -124,7 +125,20 @@ struct mwait_cpu_dead {
  */
 static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
 
-/* Logical package management. We might want to allocate that dynamically */
+/* Logical package management. */
+struct logical_maps {
+	u32	phys_pkg_id;
+	u32	phys_die_id;
+	u32	logical_pkg_id;
+	u32	logical_die_id;
+};
+
+/* Temporary workaround until the full topology mechanics is in place */
+static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = {
+	.phys_pkg_id	= U32_MAX,
+	.phys_die_id	= U32_MAX,
+};
+
 unsigned int __max_logical_packages __read_mostly;
 EXPORT_SYMBOL(__max_logical_packages);
 static unsigned int logical_packages __read_mostly;
@@ -288,7 +302,7 @@ static void notrace start_secondary(void *unused)
 
 	cpu_init();
 	fpu__init_cpu();
-	rcu_cpu_starting(raw_smp_processor_id());
+	rcutree_report_cpu_starting(raw_smp_processor_id());
 	x86_cpuinit.early_percpu_clock_init();
 
 	ap_starting();
@@ -337,10 +351,8 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-		if (c->initialized && c->phys_proc_id == phys_pkg)
-			return c->logical_proc_id;
+		if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg)
+			return per_cpu(logical_maps.logical_pkg_id, cpu);
 	}
 	return -1;
 }
@@ -355,14 +367,12 @@ EXPORT_SYMBOL(topology_phys_to_logical_pkg);
  */
 static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
 {
-	int cpu, proc_id = cpu_data(cur_cpu).phys_proc_id;
+	int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id;
 
 	for_each_possible_cpu(cpu) {
-		struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-		if (c->initialized && c->cpu_die_id == die_id &&
-		    c->phys_proc_id == proc_id)
-			return c->logical_die_id;
+		if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id &&
+		    per_cpu(logical_maps.phys_die_id, cpu) == die_id)
+			return per_cpu(logical_maps.logical_die_id, cpu);
 	}
 	return -1;
 }
@@ -387,7 +397,9 @@ int topology_update_package_map(unsigned int pkg, unsigned int cpu)
 			cpu, pkg, new);
 	}
 found:
-	cpu_data(cpu).logical_proc_id = new;
+	per_cpu(logical_maps.phys_pkg_id, cpu) = pkg;
+	per_cpu(logical_maps.logical_pkg_id, cpu) = new;
+	cpu_data(cpu).topo.logical_pkg_id = new;
 	return 0;
 }
 /**
@@ -410,7 +422,9 @@ int topology_update_die_map(unsigned int die, unsigned int cpu)
 			cpu, die, new);
 	}
 found:
-	cpu_data(cpu).logical_die_id = new;
+	per_cpu(logical_maps.phys_die_id, cpu) = die;
+	per_cpu(logical_maps.logical_die_id, cpu) = new;
+	cpu_data(cpu).topo.logical_die_id = new;
 	return 0;
 }
 
@@ -421,8 +435,8 @@ static void __init smp_store_boot_cpu_info(void)
 
 	*c = boot_cpu_data;
 	c->cpu_index = id;
-	topology_update_package_map(c->phys_proc_id, id);
-	topology_update_die_map(c->cpu_die_id, id);
+	topology_update_package_map(c->topo.pkg_id, id);
+	topology_update_die_map(c->topo.die_id, id);
 	c->initialized = true;
 }
 
@@ -476,21 +490,21 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 
-		if (c->phys_proc_id == o->phys_proc_id &&
-		    c->cpu_die_id == o->cpu_die_id &&
-		    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
-			if (c->cpu_core_id == o->cpu_core_id)
+		if (c->topo.pkg_id == o->topo.pkg_id &&
+		    c->topo.die_id == o->topo.die_id &&
+		    per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) {
+			if (c->topo.core_id == o->topo.core_id)
 				return topology_sane(c, o, "smt");
 
-			if ((c->cu_id != 0xff) &&
-			    (o->cu_id != 0xff) &&
-			    (c->cu_id == o->cu_id))
+			if ((c->topo.cu_id != 0xff) &&
+			    (o->topo.cu_id != 0xff) &&
+			    (c->topo.cu_id == o->topo.cu_id))
 				return topology_sane(c, o, "smt");
 		}
 
-	} else if (c->phys_proc_id == o->phys_proc_id &&
-		   c->cpu_die_id == o->cpu_die_id &&
-		   c->cpu_core_id == o->cpu_core_id) {
+	} else if (c->topo.pkg_id == o->topo.pkg_id &&
+		   c->topo.die_id == o->topo.die_id &&
+		   c->topo.core_id == o->topo.core_id) {
 		return topology_sane(c, o, "smt");
 	}
 
@@ -499,8 +513,8 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 
 static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
-	if (c->phys_proc_id == o->phys_proc_id &&
-	    c->cpu_die_id == o->cpu_die_id)
+	if (c->topo.pkg_id == o->topo.pkg_id &&
+	    c->topo.die_id == o->topo.die_id)
 		return true;
 	return false;
 }
@@ -510,11 +524,11 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 
 	/* If the arch didn't set up l2c_id, fall back to SMT */
-	if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
+	if (per_cpu_l2c_id(cpu1) == BAD_APICID)
 		return match_smt(c, o);
 
 	/* Do not match if L2 cache id does not match: */
-	if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
+	if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2))
 		return false;
 
 	return topology_sane(c, o, "l2c");
@@ -527,7 +541,7 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
  */
 static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
-	if (c->phys_proc_id == o->phys_proc_id)
+	if (c->topo.pkg_id == o->topo.pkg_id)
 		return true;
 	return false;
 }
@@ -560,11 +574,11 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	bool intel_snc = id && id->driver_data;
 
 	/* Do not match if we do not have a valid APICID for cpu: */
-	if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+	if (per_cpu_llc_id(cpu1) == BAD_APICID)
 		return false;
 
 	/* Do not match if LLC id does not match: */
-	if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+	if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
 		return false;
 
 	/*
@@ -640,13 +654,13 @@ static void __init build_sched_topology(void)
 	};
 #endif
 	/*
-	 * When there is NUMA topology inside the package skip the DIE domain
+	 * When there is NUMA topology inside the package skip the PKG domain
 	 * since the NUMA domains will auto-magically create the right spanning
 	 * domains based on the SLIT.
 	 */
 	if (!x86_has_numa_in_package) {
 		x86_topology[i++] = (struct sched_domain_topology_level){
-			cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
+			cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
 		};
 	}
 
@@ -809,7 +823,7 @@ static void __init smp_quirk_init_udelay(void)
 /*
  * Wake up AP by INIT, INIT, STARTUP sequence.
  */
-static void send_init_sequence(int phys_apicid)
+static void send_init_sequence(u32 phys_apicid)
 {
 	int maxlvt = lapic_get_maxlvt();
 
@@ -835,7 +849,7 @@ static void send_init_sequence(int phys_apicid)
 /*
  * Wake up AP by INIT, INIT, STARTUP sequence.
  */
-static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip)
 {
 	unsigned long send_status = 0, accept_status = 0;
 	int num_starts, j, maxlvt;
@@ -982,7 +996,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
  * Returns zero if startup was successfully sent, else error code from
  * ->wakeup_secondary_cpu.
  */
-static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle)
 {
 	unsigned long start_ip = real_mode_header->trampoline_start;
 	int ret;
@@ -1050,7 +1064,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 
 int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
 {
-	int apicid = apic->cpu_present_to_apicid(cpu);
+	u32 apicid = apic->cpu_present_to_apicid(cpu);
 	int err;
 
 	lockdep_assert_irqs_enabled();
@@ -1405,7 +1419,7 @@ static void remove_siblinginfo(int cpu)
 	cpumask_clear(topology_sibling_cpumask(cpu));
 	cpumask_clear(topology_core_cpumask(cpu));
 	cpumask_clear(topology_die_cpumask(cpu));
-	c->cpu_core_id = 0;
+	c->topo.core_id = 0;
 	c->booted_cores = 0;
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
 	recompute_smt_state();
@@ -1596,8 +1610,15 @@ void __noreturn hlt_play_dead(void)
 		native_halt();
 }
 
+/*
+ * native_play_dead() is essentially a __noreturn function, but it can't
+ * be marked as such as the compiler may complain about it.
+ */
 void native_play_dead(void)
 {
+	if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+		__update_spec_ctrl(0);
+
 	play_dead_common();
 	tboot_shutdown(TB_SHUTDOWN_WFS);
 
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bbc440c93e08..1123ef3ccf90 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -15,6 +15,7 @@
  * ( The serial nature of the boot logic and the CPU hotplug lock
  *   protects against more than 2 CPUs entering this code. )
  */
+#include <linux/workqueue.h>
 #include <linux/topology.h>
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
@@ -342,6 +343,13 @@ static inline unsigned int loop_timeout(int cpu)
 	return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
 }
 
+static void tsc_sync_mark_tsc_unstable(struct work_struct *work)
+{
+	mark_tsc_unstable("check_tsc_sync_source failed");
+}
+
+static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable);
+
 /*
  * The freshly booted CPU initiates this via an async SMP function call.
  */
@@ -395,7 +403,7 @@ retry:
 			"turning off TSC clock.\n", max_warp);
 		if (random_warps)
 			pr_warn("TSC warped randomly between CPUs\n");
-		mark_tsc_unstable("check_tsc_sync_source failed");
+		schedule_work(&tsc_sync_work);
 	}
 
 	/*
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 7e574cf3bf8a..d00c28aaa5be 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -85,7 +85,7 @@ static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
 {
 	int *first = ip_table;
 	int *last = ip_table + num_entries - 1;
-	int *mid = first, *found = first;
+	int *mid, *found = first;
 
 	if (!num_entries)
 		return NULL;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f15fb71f280e..54a5596adaa6 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -139,10 +139,7 @@ SECTIONS
 		STATIC_CALL_TEXT
 
 		ALIGN_ENTRY_TEXT_BEGIN
-#ifdef CONFIG_CPU_SRSO
 		*(.text..__x86.rethunk_untrain)
-#endif
-
 		ENTRY_TEXT
 
 #ifdef CONFIG_CPU_SRSO
@@ -520,12 +517,12 @@ INIT_PER_CPU(irq_stack_backing_store);
            "fixed_percpu_data is not at start of per-cpu area");
 #endif
 
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_CPU_UNRET_ENTRY
 . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
-. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
 #endif
 
 #ifdef CONFIG_CPU_SRSO
+. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
 /*
  * GNU ld cannot do XOR until 2.41.
  * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 65e96b76c423..d3fc01770558 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -127,7 +127,7 @@ static void __init vsmp_cap_cpus(void)
 #endif
 }
 
-static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb)
 {
 	return read_apic_id() >> index_msb;
 }
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index beea99c8e8e0..ded1d80d72cb 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -531,8 +531,6 @@ static bool __kvm_is_svm_supported(void)
 	int cpu = smp_processor_id();
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	u64 vm_cr;
-
 	if (c->x86_vendor != X86_VENDOR_AMD &&
 	    c->x86_vendor != X86_VENDOR_HYGON) {
 		pr_err("CPU %d isn't AMD or Hygon\n", cpu);
@@ -549,12 +547,6 @@ static bool __kvm_is_svm_supported(void)
 		return false;
 	}
 
-	rdmsrl(MSR_VM_CR, vm_cr);
-	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
-		pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
-		return false;
-	}
-
 	return true;
 }
 
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 23318c338db0..68f7fa3e1322 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -21,10 +21,10 @@
  *                   converted to pure assembler
  */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
-#include <asm/export.h>
 #include <asm/nospec-branch.h>
 
 /*
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f74a3e704a1c..2760a15fbc00 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/asm.h>
-#include <asm/export.h>
 
 /*
  * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index 49805257b125..873e4ef23e49 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
-#include <asm/export.h>
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index 80efd45a7761..6e8b7e600def 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -70,23 +70,23 @@ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigne
 }
 EXPORT_SYMBOL_GPL(copy_mc_to_kernel);
 
-unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned len)
+unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, unsigned len)
 {
 	unsigned long ret;
 
 	if (copy_mc_fragile_enabled) {
 		__uaccess_begin();
-		ret = copy_mc_fragile(dst, src, len);
+		ret = copy_mc_fragile((__force void *)dst, src, len);
 		__uaccess_end();
 		return ret;
 	}
 
 	if (static_cpu_has(X86_FEATURE_ERMS)) {
 		__uaccess_begin();
-		ret = copy_mc_enhanced_fast_string(dst, src, len);
+		ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len);
 		__uaccess_end();
 		return ret;
 	}
 
-	return copy_user_generic(dst, src, len);
+	return copy_user_generic((__force void *)dst, src, len);
 }
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 30ea644bf446..d6ae793d08fa 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,10 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
-#include <asm/export.h>
 
 /*
  * Some CPUs run faster using the string copy instructions (sane microcode).
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 0a81aafed7f8..fc9fb5d06174 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -6,11 +6,11 @@
  * Functions to copy from and to user space.
  */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
 #include <asm/asm.h>
-#include <asm/export.h>
 
 /*
  * rep_movs_alternative - memory copy with exception handling.
diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S
index 5c5f38d32672..2918e36eece2 100644
--- a/arch/x86/lib/copy_user_uncached_64.S
+++ b/arch/x86/lib/copy_user_uncached_64.S
@@ -3,9 +3,9 @@
  * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
  */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/asm.h>
-#include <asm/export.h>
 
 /*
  * copy_user_nocache - Uncached memory copy with exception handling
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 145f9a0bde29..f4df4d241526 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -14,8 +14,6 @@
  * @src: source address (user space)
  * @dst: destination address
  * @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad source address.
  *
  * Returns an 32bit unfolded checksum of the buffer.
  * src and dst are best aligned to 64bits.
@@ -38,8 +36,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len)
  * @src: source address
  * @dst: destination address (user space)
  * @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad destination address.
  *
  * Returns an 32bit unfolded checksum of the buffer.
  * src and dst are best aligned to 64bits.
@@ -62,7 +58,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len)
  * @src: source address
  * @dst: destination address
  * @len: number of bytes to be copied.
- * @sum: initial sum that is added into the result (32bit unfolded)
  *
  * Returns an 32bit unfolded checksum of the buffer.
  */
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 9c63713477bb..20ef350a60fb 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -26,6 +26,7 @@
  * as they get called from within inline assembly.
  */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/page_types.h>
 #include <asm/errno.h>
@@ -33,7 +34,6 @@
 #include <asm/thread_info.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
-#include <asm/export.h>
 
 #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC
 
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 12c16c6aa44a..774bdf3e6f0a 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/export.h>
 #include <linux/linkage.h>
-#include <asm/export.h>
 
 #include <asm/asm.h>
 
@@ -36,8 +36,12 @@ SYM_FUNC_START(__sw_hweight32)
 SYM_FUNC_END(__sw_hweight32)
 EXPORT_SYMBOL(__sw_hweight32)
 
-SYM_FUNC_START(__sw_hweight64)
+/*
+ * No 32-bit variant, because it's implemented as an inline wrapper
+ * on top of __arch_hweight32():
+ */
 #ifdef CONFIG_X86_64
+SYM_FUNC_START(__sw_hweight64)
 	pushq   %rdi
 	pushq   %rdx
 
@@ -66,18 +70,6 @@ SYM_FUNC_START(__sw_hweight64)
 	popq    %rdx
 	popq    %rdi
 	RET
-#else /* CONFIG_X86_32 */
-	/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
-	pushl   %ecx
-
-	call    __sw_hweight32
-	movl    %eax, %ecx                      # stash away result
-	movl    %edx, %eax                      # second part of input
-	call    __sw_hweight32
-	addl    %ecx, %eax                      # result
-
-	popl    %ecx
-	RET
-#endif
 SYM_FUNC_END(__sw_hweight64)
 EXPORT_SYMBOL(__sw_hweight64)
+#endif
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 76697df8dfd5..0ae2e1712e2e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright 2002 Andi Kleen */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <linux/cfi_types.h>
 #include <asm/errno.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
-#include <asm/export.h>
 
 .section .noinstr.text, "ax"
 
diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S
index 0588b2c0fc95..35010ba3dd6f 100644
--- a/arch/x86/lib/memmove_32.S
+++ b/arch/x86/lib/memmove_32.S
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
-#include <asm/export.h>
 
 SYM_FUNC_START(memmove)
 /*
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index ccdf3a597045..1b60ae81ecd8 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,10 +6,10 @@
  * This assembly file is re-written from memmove_64.c file.
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
-#include <asm/export.h>
 
 #undef memmove
 
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 3d818b849ec6..0199d56cb479 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,10 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright 2002 Andi Kleen, SuSE Labs */
 
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
-#include <asm/export.h>
 
 .section .noinstr.text, "ax"
 
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 235bbda6fc82..2877f5934177 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -11,13 +11,12 @@
  * return an error value in addition to the "real"
  * return value.
  */
+#include <linux/export.h>
 #include <linux/linkage.h>
 #include <asm/thread_info.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
-#include <asm/export.h>
-
 
 /*
  * __put_user_X
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index cd86aeb5fdd3..7b2589877d06 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -1,12 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#include <linux/export.h>
 #include <linux/stringify.h>
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
 #include <asm/asm-offsets.h>
-#include <asm/export.h>
 #include <asm/nospec-branch.h>
 #include <asm/unwind_hints.h>
 #include <asm/percpu.h>
@@ -126,11 +126,19 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
 #include <asm/GEN-for-each-reg.h>
 #undef GEN
 #endif
+
+#ifdef CONFIG_RETHUNK
+
 /*
- * This function name is magical and is used by -mfunction-return=thunk-extern
- * for the compiler to generate JMPs to it.
+ * Be careful here: that label cannot really be removed because in
+ * some configurations and toolchains, the JMP __x86_return_thunk the
+ * compiler issues is either a short one or the compiler doesn't use
+ * relocations for same-section JMPs and that breaks the returns
+ * detection logic in apply_returns() and in objtool.
  */
-#ifdef CONFIG_RETHUNK
+	.section .text..__x86.return_thunk
+
+#ifdef CONFIG_CPU_SRSO
 
 /*
  * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at
@@ -147,29 +155,18 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
  *
  * As a result, srso_alias_safe_ret() becomes a safe return.
  */
-#ifdef CONFIG_CPU_SRSO
-	.section .text..__x86.rethunk_untrain
-
-SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+	.pushsection .text..__x86.rethunk_untrain
+SYM_CODE_START_NOALIGN(srso_alias_untrain_ret)
 	UNWIND_HINT_FUNC
 	ANNOTATE_NOENDBR
 	ASM_NOP2
 	lfence
 	jmp srso_alias_return_thunk
 SYM_FUNC_END(srso_alias_untrain_ret)
-__EXPORT_THUNK(srso_alias_untrain_ret)
-
-	.section .text..__x86.rethunk_safe
-#else
-/* dummy definition for alternatives */
-SYM_START(srso_alias_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
-	ANNOTATE_UNRET_SAFE
-	ret
-	int3
-SYM_FUNC_END(srso_alias_untrain_ret)
-#endif
+	.popsection
 
-SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE)
+	.pushsection .text..__x86.rethunk_safe
+SYM_CODE_START_NOALIGN(srso_alias_safe_ret)
 	lea 8(%_ASM_SP), %_ASM_SP
 	UNWIND_HINT_FUNC
 	ANNOTATE_UNRET_SAFE
@@ -177,14 +174,63 @@ SYM_START(srso_alias_safe_ret, SYM_L_GLOBAL, SYM_A_NONE)
 	int3
 SYM_FUNC_END(srso_alias_safe_ret)
 
-	.section .text..__x86.return_thunk
-
-SYM_CODE_START(srso_alias_return_thunk)
+SYM_CODE_START_NOALIGN(srso_alias_return_thunk)
 	UNWIND_HINT_FUNC
 	ANNOTATE_NOENDBR
 	call srso_alias_safe_ret
 	ud2
 SYM_CODE_END(srso_alias_return_thunk)
+	.popsection
+
+/*
+ * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret()
+ * above. On kernel entry, srso_untrain_ret() is executed which is a
+ *
+ * movabs $0xccccc30824648d48,%rax
+ *
+ * and when the return thunk executes the inner label srso_safe_ret()
+ * later, it is a stack manipulation and a RET which is mispredicted and
+ * thus a "safe" one to use.
+ */
+	.align 64
+	.skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc
+SYM_CODE_START_LOCAL_NOALIGN(srso_untrain_ret)
+	ANNOTATE_NOENDBR
+	.byte 0x48, 0xb8
+
+/*
+ * This forces the function return instruction to speculate into a trap
+ * (UD2 in srso_return_thunk() below).  This RET will then mispredict
+ * and execution will continue at the return site read from the top of
+ * the stack.
+ */
+SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
+	lea 8(%_ASM_SP), %_ASM_SP
+	ret
+	int3
+	int3
+	/* end of movabs */
+	lfence
+	call srso_safe_ret
+	ud2
+SYM_CODE_END(srso_safe_ret)
+SYM_FUNC_END(srso_untrain_ret)
+
+SYM_CODE_START(srso_return_thunk)
+	UNWIND_HINT_FUNC
+	ANNOTATE_NOENDBR
+	call srso_safe_ret
+	ud2
+SYM_CODE_END(srso_return_thunk)
+
+#define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret"
+#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret"
+#else /* !CONFIG_CPU_SRSO */
+#define JMP_SRSO_UNTRAIN_RET "ud2"
+#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2"
+#endif /* CONFIG_CPU_SRSO */
+
+#ifdef CONFIG_CPU_UNRET_ENTRY
 
 /*
  * Some generic notes on the untraining sequences:
@@ -216,7 +262,7 @@ SYM_CODE_END(srso_alias_return_thunk)
  */
 	.align 64
 	.skip 64 - (retbleed_return_thunk - retbleed_untrain_ret), 0xcc
-SYM_START(retbleed_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+SYM_CODE_START_LOCAL_NOALIGN(retbleed_untrain_ret)
 	ANNOTATE_NOENDBR
 	/*
 	 * As executed from retbleed_untrain_ret, this is:
@@ -264,72 +310,27 @@ SYM_CODE_END(retbleed_return_thunk)
 	jmp retbleed_return_thunk
 	int3
 SYM_FUNC_END(retbleed_untrain_ret)
-__EXPORT_THUNK(retbleed_untrain_ret)
 
-/*
- * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret()
- * above. On kernel entry, srso_untrain_ret() is executed which is a
- *
- * movabs $0xccccc30824648d48,%rax
- *
- * and when the return thunk executes the inner label srso_safe_ret()
- * later, it is a stack manipulation and a RET which is mispredicted and
- * thus a "safe" one to use.
- */
-	.align 64
-	.skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc
-SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
-	ANNOTATE_NOENDBR
-	.byte 0x48, 0xb8
+#define JMP_RETBLEED_UNTRAIN_RET "jmp retbleed_untrain_ret"
+#else /* !CONFIG_CPU_UNRET_ENTRY */
+#define JMP_RETBLEED_UNTRAIN_RET "ud2"
+#endif /* CONFIG_CPU_UNRET_ENTRY */
 
-/*
- * This forces the function return instruction to speculate into a trap
- * (UD2 in srso_return_thunk() below).  This RET will then mispredict
- * and execution will continue at the return site read from the top of
- * the stack.
- */
-SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
-	lea 8(%_ASM_SP), %_ASM_SP
-	ret
-	int3
-	int3
-	/* end of movabs */
-	lfence
-	call srso_safe_ret
-	ud2
-SYM_CODE_END(srso_safe_ret)
-SYM_FUNC_END(srso_untrain_ret)
-__EXPORT_THUNK(srso_untrain_ret)
-
-SYM_CODE_START(srso_return_thunk)
-	UNWIND_HINT_FUNC
-	ANNOTATE_NOENDBR
-	call srso_safe_ret
-	ud2
-SYM_CODE_END(srso_return_thunk)
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
 
 SYM_FUNC_START(entry_untrain_ret)
-	ALTERNATIVE_2 "jmp retbleed_untrain_ret", \
-		      "jmp srso_untrain_ret", X86_FEATURE_SRSO, \
-		      "jmp srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
+	ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET,				\
+		      JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO,		\
+		      JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS
 SYM_FUNC_END(entry_untrain_ret)
 __EXPORT_THUNK(entry_untrain_ret)
 
-SYM_CODE_START(__x86_return_thunk)
-	UNWIND_HINT_FUNC
-	ANNOTATE_NOENDBR
-	ANNOTATE_UNRET_SAFE
-	ret
-	int3
-SYM_CODE_END(__x86_return_thunk)
-EXPORT_SYMBOL(__x86_return_thunk)
-
-#endif /* CONFIG_RETHUNK */
+#endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */
 
 #ifdef CONFIG_CALL_DEPTH_TRACKING
 
 	.align 64
-SYM_FUNC_START(__x86_return_skl)
+SYM_FUNC_START(call_depth_return_thunk)
 	ANNOTATE_NOENDBR
 	/*
 	 * Keep the hotpath in a 16byte I-fetch for the non-debug
@@ -356,6 +357,33 @@ SYM_FUNC_START(__x86_return_skl)
 	ANNOTATE_UNRET_SAFE
 	ret
 	int3
-SYM_FUNC_END(__x86_return_skl)
+SYM_FUNC_END(call_depth_return_thunk)
 
 #endif /* CONFIG_CALL_DEPTH_TRACKING */
+
+/*
+ * This function name is magical and is used by -mfunction-return=thunk-extern
+ * for the compiler to generate JMPs to it.
+ *
+ * This code is only used during kernel boot or module init.  All
+ * 'JMP __x86_return_thunk' sites are changed to something else by
+ * apply_returns().
+ *
+ * This should be converted eventually to call a warning function which
+ * should scream loudly when the default return thunk is called after
+ * alternatives have been applied.
+ *
+ * That warning function cannot BUG() because the bug splat cannot be
+ * displayed in all possible configurations, leading to users not really
+ * knowing why the machine froze.
+ */
+SYM_CODE_START(__x86_return_thunk)
+	UNWIND_HINT_FUNC
+	ANNOTATE_NOENDBR
+	ANNOTATE_UNRET_SAFE
+	ret
+	int3
+SYM_CODE_END(__x86_return_thunk)
+EXPORT_SYMBOL(__x86_return_thunk)
+
+#endif /* CONFIG_RETHUNK */
diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c
index 5a53c2cc169c..6993f026adec 100644
--- a/arch/x86/mm/maccess.c
+++ b/arch/x86/mm/maccess.c
@@ -9,12 +9,21 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
 	unsigned long vaddr = (unsigned long)unsafe_src;
 
 	/*
-	 * Range covering the highest possible canonical userspace address
-	 * as well as non-canonical address range. For the canonical range
-	 * we also need to include the userspace guard page.
+	 * Do not allow userspace addresses.  This disallows
+	 * normal userspace and the userspace guard page:
 	 */
-	return vaddr >= TASK_SIZE_MAX + PAGE_SIZE &&
-	       __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
+	if (vaddr < TASK_SIZE_MAX + PAGE_SIZE)
+		return false;
+
+	/*
+	 * Allow everything during early boot before 'x86_virt_bits'
+	 * is initialized.  Needed for instruction decoding in early
+	 * exception handlers.
+	 */
+	if (!boot_cpu_data.x86_virt_bits)
+		return true;
+
+	return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
 }
 #else
 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 9f27e14e185f..c290c55b632b 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -12,6 +12,7 @@
 #include <linux/swiotlb.h>
 #include <linux/cc_platform.h>
 #include <linux/mem_encrypt.h>
+#include <linux/virtio_anchor.h>
 
 /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
 bool force_dma_unencrypted(struct device *dev)
@@ -86,3 +87,36 @@ void __init mem_encrypt_init(void)
 
 	print_mem_encrypt_feature_info();
 }
+
+void __init mem_encrypt_setup_arch(void)
+{
+	phys_addr_t total_mem = memblock_phys_mem_size();
+	unsigned long size;
+
+	if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+		return;
+
+	/*
+	 * For SEV and TDX, all DMA has to occur via shared/unencrypted pages.
+	 * Kernel uses SWIOTLB to make this happen without changing device
+	 * drivers. However, depending on the workload being run, the
+	 * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+	 * run out of buffers for DMA, resulting in I/O errors and/or
+	 * performance degradation especially with high I/O workloads.
+	 *
+	 * Adjust the default size of SWIOTLB using a percentage of guest
+	 * memory for SWIOTLB buffers. Also, as the SWIOTLB bounce buffer
+	 * memory is allocated from low memory, ensure that the adjusted size
+	 * is within the limits of low available memory.
+	 *
+	 * The percentage of guest memory used here for SWIOTLB buffers
+	 * is more of an approximation of the static adjustment which
+	 * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+	 */
+	size = total_mem * 6 / 100;
+	size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+	swiotlb_adjust_size(size);
+
+	/* Set restricted memory access for virtio. */
+	virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 6faea41e99b6..a68f2dda0948 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -19,8 +19,6 @@
 #include <linux/kernel.h>
 #include <linux/bitops.h>
 #include <linux/dma-mapping.h>
-#include <linux/virtio_config.h>
-#include <linux/virtio_anchor.h>
 #include <linux/cc_platform.h>
 
 #include <asm/tlbflush.h>
@@ -215,40 +213,6 @@ void __init sme_map_bootdata(char *real_mode_data)
 	__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
 }
 
-void __init sev_setup_arch(void)
-{
-	phys_addr_t total_mem = memblock_phys_mem_size();
-	unsigned long size;
-
-	if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
-		return;
-
-	/*
-	 * For SEV, all DMA has to occur via shared/unencrypted pages.
-	 * SEV uses SWIOTLB to make this happen without changing device
-	 * drivers. However, depending on the workload being run, the
-	 * default 64MB of SWIOTLB may not be enough and SWIOTLB may
-	 * run out of buffers for DMA, resulting in I/O errors and/or
-	 * performance degradation especially with high I/O workloads.
-	 *
-	 * Adjust the default size of SWIOTLB for SEV guests using
-	 * a percentage of guest memory for SWIOTLB buffers.
-	 * Also, as the SWIOTLB bounce buffer memory is allocated
-	 * from low memory, ensure that the adjusted size is within
-	 * the limits of low available memory.
-	 *
-	 * The percentage of guest memory used here for SWIOTLB buffers
-	 * is more of an approximation of the static adjustment which
-	 * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
-	 */
-	size = total_mem * 6 / 100;
-	size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
-	swiotlb_adjust_size(size);
-
-	/* Set restricted memory access for virtio. */
-	virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
-}
-
 static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
 {
 	unsigned long pfn = 0;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 2aadb2019b4f..b29ceb19e46e 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -3,6 +3,7 @@
 #include <linux/acpi.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/of.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/memblock.h>
@@ -11,6 +12,7 @@
 #include <linux/nodemask.h>
 #include <linux/sched.h>
 #include <linux/topology.h>
+#include <linux/sort.h>
 
 #include <asm/e820/api.h>
 #include <asm/proto.h>
@@ -56,7 +58,7 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = {
 
 int numa_cpu_node(int cpu)
 {
-	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
 
 	if (apicid != BAD_APICID)
 		return __apicid_to_node[apicid];
@@ -601,13 +603,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 		if (start >= end)
 			continue;
 
-		/*
-		 * Don't confuse VM with a node that doesn't have the
-		 * minimum amount of memory:
-		 */
-		if (end && (end - start) < NODE_MIN_SIZE)
-			continue;
-
 		alloc_node_data(nid);
 	}
 
@@ -733,6 +728,8 @@ void __init x86_numa_init(void)
 		if (!numa_init(amd_numa_init))
 			return;
 #endif
+		if (acpi_disabled && !numa_init(of_numa_init))
+			return;
 	}
 
 	numa_init(dummy_numa_init);
@@ -786,7 +783,7 @@ void __init init_gi_nodes(void)
 void __init init_cpu_to_node(void)
 {
 	int cpu;
-	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+	u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
 
 	BUG_ON(cpu_to_apicid == NULL);
 
@@ -961,4 +958,83 @@ int memory_add_physaddr_to_nid(u64 start)
 	return nid;
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+static int __init cmp_memblk(const void *a, const void *b)
+{
+	const struct numa_memblk *ma = *(const struct numa_memblk **)a;
+	const struct numa_memblk *mb = *(const struct numa_memblk **)b;
+
+	return ma->start - mb->start;
+}
+
+static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
+
+/**
+ * numa_fill_memblks - Fill gaps in numa_meminfo memblks
+ * @start: address to begin fill
+ * @end: address to end fill
+ *
+ * Find and extend numa_meminfo memblks to cover the @start-@end
+ * physical address range, such that the first memblk includes
+ * @start, the last memblk includes @end, and any gaps in between
+ * are filled.
+ *
+ * RETURNS:
+ * 0		  : Success
+ * NUMA_NO_MEMBLK : No memblk exists in @start-@end range
+ */
+
+int __init numa_fill_memblks(u64 start, u64 end)
+{
+	struct numa_memblk **blk = &numa_memblk_list[0];
+	struct numa_meminfo *mi = &numa_meminfo;
+	int count = 0;
+	u64 prev_end;
+
+	/*
+	 * Create a list of pointers to numa_meminfo memblks that
+	 * overlap start, end. Exclude (start == bi->end) since
+	 * end addresses in both a CFMWS range and a memblk range
+	 * are exclusive.
+	 *
+	 * This list of pointers is used to make in-place changes
+	 * that fill out the numa_meminfo memblks.
+	 */
+	for (int i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi = &mi->blk[i];
+
+		if (start < bi->end && end >= bi->start) {
+			blk[count] = &mi->blk[i];
+			count++;
+		}
+	}
+	if (!count)
+		return NUMA_NO_MEMBLK;
+
+	/* Sort the list of pointers in memblk->start order */
+	sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
+
+	/* Make sure the first/last memblks include start/end */
+	blk[0]->start = min(blk[0]->start, start);
+	blk[count - 1]->end = max(blk[count - 1]->end, end);
+
+	/*
+	 * Fill any gaps by tracking the previous memblks
+	 * end address and backfilling to it if needed.
+	 */
+	prev_end = blk[0]->end;
+	for (int i = 1; i < count; i++) {
+		struct numa_memblk *curr = blk[i];
+
+		if (prev_end >= curr->start) {
+			if (prev_end < curr->end)
+				prev_end = curr->end;
+		} else {
+			curr->start = prev_end;
+			prev_end = curr->end;
+		}
+	}
+	return 0;
+}
+
 #endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 78414c6d1b5e..5dd733944629 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -69,6 +69,7 @@ static void __init pti_print_if_secure(const char *reason)
 		pr_info("%s\n", reason);
 }
 
+/* Assume mode is auto unless overridden via cmdline below. */
 static enum pti_mode {
 	PTI_AUTO = 0,
 	PTI_FORCE_OFF,
@@ -77,50 +78,49 @@ static enum pti_mode {
 
 void __init pti_check_boottime_disable(void)
 {
-	char arg[5];
-	int ret;
-
-	/* Assume mode is auto unless overridden. */
-	pti_mode = PTI_AUTO;
-
 	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
 		pti_mode = PTI_FORCE_OFF;
 		pti_print_if_insecure("disabled on XEN PV.");
 		return;
 	}
 
-	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
-	if (ret > 0)  {
-		if (ret == 3 && !strncmp(arg, "off", 3)) {
-			pti_mode = PTI_FORCE_OFF;
-			pti_print_if_insecure("disabled on command line.");
-			return;
-		}
-		if (ret == 2 && !strncmp(arg, "on", 2)) {
-			pti_mode = PTI_FORCE_ON;
-			pti_print_if_secure("force enabled on command line.");
-			goto enable;
-		}
-		if (ret == 4 && !strncmp(arg, "auto", 4)) {
-			pti_mode = PTI_AUTO;
-			goto autosel;
-		}
-	}
-
-	if (cmdline_find_option_bool(boot_command_line, "nopti") ||
-	    cpu_mitigations_off()) {
+	if (cpu_mitigations_off())
 		pti_mode = PTI_FORCE_OFF;
+	if (pti_mode == PTI_FORCE_OFF) {
 		pti_print_if_insecure("disabled on command line.");
 		return;
 	}
 
-autosel:
-	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+	if (pti_mode == PTI_FORCE_ON)
+		pti_print_if_secure("force enabled on command line.");
+
+	if (pti_mode == PTI_AUTO && !boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		return;
-enable:
+
 	setup_force_cpu_cap(X86_FEATURE_PTI);
 }
 
+static int __init pti_parse_cmdline(char *arg)
+{
+	if (!strcmp(arg, "off"))
+		pti_mode = PTI_FORCE_OFF;
+	else if (!strcmp(arg, "on"))
+		pti_mode = PTI_FORCE_ON;
+	else if (!strcmp(arg, "auto"))
+		pti_mode = PTI_AUTO;
+	else
+		return -EINVAL;
+	return 0;
+}
+early_param("pti", pti_parse_cmdline);
+
+static int __init pti_parse_cmdline_nopti(char *arg)
+{
+	pti_mode = PTI_FORCE_OFF;
+	return 0;
+}
+early_param("nopti", pti_parse_cmdline_nopti);
+
 pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
 {
 	/*
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 45d0c17ce77c..e03207de2880 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/slab.h>
+#include <linux/string.h>
 #include <linux/clocksource.h>
 
 #include <asm/apic.h>
@@ -178,49 +179,56 @@ module_param_named(debug, uv_nmi_debug, int, 0644);
 	} while (0)
 
 /* Valid NMI Actions */
-#define	ACTION_LEN	16
-static struct nmi_action {
-	char	*action;
-	char	*desc;
-} valid_acts[] = {
-	{	"kdump",	"do kernel crash dump"			},
-	{	"dump",		"dump process stack for each cpu"	},
-	{	"ips",		"dump Inst Ptr info for each cpu"	},
-	{	"kdb",		"enter KDB (needs kgdboc= assignment)"	},
-	{	"kgdb",		"enter KGDB (needs gdb target remote)"	},
-	{	"health",	"check if CPUs respond to NMI"		},
+enum action_t {
+	nmi_act_kdump,
+	nmi_act_dump,
+	nmi_act_ips,
+	nmi_act_kdb,
+	nmi_act_kgdb,
+	nmi_act_health,
+	nmi_act_max
 };
-typedef char action_t[ACTION_LEN];
-static action_t uv_nmi_action = { "dump" };
+
+static const char * const actions[nmi_act_max] = {
+	[nmi_act_kdump] = "kdump",
+	[nmi_act_dump] = "dump",
+	[nmi_act_ips] = "ips",
+	[nmi_act_kdb] = "kdb",
+	[nmi_act_kgdb] = "kgdb",
+	[nmi_act_health] = "health",
+};
+
+static const char * const actions_desc[nmi_act_max] = {
+	[nmi_act_kdump] = "do kernel crash dump",
+	[nmi_act_dump] = "dump process stack for each cpu",
+	[nmi_act_ips] = "dump Inst Ptr info for each cpu",
+	[nmi_act_kdb] = "enter KDB (needs kgdboc= assignment)",
+	[nmi_act_kgdb] = "enter KGDB (needs gdb target remote)",
+	[nmi_act_health] = "check if CPUs respond to NMI",
+};
+
+static enum action_t uv_nmi_action = nmi_act_dump;
 
 static int param_get_action(char *buffer, const struct kernel_param *kp)
 {
-	return sprintf(buffer, "%s\n", uv_nmi_action);
+	return sprintf(buffer, "%s\n", actions[uv_nmi_action]);
 }
 
 static int param_set_action(const char *val, const struct kernel_param *kp)
 {
-	int i;
-	int n = ARRAY_SIZE(valid_acts);
-	char arg[ACTION_LEN];
-
-	/* (remove possible '\n') */
-	strscpy(arg, val, strnchrnul(val, sizeof(arg)-1, '\n') - val + 1);
-
-	for (i = 0; i < n; i++)
-		if (!strcmp(arg, valid_acts[i].action))
-			break;
+	int i, n = ARRAY_SIZE(actions);
 
-	if (i < n) {
-		strscpy(uv_nmi_action, arg, sizeof(uv_nmi_action));
-		pr_info("UV: New NMI action:%s\n", uv_nmi_action);
+	i = sysfs_match_string(actions, val);
+	if (i >= 0) {
+		uv_nmi_action = i;
+		pr_info("UV: New NMI action:%s\n", actions[i]);
 		return 0;
 	}
 
-	pr_err("UV: Invalid NMI action:%s, valid actions are:\n", arg);
+	pr_err("UV: Invalid NMI action. Valid actions are:\n");
 	for (i = 0; i < n; i++)
-		pr_err("UV: %-8s - %s\n",
-			valid_acts[i].action, valid_acts[i].desc);
+		pr_err("UV: %-8s - %s\n", actions[i], actions_desc[i]);
+
 	return -EINVAL;
 }
 
@@ -228,15 +236,10 @@ static const struct kernel_param_ops param_ops_action = {
 	.get = param_get_action,
 	.set = param_set_action,
 };
-#define param_check_action(name, p) __param_check(name, p, action_t)
+#define param_check_action(name, p) __param_check(name, p, enum action_t)
 
 module_param_named(action, uv_nmi_action, action, 0644);
 
-static inline bool uv_nmi_action_is(const char *action)
-{
-	return (strncmp(uv_nmi_action, action, strlen(action)) == 0);
-}
-
 /* Setup which NMI support is present in system */
 static void uv_nmi_setup_mmrs(void)
 {
@@ -727,10 +730,10 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
 	if (cpu == 0)
 		uv_nmi_dump_cpu_ip_hdr();
 
-	if (current->pid != 0 || !uv_nmi_action_is("ips"))
+	if (current->pid != 0 || uv_nmi_action != nmi_act_ips)
 		uv_nmi_dump_cpu_ip(cpu, regs);
 
-	if (uv_nmi_action_is("dump")) {
+	if (uv_nmi_action == nmi_act_dump) {
 		pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu);
 		show_regs(regs);
 	}
@@ -798,7 +801,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
 		int saved_console_loglevel = console_loglevel;
 
 		pr_alert("UV: tracing %s for %d CPUs from CPU %d\n",
-			uv_nmi_action_is("ips") ? "IPs" : "processes",
+			uv_nmi_action == nmi_act_ips ? "IPs" : "processes",
 			atomic_read(&uv_nmi_cpus_in_nmi), cpu);
 
 		console_loglevel = uv_nmi_loglevel;
@@ -874,7 +877,7 @@ static inline int uv_nmi_kdb_reason(void)
 static inline int uv_nmi_kdb_reason(void)
 {
 	/* Ensure user is expecting to attach gdb remote */
-	if (uv_nmi_action_is("kgdb"))
+	if (uv_nmi_action == nmi_act_kgdb)
 		return 0;
 
 	pr_err("UV: NMI error: KDB is not enabled in this kernel\n");
@@ -950,28 +953,35 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 	master = (atomic_read(&uv_nmi_cpu) == cpu);
 
 	/* If NMI action is "kdump", then attempt to do it */
-	if (uv_nmi_action_is("kdump")) {
+	if (uv_nmi_action == nmi_act_kdump) {
 		uv_nmi_kdump(cpu, master, regs);
 
 		/* Unexpected return, revert action to "dump" */
 		if (master)
-			strscpy(uv_nmi_action, "dump", sizeof(uv_nmi_action));
+			uv_nmi_action = nmi_act_dump;
 	}
 
 	/* Pause as all CPU's enter the NMI handler */
 	uv_nmi_wait(master);
 
 	/* Process actions other than "kdump": */
-	if (uv_nmi_action_is("health")) {
+	switch (uv_nmi_action) {
+	case nmi_act_health:
 		uv_nmi_action_health(cpu, regs, master);
-	} else if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) {
+		break;
+	case nmi_act_ips:
+	case nmi_act_dump:
 		uv_nmi_dump_state(cpu, regs, master);
-	} else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) {
+		break;
+	case nmi_act_kdb:
+	case nmi_act_kgdb:
 		uv_call_kgdb_kdb(cpu, regs, master);
-	} else {
+		break;
+	default:
 		if (master)
-			pr_alert("UV: unknown NMI action: %s\n", uv_nmi_action);
+			pr_alert("UV: unknown NMI action: %d\n", uv_nmi_action);
 		uv_nmi_sync_exit(master);
+		break;
 	}
 
 	/* Clear per_cpu "in_nmi" flag */
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 54663f3e00cb..ff5afc8a5a41 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -53,7 +53,7 @@ struct uv_rtc_timer_head {
 	struct {
 		int	lcpu;		/* systemwide logical cpu number */
 		u64	expires;	/* next timer expiration for this cpu */
-	} cpu[];
+	} cpu[] __counted_by(ncpus);
 };
 
 /*
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 7ad91225fdf4..9dd5490b3318 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -33,13 +33,13 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
 	return 0xfd;
 }
 
-static u32 xen_set_apic_id(unsigned int x)
+static u32 xen_set_apic_id(u32 x)
 {
 	WARN_ON(1);
 	return x;
 }
 
-static unsigned int xen_get_apic_id(unsigned long x)
+static u32 xen_get_apic_id(u32 x)
 {
 	return ((x)>>24) & 0xFFu;
 }
@@ -110,15 +110,15 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
 	return xen_pv_domain();
 }
 
-static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
+static u32 xen_phys_pkg_id(u32 initial_apic_id, int index_msb)
 {
 	return initial_apic_id >> index_msb;
 }
 
-static int xen_cpu_present_to_apicid(int cpu)
+static u32 xen_cpu_present_to_apicid(int cpu)
 {
 	if (cpu_present(cpu))
-		return cpu_data(cpu).apicid;
+		return cpu_data(cpu).topo.apicid;
 	else
 		return BAD_APICID;
 }
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index fc1a4f3c81d9..dd71ecce8b86 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -423,3 +423,6 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/block/bdev.c b/block/bdev.c
index f3b13aa1b7d4..2018d250e131 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -829,6 +829,28 @@ put_blkdev:
 }
 EXPORT_SYMBOL(blkdev_get_by_dev);
 
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				     const struct blk_holder_ops *hops)
+{
+	struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
+	struct block_device *bdev;
+
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+	bdev = blkdev_get_by_dev(dev, mode, holder, hops);
+	if (IS_ERR(bdev)) {
+		kfree(handle);
+		return ERR_CAST(bdev);
+	}
+	handle->bdev = bdev;
+	handle->holder = holder;
+	if (holder)
+		mode |= BLK_OPEN_EXCL;
+	handle->mode = mode;
+	return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_dev);
+
 /**
  * blkdev_get_by_path - open a block device by name
  * @path: path to the block device to open
@@ -867,6 +889,28 @@ struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
 
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+		void *holder, const struct blk_holder_ops *hops)
+{
+	struct bdev_handle *handle;
+	dev_t dev;
+	int error;
+
+	error = lookup_bdev(path, &dev);
+	if (error)
+		return ERR_PTR(error);
+
+	handle = bdev_open_by_dev(dev, mode, holder, hops);
+	if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
+	    bdev_read_only(handle->bdev)) {
+		bdev_release(handle);
+		return ERR_PTR(-EACCES);
+	}
+
+	return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_path);
+
 void blkdev_put(struct block_device *bdev, void *holder)
 {
 	struct gendisk *disk = bdev->bd_disk;
@@ -903,6 +947,13 @@ void blkdev_put(struct block_device *bdev, void *holder)
 }
 EXPORT_SYMBOL(blkdev_put);
 
+void bdev_release(struct bdev_handle *handle)
+{
+	blkdev_put(handle->bdev, handle->holder);
+	kfree(handle);
+}
+EXPORT_SYMBOL(bdev_release);
+
 /**
  * lookup_bdev() - Look up a struct block_device by name.
  * @pathname: Name of the block device in the filesystem.
@@ -961,20 +1012,20 @@ void bdev_mark_dead(struct block_device *bdev, bool surprise)
 	mutex_lock(&bdev->bd_holder_lock);
 	if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
 		bdev->bd_holder_ops->mark_dead(bdev, surprise);
-	else
+	else {
+		mutex_unlock(&bdev->bd_holder_lock);
 		sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_holder_lock);
+	}
 
 	invalidate_bdev(bdev);
 }
-#ifdef CONFIG_DASD_MODULE
 /*
- * Drivers should not use this directly, but the DASD driver has historically
- * had a shutdown to offline mode that doesn't actually remove the gendisk
- * that otherwise looks a lot like a safe device removal.
+ * New drivers should not use this directly.  There are some drivers however
+ * that needs this for historical reasons. For example, the DASD driver has
+ * historically had a shutdown to offline mode that doesn't actually remove the
+ * gendisk that otherwise looks a lot like a safe device removal.
  */
 EXPORT_SYMBOL_GPL(bdev_mark_dead);
-#endif
 
 void sync_bdevs(bool wait)
 {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 38a881cf97d0..13e4377a8b28 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -723,6 +723,12 @@ static unsigned int calculate_io_allowed(u32 iops_limit,
 
 static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
 {
+	/*
+	 * Can result be wider than 64 bits?
+	 * We check against 62, not 64, due to ilog2 truncation.
+	 */
+	if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62)
+		return U64_MAX;
 	return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
 }
 
diff --git a/block/disk-events.c b/block/disk-events.c
index 13c3372c465a..2f697224386a 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -266,11 +266,8 @@ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
  * disk_check_media_change - check if a removable media has been changed
  * @disk: gendisk to check
  *
- * Check whether a removable media has been changed, and attempt to free all
- * dentries and inodes and invalidates all block device page cache entries in
- * that case.
- *
- * Returns %true if the media has changed, or %false if not.
+ * Returns %true and marks the disk for a partition rescan whether a removable
+ * media has been changed, and %false if the media did not change.
  */
 bool disk_check_media_change(struct gendisk *disk)
 {
@@ -278,12 +275,11 @@ bool disk_check_media_change(struct gendisk *disk)
 
 	events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
 				   DISK_EVENT_EJECT_REQUEST);
-	if (!(events & DISK_EVENT_MEDIA_CHANGE))
-		return false;
-
-	bdev_mark_dead(disk->part0, true);
-	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	return true;
+	if (events & DISK_EVENT_MEDIA_CHANGE) {
+		set_bit(GD_NEED_PART_SCAN, &disk->state);
+		return true;
+	}
+	return false;
 }
 EXPORT_SYMBOL(disk_check_media_change);
 
diff --git a/block/fops.c b/block/fops.c
index 73e42742543f..0abaac705daf 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -542,15 +542,31 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 	return error;
 }
 
+/**
+ * file_to_blk_mode - get block open flags from file flags
+ * @file: file whose open flags should be converted
+ *
+ * Look at file open flags and generate corresponding block open flags from
+ * them. The function works both for file just being open (e.g. during ->open
+ * callback) and for file that is already open. This is actually non-trivial
+ * (see comment in the function).
+ */
 blk_mode_t file_to_blk_mode(struct file *file)
 {
 	blk_mode_t mode = 0;
+	struct bdev_handle *handle = file->private_data;
 
 	if (file->f_mode & FMODE_READ)
 		mode |= BLK_OPEN_READ;
 	if (file->f_mode & FMODE_WRITE)
 		mode |= BLK_OPEN_WRITE;
-	if (file->private_data)
+	/*
+	 * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to
+	 * determine whether the open was exclusive for already open files.
+	 */
+	if (handle)
+		mode |= handle->mode & BLK_OPEN_EXCL;
+	else if (file->f_flags & O_EXCL)
 		mode |= BLK_OPEN_EXCL;
 	if (file->f_flags & O_NDELAY)
 		mode |= BLK_OPEN_NDELAY;
@@ -568,7 +584,8 @@ blk_mode_t file_to_blk_mode(struct file *file)
 
 static int blkdev_open(struct inode *inode, struct file *filp)
 {
-	struct block_device *bdev;
+	struct bdev_handle *handle;
+	blk_mode_t mode;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -579,29 +596,24 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 	filp->f_flags |= O_LARGEFILE;
 	filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
 
-	/*
-	 * Use the file private data to store the holder for exclusive openes.
-	 * file_to_blk_mode relies on it being present to set BLK_OPEN_EXCL.
-	 */
-	if (filp->f_flags & O_EXCL)
-		filp->private_data = filp;
-
-	bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp),
-				 filp->private_data, NULL);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+	mode = file_to_blk_mode(filp);
+	handle = bdev_open_by_dev(inode->i_rdev, mode,
+			mode & BLK_OPEN_EXCL ? filp : NULL, NULL);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
-	if (bdev_nowait(bdev))
+	if (bdev_nowait(handle->bdev))
 		filp->f_mode |= FMODE_NOWAIT;
 
-	filp->f_mapping = bdev->bd_inode->i_mapping;
+	filp->f_mapping = handle->bdev->bd_inode->i_mapping;
 	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
+	filp->private_data = handle;
 	return 0;
 }
 
 static int blkdev_release(struct inode *inode, struct file *filp)
 {
-	blkdev_put(I_BDEV(filp->f_mapping->host), filp->private_data);
+	bdev_release(filp->private_data);
 	return 0;
 }
 
diff --git a/block/genhd.c b/block/genhd.c
index cc32a0c704eb..c9d06f72c587 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(disk_uevent);
 
 int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 {
-	struct block_device *bdev;
+	struct bdev_handle *handle;
 	int ret = 0;
 
 	if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
@@ -366,12 +366,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 	}
 
 	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
-				 NULL);
-	if (IS_ERR(bdev))
-		ret =  PTR_ERR(bdev);
+	handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
+				  NULL);
+	if (IS_ERR(handle))
+		ret = PTR_ERR(handle);
 	else
-		blkdev_put(bdev, NULL);
+		bdev_release(handle);
 
 	/*
 	 * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
@@ -559,6 +559,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
 	struct block_device *bdev;
 	unsigned long idx;
 
+	/*
+	 * On surprise disk removal, bdev_mark_dead() may call into file
+	 * systems below. Make it clear that we're expecting to not hold
+	 * disk->open_mutex.
+	 */
+	lockdep_assert_not_held(&disk->open_mutex);
+
 	rcu_read_lock();
 	xa_for_each(&disk->part_tbl, idx, bdev) {
 		if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
diff --git a/block/ioctl.c b/block/ioctl.c
index d5f5cd61efd7..4160f4e6bd5b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -370,9 +370,10 @@ static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
 	mutex_lock(&bdev->bd_holder_lock);
 	if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync)
 		bdev->bd_holder_ops->sync(bdev);
-	else
+	else {
+		mutex_unlock(&bdev->bd_holder_lock);
 		sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_holder_lock);
+	}
 
 	invalidate_bdev(bdev);
 	return 0;
@@ -467,6 +468,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
 		int __user *argp)
 {
 	int ret, n;
+	struct bdev_handle *handle;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -478,10 +480,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
 	if (mode & BLK_OPEN_EXCL)
 		return set_blocksize(bdev, n);
 
-	if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode, &bdev, NULL)))
+	handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
+	if (IS_ERR(handle))
 		return -EBUSY;
 	ret = set_blocksize(bdev, n);
-	blkdev_put(bdev, &bdev);
+	bdev_release(handle);
 
 	return ret;
 }
diff --git a/block/partitions/core.c b/block/partitions/core.c
index e137a87f4db0..f47ffcfdfcec 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -274,17 +274,6 @@ void drop_partition(struct block_device *part)
 	put_device(&part->bd_device);
 }
 
-static void delete_partition(struct block_device *part)
-{
-	/*
-	 * Remove the block device from the inode hash, so that it cannot be
-	 * looked up any more even when openers still hold references.
-	 */
-	remove_inode_hash(part->bd_inode);
-	bdev_mark_dead(part, false);
-	drop_partition(part);
-}
-
 static ssize_t whole_disk_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
@@ -485,7 +474,18 @@ int bdev_del_partition(struct gendisk *disk, int partno)
 	if (atomic_read(&part->bd_openers))
 		goto out_unlock;
 
-	delete_partition(part);
+	/*
+	 * We verified that @part->bd_openers is zero above and so
+	 * @part->bd_holder{_ops} can't be set. And since we hold
+	 * @disk->open_mutex the device can't be claimed by anyone.
+	 *
+	 * So no need to call @part->bd_holder_ops->mark_dead() here.
+	 * Just delete the partition and invalidate it.
+	 */
+
+	remove_inode_hash(part->bd_inode);
+	invalidate_bdev(part);
+	drop_partition(part);
 	ret = 0;
 out_unlock:
 	mutex_unlock(&disk->open_mutex);
@@ -663,8 +663,23 @@ rescan:
 	sync_blockdev(disk->part0);
 	invalidate_bdev(disk->part0);
 
-	xa_for_each_start(&disk->part_tbl, idx, part, 1)
-		delete_partition(part);
+	xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+		/*
+		 * Remove the block device from the inode hash, so that
+		 * it cannot be looked up any more even when openers
+		 * still hold references.
+		 */
+		remove_inode_hash(part->bd_inode);
+
+		/*
+		 * If @disk->open_partitions isn't elevated but there's
+		 * still an active holder of that block device things
+		 * are broken.
+		 */
+		WARN_ON_ONCE(atomic_read(&part->bd_openers));
+		invalidate_bdev(part);
+		drop_partition(part);
+	}
 	clear_bit(GD_NEED_PART_SCAN, &disk->state);
 
 	/*
diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c
index 976019429164..18be8b98e9a8 100644
--- a/drivers/accel/ivpu/ivpu_hw_37xx.c
+++ b/drivers/accel/ivpu/ivpu_hw_37xx.c
@@ -940,9 +940,6 @@ static u32 ivpu_hw_37xx_irqb_handler(struct ivpu_device *vdev, int irq)
 	if (status == 0)
 		return 0;
 
-	/* Disable global interrupt before handling local buttress interrupts */
-	REGB_WR32(VPU_37XX_BUTTRESS_GLOBAL_INT_MASK, 0x1);
-
 	if (REG_TEST_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, FREQ_CHANGE, status))
 		ivpu_dbg(vdev, IRQ, "FREQ_CHANGE irq: %08x",
 			 REGB_RD32(VPU_37XX_BUTTRESS_CURRENT_PLL));
@@ -974,9 +971,6 @@ static u32 ivpu_hw_37xx_irqb_handler(struct ivpu_device *vdev, int irq)
 	else
 		REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, status);
 
-	/* Re-enable global interrupt */
-	REGB_WR32(VPU_37XX_BUTTRESS_GLOBAL_INT_MASK, 0x0);
-
 	if (schedule_recovery)
 		ivpu_pm_schedule_recovery(vdev);
 
@@ -988,9 +982,14 @@ static irqreturn_t ivpu_hw_37xx_irq_handler(int irq, void *ptr)
 	struct ivpu_device *vdev = ptr;
 	u32 ret_irqv, ret_irqb;
 
+	REGB_WR32(VPU_37XX_BUTTRESS_GLOBAL_INT_MASK, 0x1);
+
 	ret_irqv = ivpu_hw_37xx_irqv_handler(vdev, irq);
 	ret_irqb = ivpu_hw_37xx_irqb_handler(vdev, irq);
 
+	/* Re-enable global interrupts to re-trigger MSI for pending interrupts */
+	REGB_WR32(VPU_37XX_BUTTRESS_GLOBAL_INT_MASK, 0x0);
+
 	return IRQ_RETVAL(ret_irqb | ret_irqv);
 }
 
diff --git a/drivers/accel/ivpu/ivpu_job.h b/drivers/accel/ivpu/ivpu_job.h
index aa1f0b9479b0..5514c2d8a609 100644
--- a/drivers/accel/ivpu/ivpu_job.h
+++ b/drivers/accel/ivpu/ivpu_job.h
@@ -51,7 +51,7 @@ struct ivpu_job {
 	u32 job_id;
 	u32 engine_idx;
 	size_t bo_count;
-	struct ivpu_bo *bos[];
+	struct ivpu_bo *bos[] __counted_by(bo_count);
 };
 
 int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 1f4fc5f8a819..12f330b0eac0 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -310,11 +310,16 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
 	start = cfmws->base_hpa;
 	end = cfmws->base_hpa + cfmws->window_size;
 
-	/* Skip if the SRAT already described the NUMA details for this HPA */
-	node = phys_to_target_node(start);
-	if (node != NUMA_NO_NODE)
+	/*
+	 * The SRAT may have already described NUMA details for all,
+	 * or a portion of, this CFMWS HPA range. Extend the memblks
+	 * found for any portion of the window to cover the entire
+	 * window.
+	 */
+	if (!numa_fill_memblks(start, end))
 		return 0;
 
+	/* No SRAT description. Create a new node. */
 	node = acpi_map_pxm_to_node(*fake_pxm);
 
 	if (node == NUMA_NO_NODE) {
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
index 81effec17b3d..420dc9cbf774 100644
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -152,7 +152,7 @@ static int binderfs_binder_device_create(struct inode *ref_inode,
 		goto err;
 
 	inode->i_ino = minor + INODE_OFFSET;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	init_special_inode(inode, S_IFCHR | 0600,
 			   MKDEV(MAJOR(binderfs_dev), minor));
 	inode->i_fop = &binder_fops;
@@ -431,7 +431,7 @@ static int binderfs_binder_ctl_create(struct super_block *sb)
 	}
 
 	inode->i_ino = SECOND_INODE;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	init_special_inode(inode, S_IFCHR | 0600,
 			   MKDEV(MAJOR(binderfs_dev), minor));
 	inode->i_fop = &binder_ctl_fops;
@@ -473,7 +473,7 @@ static struct inode *binderfs_make_inode(struct super_block *sb, int mode)
 	if (ret) {
 		ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET);
 		ret->i_mode = mode;
-		ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret);
+		simple_inode_init_ts(ret);
 	}
 	return ret;
 }
@@ -702,7 +702,7 @@ static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	inode->i_ino = FIRST_INODE;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_mode = S_IFDIR | 0755;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &binderfs_dir_inode_operations;
 	set_nlink(inode, 2);
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a371b497035e..3a957c4da409 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1053,10 +1053,11 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
 
 		/*
 		 * Ask the sd driver to issue START STOP UNIT on runtime suspend
-		 * and resume only. For system level suspend/resume, devices
-		 * power state is handled directly by libata EH.
+		 * and resume and shutdown only. For system level suspend/resume,
+		 * devices power state is handled directly by libata EH.
 		 */
 		sdev->manage_runtime_start_stop = true;
+		sdev->manage_shutdown = true;
 	}
 
 	/*
diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c
index eba04c0de7eb..e20d35bdf5fe 100644
--- a/drivers/auxdisplay/panel.c
+++ b/drivers/auxdisplay/panel.c
@@ -1449,10 +1449,9 @@ static struct logical_input *panel_bind_key(const char *name, const char *press,
 	key->rise_time = 1;
 	key->fall_time = 1;
 
-	strncpy(key->u.kbd.press_str, press, sizeof(key->u.kbd.press_str));
-	strncpy(key->u.kbd.repeat_str, repeat, sizeof(key->u.kbd.repeat_str));
-	strncpy(key->u.kbd.release_str, release,
-		sizeof(key->u.kbd.release_str));
+	strtomem_pad(key->u.kbd.press_str, press, '\0');
+	strtomem_pad(key->u.kbd.repeat_str, repeat, '\0');
+	strtomem_pad(key->u.kbd.release_str, release, '\0');
 	list_add(&key->list, &logical_inputs);
 	return key;
 }
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index cd738cab725f..50949207798d 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1760,8 +1760,10 @@ static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode,
 		/* invalidate the buffer track to force a reread */
 		BufferDrive = -1;
 		set_bit(drive, &fake_change);
-		if (disk_check_media_change(disk))
+		if (disk_check_media_change(disk)) {
+			bdev_mark_dead(disk->part0, true);
 			floppy_revalidate(disk);
+		}
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index a30a5ed811be..c21e3732759e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -524,7 +524,9 @@ struct drbd_md {
 
 struct drbd_backing_dev {
 	struct block_device *backing_bdev;
+	struct bdev_handle *backing_bdev_handle;
 	struct block_device *md_bdev;
+	struct bdev_handle *md_bdev_handle;
 	struct drbd_md md;
 	struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
 	sector_t known_size; /* last known size of that backing device */
@@ -553,7 +555,7 @@ struct fifo_buffer {
 	unsigned int head_index;
 	unsigned int size;
 	int total; /* sum of all values */
-	int values[];
+	int values[] __counted_by(size);
 };
 extern struct fifo_buffer *fifo_alloc(unsigned int fifo_size);
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index d3538bd83fb3..43747a1aae43 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -82,7 +82,7 @@ static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */
 
 DEFINE_MUTEX(notification_mutex);
 
-/* used blkdev_get_by_path, to claim our meta data device(s) */
+/* used bdev_open_by_path, to claim our meta data device(s) */
 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
 
 static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
@@ -1635,43 +1635,45 @@ success:
 	return 0;
 }
 
-static struct block_device *open_backing_dev(struct drbd_device *device,
+static struct bdev_handle *open_backing_dev(struct drbd_device *device,
 		const char *bdev_path, void *claim_ptr, bool do_bd_link)
 {
-	struct block_device *bdev;
+	struct bdev_handle *handle;
 	int err = 0;
 
-	bdev = blkdev_get_by_path(bdev_path, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				  claim_ptr, NULL);
-	if (IS_ERR(bdev)) {
+	handle = bdev_open_by_path(bdev_path, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				   claim_ptr, NULL);
+	if (IS_ERR(handle)) {
 		drbd_err(device, "open(\"%s\") failed with %ld\n",
-				bdev_path, PTR_ERR(bdev));
-		return bdev;
+				bdev_path, PTR_ERR(handle));
+		return handle;
 	}
 
 	if (!do_bd_link)
-		return bdev;
+		return handle;
 
-	err = bd_link_disk_holder(bdev, device->vdisk);
+	err = bd_link_disk_holder(handle->bdev, device->vdisk);
 	if (err) {
-		blkdev_put(bdev, claim_ptr);
+		bdev_release(handle);
 		drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
 				bdev_path, err);
-		bdev = ERR_PTR(err);
+		handle = ERR_PTR(err);
 	}
-	return bdev;
+	return handle;
 }
 
 static int open_backing_devices(struct drbd_device *device,
 		struct disk_conf *new_disk_conf,
 		struct drbd_backing_dev *nbc)
 {
-	struct block_device *bdev;
+	struct bdev_handle *handle;
 
-	bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true);
-	if (IS_ERR(bdev))
+	handle = open_backing_dev(device, new_disk_conf->backing_dev, device,
+				  true);
+	if (IS_ERR(handle))
 		return ERR_OPEN_DISK;
-	nbc->backing_bdev = bdev;
+	nbc->backing_bdev = handle->bdev;
+	nbc->backing_bdev_handle = handle;
 
 	/*
 	 * meta_dev_idx >= 0: external fixed size, possibly multiple
@@ -1681,7 +1683,7 @@ static int open_backing_devices(struct drbd_device *device,
 	 * should check it for you already; but if you don't, or
 	 * someone fooled it, we need to double check here)
 	 */
-	bdev = open_backing_dev(device, new_disk_conf->meta_dev,
+	handle = open_backing_dev(device, new_disk_conf->meta_dev,
 		/* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
 		 * if potentially shared with other drbd minors */
 			(new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
@@ -1689,20 +1691,21 @@ static int open_backing_devices(struct drbd_device *device,
 		 * as would happen with internal metadata. */
 			(new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
 			 new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
-	if (IS_ERR(bdev))
+	if (IS_ERR(handle))
 		return ERR_OPEN_MD_DISK;
-	nbc->md_bdev = bdev;
+	nbc->md_bdev = handle->bdev;
+	nbc->md_bdev_handle = handle;
 	return NO_ERROR;
 }
 
-static void close_backing_dev(struct drbd_device *device, struct block_device *bdev,
-		void *claim_ptr, bool do_bd_unlink)
+static void close_backing_dev(struct drbd_device *device,
+		struct bdev_handle *handle, bool do_bd_unlink)
 {
-	if (!bdev)
+	if (!handle)
 		return;
 	if (do_bd_unlink)
-		bd_unlink_disk_holder(bdev, device->vdisk);
-	blkdev_put(bdev, claim_ptr);
+		bd_unlink_disk_holder(handle->bdev, device->vdisk);
+	bdev_release(handle);
 }
 
 void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
@@ -1710,11 +1713,9 @@ void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *
 	if (ldev == NULL)
 		return;
 
-	close_backing_dev(device, ldev->md_bdev,
-			  ldev->md.meta_dev_idx < 0 ?
-				(void *)device : (void *)drbd_m_holder,
+	close_backing_dev(device, ldev->md_bdev_handle,
 			  ldev->md_bdev != ldev->backing_bdev);
-	close_backing_dev(device, ldev->backing_bdev, device, true);
+	close_backing_dev(device, ldev->backing_bdev_handle, true);
 
 	kfree(ldev->disk_conf);
 	kfree(ldev);
@@ -2130,11 +2131,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
  fail:
 	conn_reconfig_done(connection);
 	if (nbc) {
-		close_backing_dev(device, nbc->md_bdev,
-			  nbc->disk_conf->meta_dev_idx < 0 ?
-				(void *)device : (void *)drbd_m_holder,
+		close_backing_dev(device, nbc->md_bdev_handle,
 			  nbc->md_bdev != nbc->backing_bdev);
-		close_backing_dev(device, nbc->backing_bdev, device, true);
+		close_backing_dev(device, nbc->backing_bdev_handle, true);
 		kfree(nbc);
 	}
 	kfree(new_disk_conf);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index ea4eb88a2e45..11114a5d9e5c 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3215,8 +3215,10 @@ static int invalidate_drive(struct gendisk *disk)
 	/* invalidate the buffer track to force a reread */
 	set_bit((long)disk->private_data, &fake_change);
 	process_fd_request();
-	if (disk_check_media_change(disk))
+	if (disk_check_media_change(disk)) {
+		bdev_mark_dead(disk->part0, true);
 		floppy_revalidate(disk);
+	}
 	return 0;
 }
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index a1428538bda5..d56d972aadb3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -340,8 +340,8 @@ static ssize_t device_map_show(const struct class *c, const struct class_attribu
 		n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n",
 			pd->disk->disk_name,
 			MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev),
-			MAJOR(pd->bdev->bd_dev),
-			MINOR(pd->bdev->bd_dev));
+			MAJOR(pd->bdev_handle->bdev->bd_dev),
+			MINOR(pd->bdev_handle->bdev->bd_dev));
 	}
 	mutex_unlock(&ctl_mutex);
 	return n;
@@ -437,7 +437,8 @@ static int pkt_seq_show(struct seq_file *m, void *p)
 	char *msg;
 	int states[PACKET_NUM_STATES];
 
-	seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, pd->bdev);
+	seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name,
+		   pd->bdev_handle->bdev);
 
 	seq_printf(m, "\nSettings:\n");
 	seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
@@ -714,7 +715,7 @@ static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *nod
  */
 static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
 {
-	struct request_queue *q = bdev_get_queue(pd->bdev);
+	struct request_queue *q = bdev_get_queue(pd->bdev_handle->bdev);
 	struct scsi_cmnd *scmd;
 	struct request *rq;
 	int ret = 0;
@@ -1047,7 +1048,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 			continue;
 
 		bio = pkt->r_bios[f];
-		bio_init(bio, pd->bdev, bio->bi_inline_vecs, 1, REQ_OP_READ);
+		bio_init(bio, pd->bdev_handle->bdev, bio->bi_inline_vecs, 1,
+			 REQ_OP_READ);
 		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
 		bio->bi_end_io = pkt_end_io_read;
 		bio->bi_private = pkt;
@@ -1262,8 +1264,8 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	struct device *ddev = disk_to_dev(pd->disk);
 	int f;
 
-	bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames,
-		 REQ_OP_WRITE);
+	bio_init(pkt->w_bio, pd->bdev_handle->bdev, pkt->w_bio->bi_inline_vecs,
+		 pkt->frames, REQ_OP_WRITE);
 	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
 	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
 	pkt->w_bio->bi_private = pkt;
@@ -2160,18 +2162,20 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	int ret;
 	long lba;
 	struct request_queue *q;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 
 	/*
 	 * We need to re-open the cdrom device without O_NONBLOCK to be able
 	 * to read/write from/to it. It is already opened in O_NONBLOCK mode
 	 * so open should not fail.
 	 */
-	bdev = blkdev_get_by_dev(pd->bdev->bd_dev, BLK_OPEN_READ, pd, NULL);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_dev(pd->bdev_handle->bdev->bd_dev,
+				       BLK_OPEN_READ, pd, NULL);
+	if (IS_ERR(bdev_handle)) {
+		ret = PTR_ERR(bdev_handle);
 		goto out;
 	}
+	pd->open_bdev_handle = bdev_handle;
 
 	ret = pkt_get_last_written(pd, &lba);
 	if (ret) {
@@ -2180,9 +2184,9 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	}
 
 	set_capacity(pd->disk, lba << 2);
-	set_capacity_and_notify(pd->bdev->bd_disk, lba << 2);
+	set_capacity_and_notify(pd->bdev_handle->bdev->bd_disk, lba << 2);
 
-	q = bdev_get_queue(pd->bdev);
+	q = bdev_get_queue(pd->bdev_handle->bdev);
 	if (write) {
 		ret = pkt_open_write(pd);
 		if (ret)
@@ -2214,7 +2218,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 	return 0;
 
 out_putdev:
-	blkdev_put(bdev, pd);
+	bdev_release(bdev_handle);
 out:
 	return ret;
 }
@@ -2233,7 +2237,8 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 	pkt_lock_door(pd, 0);
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-	blkdev_put(pd->bdev, pd);
+	bdev_release(pd->open_bdev_handle);
+	pd->open_bdev_handle = NULL;
 
 	pkt_shrink_pktlist(pd);
 }
@@ -2321,8 +2326,8 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 
 static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct bio *cloned_bio =
-		bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set);
+	struct bio *cloned_bio = bio_alloc_clone(pd->bdev_handle->bdev, bio,
+		GFP_NOIO, &pkt_bio_set);
 	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
 
 	psd->pd = pd;
@@ -2492,7 +2497,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 {
 	struct device *ddev = disk_to_dev(pd->disk);
 	int i;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct scsi_device *sdev;
 
 	if (pd->pkt_dev == dev) {
@@ -2503,8 +2508,9 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		struct pktcdvd_device *pd2 = pkt_devs[i];
 		if (!pd2)
 			continue;
-		if (pd2->bdev->bd_dev == dev) {
-			dev_err(ddev, "%pg already setup\n", pd2->bdev);
+		if (pd2->bdev_handle->bdev->bd_dev == dev) {
+			dev_err(ddev, "%pg already setup\n",
+				pd2->bdev_handle->bdev);
 			return -EBUSY;
 		}
 		if (pd2->pkt_dev == dev) {
@@ -2513,13 +2519,13 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		}
 	}
 
-	bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY, NULL,
-				 NULL);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
-	sdev = scsi_device_from_queue(bdev->bd_disk->queue);
+	bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY,
+				       NULL, NULL);
+	if (IS_ERR(bdev_handle))
+		return PTR_ERR(bdev_handle);
+	sdev = scsi_device_from_queue(bdev_handle->bdev->bd_disk->queue);
 	if (!sdev) {
-		blkdev_put(bdev, NULL);
+		bdev_release(bdev_handle);
 		return -EINVAL;
 	}
 	put_device(&sdev->sdev_gendev);
@@ -2527,8 +2533,8 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	/* This is safe, since we have a reference from open(). */
 	__module_get(THIS_MODULE);
 
-	pd->bdev = bdev;
-	set_blocksize(bdev, CD_FRAMESIZE);
+	pd->bdev_handle = bdev_handle;
+	set_blocksize(bdev_handle->bdev, CD_FRAMESIZE);
 
 	pkt_init_queue(pd);
 
@@ -2540,11 +2546,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	}
 
 	proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd);
-	dev_notice(ddev, "writer mapped to %pg\n", bdev);
+	dev_notice(ddev, "writer mapped to %pg\n", bdev_handle->bdev);
 	return 0;
 
 out_mem:
-	blkdev_put(bdev, NULL);
+	bdev_release(bdev_handle);
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
 	return -ENOMEM;
@@ -2599,9 +2605,9 @@ static unsigned int pkt_check_events(struct gendisk *disk,
 
 	if (!pd)
 		return 0;
-	if (!pd->bdev)
+	if (!pd->bdev_handle)
 		return 0;
-	attached_disk = pd->bdev->bd_disk;
+	attached_disk = pd->bdev_handle->bdev->bd_disk;
 	if (!attached_disk || !attached_disk->fops->check_events)
 		return 0;
 	return attached_disk->fops->check_events(attached_disk, clearing);
@@ -2686,7 +2692,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 		goto out_mem2;
 
 	/* inherit events of the host device */
-	disk->events = pd->bdev->bd_disk->events;
+	disk->events = pd->bdev_handle->bdev->bd_disk->events;
 
 	ret = add_disk(disk);
 	if (ret)
@@ -2751,7 +2757,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	pkt_debugfs_dev_remove(pd);
 	pkt_sysfs_dev_remove(pd);
 
-	blkdev_put(pd->bdev, NULL);
+	bdev_release(pd->bdev_handle);
 
 	remove_proc_entry(pd->disk->disk_name, pkt_proc);
 	dev_notice(ddev, "writer unmapped\n");
@@ -2778,7 +2784,7 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
 
 	pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
 	if (pd) {
-		ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev);
+		ctrl_cmd->dev = new_encode_dev(pd->bdev_handle->bdev->bd_dev);
 		ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
 	} else {
 		ctrl_cmd->dev = 0;
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index c186df0ec641..65de51f3dfd9 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -145,7 +145,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
 	priv->sess_dev = sess_dev;
 	priv->id = id;
 
-	bio = bio_alloc(sess_dev->bdev, 1,
+	bio = bio_alloc(sess_dev->bdev_handle->bdev, 1,
 			rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
 	if (bio_add_page(bio, virt_to_page(data), datalen,
 			offset_in_page(data)) != datalen) {
@@ -219,7 +219,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
 	rnbd_put_sess_dev(sess_dev);
 	wait_for_completion(&dc); /* wait for inflights to drop to zero */
 
-	blkdev_put(sess_dev->bdev, NULL);
+	bdev_release(sess_dev->bdev_handle);
 	mutex_lock(&sess_dev->dev->lock);
 	list_del(&sess_dev->dev_list);
 	if (!sess_dev->readonly)
@@ -534,7 +534,7 @@ rnbd_srv_get_or_create_srv_dev(struct block_device *bdev,
 static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
 					struct rnbd_srv_sess_dev *sess_dev)
 {
-	struct block_device *bdev = sess_dev->bdev;
+	struct block_device *bdev = sess_dev->bdev_handle->bdev;
 
 	rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP);
 	rsp->device_id = cpu_to_le32(sess_dev->device_id);
@@ -559,7 +559,7 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
 static struct rnbd_srv_sess_dev *
 rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess,
 			      const struct rnbd_msg_open *open_msg,
-			      struct block_device *bdev, bool readonly,
+			      struct bdev_handle *handle, bool readonly,
 			      struct rnbd_srv_dev *srv_dev)
 {
 	struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess);
@@ -571,7 +571,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess,
 
 	strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname));
 
-	sdev->bdev		= bdev;
+	sdev->bdev_handle	= handle;
 	sdev->sess		= srv_sess;
 	sdev->dev		= srv_dev;
 	sdev->readonly		= readonly;
@@ -676,7 +676,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 	struct rnbd_srv_dev *srv_dev;
 	struct rnbd_srv_sess_dev *srv_sess_dev;
 	const struct rnbd_msg_open *open_msg = msg;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	blk_mode_t open_flags = BLK_OPEN_READ;
 	char *full_path;
 	struct rnbd_msg_open_rsp *rsp = data;
@@ -714,15 +714,15 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 		goto reject;
 	}
 
-	bdev = blkdev_get_by_path(full_path, open_flags, NULL, NULL);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_path(full_path, open_flags, NULL, NULL);
+	if (IS_ERR(bdev_handle)) {
+		ret = PTR_ERR(bdev_handle);
 		pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n",
 		       full_path, srv_sess->sessname, ret);
 		goto free_path;
 	}
 
-	srv_dev = rnbd_srv_get_or_create_srv_dev(bdev, srv_sess,
+	srv_dev = rnbd_srv_get_or_create_srv_dev(bdev_handle->bdev, srv_sess,
 						  open_msg->access_mode);
 	if (IS_ERR(srv_dev)) {
 		pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n",
@@ -731,7 +731,8 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 		goto blkdev_put;
 	}
 
-	srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, bdev,
+	srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg,
+				bdev_handle,
 				open_msg->access_mode == RNBD_ACCESS_RO,
 				srv_dev);
 	if (IS_ERR(srv_sess_dev)) {
@@ -747,7 +748,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
 	 */
 	mutex_lock(&srv_dev->lock);
 	if (!srv_dev->dev_kobj.state_in_sysfs) {
-		ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev);
+		ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev_handle->bdev);
 		if (ret) {
 			mutex_unlock(&srv_dev->lock);
 			rnbd_srv_err(srv_sess_dev,
@@ -790,7 +791,7 @@ srv_dev_put:
 	}
 	rnbd_put_srv_dev(srv_dev);
 blkdev_put:
-	blkdev_put(bdev, NULL);
+	bdev_release(bdev_handle);
 free_path:
 	kfree(full_path);
 reject:
diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h
index 1027656dedb0..343cc682b617 100644
--- a/drivers/block/rnbd/rnbd-srv.h
+++ b/drivers/block/rnbd/rnbd-srv.h
@@ -46,7 +46,7 @@ struct rnbd_srv_dev {
 struct rnbd_srv_sess_dev {
 	/* Entry inside rnbd_srv_dev struct */
 	struct list_head		dev_list;
-	struct block_device		*bdev;
+	struct bdev_handle		*bdev_handle;
 	struct rnbd_srv_session		*sess;
 	struct rnbd_srv_dev		*dev;
 	struct kobject                  kobj;
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index c362f4ad80ab..4defd7f387c7 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -465,7 +465,7 @@ static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 	}
 
 	req->dev  = vbd->pdevice;
-	req->bdev = vbd->bdev;
+	req->bdev = vbd->bdev_handle->bdev;
 	rc = 0;
 
  out:
@@ -969,7 +969,7 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring,
 	int err = 0;
 	int status = BLKIF_RSP_OKAY;
 	struct xen_blkif *blkif = ring->blkif;
-	struct block_device *bdev = blkif->vbd.bdev;
+	struct block_device *bdev = blkif->vbd.bdev_handle->bdev;
 	struct phys_req preq;
 
 	xen_blkif_get(blkif);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 40f67bfc052d..5ff50e76cee5 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -221,7 +221,7 @@ struct xen_vbd {
 	unsigned char		type;
 	/* phys device that this vbd maps to. */
 	u32			pdevice;
-	struct block_device	*bdev;
+	struct bdev_handle	*bdev_handle;
 	/* Cached size parameter. */
 	sector_t		size;
 	unsigned int		flush_support:1;
@@ -360,7 +360,7 @@ struct pending_req {
 };
 
 
-#define vbd_sz(_v)	bdev_nr_sectors((_v)->bdev)
+#define vbd_sz(_v)	bdev_nr_sectors((_v)->bdev_handle->bdev)
 
 #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
 #define xen_blkif_put(_b)				\
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index bb66178c432b..e34219ea2b05 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -81,7 +81,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 	int i;
 
 	/* Not ready to connect? */
-	if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
+	if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev_handle)
 		return;
 
 	/* Already connected? */
@@ -99,12 +99,13 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 		return;
 	}
 
-	err = sync_blockdev(blkif->vbd.bdev);
+	err = sync_blockdev(blkif->vbd.bdev_handle->bdev);
 	if (err) {
 		xenbus_dev_error(blkif->be->dev, err, "block flush");
 		return;
 	}
-	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+	invalidate_inode_pages2(
+			blkif->vbd.bdev_handle->bdev->bd_inode->i_mapping);
 
 	for (i = 0; i < blkif->nr_rings; i++) {
 		ring = &blkif->rings[i];
@@ -472,9 +473,9 @@ static void xenvbd_sysfs_delif(struct xenbus_device *dev)
 
 static void xen_vbd_free(struct xen_vbd *vbd)
 {
-	if (vbd->bdev)
-		blkdev_put(vbd->bdev, NULL);
-	vbd->bdev = NULL;
+	if (vbd->bdev_handle)
+		bdev_release(vbd->bdev_handle);
+	vbd->bdev_handle = NULL;
 }
 
 static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
@@ -482,7 +483,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 			  int cdrom)
 {
 	struct xen_vbd *vbd;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 
 	vbd = &blkif->vbd;
 	vbd->handle   = handle;
@@ -491,17 +492,17 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 
 	vbd->pdevice  = MKDEV(major, minor);
 
-	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+	bdev_handle = bdev_open_by_dev(vbd->pdevice, vbd->readonly ?
 				 BLK_OPEN_READ : BLK_OPEN_WRITE, NULL, NULL);
 
-	if (IS_ERR(bdev)) {
+	if (IS_ERR(bdev_handle)) {
 		pr_warn("xen_vbd_create: device %08x could not be opened\n",
 			vbd->pdevice);
 		return -ENOENT;
 	}
 
-	vbd->bdev = bdev;
-	if (vbd->bdev->bd_disk == NULL) {
+	vbd->bdev_handle = bdev_handle;
+	if (vbd->bdev_handle->bdev->bd_disk == NULL) {
 		pr_warn("xen_vbd_create: device %08x doesn't exist\n",
 			vbd->pdevice);
 		xen_vbd_free(vbd);
@@ -509,14 +510,14 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
 	}
 	vbd->size = vbd_sz(vbd);
 
-	if (cdrom || disk_to_cdi(vbd->bdev->bd_disk))
+	if (cdrom || disk_to_cdi(vbd->bdev_handle->bdev->bd_disk))
 		vbd->type |= VDISK_CDROM;
-	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+	if (vbd->bdev_handle->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
 		vbd->type |= VDISK_REMOVABLE;
 
-	if (bdev_write_cache(bdev))
+	if (bdev_write_cache(bdev_handle->bdev))
 		vbd->flush_support = true;
-	if (bdev_max_secure_erase_sectors(bdev))
+	if (bdev_max_secure_erase_sectors(bdev_handle->bdev))
 		vbd->discard_secure = true;
 
 	pr_debug("Successful creation of handle=%04x (dom=%u)\n",
@@ -569,7 +570,7 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info
 	struct xen_blkif *blkif = be->blkif;
 	int err;
 	int state = 0;
-	struct block_device *bdev = be->blkif->vbd.bdev;
+	struct block_device *bdev = be->blkif->vbd.bdev_handle->bdev;
 
 	if (!xenbus_read_unsigned(dev->nodename, "discard-enable", 1))
 		return;
@@ -930,15 +931,16 @@ again:
 		goto abort;
 	}
 	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
-			    (unsigned long)
-			    bdev_logical_block_size(be->blkif->vbd.bdev));
+			    (unsigned long)bdev_logical_block_size(
+					be->blkif->vbd.bdev_handle->bdev));
 	if (err) {
 		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
 				 dev->nodename);
 		goto abort;
 	}
 	err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
-			    bdev_physical_block_size(be->blkif->vbd.bdev));
+			    bdev_physical_block_size(
+					be->blkif->vbd.bdev_handle->bdev));
 	if (err)
 		xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
 				 dev->nodename);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 06673c6ca255..d77d3664ca08 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -414,17 +414,14 @@ static ssize_t writeback_limit_show(struct device *dev,
 
 static void reset_bdev(struct zram *zram)
 {
-	struct block_device *bdev;
-
 	if (!zram->backing_dev)
 		return;
 
-	bdev = zram->bdev;
-	blkdev_put(bdev, zram);
+	bdev_release(zram->bdev_handle);
 	/* hope filp_close flush all of IO */
 	filp_close(zram->backing_dev, NULL);
 	zram->backing_dev = NULL;
-	zram->bdev = NULL;
+	zram->bdev_handle = NULL;
 	zram->disk->fops = &zram_devops;
 	kvfree(zram->bitmap);
 	zram->bitmap = NULL;
@@ -470,7 +467,7 @@ static ssize_t backing_dev_store(struct device *dev,
 	struct address_space *mapping;
 	unsigned int bitmap_sz;
 	unsigned long nr_pages, *bitmap = NULL;
-	struct block_device *bdev = NULL;
+	struct bdev_handle *bdev_handle = NULL;
 	int err;
 	struct zram *zram = dev_to_zram(dev);
 
@@ -507,11 +504,11 @@ static ssize_t backing_dev_store(struct device *dev,
 		goto out;
 	}
 
-	bdev = blkdev_get_by_dev(inode->i_rdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				 zram, NULL);
-	if (IS_ERR(bdev)) {
-		err = PTR_ERR(bdev);
-		bdev = NULL;
+	bdev_handle = bdev_open_by_dev(inode->i_rdev,
+				BLK_OPEN_READ | BLK_OPEN_WRITE, zram, NULL);
+	if (IS_ERR(bdev_handle)) {
+		err = PTR_ERR(bdev_handle);
+		bdev_handle = NULL;
 		goto out;
 	}
 
@@ -525,7 +522,7 @@ static ssize_t backing_dev_store(struct device *dev,
 
 	reset_bdev(zram);
 
-	zram->bdev = bdev;
+	zram->bdev_handle = bdev_handle;
 	zram->backing_dev = backing_dev;
 	zram->bitmap = bitmap;
 	zram->nr_pages = nr_pages;
@@ -538,8 +535,8 @@ static ssize_t backing_dev_store(struct device *dev,
 out:
 	kvfree(bitmap);
 
-	if (bdev)
-		blkdev_put(bdev, zram);
+	if (bdev_handle)
+		bdev_release(bdev_handle);
 
 	if (backing_dev)
 		filp_close(backing_dev, NULL);
@@ -581,7 +578,7 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
 {
 	struct bio *bio;
 
-	bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO);
+	bio = bio_alloc(zram->bdev_handle->bdev, 1, parent->bi_opf, GFP_NOIO);
 	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
 	__bio_add_page(bio, page, PAGE_SIZE, 0);
 	bio_chain(bio, parent);
@@ -697,7 +694,7 @@ static ssize_t writeback_store(struct device *dev,
 			continue;
 		}
 
-		bio_init(&bio, zram->bdev, &bio_vec, 1,
+		bio_init(&bio, zram->bdev_handle->bdev, &bio_vec, 1,
 			 REQ_OP_WRITE | REQ_SYNC);
 		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
 		__bio_add_page(&bio, page, PAGE_SIZE, 0);
@@ -779,7 +776,7 @@ static void zram_sync_read(struct work_struct *work)
 	struct bio_vec bv;
 	struct bio bio;
 
-	bio_init(&bio, zw->zram->bdev, &bv, 1, REQ_OP_READ);
+	bio_init(&bio, zw->zram->bdev_handle->bdev, &bv, 1, REQ_OP_READ);
 	bio.bi_iter.bi_sector = zw->entry * (PAGE_SIZE >> 9);
 	__bio_add_page(&bio, zw->page, PAGE_SIZE, 0);
 	zw->error = submit_bio_wait(&bio);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index ca7a15bd4845..d090753f97be 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -132,7 +132,7 @@ struct zram {
 	spinlock_t wb_limit_lock;
 	bool wb_limit_enable;
 	u64 bd_wb_limit;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	unsigned long *bitmap;
 	unsigned long nr_pages;
 #endif
diff --git a/drivers/bus/fsl-mc/dprc.c b/drivers/bus/fsl-mc/dprc.c
index d129338b8bc0..dd1b5c0fb7e2 100644
--- a/drivers/bus/fsl-mc/dprc.c
+++ b/drivers/bus/fsl-mc/dprc.c
@@ -450,10 +450,8 @@ int dprc_get_obj(struct fsl_mc_io *mc_io,
 	obj_desc->ver_major = le16_to_cpu(rsp_params->version_major);
 	obj_desc->ver_minor = le16_to_cpu(rsp_params->version_minor);
 	obj_desc->flags = le16_to_cpu(rsp_params->flags);
-	strncpy(obj_desc->type, rsp_params->type, 16);
-	obj_desc->type[15] = '\0';
-	strncpy(obj_desc->label, rsp_params->label, 16);
-	obj_desc->label[15] = '\0';
+	strscpy_pad(obj_desc->type, rsp_params->type, 16);
+	strscpy_pad(obj_desc->label, rsp_params->label, 16);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dprc_get_obj);
@@ -491,8 +489,7 @@ int dprc_set_obj_irq(struct fsl_mc_io *mc_io,
 	cmd_params->irq_addr = cpu_to_le64(irq_cfg->paddr);
 	cmd_params->irq_num = cpu_to_le32(irq_cfg->irq_num);
 	cmd_params->obj_id = cpu_to_le32(obj_id);
-	strncpy(cmd_params->obj_type, obj_type, 16);
-	cmd_params->obj_type[15] = '\0';
+	strscpy_pad(cmd_params->obj_type, obj_type, 16);
 
 	/* send command to mc*/
 	return mc_send_command(mc_io, &cmd);
@@ -564,8 +561,7 @@ int dprc_get_obj_region(struct fsl_mc_io *mc_io,
 	cmd_params = (struct dprc_cmd_get_obj_region *)cmd.params;
 	cmd_params->obj_id = cpu_to_le32(obj_id);
 	cmd_params->region_index = region_index;
-	strncpy(cmd_params->obj_type, obj_type, 16);
-	cmd_params->obj_type[15] = '\0';
+	strscpy_pad(cmd_params->obj_type, obj_type, 16);
 
 	/* send command to mc*/
 	err = mc_send_command(mc_io, &cmd);
diff --git a/drivers/cache/Kconfig b/drivers/cache/Kconfig
index a57677f908f3..d6e5e3abaad8 100644
--- a/drivers/cache/Kconfig
+++ b/drivers/cache/Kconfig
@@ -3,7 +3,7 @@ menu "Cache Drivers"
 
 config AX45MP_L2_CACHE
 	bool "Andes Technology AX45MP L2 Cache controller"
-	depends on RISCV_DMA_NONCOHERENT
+	depends on RISCV
 	select RISCV_NONSTANDARD_CACHE_OPS
 	help
 	  Support for the L2 cache controller on Andes Technology AX45MP platforms.
diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index 9211531689b2..22d249333f53 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -920,7 +920,7 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf,
 
 	if (ret > 0) {
 		struct inode *inode = file_inode(file);
-		inode->i_atime = current_time(inode);
+		inode_set_atime_to_ts(inode, current_time(inode));
 	}
 
 	return ret;
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 680d1ef2a217..431e9e5bf9c1 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -106,7 +106,7 @@ struct port_buffer {
 	unsigned int sgpages;
 
 	/* sg is used if spages > 0. sg must be the last in is struct */
-	struct scatterlist sg[];
+	struct scatterlist sg[] __counted_by(sgpages);
 };
 
 /*
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index c249f9791ae8..473563bc7496 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3416,6 +3416,7 @@ static void possible_parent_show(struct seq_file *s, struct clk_core *core,
 				 unsigned int i, char terminator)
 {
 	struct clk_core *parent;
+	const char *name = NULL;
 
 	/*
 	 * Go through the following options to fetch a parent's name.
@@ -3430,18 +3431,20 @@ static void possible_parent_show(struct seq_file *s, struct clk_core *core,
 	 * registered (yet).
 	 */
 	parent = clk_core_get_parent_by_index(core, i);
-	if (parent)
+	if (parent) {
 		seq_puts(s, parent->name);
-	else if (core->parents[i].name)
+	} else if (core->parents[i].name) {
 		seq_puts(s, core->parents[i].name);
-	else if (core->parents[i].fw_name)
+	} else if (core->parents[i].fw_name) {
 		seq_printf(s, "<%s>(fw)", core->parents[i].fw_name);
-	else if (core->parents[i].index >= 0)
-		seq_puts(s,
-			 of_clk_get_parent_name(core->of_node,
-						core->parents[i].index));
-	else
-		seq_puts(s, "(missing)");
+	} else {
+		if (core->parents[i].index >= 0)
+			name = of_clk_get_parent_name(core->of_node, core->parents[i].index);
+		if (!name)
+			name = "(missing)";
+
+		seq_puts(s, name);
+	}
 
 	seq_putc(s, terminator);
 }
diff --git a/drivers/clk/socfpga/clk-gate.c b/drivers/clk/socfpga/clk-gate.c
index 8dd601bd8538..0a5a95e0267f 100644
--- a/drivers/clk/socfpga/clk-gate.c
+++ b/drivers/clk/socfpga/clk-gate.c
@@ -87,10 +87,8 @@ static int socfpga_clk_set_parent(struct clk_hw *hwclk, u8 parent)
 	return 0;
 }
 
-static unsigned long socfpga_clk_recalc_rate(struct clk_hw *hwclk,
-	unsigned long parent_rate)
+static u32 socfpga_clk_get_div(struct socfpga_gate_clk *socfpgaclk)
 {
-	struct socfpga_gate_clk *socfpgaclk = to_socfpga_gate_clk(hwclk);
 	u32 div = 1, val;
 
 	if (socfpgaclk->fixed_div)
@@ -105,12 +103,33 @@ static unsigned long socfpga_clk_recalc_rate(struct clk_hw *hwclk,
 			div = (1 << val);
 	}
 
+	return div;
+}
+
+static unsigned long socfpga_clk_recalc_rate(struct clk_hw *hwclk,
+					     unsigned long parent_rate)
+{
+	struct socfpga_gate_clk *socfpgaclk = to_socfpga_gate_clk(hwclk);
+	u32 div = socfpga_clk_get_div(socfpgaclk);
+
 	return parent_rate / div;
 }
 
+
+static int socfpga_clk_determine_rate(struct clk_hw *hwclk,
+				      struct clk_rate_request *req)
+{
+	struct socfpga_gate_clk *socfpgaclk = to_socfpga_gate_clk(hwclk);
+	u32 div = socfpga_clk_get_div(socfpgaclk);
+
+	req->rate = req->best_parent_rate / div;
+
+	return 0;
+}
+
 static struct clk_ops gateclk_ops = {
 	.recalc_rate = socfpga_clk_recalc_rate,
-	.determine_rate = clk_hw_determine_rate_no_reparent,
+	.determine_rate = socfpga_clk_determine_rate,
 	.get_parent = socfpga_clk_get_parent,
 	.set_parent = socfpga_clk_set_parent,
 };
diff --git a/drivers/clk/stm32/clk-stm32-core.c b/drivers/clk/stm32/clk-stm32-core.c
index d5aa09e9fce4..067b918a8894 100644
--- a/drivers/clk/stm32/clk-stm32-core.c
+++ b/drivers/clk/stm32/clk-stm32-core.c
@@ -431,7 +431,7 @@ static int clk_stm32_composite_determine_rate(struct clk_hw *hw,
 {
 	struct clk_stm32_composite *composite = to_clk_stm32_composite(hw);
 	const struct stm32_div_cfg *divider;
-	unsigned long rate;
+	long rate;
 
 	if (composite->div_id == NO_STM32_DIV)
 		return 0;
diff --git a/drivers/clk/ti/clk-44xx.c b/drivers/clk/ti/clk-44xx.c
index 868bc7af21b0..9b2824ed785b 100644
--- a/drivers/clk/ti/clk-44xx.c
+++ b/drivers/clk/ti/clk-44xx.c
@@ -749,9 +749,14 @@ static struct ti_dt_clk omap44xx_clks[] = {
 	DT_CLK(NULL, "mcbsp1_sync_mux_ck", "abe-clkctrl:0028:26"),
 	DT_CLK(NULL, "mcbsp2_sync_mux_ck", "abe-clkctrl:0030:26"),
 	DT_CLK(NULL, "mcbsp3_sync_mux_ck", "abe-clkctrl:0038:26"),
+	DT_CLK("40122000.mcbsp", "prcm_fck", "abe-clkctrl:0028:26"),
+	DT_CLK("40124000.mcbsp", "prcm_fck", "abe-clkctrl:0030:26"),
+	DT_CLK("40126000.mcbsp", "prcm_fck", "abe-clkctrl:0038:26"),
 	DT_CLK(NULL, "mcbsp4_sync_mux_ck", "l4-per-clkctrl:00c0:26"),
+	DT_CLK("48096000.mcbsp", "prcm_fck", "l4-per-clkctrl:00c0:26"),
 	DT_CLK(NULL, "ocp2scp_usb_phy_phy_48m", "l3-init-clkctrl:00c0:8"),
 	DT_CLK(NULL, "otg_60m_gfclk", "l3-init-clkctrl:0040:24"),
+	DT_CLK(NULL, "pad_fck", "pad_clks_ck"),
 	DT_CLK(NULL, "per_mcbsp4_gfclk", "l4-per-clkctrl:00c0:24"),
 	DT_CLK(NULL, "pmd_stm_clock_mux_ck", "emu-sys-clkctrl:0000:20"),
 	DT_CLK(NULL, "pmd_trace_clk_mux_ck", "emu-sys-clkctrl:0000:22"),
diff --git a/drivers/clk/ti/clk-54xx.c b/drivers/clk/ti/clk-54xx.c
index b4aff76eb373..74dfd5823f83 100644
--- a/drivers/clk/ti/clk-54xx.c
+++ b/drivers/clk/ti/clk-54xx.c
@@ -565,15 +565,19 @@ static struct ti_dt_clk omap54xx_clks[] = {
 	DT_CLK(NULL, "gpio8_dbclk", "l4per-clkctrl:00f8:8"),
 	DT_CLK(NULL, "mcbsp1_gfclk", "abe-clkctrl:0028:24"),
 	DT_CLK(NULL, "mcbsp1_sync_mux_ck", "abe-clkctrl:0028:26"),
+	DT_CLK("40122000.mcbsp", "prcm_fck", "abe-clkctrl:0028:26"),
 	DT_CLK(NULL, "mcbsp2_gfclk", "abe-clkctrl:0030:24"),
 	DT_CLK(NULL, "mcbsp2_sync_mux_ck", "abe-clkctrl:0030:26"),
+	DT_CLK("40124000.mcbsp", "prcm_fck", "abe-clkctrl:0030:26"),
 	DT_CLK(NULL, "mcbsp3_gfclk", "abe-clkctrl:0038:24"),
 	DT_CLK(NULL, "mcbsp3_sync_mux_ck", "abe-clkctrl:0038:26"),
+	DT_CLK("40126000.mcbsp", "prcm_fck", "abe-clkctrl:0038:26"),
 	DT_CLK(NULL, "mmc1_32khz_clk", "l3init-clkctrl:0008:8"),
 	DT_CLK(NULL, "mmc1_fclk", "l3init-clkctrl:0008:25"),
 	DT_CLK(NULL, "mmc1_fclk_mux", "l3init-clkctrl:0008:24"),
 	DT_CLK(NULL, "mmc2_fclk", "l3init-clkctrl:0010:25"),
 	DT_CLK(NULL, "mmc2_fclk_mux", "l3init-clkctrl:0010:24"),
+	DT_CLK(NULL, "pad_fck", "pad_clks_ck"),
 	DT_CLK(NULL, "sata_ref_clk", "l3init-clkctrl:0068:8"),
 	DT_CLK(NULL, "timer10_gfclk_mux", "l4per-clkctrl:0008:24"),
 	DT_CLK(NULL, "timer11_gfclk_mux", "l4per-clkctrl:0010:24"),
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 0ba0dc4ecf06..34faa0320ece 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -732,4 +732,15 @@ config GOLDFISH_TIMER
 	help
 	  Support for the timer/counter of goldfish-rtc
 
+config EP93XX_TIMER
+	bool "Cirrus Logic ep93xx timer driver" if COMPILE_TEST
+	depends on ARCH_EP93XX
+	depends on GENERIC_CLOCKEVENTS
+	depends on HAS_IOMEM
+	select CLKSRC_MMIO
+	select TIMER_OF
+	help
+	  Enables support for the Cirrus Logic timer block
+	  EP93XX.
+
 endmenu
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 368c3461dab8..4bb856e4df55 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -89,3 +89,4 @@ obj-$(CONFIG_MSC313E_TIMER)		+= timer-msc313e.o
 obj-$(CONFIG_GOLDFISH_TIMER)		+= timer-goldfish.o
 obj-$(CONFIG_GXP_TIMER)			+= timer-gxp.o
 obj-$(CONFIG_CLKSRC_LOONGSON1_PWM)	+= timer-loongson1-pwm.o
+obj-$(CONFIG_EP93XX_TIMER)		+= timer-ep93xx.o
diff --git a/drivers/clocksource/timer-atmel-tcb.c b/drivers/clocksource/timer-atmel-tcb.c
index 27af17c99590..2a90c92a9182 100644
--- a/drivers/clocksource/timer-atmel-tcb.c
+++ b/drivers/clocksource/timer-atmel-tcb.c
@@ -315,6 +315,7 @@ static void __init tcb_setup_dual_chan(struct atmel_tc *tc, int mck_divisor_idx)
 	writel(mck_divisor_idx			/* likely divide-by-8 */
 			| ATMEL_TC_WAVE
 			| ATMEL_TC_WAVESEL_UP		/* free-run */
+			| ATMEL_TC_ASWTRG_SET		/* TIOA0 rises at software trigger */
 			| ATMEL_TC_ACPA_SET		/* TIOA0 rises at 0 */
 			| ATMEL_TC_ACPC_CLEAR,		/* (duty cycle 50%) */
 			tcaddr + ATMEL_TC_REG(0, CMR));
diff --git a/drivers/clocksource/timer-ep93xx.c b/drivers/clocksource/timer-ep93xx.c
new file mode 100644
index 000000000000..bc0ca6e12334
--- /dev/null
+++ b/drivers/clocksource/timer-ep93xx.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cirrus Logic EP93xx timer driver.
+ * Copyright (C) 2021 Nikita Shubin <nikita.shubin@maquefel.me>
+ *
+ * Based on a rewrite of arch/arm/mach-ep93xx/timer.c:
+ */
+
+#include <linux/clockchips.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/sched_clock.h>
+
+#include <asm/mach/time.h>
+
+/*************************************************************************
+ * Timer handling for EP93xx
+ *************************************************************************
+ * The ep93xx has four internal timers.  Timers 1, 2 (both 16 bit) and
+ * 3 (32 bit) count down at 508 kHz, are self-reloading, and can generate
+ * an interrupt on underflow.  Timer 4 (40 bit) counts down at 983.04 kHz,
+ * is free-running, and can't generate interrupts.
+ *
+ * The 508 kHz timers are ideal for use for the timer interrupt, as the
+ * most common values of HZ divide 508 kHz nicely.  We pick the 32 bit
+ * timer (timer 3) to get as long sleep intervals as possible when using
+ * CONFIG_NO_HZ.
+ *
+ * The higher clock rate of timer 4 makes it a better choice than the
+ * other timers for use as clock source and for sched_clock(), providing
+ * a stable 40 bit time base.
+ *************************************************************************
+ */
+
+#define EP93XX_TIMER1_LOAD		0x00
+#define EP93XX_TIMER1_VALUE		0x04
+#define EP93XX_TIMER1_CONTROL		0x08
+#define EP93XX_TIMER123_CONTROL_ENABLE	BIT(7)
+#define EP93XX_TIMER123_CONTROL_MODE	BIT(6)
+#define EP93XX_TIMER123_CONTROL_CLKSEL	BIT(3)
+#define EP93XX_TIMER1_CLEAR		0x0c
+#define EP93XX_TIMER2_LOAD		0x20
+#define EP93XX_TIMER2_VALUE		0x24
+#define EP93XX_TIMER2_CONTROL		0x28
+#define EP93XX_TIMER2_CLEAR		0x2c
+/*
+ * This read-only register contains the low word of the time stamp debug timer
+ * ( Timer4). When this register is read, the high byte of the Timer4 counter is
+ * saved in the Timer4ValueHigh register.
+ */
+#define EP93XX_TIMER4_VALUE_LOW		0x60
+#define EP93XX_TIMER4_VALUE_HIGH	0x64
+#define EP93XX_TIMER4_VALUE_HIGH_ENABLE	BIT(8)
+#define EP93XX_TIMER3_LOAD		0x80
+#define EP93XX_TIMER3_VALUE		0x84
+#define EP93XX_TIMER3_CONTROL		0x88
+#define EP93XX_TIMER3_CLEAR		0x8c
+
+#define EP93XX_TIMER123_RATE		508469
+#define EP93XX_TIMER4_RATE		983040
+
+struct ep93xx_tcu {
+	void __iomem *base;
+};
+
+static struct ep93xx_tcu *ep93xx_tcu;
+
+static u64 ep93xx_clocksource_read(struct clocksource *c)
+{
+	struct ep93xx_tcu *tcu = ep93xx_tcu;
+
+	return lo_hi_readq(tcu->base + EP93XX_TIMER4_VALUE_LOW) & GENMASK_ULL(39, 0);
+}
+
+static u64 notrace ep93xx_read_sched_clock(void)
+{
+	return ep93xx_clocksource_read(NULL);
+}
+
+static int ep93xx_clkevt_set_next_event(unsigned long next,
+					struct clock_event_device *evt)
+{
+	struct ep93xx_tcu *tcu = ep93xx_tcu;
+	/* Default mode: periodic, off, 508 kHz */
+	u32 tmode = EP93XX_TIMER123_CONTROL_MODE |
+	EP93XX_TIMER123_CONTROL_CLKSEL;
+
+	/* Clear timer */
+	writel(tmode, tcu->base + EP93XX_TIMER3_CONTROL);
+
+	/* Set next event */
+	writel(next, tcu->base + EP93XX_TIMER3_LOAD);
+	writel(tmode | EP93XX_TIMER123_CONTROL_ENABLE,
+	       tcu->base + EP93XX_TIMER3_CONTROL);
+	return 0;
+}
+
+static int ep93xx_clkevt_shutdown(struct clock_event_device *evt)
+{
+	struct ep93xx_tcu *tcu = ep93xx_tcu;
+	/* Disable timer */
+	writel(0, tcu->base + EP93XX_TIMER3_CONTROL);
+
+	return 0;
+}
+
+static struct clock_event_device ep93xx_clockevent = {
+	.name			= "timer1",
+	.features		= CLOCK_EVT_FEAT_ONESHOT,
+	.set_state_shutdown	= ep93xx_clkevt_shutdown,
+	.set_state_oneshot	= ep93xx_clkevt_shutdown,
+	.tick_resume		= ep93xx_clkevt_shutdown,
+	.set_next_event		= ep93xx_clkevt_set_next_event,
+	.rating			= 300,
+};
+
+static irqreturn_t ep93xx_timer_interrupt(int irq, void *dev_id)
+{
+	struct ep93xx_tcu *tcu = ep93xx_tcu;
+	struct clock_event_device *evt = dev_id;
+
+	/* Writing any value clears the timer interrupt */
+	writel(1, tcu->base + EP93XX_TIMER3_CLEAR);
+
+	evt->event_handler(evt);
+
+	return IRQ_HANDLED;
+}
+
+static int __init ep93xx_timer_of_init(struct device_node *np)
+{
+	int irq;
+	unsigned long flags = IRQF_TIMER | IRQF_IRQPOLL;
+	struct ep93xx_tcu *tcu;
+	int ret;
+
+	tcu = kzalloc(sizeof(*tcu), GFP_KERNEL);
+	if (!tcu)
+		return -ENOMEM;
+
+	tcu->base = of_iomap(np, 0);
+	if (!tcu->base) {
+		pr_err("Can't remap registers\n");
+		ret = -ENXIO;
+		goto out_free;
+	}
+
+	ep93xx_tcu = tcu;
+
+	irq = irq_of_parse_and_map(np, 0);
+	if (irq == 0)
+		irq = -EINVAL;
+	if (irq < 0) {
+		pr_err("EP93XX Timer Can't parse IRQ %d", irq);
+		goto out_free;
+	}
+
+	/* Enable and register clocksource and sched_clock on timer 4 */
+	writel(EP93XX_TIMER4_VALUE_HIGH_ENABLE,
+	       tcu->base + EP93XX_TIMER4_VALUE_HIGH);
+	clocksource_mmio_init(NULL, "timer4",
+				EP93XX_TIMER4_RATE, 200, 40,
+				ep93xx_clocksource_read);
+	sched_clock_register(ep93xx_read_sched_clock, 40,
+			     EP93XX_TIMER4_RATE);
+
+	/* Set up clockevent on timer 3 */
+	if (request_irq(irq, ep93xx_timer_interrupt, flags, "ep93xx timer",
+		&ep93xx_clockevent))
+		pr_err("Failed to request irq %d (ep93xx timer)\n", irq);
+
+	clockevents_config_and_register(&ep93xx_clockevent,
+				EP93XX_TIMER123_RATE,
+				1,
+				UINT_MAX);
+
+	return 0;
+
+out_free:
+	kfree(tcu);
+	return ret;
+}
+TIMER_OF_DECLARE(ep93xx_timer, "cirrus,ep9301-timer", ep93xx_timer_of_init);
diff --git a/drivers/clocksource/timer-imx-gpt.c b/drivers/clocksource/timer-imx-gpt.c
index 28ab4f1a7c71..6a878d227a13 100644
--- a/drivers/clocksource/timer-imx-gpt.c
+++ b/drivers/clocksource/timer-imx-gpt.c
@@ -434,12 +434,16 @@ static int __init mxc_timer_init_dt(struct device_node *np,  enum imx_gpt_type t
 		return -ENOMEM;
 
 	imxtm->base = of_iomap(np, 0);
-	if (!imxtm->base)
-		return -ENXIO;
+	if (!imxtm->base) {
+		ret = -ENXIO;
+		goto err_kfree;
+	}
 
 	imxtm->irq = irq_of_parse_and_map(np, 0);
-	if (imxtm->irq <= 0)
-		return -EINVAL;
+	if (imxtm->irq <= 0) {
+		ret = -EINVAL;
+		goto err_kfree;
+	}
 
 	imxtm->clk_ipg = of_clk_get_by_name(np, "ipg");
 
@@ -452,11 +456,15 @@ static int __init mxc_timer_init_dt(struct device_node *np,  enum imx_gpt_type t
 
 	ret = _mxc_timer_init(imxtm);
 	if (ret)
-		return ret;
+		goto err_kfree;
 
 	initialized = 1;
 
 	return 0;
+
+err_kfree:
+	kfree(imxtm);
+	return ret;
 }
 
 static int __init imx1_timer_init_dt(struct device_node *np)
diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c
index da3071b387eb..50198657230e 100644
--- a/drivers/clocksource/timer-riscv.c
+++ b/drivers/clocksource/timer-riscv.c
@@ -212,6 +212,10 @@ TIMER_OF_DECLARE(riscv_timer, "riscv", riscv_timer_init_dt);
 #ifdef CONFIG_ACPI
 static int __init riscv_timer_acpi_init(struct acpi_table_header *table)
 {
+	struct acpi_table_rhct *rhct = (struct acpi_table_rhct *)table;
+
+	riscv_timer_cannot_wake_cpu = rhct->flags & ACPI_RHCT_TIMER_CANNOT_WAKEUP_CPU;
+
 	return riscv_timer_init_common();
 }
 
diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c
index 69fee3540d37..0d229a9058da 100644
--- a/drivers/clocksource/timer-sun5i.c
+++ b/drivers/clocksource/timer-sun5i.c
@@ -256,10 +256,8 @@ static int sun5i_timer_probe(struct platform_device *pdev)
 	}
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
-		dev_err(dev, "Can't get IRQ\n");
+	if (irq < 0)
 		return irq;
-	}
 
 	clk = devm_clk_get_enabled(dev, NULL);
 	if (IS_ERR(clk)) {
diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c
index 09ab29cb7f64..5f60f6bd3386 100644
--- a/drivers/clocksource/timer-ti-dm.c
+++ b/drivers/clocksource/timer-ti-dm.c
@@ -140,6 +140,8 @@ struct dmtimer {
 	struct platform_device *pdev;
 	struct list_head node;
 	struct notifier_block nb;
+	struct notifier_block fclk_nb;
+	unsigned long fclk_rate;
 };
 
 static u32 omap_reserved_systimers;
@@ -253,8 +255,7 @@ static inline void __omap_dm_timer_enable_posted(struct dmtimer *timer)
 	timer->posted = OMAP_TIMER_POSTED;
 }
 
-static inline void __omap_dm_timer_stop(struct dmtimer *timer,
-					unsigned long rate)
+static inline void __omap_dm_timer_stop(struct dmtimer *timer)
 {
 	u32 l;
 
@@ -269,7 +270,7 @@ static inline void __omap_dm_timer_stop(struct dmtimer *timer,
 		 * Wait for functional clock period x 3.5 to make sure that
 		 * timer is stopped
 		 */
-		udelay(3500000 / rate + 1);
+		udelay(3500000 / timer->fclk_rate + 1);
 #endif
 	}
 
@@ -348,6 +349,21 @@ static int omap_timer_context_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+static int omap_timer_fclk_notifier(struct notifier_block *nb,
+				    unsigned long event, void *data)
+{
+	struct clk_notifier_data *clk_data = data;
+	struct dmtimer *timer = container_of(nb, struct dmtimer, fclk_nb);
+
+	switch (event) {
+	case POST_RATE_CHANGE:
+		timer->fclk_rate = clk_data->new_rate;
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
 static int omap_dm_timer_reset(struct dmtimer *timer)
 {
 	u32 l, timeout = 100000;
@@ -754,7 +770,6 @@ static int omap_dm_timer_stop(struct omap_dm_timer *cookie)
 {
 	struct dmtimer *timer;
 	struct device *dev;
-	unsigned long rate = 0;
 
 	timer = to_dmtimer(cookie);
 	if (unlikely(!timer))
@@ -762,10 +777,7 @@ static int omap_dm_timer_stop(struct omap_dm_timer *cookie)
 
 	dev = &timer->pdev->dev;
 
-	if (!timer->omap1)
-		rate = clk_get_rate(timer->fclk);
-
-	__omap_dm_timer_stop(timer, rate);
+	__omap_dm_timer_stop(timer);
 
 	pm_runtime_put_sync(dev);
 
@@ -1124,6 +1136,14 @@ static int omap_dm_timer_probe(struct platform_device *pdev)
 		timer->fclk = devm_clk_get(dev, "fck");
 		if (IS_ERR(timer->fclk))
 			return PTR_ERR(timer->fclk);
+
+		timer->fclk_nb.notifier_call = omap_timer_fclk_notifier;
+		ret = devm_clk_notifier_register(dev, timer->fclk,
+						 &timer->fclk_nb);
+		if (ret)
+			return ret;
+
+		timer->fclk_rate = clk_get_rate(timer->fclk);
 	} else {
 		timer->fclk = ERR_PTR(-ENODEV);
 	}
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 60ed89000e82..15c440e5c773 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1650,7 +1650,7 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy)
 	}
 
 	if (has_target())
-		strncpy(policy->last_governor, policy->governor->name,
+		strscpy(policy->last_governor, policy->governor->name,
 			CPUFREQ_NAME_LEN);
 	else
 		policy->last_policy = policy->policy;
@@ -2996,7 +2996,7 @@ static int __init cpufreq_core_init(void)
 	BUG_ON(!cpufreq_global_kobject);
 
 	if (!strlen(default_governor))
-		strncpy(default_governor, gov->name, CPUFREQ_NAME_LEN);
+		strscpy(default_governor, gov->name, CPUFREQ_NAME_LEN);
 
 	return 0;
 }
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index 12fec92a85fd..97feb7d8fb23 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -84,8 +84,8 @@ static int init_state_node(struct cpuidle_state *idle_state,
 	 *	replace with kstrdup and pointer assignment when name
 	 *	and desc become string pointers
 	 */
-	strncpy(idle_state->name, state_node->name, CPUIDLE_NAME_LEN - 1);
-	strncpy(idle_state->desc, desc, CPUIDLE_DESC_LEN - 1);
+	strscpy(idle_state->name, state_node->name, CPUIDLE_NAME_LEN);
+	strscpy(idle_state->desc, desc, CPUIDLE_DESC_LEN);
 	return 0;
 }
 
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 110e99b86a66..5a7f3fabee22 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -561,4 +561,16 @@ config EDAC_NPCM
 	  error detection (in-line ECC in which a section 1/8th of the memory
 	  device used to store data is used for ECC storage).
 
+config EDAC_VERSAL
+	tristate "Xilinx Versal DDR Memory Controller"
+	depends on ARCH_ZYNQMP || COMPILE_TEST
+	help
+	  Support for error detection and correction on the Xilinx Versal DDR
+	  memory controller.
+
+	  Report both single bit errors (CE) and double bit errors (UE).
+	  Support injecting both correctable and uncorrectable errors
+	  for debugging purposes.
+
+
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 61945d3113cc..9c09893695b7 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -86,3 +86,4 @@ obj-$(CONFIG_EDAC_BLUEFIELD)		+= bluefield_edac.o
 obj-$(CONFIG_EDAC_DMC520)		+= dmc520_edac.o
 obj-$(CONFIG_EDAC_NPCM)			+= npcm_edac.o
 obj-$(CONFIG_EDAC_ZYNQMP)		+= zynqmp_edac.o
+obj-$(CONFIG_EDAC_VERSAL)		+= versal_edac.o
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 15f63452a9be..5116873c3330 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -229,7 +229,7 @@ static ssize_t channel_dimm_label_store(struct device *dev,
 	if (copy_count == 0 || copy_count >= sizeof(rank->dimm->label))
 		return -EINVAL;
 
-	strncpy(rank->dimm->label, data, copy_count);
+	memcpy(rank->dimm->label, data, copy_count);
 	rank->dimm->label[copy_count] = '\0';
 
 	return count;
@@ -535,7 +535,7 @@ static ssize_t dimmdev_label_store(struct device *dev,
 	if (copy_count == 0 || copy_count >= sizeof(dimm->label))
 		return -EINVAL;
 
-	strncpy(dimm->label, data, copy_count);
+	memcpy(dimm->label, data, copy_count);
 	dimm->label[copy_count] = '\0';
 
 	return count;
diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
new file mode 100644
index 000000000000..87e730dfefa0
--- /dev/null
+++ b/drivers/edac/versal_edac.c
@@ -0,0 +1,1069 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xilinx Versal memory controller driver
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ */
+#include <linux/bitfield.h>
+#include <linux/edac.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/sizes.h>
+#include <linux/firmware/xlnx-zynqmp.h>
+#include <linux/firmware/xlnx-event-manager.h>
+
+#include "edac_module.h"
+
+/* Granularity of reported error in bytes */
+#define XDDR_EDAC_ERR_GRAIN			1
+
+#define XDDR_EDAC_MSG_SIZE			256
+#define EVENT					2
+
+#define XDDR_PCSR_OFFSET			0xC
+#define XDDR_ISR_OFFSET				0x14
+#define XDDR_IRQ_EN_OFFSET			0x20
+#define XDDR_IRQ1_EN_OFFSET			0x2C
+#define XDDR_IRQ_DIS_OFFSET			0x24
+#define XDDR_IRQ_CE_MASK			GENMASK(18, 15)
+#define XDDR_IRQ_UE_MASK			GENMASK(14, 11)
+
+#define XDDR_REG_CONFIG0_OFFSET			0x258
+#define XDDR_REG_CONFIG0_BUS_WIDTH_MASK		GENMASK(19, 18)
+#define XDDR_REG_CONFIG0_NUM_CHANS_MASK		BIT(17)
+#define XDDR_REG_CONFIG0_NUM_RANKS_MASK		GENMASK(15, 14)
+#define XDDR_REG_CONFIG0_SIZE_MASK		GENMASK(10, 8)
+
+#define XDDR_REG_PINOUT_OFFSET			0x25C
+#define XDDR_REG_PINOUT_ECC_EN_MASK		GENMASK(7, 5)
+
+#define ECCW0_FLIP_CTRL				0x109C
+#define ECCW0_FLIP0_OFFSET			0x10A0
+#define ECCW1_FLIP_CTRL				0x10AC
+#define ECCW1_FLIP0_OFFSET			0x10B0
+#define ECCR0_CERR_STAT_OFFSET			0x10BC
+#define ECCR0_CE_ADDR_LO_OFFSET			0x10C0
+#define ECCR0_CE_ADDR_HI_OFFSET			0x10C4
+#define ECCR0_CE_DATA_LO_OFFSET			0x10C8
+#define ECCR0_CE_DATA_HI_OFFSET			0x10CC
+#define ECCR0_CE_DATA_PAR_OFFSET		0x10D0
+
+#define ECCR0_UERR_STAT_OFFSET			0x10D4
+#define ECCR0_UE_ADDR_LO_OFFSET			0x10D8
+#define ECCR0_UE_ADDR_HI_OFFSET			0x10DC
+#define ECCR0_UE_DATA_LO_OFFSET			0x10E0
+#define ECCR0_UE_DATA_HI_OFFSET			0x10E4
+#define ECCR0_UE_DATA_PAR_OFFSET		0x10E8
+
+#define ECCR1_CERR_STAT_OFFSET			0x10F4
+#define ECCR1_CE_ADDR_LO_OFFSET			0x10F8
+#define ECCR1_CE_ADDR_HI_OFFSET			0x10FC
+#define ECCR1_CE_DATA_LO_OFFSET			0x1100
+#define ECCR1_CE_DATA_HI_OFFSET			0x110C
+#define ECCR1_CE_DATA_PAR_OFFSET		0x1108
+
+#define ECCR1_UERR_STAT_OFFSET			0x110C
+#define ECCR1_UE_ADDR_LO_OFFSET			0x1110
+#define ECCR1_UE_ADDR_HI_OFFSET			0x1114
+#define ECCR1_UE_DATA_LO_OFFSET			0x1118
+#define ECCR1_UE_DATA_HI_OFFSET			0x111C
+#define ECCR1_UE_DATA_PAR_OFFSET		0x1120
+
+#define XDDR_NOC_REG_ADEC4_OFFSET		0x44
+#define RANK_1_MASK				GENMASK(11, 6)
+#define LRANK_0_MASK				GENMASK(17, 12)
+#define LRANK_1_MASK				GENMASK(23, 18)
+#define MASK_24					GENMASK(29, 24)
+
+#define XDDR_NOC_REG_ADEC5_OFFSET		0x48
+#define XDDR_NOC_REG_ADEC6_OFFSET		0x4C
+#define XDDR_NOC_REG_ADEC7_OFFSET		0x50
+#define XDDR_NOC_REG_ADEC8_OFFSET		0x54
+#define XDDR_NOC_REG_ADEC9_OFFSET		0x58
+#define XDDR_NOC_REG_ADEC10_OFFSET		0x5C
+
+#define XDDR_NOC_REG_ADEC11_OFFSET		0x60
+#define MASK_0					GENMASK(5, 0)
+#define GRP_0_MASK				GENMASK(11, 6)
+#define GRP_1_MASK				GENMASK(17, 12)
+#define CH_0_MASK				GENMASK(23, 18)
+
+#define XDDR_NOC_REG_ADEC12_OFFSET		0x71C
+#define XDDR_NOC_REG_ADEC13_OFFSET		0x720
+
+#define XDDR_NOC_REG_ADEC14_OFFSET		0x724
+#define XDDR_NOC_ROW_MATCH_MASK			GENMASK(17, 0)
+#define XDDR_NOC_COL_MATCH_MASK			GENMASK(27, 18)
+#define XDDR_NOC_BANK_MATCH_MASK		GENMASK(29, 28)
+#define XDDR_NOC_GRP_MATCH_MASK			GENMASK(31, 30)
+
+#define XDDR_NOC_REG_ADEC15_OFFSET		0x728
+#define XDDR_NOC_RANK_MATCH_MASK		GENMASK(1, 0)
+#define XDDR_NOC_LRANK_MATCH_MASK		GENMASK(4, 2)
+#define XDDR_NOC_CH_MATCH_MASK			BIT(5)
+#define XDDR_NOC_MOD_SEL_MASK			BIT(6)
+#define XDDR_NOC_MATCH_EN_MASK			BIT(8)
+
+#define ECCR_UE_CE_ADDR_HI_ROW_MASK		GENMASK(7, 0)
+
+#define XDDR_EDAC_NR_CSROWS			1
+#define XDDR_EDAC_NR_CHANS			1
+
+#define XDDR_BUS_WIDTH_64			0
+#define XDDR_BUS_WIDTH_32			1
+#define XDDR_BUS_WIDTH_16			2
+
+#define ECC_CEPOISON_MASK			0x1
+#define ECC_UEPOISON_MASK			0x3
+
+#define XDDR_MAX_ROW_CNT			18
+#define XDDR_MAX_COL_CNT			10
+#define XDDR_MAX_RANK_CNT			2
+#define XDDR_MAX_LRANK_CNT			3
+#define XDDR_MAX_BANK_CNT			2
+#define XDDR_MAX_GRP_CNT			2
+
+/*
+ * Config and system registers are usually locked. This is the
+ * code which unlocks them in order to accept writes. See
+ *
+ * https://docs.xilinx.com/r/en-US/am012-versal-register-reference/PCSR_LOCK-XRAM_SLCR-Register
+ */
+#define PCSR_UNLOCK_VAL				0xF9E8D7C6
+#define XDDR_ERR_TYPE_CE			0
+#define XDDR_ERR_TYPE_UE			1
+
+#define XILINX_DRAM_SIZE_4G			0
+#define XILINX_DRAM_SIZE_6G			1
+#define XILINX_DRAM_SIZE_8G			2
+#define XILINX_DRAM_SIZE_12G			3
+#define XILINX_DRAM_SIZE_16G			4
+#define XILINX_DRAM_SIZE_32G			5
+
+/**
+ * struct ecc_error_info - ECC error log information.
+ * @burstpos:		Burst position.
+ * @lrank:		Logical Rank number.
+ * @rank:		Rank number.
+ * @group:		Group number.
+ * @bank:		Bank number.
+ * @col:		Column number.
+ * @row:		Row number.
+ * @rowhi:		Row number higher bits.
+ * @i:			ECC error info.
+ */
+union ecc_error_info {
+	struct {
+		u32 burstpos:3;
+		u32 lrank:3;
+		u32 rank:2;
+		u32 group:2;
+		u32 bank:2;
+		u32 col:10;
+		u32 row:10;
+		u32 rowhi;
+	};
+	u64 i;
+} __packed;
+
+union edac_info {
+	struct {
+		u32 row0:6;
+		u32 row1:6;
+		u32 row2:6;
+		u32 row3:6;
+		u32 row4:6;
+		u32 reserved:2;
+	};
+	struct {
+		u32 col1:6;
+		u32 col2:6;
+		u32 col3:6;
+		u32 col4:6;
+		u32 col5:6;
+		u32 reservedcol:2;
+	};
+	u32 i;
+} __packed;
+
+/**
+ * struct ecc_status - ECC status information to report.
+ * @ceinfo:	Correctable error log information.
+ * @ueinfo:	Uncorrectable error log information.
+ * @channel:	Channel number.
+ * @error_type:	Error type information.
+ */
+struct ecc_status {
+	union ecc_error_info ceinfo[2];
+	union ecc_error_info ueinfo[2];
+	u8 channel;
+	u8 error_type;
+};
+
+/**
+ * struct edac_priv - DDR memory controller private instance data.
+ * @ddrmc_baseaddr:	Base address of the DDR controller.
+ * @ddrmc_noc_baseaddr:	Base address of the DDRMC NOC.
+ * @message:		Buffer for framing the event specific info.
+ * @mc_id:		Memory controller ID.
+ * @ce_cnt:		Correctable error count.
+ * @ue_cnt:		UnCorrectable error count.
+ * @stat:		ECC status information.
+ * @lrank_bit:		Bit shifts for lrank bit.
+ * @rank_bit:		Bit shifts for rank bit.
+ * @row_bit:		Bit shifts for row bit.
+ * @col_bit:		Bit shifts for column bit.
+ * @bank_bit:		Bit shifts for bank bit.
+ * @grp_bit:		Bit shifts for group bit.
+ * @ch_bit:		Bit shifts for channel bit.
+ * @err_inject_addr:	Data poison address.
+ * @debugfs:		Debugfs handle.
+ */
+struct edac_priv {
+	void __iomem *ddrmc_baseaddr;
+	void __iomem *ddrmc_noc_baseaddr;
+	char message[XDDR_EDAC_MSG_SIZE];
+	u32 mc_id;
+	u32 ce_cnt;
+	u32 ue_cnt;
+	struct ecc_status stat;
+	u32 lrank_bit[3];
+	u32 rank_bit[2];
+	u32 row_bit[18];
+	u32 col_bit[10];
+	u32 bank_bit[2];
+	u32 grp_bit[2];
+	u32 ch_bit;
+#ifdef CONFIG_EDAC_DEBUG
+	u64 err_inject_addr;
+	struct dentry *debugfs;
+#endif
+};
+
+static void get_ce_error_info(struct edac_priv *priv)
+{
+	void __iomem *ddrmc_base;
+	struct ecc_status *p;
+	u32  regval;
+	u64  reghi;
+
+	ddrmc_base = priv->ddrmc_baseaddr;
+	p = &priv->stat;
+
+	p->error_type = XDDR_ERR_TYPE_CE;
+	regval = readl(ddrmc_base + ECCR0_CE_ADDR_LO_OFFSET);
+	reghi = regval & ECCR_UE_CE_ADDR_HI_ROW_MASK;
+	p->ceinfo[0].i = regval | reghi << 32;
+	regval = readl(ddrmc_base + ECCR0_CE_ADDR_HI_OFFSET);
+
+	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
+		 readl(ddrmc_base + ECCR0_CE_DATA_LO_OFFSET),
+		 readl(ddrmc_base + ECCR0_CE_DATA_HI_OFFSET),
+		 readl(ddrmc_base + ECCR0_CE_DATA_PAR_OFFSET));
+
+	regval = readl(ddrmc_base + ECCR1_CE_ADDR_LO_OFFSET);
+	reghi = readl(ddrmc_base + ECCR1_CE_ADDR_HI_OFFSET);
+	p->ceinfo[1].i = regval | reghi << 32;
+	regval = readl(ddrmc_base + ECCR1_CE_ADDR_HI_OFFSET);
+
+	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
+		 readl(ddrmc_base + ECCR1_CE_DATA_LO_OFFSET),
+		 readl(ddrmc_base + ECCR1_CE_DATA_HI_OFFSET),
+		 readl(ddrmc_base + ECCR1_CE_DATA_PAR_OFFSET));
+}
+
+static void get_ue_error_info(struct edac_priv *priv)
+{
+	void __iomem *ddrmc_base;
+	struct ecc_status *p;
+	u32  regval;
+	u64 reghi;
+
+	ddrmc_base = priv->ddrmc_baseaddr;
+	p = &priv->stat;
+
+	p->error_type = XDDR_ERR_TYPE_UE;
+	regval = readl(ddrmc_base + ECCR0_UE_ADDR_LO_OFFSET);
+	reghi = readl(ddrmc_base + ECCR0_UE_ADDR_HI_OFFSET);
+
+	p->ueinfo[0].i = regval | reghi << 32;
+	regval = readl(ddrmc_base + ECCR0_UE_ADDR_HI_OFFSET);
+
+	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
+		 readl(ddrmc_base + ECCR0_UE_DATA_LO_OFFSET),
+		 readl(ddrmc_base + ECCR0_UE_DATA_HI_OFFSET),
+		 readl(ddrmc_base + ECCR0_UE_DATA_PAR_OFFSET));
+
+	regval = readl(ddrmc_base + ECCR1_UE_ADDR_LO_OFFSET);
+	reghi = readl(ddrmc_base + ECCR1_UE_ADDR_HI_OFFSET);
+	p->ueinfo[1].i = regval | reghi << 32;
+
+	edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
+		 readl(ddrmc_base + ECCR1_UE_DATA_LO_OFFSET),
+		 readl(ddrmc_base + ECCR1_UE_DATA_HI_OFFSET),
+		 readl(ddrmc_base + ECCR1_UE_DATA_PAR_OFFSET));
+}
+
+static bool get_error_info(struct edac_priv *priv)
+{
+	u32 eccr0_ceval, eccr1_ceval, eccr0_ueval, eccr1_ueval;
+	void __iomem *ddrmc_base;
+	struct ecc_status *p;
+
+	ddrmc_base = priv->ddrmc_baseaddr;
+	p = &priv->stat;
+
+	eccr0_ceval = readl(ddrmc_base + ECCR0_CERR_STAT_OFFSET);
+	eccr1_ceval = readl(ddrmc_base + ECCR1_CERR_STAT_OFFSET);
+	eccr0_ueval = readl(ddrmc_base + ECCR0_UERR_STAT_OFFSET);
+	eccr1_ueval = readl(ddrmc_base + ECCR1_UERR_STAT_OFFSET);
+
+	if (!eccr0_ceval && !eccr1_ceval && !eccr0_ueval && !eccr1_ueval)
+		return 1;
+	if (!eccr0_ceval)
+		p->channel = 1;
+	else
+		p->channel = 0;
+
+	if (eccr0_ceval || eccr1_ceval)
+		get_ce_error_info(priv);
+
+	if (eccr0_ueval || eccr1_ueval) {
+		if (!eccr0_ueval)
+			p->channel = 1;
+		else
+			p->channel = 0;
+		get_ue_error_info(priv);
+	}
+
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, ddrmc_base + XDDR_PCSR_OFFSET);
+
+	writel(0, ddrmc_base + ECCR0_CERR_STAT_OFFSET);
+	writel(0, ddrmc_base + ECCR1_CERR_STAT_OFFSET);
+	writel(0, ddrmc_base + ECCR0_UERR_STAT_OFFSET);
+	writel(0, ddrmc_base + ECCR1_UERR_STAT_OFFSET);
+
+	/* Lock the PCSR registers */
+	writel(1, ddrmc_base + XDDR_PCSR_OFFSET);
+
+	return 0;
+}
+
+/**
+ * convert_to_physical - Convert to physical address.
+ * @priv:	DDR memory controller private instance data.
+ * @pinf:	ECC error info structure.
+ *
+ * Return: Physical address of the DDR memory.
+ */
+static unsigned long convert_to_physical(struct edac_priv *priv, union ecc_error_info pinf)
+{
+	unsigned long err_addr = 0;
+	u32 index;
+	u32 row;
+
+	row = pinf.rowhi << 10 | pinf.row;
+	for (index = 0; index < XDDR_MAX_ROW_CNT; index++) {
+		err_addr |= (row & BIT(0)) << priv->row_bit[index];
+		row >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_COL_CNT; index++) {
+		err_addr |= (pinf.col & BIT(0)) << priv->col_bit[index];
+		pinf.col >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_BANK_CNT; index++) {
+		err_addr |= (pinf.bank & BIT(0)) << priv->bank_bit[index];
+		pinf.bank >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_GRP_CNT; index++) {
+		err_addr |= (pinf.group & BIT(0)) << priv->grp_bit[index];
+		pinf.group >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_RANK_CNT; index++) {
+		err_addr |= (pinf.rank & BIT(0)) << priv->rank_bit[index];
+		pinf.rank >>= 1;
+	}
+
+	for (index = 0; index < XDDR_MAX_LRANK_CNT; index++) {
+		err_addr |= (pinf.lrank & BIT(0)) << priv->lrank_bit[index];
+		pinf.lrank >>= 1;
+	}
+
+	err_addr |= (priv->stat.channel & BIT(0)) << priv->ch_bit;
+
+	return err_addr;
+}
+
+/**
+ * handle_error - Handle Correctable and Uncorrectable errors.
+ * @mci:	EDAC memory controller instance.
+ * @stat:	ECC status structure.
+ *
+ * Handles ECC correctable and uncorrectable errors.
+ */
+static void handle_error(struct mem_ctl_info *mci, struct ecc_status *stat)
+{
+	struct edac_priv *priv = mci->pvt_info;
+	union ecc_error_info pinf;
+
+	if (stat->error_type == XDDR_ERR_TYPE_CE) {
+		priv->ce_cnt++;
+		pinf = stat->ceinfo[stat->channel];
+		snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
+			 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
+			 "CE", priv->mc_id,
+			 convert_to_physical(priv, pinf), pinf.burstpos);
+
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     priv->ce_cnt, 0, 0, 0, 0, 0, -1,
+				     priv->message, "");
+	}
+
+	if (stat->error_type == XDDR_ERR_TYPE_UE) {
+		priv->ue_cnt++;
+		pinf = stat->ueinfo[stat->channel];
+		snprintf(priv->message, XDDR_EDAC_MSG_SIZE,
+			 "Error type:%s MC ID: %d Addr at %lx Burst Pos: %d\n",
+			 "UE", priv->mc_id,
+			 convert_to_physical(priv, pinf), pinf.burstpos);
+
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     priv->ue_cnt, 0, 0, 0, 0, 0, -1,
+				     priv->message, "");
+	}
+
+	memset(stat, 0, sizeof(*stat));
+}
+
+/**
+ * err_callback - Handle Correctable and Uncorrectable errors.
+ * @payload:	payload data.
+ * @data:	mci controller data.
+ *
+ * Handles ECC correctable and uncorrectable errors.
+ */
+static void err_callback(const u32 *payload, void *data)
+{
+	struct mem_ctl_info *mci = (struct mem_ctl_info *)data;
+	struct edac_priv *priv;
+	struct ecc_status *p;
+	int regval;
+
+	priv = mci->pvt_info;
+	p = &priv->stat;
+
+	regval = readl(priv->ddrmc_baseaddr + XDDR_ISR_OFFSET);
+
+	if (payload[EVENT] == XPM_EVENT_ERROR_MASK_DDRMC_CR)
+		p->error_type = XDDR_ERR_TYPE_CE;
+	if (payload[EVENT] == XPM_EVENT_ERROR_MASK_DDRMC_NCR)
+		p->error_type = XDDR_ERR_TYPE_UE;
+
+	if (get_error_info(priv))
+		return;
+
+	handle_error(mci, &priv->stat);
+
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+
+	/* Clear the ISR */
+	writel(regval, priv->ddrmc_baseaddr + XDDR_ISR_OFFSET);
+
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+	edac_dbg(3, "Total error count CE %d UE %d\n",
+		 priv->ce_cnt, priv->ue_cnt);
+}
+
+/**
+ * get_dwidth - Return the controller memory width.
+ * @base:	DDR memory controller base address.
+ *
+ * Get the EDAC device type width appropriate for the controller
+ * configuration.
+ *
+ * Return: a device type width enumeration.
+ */
+static enum dev_type get_dwidth(const void __iomem *base)
+{
+	enum dev_type dt;
+	u32 regval;
+	u32 width;
+
+	regval = readl(base + XDDR_REG_CONFIG0_OFFSET);
+	width  = FIELD_GET(XDDR_REG_CONFIG0_BUS_WIDTH_MASK, regval);
+
+	switch (width) {
+	case XDDR_BUS_WIDTH_16:
+		dt = DEV_X2;
+		break;
+	case XDDR_BUS_WIDTH_32:
+		dt = DEV_X4;
+		break;
+	case XDDR_BUS_WIDTH_64:
+		dt = DEV_X8;
+		break;
+	default:
+		dt = DEV_UNKNOWN;
+	}
+
+	return dt;
+}
+
+/**
+ * get_ecc_state - Return the controller ECC enable/disable status.
+ * @base:	DDR memory controller base address.
+ *
+ * Get the ECC enable/disable status for the controller.
+ *
+ * Return: a ECC status boolean i.e true/false - enabled/disabled.
+ */
+static bool get_ecc_state(void __iomem *base)
+{
+	enum dev_type dt;
+	u32 ecctype;
+
+	dt = get_dwidth(base);
+	if (dt == DEV_UNKNOWN)
+		return false;
+
+	ecctype = readl(base + XDDR_REG_PINOUT_OFFSET);
+	ecctype &= XDDR_REG_PINOUT_ECC_EN_MASK;
+
+	return !!ecctype;
+}
+
+/**
+ * get_memsize - Get the size of the attached memory device.
+ * @priv:	DDR memory controller private instance data.
+ *
+ * Return: the memory size in bytes.
+ */
+static u64 get_memsize(struct edac_priv *priv)
+{
+	u32 regval;
+	u64 size;
+
+	regval = readl(priv->ddrmc_baseaddr + XDDR_REG_CONFIG0_OFFSET);
+	regval  = FIELD_GET(XDDR_REG_CONFIG0_SIZE_MASK, regval);
+
+	switch (regval) {
+	case XILINX_DRAM_SIZE_4G:
+		size = 4U;      break;
+	case XILINX_DRAM_SIZE_6G:
+		size = 6U;      break;
+	case XILINX_DRAM_SIZE_8G:
+		size = 8U;      break;
+	case XILINX_DRAM_SIZE_12G:
+		size = 12U;     break;
+	case XILINX_DRAM_SIZE_16G:
+		size = 16U;     break;
+	case XILINX_DRAM_SIZE_32G:
+		size = 32U;     break;
+	/* Invalid configuration */
+	default:
+		size = 0;	break;
+	}
+
+	size *= SZ_1G;
+	return size;
+}
+
+/**
+ * init_csrows - Initialize the csrow data.
+ * @mci:	EDAC memory controller instance.
+ *
+ * Initialize the chip select rows associated with the EDAC memory
+ * controller instance.
+ */
+static void init_csrows(struct mem_ctl_info *mci)
+{
+	struct edac_priv *priv = mci->pvt_info;
+	struct csrow_info *csi;
+	struct dimm_info *dimm;
+	unsigned long size;
+	u32 row;
+	int ch;
+
+	size = get_memsize(priv);
+	for (row = 0; row < mci->nr_csrows; row++) {
+		csi = mci->csrows[row];
+		for (ch = 0; ch < csi->nr_channels; ch++) {
+			dimm = csi->channels[ch]->dimm;
+			dimm->edac_mode	= EDAC_SECDED;
+			dimm->mtype = MEM_DDR4;
+			dimm->nr_pages = (size >> PAGE_SHIFT) / csi->nr_channels;
+			dimm->grain = XDDR_EDAC_ERR_GRAIN;
+			dimm->dtype = get_dwidth(priv->ddrmc_baseaddr);
+		}
+	}
+}
+
+/**
+ * mc_init - Initialize one driver instance.
+ * @mci:	EDAC memory controller instance.
+ * @pdev:	platform device.
+ *
+ * Perform initialization of the EDAC memory controller instance and
+ * related driver-private data associated with the memory controller the
+ * instance is bound to.
+ */
+static void mc_init(struct mem_ctl_info *mci, struct platform_device *pdev)
+{
+	mci->pdev = &pdev->dev;
+	platform_set_drvdata(pdev, mci);
+
+	/* Initialize controller capabilities and configuration */
+	mci->mtype_cap = MEM_FLAG_DDR4;
+	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
+	mci->scrub_cap = SCRUB_HW_SRC;
+	mci->scrub_mode = SCRUB_NONE;
+
+	mci->edac_cap = EDAC_FLAG_SECDED;
+	mci->ctl_name = "xlnx_ddr_controller";
+	mci->dev_name = dev_name(&pdev->dev);
+	mci->mod_name = "xlnx_edac";
+
+	edac_op_state = EDAC_OPSTATE_INT;
+
+	init_csrows(mci);
+}
+
+static void enable_intr(struct edac_priv *priv)
+{
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+
+	/* Enable UE and CE Interrupts to support the interrupt case */
+	writel(XDDR_IRQ_CE_MASK | XDDR_IRQ_UE_MASK,
+	       priv->ddrmc_baseaddr + XDDR_IRQ_EN_OFFSET);
+
+	writel(XDDR_IRQ_UE_MASK,
+	       priv->ddrmc_baseaddr + XDDR_IRQ1_EN_OFFSET);
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+}
+
+static void disable_intr(struct edac_priv *priv)
+{
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+
+	/* Disable UE/CE Interrupts */
+	writel(XDDR_IRQ_CE_MASK | XDDR_IRQ_UE_MASK,
+	       priv->ddrmc_baseaddr + XDDR_IRQ_DIS_OFFSET);
+
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+}
+
+#define to_mci(k) container_of(k, struct mem_ctl_info, dev)
+
+#ifdef CONFIG_EDAC_DEBUG
+/**
+ * poison_setup - Update poison registers.
+ * @priv:	DDR memory controller private instance data.
+ *
+ * Update poison registers as per DDR mapping upon write of the address
+ * location the fault is injected.
+ * Return: none.
+ */
+static void poison_setup(struct edac_priv *priv)
+{
+	u32 col = 0, row = 0, bank = 0, grp = 0, rank = 0, lrank = 0, ch = 0;
+	u32 index, regval;
+
+	for (index = 0; index < XDDR_MAX_ROW_CNT; index++) {
+		row |= (((priv->err_inject_addr >> priv->row_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_COL_CNT; index++) {
+		col |= (((priv->err_inject_addr >> priv->col_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_BANK_CNT; index++) {
+		bank |= (((priv->err_inject_addr >> priv->bank_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_GRP_CNT; index++) {
+		grp |= (((priv->err_inject_addr >> priv->grp_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_RANK_CNT; index++) {
+		rank |= (((priv->err_inject_addr >> priv->rank_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	for (index = 0; index < XDDR_MAX_LRANK_CNT; index++) {
+		lrank |= (((priv->err_inject_addr >> priv->lrank_bit[index]) &
+						BIT(0)) << index);
+	}
+
+	ch = (priv->err_inject_addr >> priv->ch_bit) & BIT(0);
+	if (ch)
+		writel(0xFF, priv->ddrmc_baseaddr + ECCW1_FLIP_CTRL);
+	else
+		writel(0xFF, priv->ddrmc_baseaddr + ECCW0_FLIP_CTRL);
+
+	writel(0, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC12_OFFSET);
+	writel(0, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC13_OFFSET);
+
+	regval = row & XDDR_NOC_ROW_MATCH_MASK;
+	regval |= FIELD_PREP(XDDR_NOC_COL_MATCH_MASK, col);
+	regval |= FIELD_PREP(XDDR_NOC_BANK_MATCH_MASK, bank);
+	regval |= FIELD_PREP(XDDR_NOC_GRP_MATCH_MASK, grp);
+	writel(regval, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC14_OFFSET);
+
+	regval = rank & XDDR_NOC_RANK_MATCH_MASK;
+	regval |= FIELD_PREP(XDDR_NOC_LRANK_MATCH_MASK, lrank);
+	regval |= FIELD_PREP(XDDR_NOC_CH_MATCH_MASK, ch);
+	regval |= (XDDR_NOC_MOD_SEL_MASK | XDDR_NOC_MATCH_EN_MASK);
+	writel(regval, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC15_OFFSET);
+}
+
+static ssize_t xddr_inject_data_poison_store(struct mem_ctl_info *mci,
+					     const char __user *data)
+{
+	struct edac_priv *priv = mci->pvt_info;
+
+	writel(0, priv->ddrmc_baseaddr + ECCW0_FLIP0_OFFSET);
+	writel(0, priv->ddrmc_baseaddr + ECCW1_FLIP0_OFFSET);
+
+	if (strncmp(data, "CE", 2) == 0) {
+		writel(ECC_CEPOISON_MASK, priv->ddrmc_baseaddr +
+		       ECCW0_FLIP0_OFFSET);
+		writel(ECC_CEPOISON_MASK, priv->ddrmc_baseaddr +
+		       ECCW1_FLIP0_OFFSET);
+	} else {
+		writel(ECC_UEPOISON_MASK, priv->ddrmc_baseaddr +
+		       ECCW0_FLIP0_OFFSET);
+		writel(ECC_UEPOISON_MASK, priv->ddrmc_baseaddr +
+		       ECCW1_FLIP0_OFFSET);
+	}
+
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+
+	return 0;
+}
+
+static ssize_t inject_data_poison_store(struct file *file, const char __user *data,
+					size_t count, loff_t *ppos)
+{
+	struct device *dev = file->private_data;
+	struct mem_ctl_info *mci = to_mci(dev);
+	struct edac_priv *priv = mci->pvt_info;
+
+	/* Unlock the PCSR registers */
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+	writel(PCSR_UNLOCK_VAL, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET);
+
+	poison_setup(priv);
+
+	/* Lock the PCSR registers */
+	writel(1, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET);
+
+	xddr_inject_data_poison_store(mci, data);
+
+	return count;
+}
+
+static const struct file_operations xddr_inject_enable_fops = {
+	.open = simple_open,
+	.write = inject_data_poison_store,
+	.llseek = generic_file_llseek,
+};
+
+static void create_debugfs_attributes(struct mem_ctl_info *mci)
+{
+	struct edac_priv *priv = mci->pvt_info;
+
+	priv->debugfs = edac_debugfs_create_dir(mci->dev_name);
+	if (!priv->debugfs)
+		return;
+
+	edac_debugfs_create_file("inject_error", 0200, priv->debugfs,
+				 &mci->dev, &xddr_inject_enable_fops);
+	debugfs_create_x64("address", 0600, priv->debugfs,
+			   &priv->err_inject_addr);
+	mci->debugfs = priv->debugfs;
+}
+
+static inline void process_bit(struct edac_priv *priv, unsigned int start, u32 regval)
+{
+	union edac_info rows;
+
+	rows.i  = regval;
+	priv->row_bit[start]	 = rows.row0;
+	priv->row_bit[start + 1] = rows.row1;
+	priv->row_bit[start + 2] = rows.row2;
+	priv->row_bit[start + 3] = rows.row3;
+	priv->row_bit[start + 4] = rows.row4;
+}
+
+static void setup_row_address_map(struct edac_priv *priv)
+{
+	u32 regval;
+	union edac_info rows;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC5_OFFSET);
+	process_bit(priv, 0, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC6_OFFSET);
+	process_bit(priv, 5, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC7_OFFSET);
+	process_bit(priv, 10, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC8_OFFSET);
+	rows.i  = regval;
+
+	priv->row_bit[15] = rows.row0;
+	priv->row_bit[16] = rows.row1;
+	priv->row_bit[17] = rows.row2;
+}
+
+static void setup_column_address_map(struct edac_priv *priv)
+{
+	u32 regval;
+	union edac_info cols;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC8_OFFSET);
+	priv->col_bit[0] = FIELD_GET(MASK_24, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC9_OFFSET);
+	cols.i  = regval;
+	priv->col_bit[1] = cols.col1;
+	priv->col_bit[2] = cols.col2;
+	priv->col_bit[3] = cols.col3;
+	priv->col_bit[4] = cols.col4;
+	priv->col_bit[5] = cols.col5;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC10_OFFSET);
+	cols.i  = regval;
+	priv->col_bit[6] = cols.col1;
+	priv->col_bit[7] = cols.col2;
+	priv->col_bit[8] = cols.col3;
+	priv->col_bit[9] = cols.col4;
+}
+
+static void setup_bank_grp_ch_address_map(struct edac_priv *priv)
+{
+	u32 regval;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC10_OFFSET);
+	priv->bank_bit[0] = FIELD_GET(MASK_24, regval);
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC11_OFFSET);
+	priv->bank_bit[1] = (regval & MASK_0);
+	priv->grp_bit[0] = FIELD_GET(GRP_0_MASK, regval);
+	priv->grp_bit[1] = FIELD_GET(GRP_1_MASK, regval);
+	priv->ch_bit = FIELD_GET(CH_0_MASK, regval);
+}
+
+static void setup_rank_lrank_address_map(struct edac_priv *priv)
+{
+	u32 regval;
+
+	regval = readl(priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC4_OFFSET);
+	priv->rank_bit[0] = (regval & MASK_0);
+	priv->rank_bit[1] = FIELD_GET(RANK_1_MASK, regval);
+	priv->lrank_bit[0] = FIELD_GET(LRANK_0_MASK, regval);
+	priv->lrank_bit[1] = FIELD_GET(LRANK_1_MASK, regval);
+	priv->lrank_bit[2] = FIELD_GET(MASK_24, regval);
+}
+
+/**
+ * setup_address_map - Set Address Map by querying ADDRMAP registers.
+ * @priv:	DDR memory controller private instance data.
+ *
+ * Set Address Map by querying ADDRMAP registers.
+ *
+ * Return: none.
+ */
+static void setup_address_map(struct edac_priv *priv)
+{
+	setup_row_address_map(priv);
+
+	setup_column_address_map(priv);
+
+	setup_bank_grp_ch_address_map(priv);
+
+	setup_rank_lrank_address_map(priv);
+}
+#endif /* CONFIG_EDAC_DEBUG */
+
+static const struct of_device_id xlnx_edac_match[] = {
+	{ .compatible = "xlnx,versal-ddrmc", },
+	{
+		/* end of table */
+	}
+};
+
+MODULE_DEVICE_TABLE(of, xlnx_edac_match);
+static u32 emif_get_id(struct device_node *node)
+{
+	u32 addr, my_addr, my_id = 0;
+	struct device_node *np;
+	const __be32 *addrp;
+
+	addrp = of_get_address(node, 0, NULL, NULL);
+	my_addr = (u32)of_translate_address(node, addrp);
+
+	for_each_matching_node(np, xlnx_edac_match) {
+		if (np == node)
+			continue;
+
+		addrp = of_get_address(np, 0, NULL, NULL);
+		addr = (u32)of_translate_address(np, addrp);
+
+		edac_printk(KERN_INFO, EDAC_MC,
+			    "addr=%x, my_addr=%x\n",
+			    addr, my_addr);
+
+		if (addr < my_addr)
+			my_id++;
+	}
+
+	return my_id;
+}
+
+static int mc_probe(struct platform_device *pdev)
+{
+	void __iomem *ddrmc_baseaddr, *ddrmc_noc_baseaddr;
+	struct edac_mc_layer layers[2];
+	struct mem_ctl_info *mci;
+	u8 num_chans, num_csrows;
+	struct edac_priv *priv;
+	u32 edac_mc_id, regval;
+	int rc;
+
+	ddrmc_baseaddr = devm_platform_ioremap_resource_byname(pdev, "base");
+	if (IS_ERR(ddrmc_baseaddr))
+		return PTR_ERR(ddrmc_baseaddr);
+
+	ddrmc_noc_baseaddr = devm_platform_ioremap_resource_byname(pdev, "noc");
+	if (IS_ERR(ddrmc_noc_baseaddr))
+		return PTR_ERR(ddrmc_noc_baseaddr);
+
+	if (!get_ecc_state(ddrmc_baseaddr))
+		return -ENXIO;
+
+	/* Allocate ID number for the EMIF controller */
+	edac_mc_id = emif_get_id(pdev->dev.of_node);
+
+	regval = readl(ddrmc_baseaddr + XDDR_REG_CONFIG0_OFFSET);
+	num_chans = FIELD_PREP(XDDR_REG_CONFIG0_NUM_CHANS_MASK, regval);
+	num_chans++;
+
+	num_csrows = FIELD_PREP(XDDR_REG_CONFIG0_NUM_RANKS_MASK, regval);
+	num_csrows *= 2;
+	if (!num_csrows)
+		num_csrows = 1;
+
+	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+	layers[0].size = num_csrows;
+	layers[0].is_virt_csrow = true;
+	layers[1].type = EDAC_MC_LAYER_CHANNEL;
+	layers[1].size = num_chans;
+	layers[1].is_virt_csrow = false;
+
+	mci = edac_mc_alloc(edac_mc_id, ARRAY_SIZE(layers), layers,
+			    sizeof(struct edac_priv));
+	if (!mci) {
+		edac_printk(KERN_ERR, EDAC_MC,
+			    "Failed memory allocation for mc instance\n");
+		return -ENOMEM;
+	}
+
+	priv = mci->pvt_info;
+	priv->ddrmc_baseaddr = ddrmc_baseaddr;
+	priv->ddrmc_noc_baseaddr = ddrmc_noc_baseaddr;
+	priv->ce_cnt = 0;
+	priv->ue_cnt = 0;
+	priv->mc_id = edac_mc_id;
+
+	mc_init(mci, pdev);
+
+	rc = edac_mc_add_mc(mci);
+	if (rc) {
+		edac_printk(KERN_ERR, EDAC_MC,
+			    "Failed to register with EDAC core\n");
+		goto free_edac_mc;
+	}
+
+	rc = xlnx_register_event(PM_NOTIFY_CB, EVENT_ERROR_PMC_ERR1,
+				 XPM_EVENT_ERROR_MASK_DDRMC_CR | XPM_EVENT_ERROR_MASK_DDRMC_NCR |
+				 XPM_EVENT_ERROR_MASK_NOC_CR | XPM_EVENT_ERROR_MASK_NOC_NCR,
+				 false, err_callback, mci);
+	if (rc) {
+		if (rc == -EACCES)
+			rc = -EPROBE_DEFER;
+
+		goto del_mc;
+	}
+
+#ifdef CONFIG_EDAC_DEBUG
+	create_debugfs_attributes(mci);
+	setup_address_map(priv);
+#endif
+	enable_intr(priv);
+	return rc;
+
+del_mc:
+	edac_mc_del_mc(&pdev->dev);
+free_edac_mc:
+	edac_mc_free(mci);
+
+	return rc;
+}
+
+static int mc_remove(struct platform_device *pdev)
+{
+	struct mem_ctl_info *mci = platform_get_drvdata(pdev);
+	struct edac_priv *priv = mci->pvt_info;
+
+	disable_intr(priv);
+
+#ifdef CONFIG_EDAC_DEBUG
+	debugfs_remove_recursive(priv->debugfs);
+#endif
+
+	xlnx_unregister_event(PM_NOTIFY_CB, EVENT_ERROR_PMC_ERR1,
+			      XPM_EVENT_ERROR_MASK_DDRMC_CR |
+			      XPM_EVENT_ERROR_MASK_NOC_CR |
+			      XPM_EVENT_ERROR_MASK_NOC_NCR |
+			      XPM_EVENT_ERROR_MASK_DDRMC_NCR, err_callback, mci);
+	edac_mc_del_mc(&pdev->dev);
+	edac_mc_free(mci);
+
+	return 0;
+}
+
+static struct platform_driver xilinx_ddr_edac_mc_driver = {
+	.driver = {
+		.name = "xilinx-ddrmc-edac",
+		.of_match_table = xlnx_edac_match,
+	},
+	.probe = mc_probe,
+	.remove = mc_remove,
+};
+
+module_platform_driver(xilinx_ddr_edac_mc_driver);
+
+MODULE_AUTHOR("AMD Inc");
+MODULE_DESCRIPTION("Xilinx DDRMC ECC driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index 749868b9e80d..7edf2c95282f 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -1521,6 +1521,7 @@ static int sbp2_scsi_slave_configure(struct scsi_device *sdev)
 	if (sbp2_param_exclusive_login) {
 		sdev->manage_system_start_stop = true;
 		sdev->manage_runtime_start_stop = true;
+		sdev->manage_shutdown = true;
 	}
 
 	if (sdev->type == TYPE_ROM)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index a1157c2a7170..ef4c12f0877b 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -108,13 +108,6 @@ lib-y				:= $(patsubst %.o,%.stub.o,$(lib-y))
 # https://bugs.llvm.org/show_bug.cgi?id=46480
 STUBCOPY_FLAGS-y		+= --remove-section=.note.gnu.property
 
-#
-# For x86, bootloaders like systemd-boot or grub-efi do not zero-initialize the
-# .bss section, so the .bss section of the EFI stub needs to be included in the
-# .data section of the compressed kernel to ensure initialization. Rename the
-# .bss section here so it's easy to pick out in the linker script.
-#
-STUBCOPY_FLAGS-$(CONFIG_X86)	+= --rename-section .bss=.bss.efistub,load,alloc
 STUBCOPY_RELOC-$(CONFIG_X86_32)	:= R_386_32
 STUBCOPY_RELOC-$(CONFIG_X86_64)	:= R_X86_64_64
 
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 9d5df683f882..1bfdae34df39 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -449,9 +449,8 @@ void __noreturn efi_stub_entry(efi_handle_t handle,
 efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
 				   efi_system_table_t *sys_table_arg)
 {
-	struct boot_params *boot_params;
-	struct setup_header *hdr;
-	void *image_base;
+	static struct boot_params boot_params __page_aligned_bss;
+	struct setup_header *hdr = &boot_params.hdr;
 	efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
 	int options_size = 0;
 	efi_status_t status;
@@ -469,30 +468,9 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
 		efi_exit(handle, status);
 	}
 
-	image_base = efi_table_attr(image, image_base);
-
-	status = efi_allocate_pages(sizeof(struct boot_params),
-				    (unsigned long *)&boot_params, ULONG_MAX);
-	if (status != EFI_SUCCESS) {
-		efi_err("Failed to allocate lowmem for boot params\n");
-		efi_exit(handle, status);
-	}
-
-	memset(boot_params, 0x0, sizeof(struct boot_params));
-
-	hdr = &boot_params->hdr;
-
-	/* Copy the setup header from the second sector to boot_params */
-	memcpy(&hdr->jump, image_base + 512,
-	       sizeof(struct setup_header) - offsetof(struct setup_header, jump));
-
-	/*
-	 * Fill out some of the header fields ourselves because the
-	 * EFI firmware loader doesn't load the first sector.
-	 */
+	/* Assign the setup_header fields that the kernel actually cares about */
 	hdr->root_flags	= 1;
 	hdr->vid_mode	= 0xffff;
-	hdr->boot_flag	= 0xAA55;
 
 	hdr->type_of_loader = 0x21;
 
@@ -501,25 +479,13 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
 	if (!cmdline_ptr)
 		goto fail;
 
-	efi_set_u64_split((unsigned long)cmdline_ptr,
-			  &hdr->cmd_line_ptr, &boot_params->ext_cmd_line_ptr);
-
-	hdr->ramdisk_image = 0;
-	hdr->ramdisk_size = 0;
-
-	/*
-	 * Disregard any setup data that was provided by the bootloader:
-	 * setup_data could be pointing anywhere, and we have no way of
-	 * authenticating or validating the payload.
-	 */
-	hdr->setup_data = 0;
+	efi_set_u64_split((unsigned long)cmdline_ptr, &hdr->cmd_line_ptr,
+			  &boot_params.ext_cmd_line_ptr);
 
-	efi_stub_entry(handle, sys_table_arg, boot_params);
+	efi_stub_entry(handle, sys_table_arg, &boot_params);
 	/* not reached */
 
 fail:
-	efi_free(sizeof(struct boot_params), (unsigned long)boot_params);
-
 	efi_exit(handle, status);
 }
 
@@ -849,7 +815,7 @@ void __noreturn efi_stub_entry(efi_handle_t handle,
 	unsigned long kernel_entry;
 	efi_status_t status;
 
-	boot_params_pointer = boot_params;
+	boot_params_ptr = boot_params;
 
 	efi_system_table = sys_table_arg;
 	/* Check if we were booted by the EFI firmware */
diff --git a/drivers/firmware/efi/libstub/x86-stub.h b/drivers/firmware/efi/libstub/x86-stub.h
index 2748bca192df..37c5a36b9d8c 100644
--- a/drivers/firmware/efi/libstub/x86-stub.h
+++ b/drivers/firmware/efi/libstub/x86-stub.h
@@ -2,8 +2,6 @@
 
 #include <linux/efi.h>
 
-extern struct boot_params *boot_params_pointer asm("boot_params");
-
 extern void trampoline_32bit_src(void *, bool);
 extern const u16 trampoline_ljmp_imm_offset;
 
diff --git a/drivers/firmware/imx/imx-dsp.c b/drivers/firmware/imx/imx-dsp.c
index 508eab346fc6..a48a58e0c61f 100644
--- a/drivers/firmware/imx/imx-dsp.c
+++ b/drivers/firmware/imx/imx-dsp.c
@@ -114,11 +114,11 @@ static int imx_dsp_setup_channels(struct imx_dsp_ipc *dsp_ipc)
 		dsp_chan->idx = i % 2;
 		dsp_chan->ch = mbox_request_channel_byname(cl, chan_name);
 		if (IS_ERR(dsp_chan->ch)) {
-			kfree(dsp_chan->name);
 			ret = PTR_ERR(dsp_chan->ch);
 			if (ret != -EPROBE_DEFER)
 				dev_err(dev, "Failed to request mbox chan %s ret %d\n",
 					chan_name, ret);
+			kfree(dsp_chan->name);
 			goto out;
 		}
 
diff --git a/drivers/firmware/tegra/bpmp-debugfs.c b/drivers/firmware/tegra/bpmp-debugfs.c
index 6dfe3d34109e..bbcdd9fed3fb 100644
--- a/drivers/firmware/tegra/bpmp-debugfs.c
+++ b/drivers/firmware/tegra/bpmp-debugfs.c
@@ -610,7 +610,7 @@ static int debugfs_show(struct seq_file *m, void *p)
 	}
 
 	len = strlen(filename);
-	strncpy(namevirt, filename, namesize);
+	strscpy_pad(namevirt, filename, namesize);
 
 	err = mrq_debugfs_read(bpmp, namephys, len, dataphys, datasize,
 			       &nbytes);
@@ -661,7 +661,7 @@ static ssize_t debugfs_store(struct file *file, const char __user *buf,
 	}
 
 	len = strlen(filename);
-	strncpy(namevirt, filename, namesize);
+	strscpy_pad(namevirt, filename, namesize);
 
 	if (copy_from_user(datavirt, buf, count)) {
 		err = -EFAULT;
diff --git a/drivers/fpga/tests/Kconfig b/drivers/fpga/tests/Kconfig
index e4a64815f16d..d4e55204c092 100644
--- a/drivers/fpga/tests/Kconfig
+++ b/drivers/fpga/tests/Kconfig
@@ -1,6 +1,6 @@
 config FPGA_KUNIT_TESTS
-	tristate "KUnit test for the FPGA subsystem" if !KUNIT_ALL_TESTS
-	depends on FPGA && FPGA_REGION && FPGA_BRIDGE && KUNIT=y
+	bool "KUnit test for the FPGA subsystem" if !KUNIT_ALL_TESTS
+	depends on FPGA=y && FPGA_REGION=y && FPGA_BRIDGE=y && KUNIT=y && MODULES=n
 	default KUNIT_ALL_TESTS
         help
           This builds unit tests for the FPGA subsystem
diff --git a/drivers/fpga/tests/fpga-region-test.c b/drivers/fpga/tests/fpga-region-test.c
index 9f9d50ee7871..baab07e3fc59 100644
--- a/drivers/fpga/tests/fpga-region-test.c
+++ b/drivers/fpga/tests/fpga-region-test.c
@@ -93,6 +93,8 @@ static void fpga_region_test_class_find(struct kunit *test)
 
 	region = fpga_region_class_find(NULL, &ctx->region_pdev->dev, fake_region_match);
 	KUNIT_EXPECT_PTR_EQ(test, region, ctx->region);
+
+	put_device(&region->dev);
 }
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 7d6daf8d2bfa..e036011137aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1103,7 +1103,7 @@ static int reserve_bo_and_vm(struct kgd_mem *mem,
 		if (unlikely(ret))
 			goto error;
 
-		ret = drm_exec_lock_obj(&ctx->exec, &bo->tbo.base);
+		ret = drm_exec_prepare_obj(&ctx->exec, &bo->tbo.base, 1);
 		drm_exec_retry_on_contention(&ctx->exec);
 		if (unlikely(ret))
 			goto error;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index efdb1c48f431..d93a8961274c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -65,7 +65,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
 	}
 
 	amdgpu_sync_create(&p->sync);
-	drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
+	drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
+		      DRM_EXEC_IGNORE_DUPLICATES);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index aac52d9754e6..76549c2cffeb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -55,6 +55,10 @@ bool amdgpu_ctx_priority_is_valid(int32_t ctx_prio)
 		return true;
 	default:
 	case AMDGPU_CTX_PRIORITY_UNSET:
+		/* UNSET priority is not valid and we don't carry that
+		 * around, but set it to NORMAL in the only place this
+		 * function is called, amdgpu_ctx_ioctl().
+		 */
 		return false;
 	}
 }
@@ -95,9 +99,6 @@ amdgpu_ctx_to_drm_sched_prio(int32_t ctx_prio)
 static int amdgpu_ctx_priority_permit(struct drm_file *filp,
 				      int32_t priority)
 {
-	if (!amdgpu_ctx_priority_is_valid(priority))
-		return -EINVAL;
-
 	/* NORMAL and below are accessible by everyone */
 	if (priority <= AMDGPU_CTX_PRIORITY_NORMAL)
 		return 0;
@@ -632,8 +633,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 	return 0;
 }
 
-
-
 static int amdgpu_ctx_stable_pstate(struct amdgpu_device *adev,
 				    struct amdgpu_fpriv *fpriv, uint32_t id,
 				    bool set, u32 *stable_pstate)
@@ -676,8 +675,10 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
 	id = args->in.ctx_id;
 	priority = args->in.priority;
 
-	/* For backwards compatibility reasons, we need to accept
-	 * ioctls with garbage in the priority field */
+	/* For backwards compatibility, we need to accept ioctls with garbage
+	 * in the priority field. Garbage values in the priority field, result
+	 * in the priority being set to NORMAL.
+	 */
 	if (!amdgpu_ctx_priority_is_valid(priority))
 		priority = AMDGPU_CTX_PRIORITY_NORMAL;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index 6a8494f98d3e..fe8ba9e9837b 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -1124,7 +1124,7 @@ static void vi_program_aspm(struct amdgpu_device *adev)
 	bool bL1SS = false;
 	bool bClkReqSupport = true;
 
-	if (!amdgpu_device_should_use_aspm(adev) || !amdgpu_device_aspm_support_quirk())
+	if (!amdgpu_device_should_use_aspm(adev) || !amdgpu_device_pcie_dynamic_switching_supported())
 		return;
 
 	if (adev->flags & AMD_IS_APU ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index c8c75ff7cea8..490000992ecd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -2218,7 +2218,7 @@ static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask)
 	if (first_cpu_of_numa_node >= nr_cpu_ids)
 		return -1;
 #ifdef CONFIG_X86_64
-	return cpu_data(first_cpu_of_numa_node).apicid;
+	return cpu_data(first_cpu_of_numa_node).topo.apicid;
 #else
 	return first_cpu_of_numa_node;
 #endif
diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index ed96cfcfa304..8c929ef72c72 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -2574,14 +2574,14 @@ static struct drm_dp_mst_branch *get_mst_branch_device_by_guid_helper(
 	struct drm_dp_mst_branch *found_mstb;
 	struct drm_dp_mst_port *port;
 
+	if (!mstb)
+		return NULL;
+
 	if (memcmp(mstb->guid, guid, 16) == 0)
 		return mstb;
 
 
 	list_for_each_entry(port, &mstb->ports, next) {
-		if (!port->mstb)
-			continue;
-
 		found_mstb = get_mst_branch_device_by_guid_helper(port->mstb, guid);
 
 		if (found_mstb)
diff --git a/drivers/gpu/drm/gud/gud_pipe.c b/drivers/gpu/drm/gud/gud_pipe.c
index d2f199ea3c11..a02f75be81f0 100644
--- a/drivers/gpu/drm/gud/gud_pipe.c
+++ b/drivers/gpu/drm/gud/gud_pipe.c
@@ -503,7 +503,7 @@ int gud_pipe_check(struct drm_simple_display_pipe *pipe,
 		return -ENOENT;
 
 	len = struct_size(req, properties,
-			  GUD_PROPERTIES_MAX_NUM + GUD_CONNECTOR_PROPERTIES_MAX_NUM);
+			  size_add(GUD_PROPERTIES_MAX_NUM, GUD_CONNECTOR_PROPERTIES_MAX_NUM));
 	req = kzalloc(len, GFP_KERNEL);
 	if (!req)
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 310654542b42..a2195e28b625 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -916,11 +916,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915)
 {
 	struct file *file;
 
-	rcu_read_lock();
-	file = READ_ONCE(i915->gem.mmap_singleton);
-	if (file && !get_file_rcu(file))
-		file = NULL;
-	rcu_read_unlock();
+	file = get_file_active(&i915->gem.mmap_singleton);
 	if (file)
 		return file;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
index 0b414eae1683..2c0f1f3e28ff 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
@@ -376,9 +376,26 @@ void intel_gt_mcr_lock(struct intel_gt *gt, unsigned long *flags)
 	 * driver threads, but also with hardware/firmware agents.  A dedicated
 	 * locking register is used.
 	 */
-	if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70))
+	if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) {
+		/*
+		 * The steering control and semaphore registers are inside an
+		 * "always on" power domain with respect to RC6.  However there
+		 * are some issues if higher-level platform sleep states are
+		 * entering/exiting at the same time these registers are
+		 * accessed.  Grabbing GT forcewake and holding it over the
+		 * entire lock/steer/unlock cycle ensures that those sleep
+		 * states have been fully exited before we access these
+		 * registers.  This wakeref will be released in the unlock
+		 * routine.
+		 *
+		 * This is expected to become a formally documented/numbered
+		 * workaround soon.
+		 */
+		intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_GT);
+
 		err = wait_for(intel_uncore_read_fw(gt->uncore,
 						    MTL_STEER_SEMAPHORE) == 0x1, 100);
+	}
 
 	/*
 	 * Even on platforms with a hardware lock, we'll continue to grab
@@ -415,8 +432,11 @@ void intel_gt_mcr_unlock(struct intel_gt *gt, unsigned long flags)
 {
 	spin_unlock_irqrestore(&gt->mcr_lock, flags);
 
-	if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70))
+	if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) {
 		intel_uncore_write_fw(gt->uncore, MTL_STEER_SEMAPHORE, 0x1);
+
+		intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_GT);
+	}
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 04bc1f4a1115..59e1e21df271 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -482,8 +482,7 @@ static void oa_report_id_clear(struct i915_perf_stream *stream, u32 *report)
 static bool oa_report_ctx_invalid(struct i915_perf_stream *stream, void *report)
 {
 	return !(oa_report_id(stream, report) &
-	       stream->perf->gen8_valid_ctx_bit) &&
-	       GRAPHICS_VER(stream->perf->i915) <= 11;
+	       stream->perf->gen8_valid_ctx_bit);
 }
 
 static u64 oa_timestamp(struct i915_perf_stream *stream, void *report)
@@ -5106,6 +5105,7 @@ static void i915_perf_init_info(struct drm_i915_private *i915)
 		perf->gen8_valid_ctx_bit = BIT(16);
 		break;
 	case 12:
+		perf->gen8_valid_ctx_bit = BIT(16);
 		/*
 		 * Calculate offset at runtime in oa_pin_context for gen12 and
 		 * cache the value in perf->ctx_oactxctrl_offset.
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index d35973b41186..7b1076b5e748 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -832,9 +832,18 @@ static void i915_pmu_event_start(struct perf_event *event, int flags)
 
 static void i915_pmu_event_stop(struct perf_event *event, int flags)
 {
+	struct drm_i915_private *i915 =
+		container_of(event->pmu, typeof(*i915), pmu.base);
+	struct i915_pmu *pmu = &i915->pmu;
+
+	if (pmu->closed)
+		goto out;
+
 	if (flags & PERF_EF_UPDATE)
 		i915_pmu_event_read(event);
 	i915_pmu_disable(event);
+
+out:
 	event->hw.state = PERF_HES_STOPPED;
 }
 
diff --git a/drivers/gpu/drm/logicvc/Kconfig b/drivers/gpu/drm/logicvc/Kconfig
index fa7a88368809..1df22a852a23 100644
--- a/drivers/gpu/drm/logicvc/Kconfig
+++ b/drivers/gpu/drm/logicvc/Kconfig
@@ -5,5 +5,7 @@ config DRM_LOGICVC
 	select DRM_KMS_HELPER
 	select DRM_KMS_DMA_HELPER
 	select DRM_GEM_DMA_HELPER
+	select REGMAP
+	select REGMAP_MMIO
 	help
 	  DRM display driver for the logiCVC programmable logic block from Xylon
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 186351ecf72f..cc03e0c22ff3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -67,7 +67,7 @@ struct nouveau_svm {
 			struct nouveau_svmm *svmm;
 		} **fault;
 		int fault_nr;
-	} buffer[1];
+	} buffer[];
 };
 
 #define FAULT_ACCESS_READ 0
@@ -1063,7 +1063,8 @@ nouveau_svm_init(struct nouveau_drm *drm)
 	if (drm->client.device.info.family > NV_DEVICE_INFO_V0_PASCAL)
 		return;
 
-	if (!(drm->svm = svm = kzalloc(sizeof(*drm->svm), GFP_KERNEL)))
+	drm->svm = svm = kzalloc(struct_size(drm->svm, buffer, 1), GFP_KERNEL);
+	if (!drm->svm)
 		return;
 
 	drm->svm->drm = drm;
diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c
index e4e9471d0f1e..c16d2ba6ea16 100644
--- a/drivers/hid/hid-prodikeys.c
+++ b/drivers/hid/hid-prodikeys.c
@@ -639,9 +639,9 @@ static int pcmidi_snd_initialise(struct pcmidi_snd *pm)
 		goto fail;
 	}
 
-	strncpy(card->driver, shortname, sizeof(card->driver));
-	strncpy(card->shortname, shortname, sizeof(card->shortname));
-	strncpy(card->longname, longname, sizeof(card->longname));
+	strscpy(card->driver, shortname, sizeof(card->driver));
+	strscpy(card->shortname, shortname, sizeof(card->shortname));
+	strscpy(card->longname, longname, sizeof(card->longname));
 
 	/* Set up rawmidi */
 	err = snd_rawmidi_new(card, card->shortname, 0,
@@ -652,7 +652,7 @@ static int pcmidi_snd_initialise(struct pcmidi_snd *pm)
 		goto fail;
 	}
 	pm->rwmidi = rwmidi;
-	strncpy(rwmidi->name, card->shortname, sizeof(rwmidi->name));
+	strscpy(rwmidi->name, card->shortname, sizeof(rwmidi->name));
 	rwmidi->info_flags = SNDRV_RAWMIDI_INFO_INPUT;
 	rwmidi->private_data = pm;
 
diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c
index fa28d447f0df..8db740214ffd 100644
--- a/drivers/hwmon/acpi_power_meter.c
+++ b/drivers/hwmon/acpi_power_meter.c
@@ -796,14 +796,13 @@ static int read_capabilities(struct acpi_power_meter_resource *resource)
 			goto error;
 		}
 
-		*str = kcalloc(element->string.length + 1, sizeof(u8),
-			       GFP_KERNEL);
+		*str = kmemdup_nul(element->string.pointer, element->string.length,
+				   GFP_KERNEL);
 		if (!*str) {
 			res = -ENOMEM;
 			goto error;
 		}
 
-		strncpy(*str, element->string.pointer, element->string.length);
 		str++;
 	}
 
diff --git a/drivers/hwmon/asus_wmi_sensors.c b/drivers/hwmon/asus_wmi_sensors.c
index 6e8a908171f0..c2dd7ff882f2 100644
--- a/drivers/hwmon/asus_wmi_sensors.c
+++ b/drivers/hwmon/asus_wmi_sensors.c
@@ -300,7 +300,7 @@ static int asus_wmi_sensor_info(int index, struct asus_wmi_sensor_info *s)
 		goto out_free_obj;
 	}
 
-	strncpy(s->name, name_obj.string.pointer, sizeof(s->name) - 1);
+	strscpy(s->name, name_obj.string.pointer, sizeof(s->name));
 
 	data_type_obj = obj->package.elements[1];
 	if (data_type_obj.type != ACPI_TYPE_INTEGER) {
diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c
index 521534d5c1e5..6307112c2c0c 100644
--- a/drivers/hwmon/fam15h_power.c
+++ b/drivers/hwmon/fam15h_power.c
@@ -17,6 +17,7 @@
 #include <linux/cpumask.h>
 #include <linux/time.h>
 #include <linux/sched.h>
+#include <linux/topology.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 
@@ -134,15 +135,13 @@ static DEVICE_ATTR_RO(power1_crit);
 static void do_read_registers_on_cu(void *_data)
 {
 	struct fam15h_power_data *data = _data;
-	int cpu, cu;
-
-	cpu = smp_processor_id();
+	int cu;
 
 	/*
 	 * With the new x86 topology modelling, cpu core id actually
 	 * is compute unit id.
 	 */
-	cu = cpu_data(cpu).cpu_core_id;
+	cu = topology_core_id(smp_processor_id());
 
 	rdmsrl_safe(MSR_F15H_CU_PWR_ACCUMULATOR, &data->cu_acc_power[cu]);
 	rdmsrl_safe(MSR_F15H_PTSC, &data->cpu_sw_pwr_ptsc[cu]);
diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c
index 594254d6a72d..70ca833259ab 100644
--- a/drivers/hwmon/ibmpowernv.c
+++ b/drivers/hwmon/ibmpowernv.c
@@ -234,7 +234,7 @@ static int get_sensor_index_attr(const char *name, u32 *index, char *attr)
 	if (copy_len >= sizeof(buf))
 		return -EINVAL;
 
-	strncpy(buf, hash_pos + 1, copy_len);
+	memcpy(buf, hash_pos + 1, copy_len);
 
 	err = kstrtou32(buf, 10, index);
 	if (err)
diff --git a/drivers/i2c/busses/i2c-aspeed.c b/drivers/i2c/busses/i2c-aspeed.c
index 5a416b39b818..28e2a5fc4528 100644
--- a/drivers/i2c/busses/i2c-aspeed.c
+++ b/drivers/i2c/busses/i2c-aspeed.c
@@ -749,6 +749,8 @@ static void __aspeed_i2c_reg_slave(struct aspeed_i2c_bus *bus, u16 slave_addr)
 	func_ctrl_reg_val = readl(bus->base + ASPEED_I2C_FUN_CTRL_REG);
 	func_ctrl_reg_val |= ASPEED_I2CD_SLAVE_EN;
 	writel(func_ctrl_reg_val, bus->base + ASPEED_I2C_FUN_CTRL_REG);
+
+	bus->slave_state = ASPEED_I2C_SLAVE_INACTIVE;
 }
 
 static int aspeed_i2c_reg_slave(struct i2c_client *client)
@@ -765,7 +767,6 @@ static int aspeed_i2c_reg_slave(struct i2c_client *client)
 	__aspeed_i2c_reg_slave(bus, client->addr);
 
 	bus->slave = client;
-	bus->slave_state = ASPEED_I2C_SLAVE_INACTIVE;
 	spin_unlock_irqrestore(&bus->lock, flags);
 
 	return 0;
diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c
index 579b30581725..0d3c9a041b56 100644
--- a/drivers/i2c/busses/i2c-stm32f7.c
+++ b/drivers/i2c/busses/i2c-stm32f7.c
@@ -1059,9 +1059,10 @@ static int stm32f7_i2c_smbus_xfer_msg(struct stm32f7_i2c_dev *i2c_dev,
 	/* Configure PEC */
 	if ((flags & I2C_CLIENT_PEC) && f7_msg->size != I2C_SMBUS_QUICK) {
 		cr1 |= STM32F7_I2C_CR1_PECEN;
-		cr2 |= STM32F7_I2C_CR2_PECBYTE;
-		if (!f7_msg->read_write)
+		if (!f7_msg->read_write) {
+			cr2 |= STM32F7_I2C_CR2_PECBYTE;
 			f7_msg->count++;
+		}
 	} else {
 		cr1 &= ~STM32F7_I2C_CR1_PECEN;
 		cr2 &= ~STM32F7_I2C_CR2_PECBYTE;
@@ -1149,8 +1150,10 @@ static void stm32f7_i2c_smbus_rep_start(struct stm32f7_i2c_dev *i2c_dev)
 	f7_msg->stop = true;
 
 	/* Add one byte for PEC if needed */
-	if (cr1 & STM32F7_I2C_CR1_PECEN)
+	if (cr1 & STM32F7_I2C_CR1_PECEN) {
+		cr2 |= STM32F7_I2C_CR2_PECBYTE;
 		f7_msg->count++;
+	}
 
 	/* Set number of bytes to be transferred */
 	cr2 &= ~(STM32F7_I2C_CR2_NBYTES_MASK);
diff --git a/drivers/i2c/muxes/i2c-demux-pinctrl.c b/drivers/i2c/muxes/i2c-demux-pinctrl.c
index 22f2280eab7f..9f2e4aa28159 100644
--- a/drivers/i2c/muxes/i2c-demux-pinctrl.c
+++ b/drivers/i2c/muxes/i2c-demux-pinctrl.c
@@ -61,7 +61,7 @@ static int i2c_demux_activate_master(struct i2c_demux_pinctrl_priv *priv, u32 ne
 	if (ret)
 		goto err;
 
-	adap = of_find_i2c_adapter_by_node(priv->chan[new_chan].parent_np);
+	adap = of_get_i2c_adapter_by_node(priv->chan[new_chan].parent_np);
 	if (!adap) {
 		ret = -ENODEV;
 		goto err_with_revert;
diff --git a/drivers/i2c/muxes/i2c-mux-gpmux.c b/drivers/i2c/muxes/i2c-mux-gpmux.c
index baccf4bfaf02..8305661e1253 100644
--- a/drivers/i2c/muxes/i2c-mux-gpmux.c
+++ b/drivers/i2c/muxes/i2c-mux-gpmux.c
@@ -52,7 +52,7 @@ static struct i2c_adapter *mux_parent_adapter(struct device *dev)
 		dev_err(dev, "Cannot parse i2c-parent\n");
 		return ERR_PTR(-ENODEV);
 	}
-	parent = of_find_i2c_adapter_by_node(parent_np);
+	parent = of_get_i2c_adapter_by_node(parent_np);
 	of_node_put(parent_np);
 	if (!parent)
 		return ERR_PTR(-EPROBE_DEFER);
diff --git a/drivers/i2c/muxes/i2c-mux-pinctrl.c b/drivers/i2c/muxes/i2c-mux-pinctrl.c
index 18236b9fa14a..6ebca7bfd8a2 100644
--- a/drivers/i2c/muxes/i2c-mux-pinctrl.c
+++ b/drivers/i2c/muxes/i2c-mux-pinctrl.c
@@ -62,7 +62,7 @@ static struct i2c_adapter *i2c_mux_pinctrl_parent_adapter(struct device *dev)
 		dev_err(dev, "Cannot parse i2c-parent\n");
 		return ERR_PTR(-ENODEV);
 	}
-	parent = of_find_i2c_adapter_by_node(parent_np);
+	parent = of_get_i2c_adapter_by_node(parent_np);
 	of_node_put(parent_np);
 	if (!parent)
 		return ERR_PTR(-EPROBE_DEFER);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index ea5a6a14c553..dcda0afecfc5 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -53,9 +53,8 @@
 #include <linux/moduleparam.h>
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
-#include <asm/nospec-branch.h>
 #include <asm/mwait.h>
-#include <asm/msr.h>
+#include <asm/spec-ctrl.h>
 #include <asm/fpu/api.h>
 
 #define INTEL_IDLE_VERSION "0.5.1"
@@ -69,6 +68,7 @@ static int max_cstate = CPUIDLE_STATE_MAX - 1;
 static unsigned int disabled_states_mask __read_mostly;
 static unsigned int preferred_states_mask __read_mostly;
 static bool force_irq_on __read_mostly;
+static bool ibrs_off __read_mostly;
 
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 
@@ -182,12 +182,12 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
 	int ret;
 
 	if (smt_active)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+		__update_spec_ctrl(0);
 
 	ret = __intel_idle(dev, drv, index);
 
 	if (smt_active)
-		native_wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
+		__update_spec_ctrl(spec_ctrl);
 
 	return ret;
 }
@@ -1853,11 +1853,13 @@ static void state_update_enter_method(struct cpuidle_state *state, int cstate)
 	}
 
 	if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
-			   state->flags & CPUIDLE_FLAG_IBRS) {
+			((state->flags & CPUIDLE_FLAG_IBRS) || ibrs_off)) {
 		/*
 		 * IBRS mitigation requires that C-states are entered
 		 * with interrupts disabled.
 		 */
+		if (ibrs_off && (state->flags & CPUIDLE_FLAG_IRQ_ENABLE))
+			state->flags &= ~CPUIDLE_FLAG_IRQ_ENABLE;
 		WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
 		state->enter = intel_idle_ibrs;
 		return;
@@ -2176,3 +2178,9 @@ MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");
  * 'CPUIDLE_FLAG_INIT_XSTATE' and 'CPUIDLE_FLAG_IBRS' flags.
  */
 module_param(force_irq_on, bool, 0444);
+/*
+ * Force the disabling of IBRS when X86_FEATURE_KERNEL_IBRS is on and
+ * CPUIDLE_FLAG_IRQ_ENABLE isn't set.
+ */
+module_param(ibrs_off, bool, 0444);
+MODULE_PARM_DESC(ibrs_off, "Disable IBRS when idle");
diff --git a/drivers/iio/adc/exynos_adc.c b/drivers/iio/adc/exynos_adc.c
index cff1ba57fb16..43c8af41b4a9 100644
--- a/drivers/iio/adc/exynos_adc.c
+++ b/drivers/iio/adc/exynos_adc.c
@@ -826,16 +826,26 @@ static int exynos_adc_probe(struct platform_device *pdev)
 		}
 	}
 
+	/* leave out any TS related code if unreachable */
+	if (IS_REACHABLE(CONFIG_INPUT)) {
+		has_ts = of_property_read_bool(pdev->dev.of_node,
+					       "has-touchscreen") || pdata;
+	}
+
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0)
 		return irq;
 	info->irq = irq;
 
-	irq = platform_get_irq(pdev, 1);
-	if (irq == -EPROBE_DEFER)
-		return irq;
+	if (has_ts) {
+		irq = platform_get_irq(pdev, 1);
+		if (irq == -EPROBE_DEFER)
+			return irq;
 
-	info->tsirq = irq;
+		info->tsirq = irq;
+	} else {
+		info->tsirq = -1;
+	}
 
 	info->dev = &pdev->dev;
 
@@ -900,12 +910,6 @@ static int exynos_adc_probe(struct platform_device *pdev)
 	if (info->data->init_hw)
 		info->data->init_hw(info);
 
-	/* leave out any TS related code if unreachable */
-	if (IS_REACHABLE(CONFIG_INPUT)) {
-		has_ts = of_property_read_bool(pdev->dev.of_node,
-					       "has-touchscreen") || pdata;
-	}
-
 	if (pdata)
 		info->delay = pdata->delay;
 	else
diff --git a/drivers/iio/adc/xilinx-xadc-core.c b/drivers/iio/adc/xilinx-xadc-core.c
index dba73300f894..564c0cad0fc7 100644
--- a/drivers/iio/adc/xilinx-xadc-core.c
+++ b/drivers/iio/adc/xilinx-xadc-core.c
@@ -456,6 +456,9 @@ static const struct xadc_ops xadc_zynq_ops = {
 	.interrupt_handler = xadc_zynq_interrupt_handler,
 	.update_alarm = xadc_zynq_update_alarm,
 	.type = XADC_TYPE_S7,
+	/* Temp in C = (val * 503.975) / 2**bits - 273.15 */
+	.temp_scale = 503975,
+	.temp_offset = 273150,
 };
 
 static const unsigned int xadc_axi_reg_offsets[] = {
@@ -566,6 +569,9 @@ static const struct xadc_ops xadc_7s_axi_ops = {
 	.interrupt_handler = xadc_axi_interrupt_handler,
 	.flags = XADC_FLAGS_BUFFERED | XADC_FLAGS_IRQ_OPTIONAL,
 	.type = XADC_TYPE_S7,
+	/* Temp in C = (val * 503.975) / 2**bits - 273.15 */
+	.temp_scale = 503975,
+	.temp_offset = 273150,
 };
 
 static const struct xadc_ops xadc_us_axi_ops = {
@@ -577,6 +583,12 @@ static const struct xadc_ops xadc_us_axi_ops = {
 	.interrupt_handler = xadc_axi_interrupt_handler,
 	.flags = XADC_FLAGS_BUFFERED | XADC_FLAGS_IRQ_OPTIONAL,
 	.type = XADC_TYPE_US,
+	/**
+	 * Values below are for UltraScale+ (SYSMONE4) using internal reference.
+	 * See https://docs.xilinx.com/v/u/en-US/ug580-ultrascale-sysmon
+	 */
+	.temp_scale = 509314,
+	.temp_offset = 280231,
 };
 
 static int _xadc_update_adc_reg(struct xadc *xadc, unsigned int reg,
@@ -945,8 +957,7 @@ static int xadc_read_raw(struct iio_dev *indio_dev,
 			*val2 = bits;
 			return IIO_VAL_FRACTIONAL_LOG2;
 		case IIO_TEMP:
-			/* Temp in C = (val * 503.975) / 2**bits - 273.15 */
-			*val = 503975;
+			*val = xadc->ops->temp_scale;
 			*val2 = bits;
 			return IIO_VAL_FRACTIONAL_LOG2;
 		default:
@@ -954,7 +965,7 @@ static int xadc_read_raw(struct iio_dev *indio_dev,
 		}
 	case IIO_CHAN_INFO_OFFSET:
 		/* Only the temperature channel has an offset */
-		*val = -((273150 << bits) / 503975);
+		*val = -((xadc->ops->temp_offset << bits) / xadc->ops->temp_scale);
 		return IIO_VAL_INT;
 	case IIO_CHAN_INFO_SAMP_FREQ:
 		ret = xadc_read_samplerate(xadc);
@@ -1423,28 +1434,6 @@ static int xadc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	/* Disable all alarms */
-	ret = xadc_update_adc_reg(xadc, XADC_REG_CONF1, XADC_CONF1_ALARM_MASK,
-				  XADC_CONF1_ALARM_MASK);
-	if (ret)
-		return ret;
-
-	/* Set thresholds to min/max */
-	for (i = 0; i < 16; i++) {
-		/*
-		 * Set max voltage threshold and both temperature thresholds to
-		 * 0xffff, min voltage threshold to 0.
-		 */
-		if (i % 8 < 4 || i == 7)
-			xadc->threshold[i] = 0xffff;
-		else
-			xadc->threshold[i] = 0;
-		ret = xadc_write_adc_reg(xadc, XADC_REG_THRESHOLD(i),
-			xadc->threshold[i]);
-		if (ret)
-			return ret;
-	}
-
 	/* Go to non-buffered mode */
 	xadc_postdisable(indio_dev);
 
diff --git a/drivers/iio/adc/xilinx-xadc.h b/drivers/iio/adc/xilinx-xadc.h
index 7d78ce698967..3036f4d613ff 100644
--- a/drivers/iio/adc/xilinx-xadc.h
+++ b/drivers/iio/adc/xilinx-xadc.h
@@ -85,6 +85,8 @@ struct xadc_ops {
 
 	unsigned int flags;
 	enum xadc_type type;
+	int temp_scale;
+	int temp_offset;
 };
 
 static inline int _xadc_read_adc_reg(struct xadc *xadc, unsigned int reg,
diff --git a/drivers/iio/afe/iio-rescale.c b/drivers/iio/afe/iio-rescale.c
index 1f280c360701..56e5913ab82d 100644
--- a/drivers/iio/afe/iio-rescale.c
+++ b/drivers/iio/afe/iio-rescale.c
@@ -214,8 +214,18 @@ static int rescale_read_raw(struct iio_dev *indio_dev,
 				return ret < 0 ? ret : -EOPNOTSUPP;
 		}
 
-		ret = iio_read_channel_scale(rescale->source, &scale, &scale2);
-		return rescale_process_offset(rescale, ret, scale, scale2,
+		if (iio_channel_has_info(rescale->source->channel,
+					 IIO_CHAN_INFO_SCALE)) {
+			ret = iio_read_channel_scale(rescale->source, &scale, &scale2);
+			return rescale_process_offset(rescale, ret, scale, scale2,
+						      schan_off, val, val2);
+		}
+
+		/*
+		 * If we get here we have no scale so scale 1:1 but apply
+		 * rescaler and offset, if any.
+		 */
+		return rescale_process_offset(rescale, IIO_VAL_FRACTIONAL, 1, 1,
 					      schan_off, val, val2);
 	default:
 		return -EINVAL;
@@ -280,8 +290,9 @@ static int rescale_configure_channel(struct device *dev,
 	chan->type = rescale->cfg->type;
 
 	if (iio_channel_has_info(schan, IIO_CHAN_INFO_RAW) &&
-	    iio_channel_has_info(schan, IIO_CHAN_INFO_SCALE)) {
-		dev_info(dev, "using raw+scale source channel\n");
+	    (iio_channel_has_info(schan, IIO_CHAN_INFO_SCALE) ||
+	     iio_channel_has_info(schan, IIO_CHAN_INFO_OFFSET))) {
+		dev_info(dev, "using raw+scale/offset source channel\n");
 	} else if (iio_channel_has_info(schan, IIO_CHAN_INFO_PROCESSED)) {
 		dev_info(dev, "using processed channel\n");
 		rescale->chan_processed = true;
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index a5ab22cedd41..788fc249234f 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -267,7 +267,7 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 
 	if (!HFI1_CAP_IS_KSET(SDMA))
 		return -EINVAL;
-	if (!from->user_backed)
+	if (!user_backed_iter(from))
 		return -EINVAL;
 	idx = srcu_read_lock(&fd->pq_srcu);
 	pq = srcu_dereference(fd->pq, &fd->pq_srcu);
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 152952127f13..29e4c59aa23b 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -2244,7 +2244,7 @@ static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp);
 	struct qib_user_sdma_queue *pq = fp->pq;
 
-	if (!from->user_backed || !from->nr_segs || !pq)
+	if (!user_backed_iter(from) || !from->nr_segs || !pq)
 		return -EINVAL;
 
 	return qib_user_sdma_writev(rcd, pq, iter_iov(from), from->nr_segs);
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index ed7d4b02f45a..455e966eeff3 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -64,8 +64,8 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
 	inode->i_uid = GLOBAL_ROOT_UID;
 	inode->i_gid = GLOBAL_ROOT_GID;
 	inode->i_blocks = 0;
-	inode->i_atime = inode_set_ctime_current(inode);
-	inode->i_mtime = inode->i_atime;
+	simple_inode_init_ts(inode);
+	
 	inode->i_private = data;
 	if (S_ISDIR(mode)) {
 		inode->i_op = &simple_dir_inode_operations;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3bfc56df4f78..c146378c7d03 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1108,7 +1108,8 @@ map_end:
 
 	}
 
-	iommu_flush_iotlb_all(domain);
+	if (!list_empty(&mappings) && iommu_is_dma_domain(domain))
+		iommu_flush_iotlb_all(domain);
 
 out:
 	iommu_put_resv_regions(dev, &mappings);
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 75a2dd550625..a8c89df1a997 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -5112,8 +5112,6 @@ static int __init its_probe_one(struct its_node *its)
 	}
 	its->cmd_base = (void *)page_address(page);
 	its->cmd_write = its->cmd_base;
-	its->get_msi_base = its_irq_get_msi_base;
-	its->msi_domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI;
 
 	err = its_alloc_tables(its);
 	if (err)
@@ -5362,6 +5360,8 @@ static struct its_node __init *its_node_init(struct resource *res,
 	its->typer = gic_read_typer(its_base + GITS_TYPER);
 	its->base = its_base;
 	its->phys_base = res->start;
+	its->get_msi_base = its_irq_get_msi_base;
+	its->msi_domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI;
 
 	its->numa_node = numa_node;
 	its->fwnode_handle = handle;
diff --git a/drivers/irqchip/irq-imx-intmux.c b/drivers/irqchip/irq-imx-intmux.c
index 6d9a08238c9d..aa041e4dfee0 100644
--- a/drivers/irqchip/irq-imx-intmux.c
+++ b/drivers/irqchip/irq-imx-intmux.c
@@ -73,7 +73,7 @@ struct intmux_data {
 	void __iomem			*regs;
 	struct clk			*ipg_clk;
 	int				channum;
-	struct intmux_irqchip_data	irqchip_data[];
+	struct intmux_irqchip_data	irqchip_data[] __counted_by(channum);
 };
 
 static void imx_intmux_irq_mask(struct irq_data *d)
diff --git a/drivers/irqchip/irq-ls-scfg-msi.c b/drivers/irqchip/irq-ls-scfg-msi.c
index f31a262fe438..15cf80b46322 100644
--- a/drivers/irqchip/irq-ls-scfg-msi.c
+++ b/drivers/irqchip/irq-ls-scfg-msi.c
@@ -17,7 +17,8 @@
 #include <linux/irqdomain.h>
 #include <linux/of_irq.h>
 #include <linux/of_pci.h>
-#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
 #include <linux/spinlock.h>
 
 #define MSI_IRQS_PER_MSIR	32
@@ -334,20 +335,17 @@ MODULE_DEVICE_TABLE(of, ls_scfg_msi_id);
 
 static int ls_scfg_msi_probe(struct platform_device *pdev)
 {
-	const struct of_device_id *match;
 	struct ls_scfg_msi *msi_data;
 	struct resource *res;
 	int i, ret;
 
-	match = of_match_device(ls_scfg_msi_id, &pdev->dev);
-	if (!match)
-		return -ENODEV;
-
 	msi_data = devm_kzalloc(&pdev->dev, sizeof(*msi_data), GFP_KERNEL);
 	if (!msi_data)
 		return -ENOMEM;
 
-	msi_data->cfg = (struct ls_scfg_msi_cfg *) match->data;
+	msi_data->cfg = (struct ls_scfg_msi_cfg *)device_get_match_data(&pdev->dev);
+	if (!msi_data->cfg)
+		return -ENODEV;
 
 	msi_data->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
 	if (IS_ERR(msi_data->regs)) {
diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c
index 96f4e322ed6b..fe8d516f3614 100644
--- a/drivers/irqchip/irq-renesas-rzg2l.c
+++ b/drivers/irqchip/irq-renesas-rzg2l.c
@@ -247,6 +247,7 @@ static const struct irq_chip irqc_chip = {
 	.irq_set_irqchip_state	= irq_chip_set_parent_state,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_set_type		= rzg2l_irqc_set_type,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
 	.flags			= IRQCHIP_MASK_ON_SUSPEND |
 				  IRQCHIP_SET_TYPE_MASKED |
 				  IRQCHIP_SKIP_SET_WAKE,
diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
index e1484905b7bd..5b7bc4fd9517 100644
--- a/drivers/irqchip/irq-sifive-plic.c
+++ b/drivers/irqchip/irq-sifive-plic.c
@@ -532,17 +532,18 @@ done:
 	}
 
 	/*
-	 * We can have multiple PLIC instances so setup cpuhp state only
-	 * when context handler for current/boot CPU is present.
+	 * We can have multiple PLIC instances so setup cpuhp state
+	 * and register syscore operations only when context handler
+	 * for current/boot CPU is present.
 	 */
 	handler = this_cpu_ptr(&plic_handlers);
 	if (handler->present && !plic_cpuhp_setup_done) {
 		cpuhp_setup_state(CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
 				  "irqchip/sifive/plic:starting",
 				  plic_starting_cpu, plic_dying_cpu);
+		register_syscore_ops(&plic_irq_syscore_ops);
 		plic_cpuhp_setup_done = true;
 	}
-	register_syscore_ops(&plic_irq_syscore_ops);
 
 	pr_info("%pOFP: mapped %d interrupts with %d handlers for"
 		" %d contexts.\n", node, nr_irqs, nr_handlers, nr_contexts);
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index ae24848af233..136ba9fe55e0 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -732,7 +732,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 buf[CAPI_MANUFACTURER_LEN])
 	u16 ret;
 
 	if (contr == 0) {
-		strncpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN);
+		strscpy_pad(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN);
 		return CAPI_NOERROR;
 	}
 
@@ -740,7 +740,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 buf[CAPI_MANUFACTURER_LEN])
 
 	ctr = get_capi_ctr_by_nr(contr);
 	if (ctr && ctr->state == CAPI_CTR_RUNNING) {
-		strncpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN);
+		strscpy_pad(buf, ctr->manu, CAPI_MANUFACTURER_LEN);
 		ret = CAPI_NOERROR;
 	} else
 		ret = CAPI_REGNOTINSTALLED;
diff --git a/drivers/isdn/mISDN/clock.c b/drivers/isdn/mISDN/clock.c
index 01d878168ef2..f71eb61db131 100644
--- a/drivers/isdn/mISDN/clock.c
+++ b/drivers/isdn/mISDN/clock.c
@@ -96,7 +96,7 @@ struct mISDNclock
 		printk(KERN_ERR "%s: No memory for clock entry.\n", __func__);
 		return NULL;
 	}
-	strncpy(iclock->name, name, sizeof(iclock->name) - 1);
+	strscpy(iclock->name, name, sizeof(iclock->name));
 	iclock->pri = pri;
 	iclock->priv = priv;
 	iclock->ctl = ctl;
diff --git a/drivers/mailbox/zynqmp-ipi-mailbox.c b/drivers/mailbox/zynqmp-ipi-mailbox.c
index e4fcac97dbfa..7fa533e80dd9 100644
--- a/drivers/mailbox/zynqmp-ipi-mailbox.c
+++ b/drivers/mailbox/zynqmp-ipi-mailbox.c
@@ -108,7 +108,7 @@ struct zynqmp_ipi_pdata {
 	unsigned int method;
 	u32 local_id;
 	int num_mboxes;
-	struct zynqmp_ipi_mbox ipi_mboxes[];
+	struct zynqmp_ipi_mbox ipi_mboxes[] __counted_by(num_mboxes);
 };
 
 static struct device_driver zynqmp_ipi_mbox_driver = {
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 529c9d04e9a4..b2d10063d35f 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -4,6 +4,7 @@ config BCACHE
 	tristate "Block device as cache"
 	select BLOCK_HOLDER_DEPRECATED if SYSFS
 	select CRC64
+	select CLOSURES
 	help
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
@@ -19,15 +20,6 @@ config BCACHE_DEBUG
 	Enables extra debugging tools, allows expensive runtime checks to be
 	turned on.
 
-config BCACHE_CLOSURES_DEBUG
-	bool "Debug closures"
-	depends on BCACHE
-	select DEBUG_FS
-	help
-	Keeps all active closures in a linked list and provides a debugfs
-	interface to list them, which makes it possible to see asynchronous
-	operations that get stuck.
-
 config BCACHE_ASYNC_REGISTRATION
 	bool "Asynchronous device registration"
 	depends on BCACHE
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 5b87e59676b8..054e8a33a7ab 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -2,6 +2,6 @@
 
 obj-$(CONFIG_BCACHE)	+= bcache.o
 
-bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
-	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+bcache-y		:= alloc.o bset.o btree.o debug.o extents.o io.o\
+	journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
 	util.o writeback.o features.o
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5a79bb3c272f..313cee6ad009 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -179,6 +179,7 @@
 #define pr_fmt(fmt) "bcache: %s() " fmt, __func__
 
 #include <linux/bio.h>
+#include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
@@ -192,7 +193,6 @@
 #include "bcache_ondisk.h"
 #include "bset.h"
 #include "util.h"
-#include "closure.h"
 
 struct bucket {
 	atomic_t	pin;
@@ -299,6 +299,7 @@ struct cached_dev {
 	struct list_head	list;
 	struct bcache_device	disk;
 	struct block_device	*bdev;
+	struct bdev_handle	*bdev_handle;
 
 	struct cache_sb		sb;
 	struct cache_sb_disk	*sb_disk;
@@ -421,6 +422,7 @@ struct cache {
 
 	struct kobject		kobj;
 	struct block_device	*bdev;
+	struct bdev_handle	*bdev_handle;
 
 	struct task_struct	*alloc_thread;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 0ae2b3676293..8bd899766372 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1368,8 +1368,8 @@ static void cached_dev_free(struct closure *cl)
 	if (dc->sb_disk)
 		put_page(virt_to_page(dc->sb_disk));
 
-	if (!IS_ERR_OR_NULL(dc->bdev))
-		blkdev_put(dc->bdev, dc);
+	if (dc->bdev_handle)
+		bdev_release(dc->bdev_handle);
 
 	wake_up(&unregister_wait);
 
@@ -1444,7 +1444,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
 /* Cached device - bcache superblock */
 
 static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
-				 struct block_device *bdev,
+				 struct bdev_handle *bdev_handle,
 				 struct cached_dev *dc)
 {
 	const char *err = "cannot allocate memory";
@@ -1452,14 +1452,15 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 	int ret = -ENOMEM;
 
 	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
-	dc->bdev = bdev;
+	dc->bdev_handle = bdev_handle;
+	dc->bdev = bdev_handle->bdev;
 	dc->sb_disk = sb_disk;
 
 	if (cached_dev_init(dc, sb->block_size << 9))
 		goto err;
 
 	err = "error creating kobject";
-	if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
+	if (kobject_add(&dc->disk.kobj, bdev_kobj(dc->bdev), "bcache"))
 		goto err;
 	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
 		goto err;
@@ -2216,8 +2217,8 @@ void bch_cache_release(struct kobject *kobj)
 	if (ca->sb_disk)
 		put_page(virt_to_page(ca->sb_disk));
 
-	if (!IS_ERR_OR_NULL(ca->bdev))
-		blkdev_put(ca->bdev, ca);
+	if (ca->bdev_handle)
+		bdev_release(ca->bdev_handle);
 
 	kfree(ca);
 	module_put(THIS_MODULE);
@@ -2337,38 +2338,42 @@ err_free:
 }
 
 static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
-				struct block_device *bdev, struct cache *ca)
+				struct bdev_handle *bdev_handle,
+				struct cache *ca)
 {
 	const char *err = NULL; /* must be set for any error case */
 	int ret = 0;
 
 	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
-	ca->bdev = bdev;
+	ca->bdev_handle = bdev_handle;
+	ca->bdev = bdev_handle->bdev;
 	ca->sb_disk = sb_disk;
 
-	if (bdev_max_discard_sectors((bdev)))
+	if (bdev_max_discard_sectors((bdev_handle->bdev)))
 		ca->discard = CACHE_DISCARD(&ca->sb);
 
 	ret = cache_alloc(ca);
 	if (ret != 0) {
-		/*
-		 * If we failed here, it means ca->kobj is not initialized yet,
-		 * kobject_put() won't be called and there is no chance to
-		 * call blkdev_put() to bdev in bch_cache_release(). So we
-		 * explicitly call blkdev_put() here.
-		 */
-		blkdev_put(bdev, ca);
 		if (ret == -ENOMEM)
 			err = "cache_alloc(): -ENOMEM";
 		else if (ret == -EPERM)
 			err = "cache_alloc(): cache device is too small";
 		else
 			err = "cache_alloc(): unknown error";
-		goto err;
+		pr_notice("error %pg: %s\n", bdev_handle->bdev, err);
+		/*
+		 * If we failed here, it means ca->kobj is not initialized yet,
+		 * kobject_put() won't be called and there is no chance to
+		 * call bdev_release() to bdev in bch_cache_release(). So
+		 * we explicitly call bdev_release() here.
+		 */
+		bdev_release(bdev_handle);
+		return ret;
 	}
 
-	if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
-		err = "error calling kobject_add";
+	if (kobject_add(&ca->kobj, bdev_kobj(bdev_handle->bdev), "bcache")) {
+		pr_notice("error %pg: error calling kobject_add\n",
+			  bdev_handle->bdev);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -2382,15 +2387,10 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 		goto out;
 	}
 
-	pr_info("registered cache device %pg\n", ca->bdev);
+	pr_info("registered cache device %pg\n", ca->bdev_handle->bdev);
 
 out:
 	kobject_put(&ca->kobj);
-
-err:
-	if (err)
-		pr_notice("error %pg: %s\n", ca->bdev, err);
-
 	return ret;
 }
 
@@ -2445,7 +2445,7 @@ struct async_reg_args {
 	char *path;
 	struct cache_sb *sb;
 	struct cache_sb_disk *sb_disk;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	void *holder;
 };
 
@@ -2456,8 +2456,8 @@ static void register_bdev_worker(struct work_struct *work)
 		container_of(work, struct async_reg_args, reg_work.work);
 
 	mutex_lock(&bch_register_lock);
-	if (register_bdev(args->sb, args->sb_disk, args->bdev, args->holder)
-	    < 0)
+	if (register_bdev(args->sb, args->sb_disk, args->bdev_handle,
+			  args->holder) < 0)
 		fail = true;
 	mutex_unlock(&bch_register_lock);
 
@@ -2477,7 +2477,8 @@ static void register_cache_worker(struct work_struct *work)
 		container_of(work, struct async_reg_args, reg_work.work);
 
 	/* blkdev_put() will be called in bch_cache_release() */
-	if (register_cache(args->sb, args->sb_disk, args->bdev, args->holder))
+	if (register_cache(args->sb, args->sb_disk, args->bdev_handle,
+			   args->holder))
 		fail = true;
 
 	if (fail)
@@ -2514,7 +2515,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	char *path = NULL;
 	struct cache_sb *sb;
 	struct cache_sb_disk *sb_disk;
-	struct block_device *bdev, *bdev2;
+	struct bdev_handle *bdev_handle, *bdev_handle2;
 	void *holder = NULL;
 	ssize_t ret;
 	bool async_registration = false;
@@ -2547,15 +2548,15 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 
 	ret = -EINVAL;
 	err = "failed to open device";
-	bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
-	if (IS_ERR(bdev))
+	bdev_handle = bdev_open_by_path(strim(path), BLK_OPEN_READ, NULL, NULL);
+	if (IS_ERR(bdev_handle))
 		goto out_free_sb;
 
 	err = "failed to set blocksize";
-	if (set_blocksize(bdev, 4096))
+	if (set_blocksize(bdev_handle->bdev, 4096))
 		goto out_blkdev_put;
 
-	err = read_super(sb, bdev, &sb_disk);
+	err = read_super(sb, bdev_handle->bdev, &sb_disk);
 	if (err)
 		goto out_blkdev_put;
 
@@ -2567,13 +2568,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	}
 
 	/* Now reopen in exclusive mode with proper holder */
-	bdev2 = blkdev_get_by_dev(bdev->bd_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				  holder, NULL);
-	blkdev_put(bdev, NULL);
-	bdev = bdev2;
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
-		bdev = NULL;
+	bdev_handle2 = bdev_open_by_dev(bdev_handle->bdev->bd_dev,
+			BLK_OPEN_READ | BLK_OPEN_WRITE, holder, NULL);
+	bdev_release(bdev_handle);
+	bdev_handle = bdev_handle2;
+	if (IS_ERR(bdev_handle)) {
+		ret = PTR_ERR(bdev_handle);
+		bdev_handle = NULL;
 		if (ret == -EBUSY) {
 			dev_t dev;
 
@@ -2608,7 +2609,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		args->path	= path;
 		args->sb	= sb;
 		args->sb_disk	= sb_disk;
-		args->bdev	= bdev;
+		args->bdev_handle	= bdev_handle;
 		args->holder	= holder;
 		register_device_async(args);
 		/* No wait and returns to user space */
@@ -2617,14 +2618,14 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 
 	if (SB_IS_BDEV(sb)) {
 		mutex_lock(&bch_register_lock);
-		ret = register_bdev(sb, sb_disk, bdev, holder);
+		ret = register_bdev(sb, sb_disk, bdev_handle, holder);
 		mutex_unlock(&bch_register_lock);
 		/* blkdev_put() will be called in cached_dev_free() */
 		if (ret < 0)
 			goto out_free_sb;
 	} else {
 		/* blkdev_put() will be called in bch_cache_release() */
-		ret = register_cache(sb, sb_disk, bdev, holder);
+		ret = register_cache(sb, sb_disk, bdev_handle, holder);
 		if (ret)
 			goto out_free_sb;
 	}
@@ -2640,8 +2641,8 @@ out_free_holder:
 out_put_sb_page:
 	put_page(virt_to_page(sb_disk));
 out_blkdev_put:
-	if (bdev)
-		blkdev_put(bdev, holder);
+	if (bdev_handle)
+		bdev_release(bdev_handle);
 out_free_sb:
 	kfree(sb);
 out_free_path:
@@ -2905,7 +2906,6 @@ static int __init bcache_init(void)
 		goto err;
 
 	bch_debug_init();
-	closure_debug_init();
 
 	bcache_is_reboot = false;
 
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 6f3cb7c92130..f61ab1bada6c 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,6 +4,7 @@
 #define _BCACHE_UTIL_H
 
 #include <linux/blkdev.h>
+#include <linux/closure.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched/clock.h>
@@ -13,8 +14,6 @@
 #include <linux/workqueue.h>
 #include <linux/crc64.h>
 
-#include "closure.h"
-
 struct closure;
 
 #ifdef CONFIG_BCACHE_DEBUG
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 92afdca760ae..9ab32abe5ed4 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -26,7 +26,7 @@ struct prison_region {
 struct dm_bio_prison {
 	mempool_t cell_pool;
 	unsigned int num_locks;
-	struct prison_region regions[];
+	struct prison_region regions[] __counted_by(num_locks);
 };
 
 static struct kmem_cache *_cell_cache;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5315fd261c23..be32a290c90a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -224,7 +224,7 @@ struct crypt_config {
 	struct mutex bio_alloc_lock;
 
 	u8 *authenc_key; /* space for keys in authenc() format (if used) */
-	u8 key[];
+	u8 key[] __counted_by(key_size);
 };
 
 #define MIN_IOS		64
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 5f9991765f27..9755788e8b78 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -254,7 +254,7 @@ struct raid_set {
 		int mode;
 	} journal_dev;
 
-	struct raid_dev dev[];
+	struct raid_dev dev[] __counted_by(raid_disks);
 };
 
 static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index db2d997a6c18..bdc14ec99814 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -56,7 +56,7 @@ struct dm_stat {
 	size_t percpu_alloc_size;
 	size_t histogram_alloc_size;
 	struct dm_stat_percpu *stat_percpu[NR_CPUS];
-	struct dm_stat_shared stat_shared[];
+	struct dm_stat_shared stat_shared[] __counted_by(n_entries);
 };
 
 #define STAT_PRECISE_TIMESTAMPS		1
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index e2854a3cbd28..5e70f5ae394d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -44,7 +44,7 @@ struct stripe_c {
 	/* Work struct used for triggering events*/
 	struct work_struct trigger_event;
 
-	struct stripe stripe[];
+	struct stripe stripe[] __counted_by(stripes);
 };
 
 /*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 64a1f306c96c..f7212e8fc27f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -724,7 +724,7 @@ static struct table_device *open_table_device(struct mapped_device *md,
 		dev_t dev, blk_mode_t mode)
 {
 	struct table_device *td;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	u64 part_off;
 	int r;
 
@@ -733,9 +733,9 @@ static struct table_device *open_table_device(struct mapped_device *md,
 		return ERR_PTR(-ENOMEM);
 	refcount_set(&td->count, 1);
 
-	bdev = blkdev_get_by_dev(dev, mode, _dm_claim_ptr, NULL);
-	if (IS_ERR(bdev)) {
-		r = PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
+	if (IS_ERR(bdev_handle)) {
+		r = PTR_ERR(bdev_handle);
 		goto out_free_td;
 	}
 
@@ -745,20 +745,22 @@ static struct table_device *open_table_device(struct mapped_device *md,
 	 * called.
 	 */
 	if (md->disk->slave_dir) {
-		r = bd_link_disk_holder(bdev, md->disk);
+		r = bd_link_disk_holder(bdev_handle->bdev, md->disk);
 		if (r)
 			goto out_blkdev_put;
 	}
 
 	td->dm_dev.mode = mode;
-	td->dm_dev.bdev = bdev;
-	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
+	td->dm_dev.bdev = bdev_handle->bdev;
+	td->dm_dev.bdev_handle = bdev_handle;
+	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off,
+						NULL, NULL);
 	format_dev_t(td->dm_dev.name, dev);
 	list_add(&td->list, &md->table_devices);
 	return td;
 
 out_blkdev_put:
-	blkdev_put(bdev, _dm_claim_ptr);
+	bdev_release(bdev_handle);
 out_free_td:
 	kfree(td);
 	return ERR_PTR(r);
@@ -771,7 +773,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 {
 	if (md->disk->slave_dir)
 		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
-	blkdev_put(td->dm_dev.bdev, _dm_claim_ptr);
+	bdev_release(td->dm_dev.bdev_handle);
 	put_dax(td->dm_dev.dax_dev);
 	list_del(&td->list);
 	kfree(td);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a104a025084d..839e79e567ee 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2452,8 +2452,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
 	if (test_bit(AutoDetected, &rdev->flags))
 		md_autodetect_dev(rdev->bdev->bd_dev);
 #endif
-	blkdev_put(rdev->bdev,
-		   test_bit(Holder, &rdev->flags) ? rdev : &claim_rdev);
+	bdev_release(rdev->bdev_handle);
 	rdev->bdev = NULL;
 	kobject_put(&rdev->kobj);
 }
@@ -3633,7 +3632,6 @@ EXPORT_SYMBOL_GPL(md_rdev_init);
 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
 {
 	struct md_rdev *rdev;
-	struct md_rdev *holder;
 	sector_t size;
 	int err;
 
@@ -3648,21 +3646,16 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 	if (err)
 		goto out_clear_rdev;
 
-	if (super_format == -2) {
-		holder = &claim_rdev;
-	} else {
-		holder = rdev;
-		set_bit(Holder, &rdev->flags);
-	}
-
-	rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				       holder, NULL);
-	if (IS_ERR(rdev->bdev)) {
+	rdev->bdev_handle = bdev_open_by_dev(newdev,
+			BLK_OPEN_READ | BLK_OPEN_WRITE,
+			super_format == -2 ? &claim_rdev : rdev, NULL);
+	if (IS_ERR(rdev->bdev_handle)) {
 		pr_warn("md: could not open device unknown-block(%u,%u).\n",
 			MAJOR(newdev), MINOR(newdev));
-		err = PTR_ERR(rdev->bdev);
+		err = PTR_ERR(rdev->bdev_handle);
 		goto out_clear_rdev;
 	}
+	rdev->bdev = rdev->bdev_handle->bdev;
 
 	kobject_init(&rdev->kobj, &rdev_ktype);
 
@@ -3693,7 +3686,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 	return rdev;
 
 out_blkdev_put:
-	blkdev_put(rdev->bdev, holder);
+	bdev_release(rdev->bdev_handle);
 out_clear_rdev:
 	md_rdev_clear(rdev);
 out_free_rdev:
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7c9c13abd7ca..274e7d61d19f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -59,6 +59,7 @@ struct md_rdev {
 	 */
 	struct block_device *meta_bdev;
 	struct block_device *bdev;	/* block device handle */
+	struct bdev_handle *bdev_handle;	/* Handle from open for bdev */
 
 	struct page	*sb_page, *bb_page;
 	int		sb_loaded;
@@ -211,9 +212,6 @@ enum flag_bits {
 				 * check if there is collision between raid1
 				 * serial bios.
 				 */
-	Holder,			/* rdev is used as holder while opening
-				 * underlying disk exclusively.
-				 */
 };
 
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index a66b7c111cd5..1c6c62a7f7f5 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -958,6 +958,7 @@ static int fastrpc_get_args(u32 kernel, struct fastrpc_invoke_ctx *ctx)
 	if (err)
 		return err;
 
+	memset(ctx->buf->virt, 0, pkt_size);
 	rpra = ctx->buf->virt;
 	list = fastrpc_invoke_buf_start(rpra, ctx->nscalars);
 	pages = fastrpc_phy_page_start(list, ctx->nscalars);
@@ -1090,6 +1091,7 @@ static int fastrpc_put_args(struct fastrpc_invoke_ctx *ctx,
 		}
 	}
 
+	/* Clean up fdlist which is updated by DSP */
 	for (i = 0; i < FASTRPC_MAX_FDLIST; i++) {
 		if (!fdlist[i])
 			break;
@@ -1156,11 +1158,9 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
-	if (ctx->nscalars) {
-		err = fastrpc_get_args(kernel, ctx);
-		if (err)
-			goto bail;
-	}
+	err = fastrpc_get_args(kernel, ctx);
+	if (err)
+		goto bail;
 
 	/* make sure that all CPU memory writes are seen by DSP */
 	dma_wmb();
@@ -1179,20 +1179,18 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl,  u32 kernel,
 	if (err)
 		goto bail;
 
+	/* make sure that all memory writes by DSP are seen by CPU */
+	dma_rmb();
+	/* populate all the output buffers with results */
+	err = fastrpc_put_args(ctx, kernel);
+	if (err)
+		goto bail;
+
 	/* Check the response from remote dsp */
 	err = ctx->retval;
 	if (err)
 		goto bail;
 
-	if (ctx->nscalars) {
-		/* make sure that all memory writes by DSP are seen by CPU */
-		dma_rmb();
-		/* populate all the output buffers with results */
-		err = fastrpc_put_args(ctx, kernel);
-		if (err)
-			goto bail;
-	}
-
 bail:
 	if (err != -ERESTARTSYS && err != -ETIMEDOUT) {
 		/* We are done with this compute context */
@@ -1983,11 +1981,13 @@ static int fastrpc_req_mem_unmap_impl(struct fastrpc_user *fl, struct fastrpc_me
 	sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_MEM_UNMAP, 1, 0);
 	err = fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE, sc,
 				      &args[0]);
-	fastrpc_map_put(map);
-	if (err)
+	if (err) {
 		dev_err(dev, "unmmap\tpt fd = %d, 0x%09llx error\n",  map->fd, map->raddr);
+		return err;
+	}
+	fastrpc_map_put(map);
 
-	return err;
+	return 0;
 }
 
 static int fastrpc_req_mem_unmap(struct fastrpc_user *fl, char __user *argp)
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 5867af9f592c..c44de892a61e 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -139,7 +139,7 @@ static struct inode *ibmasmfs_make_inode(struct super_block *sb, int mode)
 	if (ret) {
 		ret->i_ino = get_next_ino();
 		ret->i_mode = mode;
-		ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret);
+		simple_inode_init_ts(ret);
 	}
 	return ret;
 }
diff --git a/drivers/misc/ibmvmc.c b/drivers/misc/ibmvmc.c
index 2101eb12bcba..7739b783c2db 100644
--- a/drivers/misc/ibmvmc.c
+++ b/drivers/misc/ibmvmc.c
@@ -1124,7 +1124,7 @@ static ssize_t ibmvmc_write(struct file *file, const char *buffer,
 		goto out;
 
 	inode = file_inode(file);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 
 	dev_dbg(adapter->dev, "write: file = 0x%lx, count = 0x%lx\n",
diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index c66cc05a68c4..b080eb2335eb 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -6,12 +6,14 @@
  * test source files.
  */
 #include "lkdtm.h"
+#include <linux/cpu.h>
 #include <linux/list.h>
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
-#include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <linux/stop_machine.h>
+#include <linux/uaccess.h>
 
 #if IS_ENABLED(CONFIG_X86_32) && !IS_ENABLED(CONFIG_UML)
 #include <asm/desc.h>
@@ -73,6 +75,31 @@ static void lkdtm_PANIC(void)
 	panic("dumptest");
 }
 
+static int panic_stop_irqoff_fn(void *arg)
+{
+	atomic_t *v = arg;
+
+	/*
+	 * As stop_machine() disables interrupts, all CPUs within this function
+	 * have interrupts disabled and cannot take a regular IPI.
+	 *
+	 * The last CPU which enters here will trigger a panic, and as all CPUs
+	 * cannot take a regular IPI, we'll only be able to stop secondaries if
+	 * smp_send_stop() or crash_smp_send_stop() uses an NMI.
+	 */
+	if (atomic_inc_return(v) == num_online_cpus())
+		panic("panic stop irqoff test");
+
+	for (;;)
+		cpu_relax();
+}
+
+static void lkdtm_PANIC_STOP_IRQOFF(void)
+{
+	atomic_t v = ATOMIC_INIT(0);
+	stop_machine(panic_stop_irqoff_fn, &v, cpu_online_mask);
+}
+
 static void lkdtm_BUG(void)
 {
 	BUG();
@@ -638,6 +665,7 @@ static noinline void lkdtm_CORRUPT_PAC(void)
 
 static struct crashtype crashtypes[] = {
 	CRASHTYPE(PANIC),
+	CRASHTYPE(PANIC_STOP_IRQOFF),
 	CRASHTYPE(BUG),
 	CRASHTYPE(WARNING),
 	CRASHTYPE(WARNING_MESSAGE),
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index be106dc20ff3..aa44a23ec045 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -37,7 +37,7 @@
 /* Info for the block device */
 struct block2mtd_dev {
 	struct list_head list;
-	struct block_device *blkdev;
+	struct bdev_handle *bdev_handle;
 	struct mtd_info mtd;
 	struct mutex write_mutex;
 };
@@ -55,7 +55,8 @@ static struct page *page_read(struct address_space *mapping, pgoff_t index)
 /* erase a specified part of the device */
 static int _block2mtd_erase(struct block2mtd_dev *dev, loff_t to, size_t len)
 {
-	struct address_space *mapping = dev->blkdev->bd_inode->i_mapping;
+	struct address_space *mapping =
+				dev->bdev_handle->bdev->bd_inode->i_mapping;
 	struct page *page;
 	pgoff_t index = to >> PAGE_SHIFT;	// page index
 	int pages = len >> PAGE_SHIFT;
@@ -105,6 +106,8 @@ static int block2mtd_read(struct mtd_info *mtd, loff_t from, size_t len,
 		size_t *retlen, u_char *buf)
 {
 	struct block2mtd_dev *dev = mtd->priv;
+	struct address_space *mapping =
+				dev->bdev_handle->bdev->bd_inode->i_mapping;
 	struct page *page;
 	pgoff_t index = from >> PAGE_SHIFT;
 	int offset = from & (PAGE_SIZE-1);
@@ -117,7 +120,7 @@ static int block2mtd_read(struct mtd_info *mtd, loff_t from, size_t len,
 			cpylen = len;	// this page
 		len = len - cpylen;
 
-		page = page_read(dev->blkdev->bd_inode->i_mapping, index);
+		page = page_read(mapping, index);
 		if (IS_ERR(page))
 			return PTR_ERR(page);
 
@@ -139,7 +142,8 @@ static int _block2mtd_write(struct block2mtd_dev *dev, const u_char *buf,
 		loff_t to, size_t len, size_t *retlen)
 {
 	struct page *page;
-	struct address_space *mapping = dev->blkdev->bd_inode->i_mapping;
+	struct address_space *mapping =
+				dev->bdev_handle->bdev->bd_inode->i_mapping;
 	pgoff_t index = to >> PAGE_SHIFT;	// page index
 	int offset = to & ~PAGE_MASK;	// page offset
 	int cpylen;
@@ -194,7 +198,7 @@ static int block2mtd_write(struct mtd_info *mtd, loff_t to, size_t len,
 static void block2mtd_sync(struct mtd_info *mtd)
 {
 	struct block2mtd_dev *dev = mtd->priv;
-	sync_blockdev(dev->blkdev);
+	sync_blockdev(dev->bdev_handle->bdev);
 	return;
 }
 
@@ -206,10 +210,10 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 
 	kfree(dev->mtd.name);
 
-	if (dev->blkdev) {
-		invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping,
-					0, -1);
-		blkdev_put(dev->blkdev, NULL);
+	if (dev->bdev_handle) {
+		invalidate_mapping_pages(
+			dev->bdev_handle->bdev->bd_inode->i_mapping, 0, -1);
+		bdev_release(dev->bdev_handle);
 	}
 
 	kfree(dev);
@@ -219,10 +223,10 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
  * This function is marked __ref because it calls the __init marked
  * early_lookup_bdev when called from the early boot code.
  */
-static struct block_device __ref *mdtblock_early_get_bdev(const char *devname,
+static struct bdev_handle __ref *mdtblock_early_get_bdev(const char *devname,
 		blk_mode_t mode, int timeout, struct block2mtd_dev *dev)
 {
-	struct block_device *bdev = ERR_PTR(-ENODEV);
+	struct bdev_handle *bdev_handle = ERR_PTR(-ENODEV);
 #ifndef MODULE
 	int i;
 
@@ -230,7 +234,7 @@ static struct block_device __ref *mdtblock_early_get_bdev(const char *devname,
 	 * We can't use early_lookup_bdev from a running system.
 	 */
 	if (system_state >= SYSTEM_RUNNING)
-		return bdev;
+		return bdev_handle;
 
 	/*
 	 * We might not have the root device mounted at this point.
@@ -249,19 +253,20 @@ static struct block_device __ref *mdtblock_early_get_bdev(const char *devname,
 		wait_for_device_probe();
 
 		if (!early_lookup_bdev(devname, &devt)) {
-			bdev = blkdev_get_by_dev(devt, mode, dev, NULL);
-			if (!IS_ERR(bdev))
+			bdev_handle = bdev_open_by_dev(devt, mode, dev, NULL);
+			if (!IS_ERR(bdev_handle))
 				break;
 		}
 	}
 #endif
-	return bdev;
+	return bdev_handle;
 }
 
 static struct block2mtd_dev *add_device(char *devname, int erase_size,
 		char *label, int timeout)
 {
 	const blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_WRITE;
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 	struct block2mtd_dev *dev;
 	char *name;
@@ -274,21 +279,23 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size,
 		return NULL;
 
 	/* Get a handle on the device */
-	bdev = blkdev_get_by_path(devname, mode, dev, NULL);
-	if (IS_ERR(bdev))
-		bdev = mdtblock_early_get_bdev(devname, mode, timeout, dev);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_path(devname, mode, dev, NULL);
+	if (IS_ERR(bdev_handle))
+		bdev_handle = mdtblock_early_get_bdev(devname, mode, timeout,
+						      dev);
+	if (IS_ERR(bdev_handle)) {
 		pr_err("error: cannot open device %s\n", devname);
 		goto err_free_block2mtd;
 	}
-	dev->blkdev = bdev;
+	dev->bdev_handle = bdev_handle;
+	bdev = bdev_handle->bdev;
 
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		pr_err("attempting to use an MTD device as a block device\n");
 		goto err_free_block2mtd;
 	}
 
-	if ((long)dev->blkdev->bd_inode->i_size % erase_size) {
+	if ((long)bdev->bd_inode->i_size % erase_size) {
 		pr_err("erasesize must be a divisor of device size\n");
 		goto err_free_block2mtd;
 	}
@@ -306,7 +313,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size,
 
 	dev->mtd.name = name;
 
-	dev->mtd.size = dev->blkdev->bd_inode->i_size & PAGE_MASK;
+	dev->mtd.size = bdev->bd_inode->i_size & PAGE_MASK;
 	dev->mtd.erasesize = erase_size;
 	dev->mtd.writesize = 1;
 	dev->mtd.writebufsize = PAGE_SIZE;
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 468833675cc9..f11400a908f2 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -50,9 +50,10 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
 
 void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
 {
-	if (ns->bdev) {
-		blkdev_put(ns->bdev, NULL);
+	if (ns->bdev_handle) {
+		bdev_release(ns->bdev_handle);
 		ns->bdev = NULL;
+		ns->bdev_handle = NULL;
 	}
 }
 
@@ -84,17 +85,18 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
 	if (ns->buffered_io)
 		return -ENOTBLK;
 
-	ns->bdev = blkdev_get_by_path(ns->device_path,
-			BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL);
-	if (IS_ERR(ns->bdev)) {
-		ret = PTR_ERR(ns->bdev);
+	ns->bdev_handle = bdev_open_by_path(ns->device_path,
+				BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL);
+	if (IS_ERR(ns->bdev_handle)) {
+		ret = PTR_ERR(ns->bdev_handle);
 		if (ret != -ENOTBLK) {
-			pr_err("failed to open block device %s: (%ld)\n",
-					ns->device_path, PTR_ERR(ns->bdev));
+			pr_err("failed to open block device %s: (%d)\n",
+					ns->device_path, ret);
 		}
-		ns->bdev = NULL;
+		ns->bdev_handle = NULL;
 		return ret;
 	}
+	ns->bdev = ns->bdev_handle->bdev;
 	ns->size = bdev_nr_bytes(ns->bdev);
 	ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 8cfd60f3b564..360e385be33b 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -58,6 +58,7 @@
 
 struct nvmet_ns {
 	struct percpu_ref	ref;
+	struct bdev_handle	*bdev_handle;
 	struct block_device	*bdev;
 	struct file		*file;
 	bool			readonly;
diff --git a/drivers/nvmem/imx-ocotp.c b/drivers/nvmem/imx-ocotp.c
index a223d9537f22..e8b6f194925d 100644
--- a/drivers/nvmem/imx-ocotp.c
+++ b/drivers/nvmem/imx-ocotp.c
@@ -498,7 +498,7 @@ static const struct ocotp_params imx6sl_params = {
 };
 
 static const struct ocotp_params imx6sll_params = {
-	.nregs = 128,
+	.nregs = 80,
 	.bank_address_words = 0,
 	.set_timing = imx_ocotp_set_imx6_timing,
 	.ctrl = IMX_OCOTP_BM_CTRL_DEFAULT,
@@ -512,14 +512,14 @@ static const struct ocotp_params imx6sx_params = {
 };
 
 static const struct ocotp_params imx6ul_params = {
-	.nregs = 128,
+	.nregs = 144,
 	.bank_address_words = 0,
 	.set_timing = imx_ocotp_set_imx6_timing,
 	.ctrl = IMX_OCOTP_BM_CTRL_DEFAULT,
 };
 
 static const struct ocotp_params imx6ull_params = {
-	.nregs = 64,
+	.nregs = 80,
 	.bank_address_words = 0,
 	.set_timing = imx_ocotp_set_imx6_timing,
 	.ctrl = IMX_OCOTP_BM_CTRL_DEFAULT,
diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c
index ad702463a65d..6bbffb081053 100644
--- a/drivers/platform/x86/amd/pmc/pmc-quirks.c
+++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c
@@ -111,6 +111,79 @@ static const struct dmi_system_id fwbug_list[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "21A1"),
 		}
 	},
+	/* https://bugzilla.kernel.org/show_bug.cgi?id=218024 */
+	{
+		.ident = "V14 G4 AMN",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "82YT"),
+		}
+	},
+	{
+		.ident = "V14 G4 AMN",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "83GE"),
+		}
+	},
+	{
+		.ident = "V15 G4 AMN",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "82YU"),
+		}
+	},
+	{
+		.ident = "V15 G4 AMN",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "83CQ"),
+		}
+	},
+	{
+		.ident = "IdeaPad 1 14AMN7",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "82VF"),
+		}
+	},
+	{
+		.ident = "IdeaPad 1 15AMN7",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "82VG"),
+		}
+	},
+	{
+		.ident = "IdeaPad 1 15AMN7",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "82X5"),
+		}
+	},
+	{
+		.ident = "IdeaPad Slim 3 14AMN8",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "82XN"),
+		}
+	},
+	{
+		.ident = "IdeaPad Slim 3 15AMN8",
+		.driver_data = &quirk_s2idle_bug,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "82XQ"),
+		}
+	},
 	/* https://gitlab.freedesktop.org/drm/amd/-/issues/2684 */
 	{
 		.ident = "HP Laptop 15s-eq2xxx",
diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 9569f11dec8c..40878e327afd 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -4092,7 +4092,7 @@ static ssize_t sonypi_misc_read(struct file *file, char __user *buf,
 
 	if (ret > 0) {
 		struct inode *inode = file_inode(file);
-		inode->i_atime = current_time(inode);
+		inode_set_atime_to_ts(inode, current_time(inode));
 	}
 
 	return ret;
diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index f0a076e94118..7ece6a8e9858 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -60,7 +60,7 @@ struct reset_control {
 struct reset_control_array {
 	struct reset_control base;
 	unsigned int num_rstcs;
-	struct reset_control *rstc[];
+	struct reset_control *rstc[] __counted_by(num_rstcs);
 };
 
 static const char *rcdev_name(struct reset_controller_dev *rcdev)
@@ -1185,6 +1185,7 @@ of_reset_control_array_get(struct device_node *np, bool shared, bool optional,
 	resets = kzalloc(struct_size(resets, rstc, num), GFP_KERNEL);
 	if (!resets)
 		return ERR_PTR(-ENOMEM);
+	resets->num_rstcs = num;
 
 	for (i = 0; i < num; i++) {
 		rstc = __of_reset_control_get(np, NULL, i, shared, optional,
@@ -1193,7 +1194,6 @@ of_reset_control_array_get(struct device_node *np, bool shared, bool optional,
 			goto err_rst;
 		resets->rstc[i] = rstc;
 	}
-	resets->num_rstcs = num;
 	resets->base.array = true;
 
 	return &resets->base;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 215597f73be4..d440319a7945 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -412,7 +412,8 @@ dasd_state_ready_to_online(struct dasd_device * device)
 					KOBJ_CHANGE);
 			return 0;
 		}
-		disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE);
+		disk_uevent(device->block->bdev_handle->bdev->bd_disk,
+			    KOBJ_CHANGE);
 	}
 	return 0;
 }
@@ -432,7 +433,8 @@ static int dasd_state_online_to_ready(struct dasd_device *device)
 
 	device->state = DASD_STATE_READY;
 	if (device->block && !(device->features & DASD_FEATURE_USERAW))
-		disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE);
+		disk_uevent(device->block->bdev_handle->bdev->bd_disk,
+			    KOBJ_CHANGE);
 	return 0;
 }
 
@@ -3590,7 +3592,7 @@ int dasd_generic_set_offline(struct ccw_device *cdev)
 	 * in the other openers.
 	 */
 	if (device->block) {
-		max_count = device->block->bdev ? 0 : -1;
+		max_count = device->block->bdev_handle ? 0 : -1;
 		open_count = atomic_read(&device->block->open_count);
 		if (open_count > max_count) {
 			if (open_count > 0)
@@ -3636,8 +3638,8 @@ int dasd_generic_set_offline(struct ccw_device *cdev)
 		 * so sync bdev first and then wait for our queues to become
 		 * empty
 		 */
-		if (device->block)
-			bdev_mark_dead(device->block->bdev, false);
+		if (device->block && device->block->bdev_handle)
+			bdev_mark_dead(device->block->bdev_handle->bdev, false);
 		dasd_schedule_device_bh(device);
 		rc = wait_event_interruptible(shutdown_waitq,
 					      _wait_for_empty_queues(device));
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index fe5108a1b332..55e3abe94cde 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -127,15 +127,15 @@ void dasd_gendisk_free(struct dasd_block *block)
  */
 int dasd_scan_partitions(struct dasd_block *block)
 {
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	int rc;
 
-	bdev = blkdev_get_by_dev(disk_devt(block->gdp), BLK_OPEN_READ, NULL,
-				 NULL);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_dev(disk_devt(block->gdp), BLK_OPEN_READ,
+				       NULL, NULL);
+	if (IS_ERR(bdev_handle)) {
 		DBF_DEV_EVENT(DBF_ERR, block->base,
 			      "scan partitions error, blkdev_get returned %ld",
-			      PTR_ERR(bdev));
+			      PTR_ERR(bdev_handle));
 		return -ENODEV;
 	}
 
@@ -147,16 +147,15 @@ int dasd_scan_partitions(struct dasd_block *block)
 				"scan partitions error, rc %d", rc);
 
 	/*
-	 * Since the matching blkdev_put call to the blkdev_get in
-	 * this function is not called before dasd_destroy_partitions
-	 * the offline open_count limit needs to be increased from
-	 * 0 to 1. This is done by setting device->bdev (see
-	 * dasd_generic_set_offline). As long as the partition
-	 * detection is running no offline should be allowed. That
-	 * is why the assignment to device->bdev is done AFTER
-	 * the BLKRRPART ioctl.
+	 * Since the matching bdev_release() call to the
+	 * bdev_open_by_path() in this function is not called before
+	 * dasd_destroy_partitions the offline open_count limit needs to be
+	 * increased from 0 to 1. This is done by setting device->bdev_handle
+	 * (see dasd_generic_set_offline). As long as the partition detection
+	 * is running no offline should be allowed. That is why the assignment
+	 * to block->bdev_handle is done AFTER the BLKRRPART ioctl.
 	 */
-	block->bdev = bdev;
+	block->bdev_handle = bdev_handle;
 	return 0;
 }
 
@@ -166,21 +165,21 @@ int dasd_scan_partitions(struct dasd_block *block)
  */
 void dasd_destroy_partitions(struct dasd_block *block)
 {
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 
 	/*
-	 * Get the bdev pointer from the device structure and clear
-	 * device->bdev to lower the offline open_count limit again.
+	 * Get the bdev_handle pointer from the device structure and clear
+	 * device->bdev_handle to lower the offline open_count limit again.
 	 */
-	bdev = block->bdev;
-	block->bdev = NULL;
+	bdev_handle = block->bdev_handle;
+	block->bdev_handle = NULL;
 
-	mutex_lock(&bdev->bd_disk->open_mutex);
-	bdev_disk_changed(bdev->bd_disk, true);
-	mutex_unlock(&bdev->bd_disk->open_mutex);
+	mutex_lock(&bdev_handle->bdev->bd_disk->open_mutex);
+	bdev_disk_changed(bdev_handle->bdev->bd_disk, true);
+	mutex_unlock(&bdev_handle->bdev->bd_disk->open_mutex);
 
 	/* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */
-	blkdev_put(bdev, NULL);
+	bdev_release(bdev_handle);
 }
 
 int dasd_gendisk_init(void)
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 8a4dbe9d7741..2e663131adaf 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -650,7 +650,7 @@ struct dasd_block {
 	struct gendisk *gdp;
 	spinlock_t request_queue_lock;
 	struct blk_mq_tag_set tag_set;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	atomic_t open_count;
 
 	unsigned long blocks;	   /* size of volume in blocks */
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index d55862605b82..61b9675e2a67 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -537,7 +537,7 @@ static int __dasd_ioctl_information(struct dasd_block *block,
 	 * This must be hidden from user-space.
 	 */
 	dasd_info->open_count = atomic_read(&block->open_count);
-	if (!block->bdev)
+	if (!block->bdev_handle)
 		dasd_info->open_count++;
 
 	/*
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 9e59c050103d..e7c47ee185a4 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -12442,9 +12442,6 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
 	int max_core_id, min_core_id;
 	struct lpfc_vector_map_info *cpup;
 	struct lpfc_vector_map_info *new_cpup;
-#ifdef CONFIG_X86
-	struct cpuinfo_x86 *cpuinfo;
-#endif
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
 	struct lpfc_hdwq_stat *c_stat;
 #endif
@@ -12458,9 +12455,8 @@ lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
 	for_each_present_cpu(cpu) {
 		cpup = &phba->sli4_hba.cpu_map[cpu];
 #ifdef CONFIG_X86
-		cpuinfo = &cpu_data(cpu);
-		cpup->phys_id = cpuinfo->phys_proc_id;
-		cpup->core_id = cpuinfo->cpu_core_id;
+		cpup->phys_id = topology_physical_package_id(cpu);
+		cpup->core_id = topology_core_id(cpu);
 		if (lpfc_find_hyper(phba, cpu, cpup->phys_id, cpup->core_id))
 			cpup->flag |= LPFC_CPU_MAP_HYPER;
 #else
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 83b6a3f3863b..6effa13039f3 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -209,7 +209,8 @@ manage_start_stop_show(struct device *dev,
 
 	return sysfs_emit(buf, "%u\n",
 			  sdp->manage_system_start_stop &&
-			  sdp->manage_runtime_start_stop);
+			  sdp->manage_runtime_start_stop &&
+			  sdp->manage_shutdown);
 }
 static DEVICE_ATTR_RO(manage_start_stop);
 
@@ -275,6 +276,35 @@ manage_runtime_start_stop_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(manage_runtime_start_stop);
 
+static ssize_t manage_shutdown_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+	struct scsi_device *sdp = sdkp->device;
+
+	return sysfs_emit(buf, "%u\n", sdp->manage_shutdown);
+}
+
+static ssize_t manage_shutdown_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+	struct scsi_device *sdp = sdkp->device;
+	bool v;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (kstrtobool(buf, &v))
+		return -EINVAL;
+
+	sdp->manage_shutdown = v;
+
+	return count;
+}
+static DEVICE_ATTR_RW(manage_shutdown);
+
 static ssize_t
 allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
@@ -607,6 +637,7 @@ static struct attribute *sd_disk_attrs[] = {
 	&dev_attr_manage_start_stop.attr,
 	&dev_attr_manage_system_start_stop.attr,
 	&dev_attr_manage_runtime_start_stop.attr,
+	&dev_attr_manage_shutdown.attr,
 	&dev_attr_protection_type.attr,
 	&dev_attr_protection_mode.attr,
 	&dev_attr_app_tag_own.attr,
@@ -3819,8 +3850,10 @@ static void sd_shutdown(struct device *dev)
 		sd_sync_cache(sdkp, NULL);
 	}
 
-	if (system_state != SYSTEM_RESTART &&
-	    sdkp->device->manage_system_start_stop) {
+	if ((system_state != SYSTEM_RESTART &&
+	     sdkp->device->manage_system_start_stop) ||
+	    (system_state == SYSTEM_POWER_OFF &&
+	     sdkp->device->manage_shutdown)) {
 		sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");
 		sd_start_stop_device(sdkp, 0);
 	}
diff --git a/drivers/soc/renesas/Kconfig b/drivers/soc/renesas/Kconfig
index 12040ce116a5..acc812e490d0 100644
--- a/drivers/soc/renesas/Kconfig
+++ b/drivers/soc/renesas/Kconfig
@@ -334,12 +334,14 @@ if RISCV
 config ARCH_R9A07G043
 	bool "RISC-V Platform support for RZ/Five"
 	depends on NONPORTABLE
+	depends on RISCV_ALTERNATIVE
+	depends on !RISCV_ISA_ZICBOM
+	depends on RISCV_SBI
 	select ARCH_RZG2L
-	select AX45MP_L2_CACHE if RISCV_DMA_NONCOHERENT
+	select AX45MP_L2_CACHE
 	select DMA_GLOBAL_POOL
-	select ERRATA_ANDES if RISCV_SBI
-	select ERRATA_ANDES_CMO if ERRATA_ANDES
-
+	select ERRATA_ANDES
+	select ERRATA_ANDES_CMO
 	help
 	  This enables support for the Renesas RZ/Five SoC.
 
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index a6a06a5f7483..8eb9eb7ce5df 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -91,7 +91,8 @@ static int iblock_configure_device(struct se_device *dev)
 {
 	struct iblock_dev *ib_dev = IBLOCK_DEV(dev);
 	struct request_queue *q;
-	struct block_device *bd = NULL;
+	struct bdev_handle *bdev_handle;
+	struct block_device *bd;
 	struct blk_integrity *bi;
 	blk_mode_t mode = BLK_OPEN_READ;
 	unsigned int max_write_zeroes_sectors;
@@ -116,12 +117,14 @@ static int iblock_configure_device(struct se_device *dev)
 	else
 		dev->dev_flags |= DF_READ_ONLY;
 
-	bd = blkdev_get_by_path(ib_dev->ibd_udev_path, mode, ib_dev, NULL);
-	if (IS_ERR(bd)) {
-		ret = PTR_ERR(bd);
+	bdev_handle = bdev_open_by_path(ib_dev->ibd_udev_path, mode, ib_dev,
+					NULL);
+	if (IS_ERR(bdev_handle)) {
+		ret = PTR_ERR(bdev_handle);
 		goto out_free_bioset;
 	}
-	ib_dev->ibd_bd = bd;
+	ib_dev->ibd_bdev_handle = bdev_handle;
+	ib_dev->ibd_bd = bd = bdev_handle->bdev;
 
 	q = bdev_get_queue(bd);
 
@@ -177,7 +180,7 @@ static int iblock_configure_device(struct se_device *dev)
 	return 0;
 
 out_blkdev_put:
-	blkdev_put(ib_dev->ibd_bd, ib_dev);
+	bdev_release(ib_dev->ibd_bdev_handle);
 out_free_bioset:
 	bioset_exit(&ib_dev->ibd_bio_set);
 out:
@@ -202,8 +205,8 @@ static void iblock_destroy_device(struct se_device *dev)
 {
 	struct iblock_dev *ib_dev = IBLOCK_DEV(dev);
 
-	if (ib_dev->ibd_bd != NULL)
-		blkdev_put(ib_dev->ibd_bd, ib_dev);
+	if (ib_dev->ibd_bdev_handle)
+		bdev_release(ib_dev->ibd_bdev_handle);
 	bioset_exit(&ib_dev->ibd_bio_set);
 }
 
diff --git a/drivers/target/target_core_iblock.h b/drivers/target/target_core_iblock.h
index 8c55375d2f75..683f9a55945b 100644
--- a/drivers/target/target_core_iblock.h
+++ b/drivers/target/target_core_iblock.h
@@ -32,6 +32,7 @@ struct iblock_dev {
 	u32	ibd_flags;
 	struct bio_set	ibd_bio_set;
 	struct block_device *ibd_bd;
+	struct bdev_handle *ibd_bdev_handle;
 	bool ibd_readonly;
 	struct iblock_dev_plug *ibd_plug;
 } ____cacheline_aligned;
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 0d4f09693ef4..41b7489d37ce 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -352,7 +352,7 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd)
 	struct pscsi_hba_virt *phv = dev->se_hba->hba_ptr;
 	struct pscsi_dev_virt *pdv = PSCSI_DEV(dev);
 	struct Scsi_Host *sh = sd->host;
-	struct block_device *bd;
+	struct bdev_handle *bdev_handle;
 	int ret;
 
 	if (scsi_device_get(sd)) {
@@ -366,18 +366,18 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd)
 	 * Claim exclusive struct block_device access to struct scsi_device
 	 * for TYPE_DISK and TYPE_ZBC using supplied udev_path
 	 */
-	bd = blkdev_get_by_path(dev->udev_path, BLK_OPEN_WRITE | BLK_OPEN_READ,
-				pdv, NULL);
-	if (IS_ERR(bd)) {
-		pr_err("pSCSI: blkdev_get_by_path() failed\n");
+	bdev_handle = bdev_open_by_path(dev->udev_path,
+				BLK_OPEN_WRITE | BLK_OPEN_READ, pdv, NULL);
+	if (IS_ERR(bdev_handle)) {
+		pr_err("pSCSI: bdev_open_by_path() failed\n");
 		scsi_device_put(sd);
-		return PTR_ERR(bd);
+		return PTR_ERR(bdev_handle);
 	}
-	pdv->pdv_bd = bd;
+	pdv->pdv_bdev_handle = bdev_handle;
 
 	ret = pscsi_add_device_to_list(dev, sd);
 	if (ret) {
-		blkdev_put(pdv->pdv_bd, pdv);
+		bdev_release(bdev_handle);
 		scsi_device_put(sd);
 		return ret;
 	}
@@ -564,9 +564,9 @@ static void pscsi_destroy_device(struct se_device *dev)
 		 * from pscsi_create_type_disk()
 		 */
 		if ((sd->type == TYPE_DISK || sd->type == TYPE_ZBC) &&
-		    pdv->pdv_bd) {
-			blkdev_put(pdv->pdv_bd, pdv);
-			pdv->pdv_bd = NULL;
+		    pdv->pdv_bdev_handle) {
+			bdev_release(pdv->pdv_bdev_handle);
+			pdv->pdv_bdev_handle = NULL;
 		}
 		/*
 		 * For HBA mode PHV_LLD_SCSI_HOST_NO, release the reference
@@ -994,8 +994,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev)
 {
 	struct pscsi_dev_virt *pdv = PSCSI_DEV(dev);
 
-	if (pdv->pdv_bd)
-		return bdev_nr_sectors(pdv->pdv_bd);
+	if (pdv->pdv_bdev_handle)
+		return bdev_nr_sectors(pdv->pdv_bdev_handle->bdev);
 	return 0;
 }
 
diff --git a/drivers/target/target_core_pscsi.h b/drivers/target/target_core_pscsi.h
index 23d9a6e340d4..b0a3ef136592 100644
--- a/drivers/target/target_core_pscsi.h
+++ b/drivers/target/target_core_pscsi.h
@@ -37,7 +37,7 @@ struct pscsi_dev_virt {
 	int	pdv_channel_id;
 	int	pdv_target_id;
 	int	pdv_lun_id;
-	struct block_device *pdv_bd;
+	struct bdev_handle *pdv_bdev_handle;
 	struct scsi_device *pdv_sd;
 	struct Scsi_Host *pdv_lld_host;
 } ____cacheline_aligned;
diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h
index 2805de1c6827..cb637fa289ca 100644
--- a/drivers/thermal/qcom/tsens.h
+++ b/drivers/thermal/qcom/tsens.h
@@ -585,7 +585,7 @@ struct tsens_priv {
 	struct dentry			*debug_root;
 	struct dentry			*debug;
 
-	struct tsens_sensor		sensor[];
+	struct tsens_sensor		sensor[] __counted_by(num_sensors);
 };
 
 /**
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 8a94e5a43c6d..d13d2f2e76c7 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -818,7 +818,7 @@ static void tty_update_time(struct tty_struct *tty, bool mtime)
 	spin_lock(&tty->files_lock);
 	list_for_each_entry(priv, &tty->tty_files, list) {
 		struct inode *inode = file_inode(priv->file);
-		struct timespec64 *time = mtime ? &inode->i_mtime : &inode->i_atime;
+		struct timespec64 time = mtime ? inode_get_mtime(inode) : inode_get_atime(inode);
 
 		/*
 		 * We only care if the two values differ in anything other than the
@@ -826,8 +826,12 @@ static void tty_update_time(struct tty_struct *tty, bool mtime)
 		 * the time of the tty device, otherwise it could be construded as a
 		 * security leak to let userspace know the exact timing of the tty.
 		 */
-		if ((sec ^ time->tv_sec) & ~7)
-			time->tv_sec = sec;
+		if ((sec ^ time.tv_sec) & ~7) {
+			if (mtime)
+				inode_set_mtime(inode, sec, 0);
+			else
+				inode_set_atime(inode, sec, 0);
+		}
 	}
 	spin_unlock(&tty->files_lock);
 }
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index 1cdb8758ae01..2da6615fbb6f 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -1018,7 +1018,8 @@ int usbatm_usb_probe(struct usb_interface *intf, const struct usb_device_id *id,
 	size_t size;
 
 	/* instance init */
-	size = struct_size(instance, urbs, num_rcv_urbs + num_snd_urbs);
+	size = struct_size(instance, urbs,
+			   size_add(num_rcv_urbs, num_snd_urbs));
 	instance = kzalloc(size, GFP_KERNEL);
 	if (!instance)
 		return -ENOMEM;
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 4f68f6ef3cc1..3beb6a862e80 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -2642,21 +2642,24 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
 		snoop(&dev->dev, "%s: CONTROL\n", __func__);
 		ret = proc_control(ps, p);
 		if (ret >= 0)
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		break;
 
 	case USBDEVFS_BULK:
 		snoop(&dev->dev, "%s: BULK\n", __func__);
 		ret = proc_bulk(ps, p);
 		if (ret >= 0)
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		break;
 
 	case USBDEVFS_RESETEP:
 		snoop(&dev->dev, "%s: RESETEP\n", __func__);
 		ret = proc_resetep(ps, p);
 		if (ret >= 0)
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		break;
 
 	case USBDEVFS_RESET:
@@ -2668,7 +2671,8 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
 		snoop(&dev->dev, "%s: CLEAR_HALT\n", __func__);
 		ret = proc_clearhalt(ps, p);
 		if (ret >= 0)
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		break;
 
 	case USBDEVFS_GETDRIVER:
@@ -2695,7 +2699,8 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
 		snoop(&dev->dev, "%s: SUBMITURB\n", __func__);
 		ret = proc_submiturb(ps, p);
 		if (ret >= 0)
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		break;
 
 #ifdef CONFIG_COMPAT
@@ -2703,14 +2708,16 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
 		snoop(&dev->dev, "%s: CONTROL32\n", __func__);
 		ret = proc_control_compat(ps, p);
 		if (ret >= 0)
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		break;
 
 	case USBDEVFS_BULK32:
 		snoop(&dev->dev, "%s: BULK32\n", __func__);
 		ret = proc_bulk_compat(ps, p);
 		if (ret >= 0)
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		break;
 
 	case USBDEVFS_DISCSIGNAL32:
@@ -2722,7 +2729,8 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
 		snoop(&dev->dev, "%s: SUBMITURB32\n", __func__);
 		ret = proc_submiturb_compat(ps, p);
 		if (ret >= 0)
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		break;
 
 	case USBDEVFS_IOCTL32:
@@ -2804,7 +2812,7 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
  done:
 	usb_unlock_device(dev);
 	if (ret >= 0)
-		inode->i_atime = current_time(inode);
+		inode_set_atime_to_ts(inode, current_time(inode));
 	return ret;
 }
 
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 6e9ef35a43a7..efe3e3b85769 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -202,7 +202,7 @@ struct ffs_epfile {
 struct ffs_buffer {
 	size_t length;
 	char *data;
-	char storage[];
+	char storage[] __counted_by(length);
 };
 
 /*  ffs_io_data structure ***************************************************/
@@ -1383,8 +1383,8 @@ ffs_sb_make_inode(struct super_block *sb, void *data,
 		inode->i_mode    = perms->mode;
 		inode->i_uid     = perms->uid;
 		inode->i_gid     = perms->gid;
-		inode->i_atime   = ts;
-		inode->i_mtime   = ts;
+		inode_set_atime_to_ts(inode, ts);
+		inode_set_mtime_to_ts(inode, ts);
 		inode->i_private = data;
 		if (fops)
 			inode->i_fop = fops;
diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c
index 2d02f25f9597..5335845d697b 100644
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -99,7 +99,7 @@ struct f_midi {
 	unsigned int in_last_port;
 	unsigned char free_ref;
 
-	struct gmidi_in_port	in_ports_array[/* in_ports */];
+	struct gmidi_in_port	in_ports_array[] __counted_by(in_ports);
 };
 
 static inline struct f_midi *func_to_midi(struct usb_function *f)
@@ -1349,6 +1349,7 @@ static struct usb_function *f_midi_alloc(struct usb_function_instance *fi)
 		status = -ENOMEM;
 		goto setup_fail;
 	}
+	midi->in_ports = opts->in_ports;
 
 	for (i = 0; i < opts->in_ports; i++)
 		midi->in_ports_array[i].cable = i;
@@ -1359,7 +1360,6 @@ static struct usb_function *f_midi_alloc(struct usb_function_instance *fi)
 		status = -ENOMEM;
 		goto midi_free;
 	}
-	midi->in_ports = opts->in_ports;
 	midi->out_ports = opts->out_ports;
 	midi->index = opts->index;
 	midi->buflen = opts->buflen;
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index ce9e31f3d26b..cdc0926100fd 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -1969,7 +1969,7 @@ gadgetfs_make_inode (struct super_block *sb,
 		inode->i_mode = mode;
 		inode->i_uid = make_kuid(&init_user_ns, default_uid);
 		inode->i_gid = make_kgid(&init_user_ns, default_gid);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_private = data;
 		inode->i_fop = fops;
 	}
diff --git a/drivers/usb/host/ohci.h b/drivers/usb/host/ohci.h
index aac6285b37f8..631dda6174b4 100644
--- a/drivers/usb/host/ohci.h
+++ b/drivers/usb/host/ohci.h
@@ -337,7 +337,7 @@ typedef struct urb_priv {
 	u16			length;		// # tds in this request
 	u16			td_cnt;		// tds already serviced
 	struct list_head	pending;
-	struct td		*td[];		// all TDs in this request
+	struct td		*td[] __counted_by(length); // all TDs in this request
 
 } urb_priv_t;
 
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 5df370482521..3d0451f5880a 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1666,7 +1666,7 @@ struct xhci_scratchpad {
 struct urb_priv {
 	int	num_tds;
 	int	num_tds_done;
-	struct	xhci_td	td[];
+	struct	xhci_td	td[] __counted_by(num_tds);
 };
 
 /*
diff --git a/drivers/virt/acrn/acrn_drv.h b/drivers/virt/acrn/acrn_drv.h
index 5663c17ad37c..fb8438094f6f 100644
--- a/drivers/virt/acrn/acrn_drv.h
+++ b/drivers/virt/acrn/acrn_drv.h
@@ -60,7 +60,7 @@ struct vm_memory_region_batch {
 	u16			   reserved[3];
 	u32			   regions_num;
 	u64			   regions_gpa;
-	struct vm_memory_region_op regions_op[];
+	struct vm_memory_region_op regions_op[] __counted_by(regions_num);
 };
 
 /**
diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c
index 423ea888d79a..c24036c4e51e 100644
--- a/drivers/virt/acrn/hsm.c
+++ b/drivers/virt/acrn/hsm.c
@@ -447,7 +447,7 @@ static ssize_t remove_cpu_store(struct device *dev,
 	if (cpu_online(cpu))
 		remove_cpu(cpu);
 
-	lapicid = cpu_data(cpu).apicid;
+	lapicid = cpu_data(cpu).topo.apicid;
 	dev_dbg(dev, "Try to remove cpu %lld with lapicid %lld\n", cpu, lapicid);
 	ret = hcall_sos_remove_cpu(lapicid);
 	if (ret < 0) {
diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c
index b4ad8d452e9a..fa5d9ca6be57 100644
--- a/drivers/virt/acrn/mm.c
+++ b/drivers/virt/acrn/mm.c
@@ -250,11 +250,11 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
 		ret = -ENOMEM;
 		goto unmap_kernel_map;
 	}
+	regions_info->regions_num = nr_regions;
 
 	/* Fill each vm_memory_region_op */
 	vm_region = regions_info->regions_op;
 	regions_info->vmid = vm->vmid;
-	regions_info->regions_num = nr_regions;
 	regions_info->regions_gpa = virt_to_phys(vm_region);
 	user_vm_pa = memmap->user_vm_pa;
 	i = 0;
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
index ad9fe51d3fb3..655775db7caf 100644
--- a/drivers/xen/events/events_fifo.c
+++ b/drivers/xen/events/events_fifo.c
@@ -226,21 +226,20 @@ static bool evtchn_fifo_is_masked(evtchn_port_t port)
  */
 static bool clear_masked_cond(volatile event_word_t *word)
 {
-	event_word_t new, old, w;
+	event_word_t new, old;
 
-	w = *word;
+	old = *word;
 
 	do {
-		if (!(w & (1 << EVTCHN_FIFO_MASKED)))
+		if (!(old & (1 << EVTCHN_FIFO_MASKED)))
 			return true;
 
-		if (w & (1 << EVTCHN_FIFO_PENDING))
+		if (old & (1 << EVTCHN_FIFO_PENDING))
 			return false;
 
-		old = w & ~(1 << EVTCHN_FIFO_BUSY);
+		old = old & ~(1 << EVTCHN_FIFO_BUSY);
 		new = old & ~(1 << EVTCHN_FIFO_MASKED);
-		w = sync_cmpxchg(word, old, new);
-	} while (w != old);
+	} while (!sync_try_cmpxchg(word, &old, new));
 
 	return true;
 }
@@ -259,17 +258,16 @@ static void evtchn_fifo_unmask(evtchn_port_t port)
 
 static uint32_t clear_linked(volatile event_word_t *word)
 {
-	event_word_t new, old, w;
+	event_word_t new, old;
 
-	w = *word;
+	old = *word;
 
 	do {
-		old = w;
-		new = (w & ~((1 << EVTCHN_FIFO_LINKED)
-			     | EVTCHN_FIFO_LINK_MASK));
-	} while ((w = sync_cmpxchg(word, old, new)) != old);
+		new = (old & ~((1 << EVTCHN_FIFO_LINKED)
+			       | EVTCHN_FIFO_LINK_MASK));
+	} while (!sync_try_cmpxchg(word, &old, new));
 
-	return w & EVTCHN_FIFO_LINK_MASK;
+	return old & EVTCHN_FIFO_LINK_MASK;
 }
 
 static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl,
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 35659bf70746..04a6b470b15d 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -427,16 +427,14 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
 
 static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref)
 {
-	u16 flags, nflags;
-	u16 *pflags;
+	u16 *pflags = &gnttab_shared.v1[ref].flags;
+	u16 flags;
 
-	pflags = &gnttab_shared.v1[ref].flags;
-	nflags = *pflags;
+	flags = *pflags;
 	do {
-		flags = nflags;
 		if (flags & (GTF_reading|GTF_writing))
 			return 0;
-	} while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags);
+	} while (!sync_try_cmpxchg(pflags, &flags, 0));
 
 	return 1;
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0d28ecf668d0..b845ee18a80b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 	inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
 	inode->i_blocks = 0;
 	inode->i_rdev = rdev;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_mapping->a_ops = &v9fs_addr_operations;
 	inode->i_private = NULL;
 
@@ -1150,8 +1150,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 
 	set_nlink(inode, 1);
 
-	inode->i_atime.tv_sec = stat->atime;
-	inode->i_mtime.tv_sec = stat->mtime;
+	inode_set_atime(inode, stat->atime, 0);
+	inode_set_mtime(inode, stat->mtime, 0);
 	inode_set_ctime(inode, stat->mtime, 0);
 
 	inode->i_uid = v9ses->dfltuid;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1312f68965ac..c7319af2f471 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -641,10 +641,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 	struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
-		inode->i_atime.tv_sec = stat->st_atime_sec;
-		inode->i_atime.tv_nsec = stat->st_atime_nsec;
-		inode->i_mtime.tv_sec = stat->st_mtime_sec;
-		inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		inode_set_atime(inode, stat->st_atime_sec,
+				stat->st_atime_nsec);
+		inode_set_mtime(inode, stat->st_mtime_sec,
+				stat->st_mtime_nsec);
 		inode_set_ctime(inode, stat->st_ctime_sec,
 				stat->st_ctime_nsec);
 		inode->i_uid = stat->st_uid;
@@ -660,12 +660,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 		inode->i_blocks = stat->st_blocks;
 	} else {
 		if (stat->st_result_mask & P9_STATS_ATIME) {
-			inode->i_atime.tv_sec = stat->st_atime_sec;
-			inode->i_atime.tv_nsec = stat->st_atime_nsec;
+			inode_set_atime(inode, stat->st_atime_sec,
+					stat->st_atime_nsec);
 		}
 		if (stat->st_result_mask & P9_STATS_MTIME) {
-			inode->i_mtime.tv_sec = stat->st_mtime_sec;
-			inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+			inode_set_mtime(inode, stat->st_mtime_sec,
+					stat->st_mtime_nsec);
 		}
 		if (stat->st_result_mask & P9_STATS_CTIME) {
 			inode_set_ctime(inode, stat->st_ctime_sec,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index e00cf8109b3f..053d1cef6e13 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -162,27 +162,27 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
 	return v9fs_xattr_set(dentry, full_name, value, size, flags);
 }
 
-static struct xattr_handler v9fs_xattr_user_handler = {
+static const struct xattr_handler v9fs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.get	= v9fs_xattr_handler_get,
 	.set	= v9fs_xattr_handler_set,
 };
 
-static struct xattr_handler v9fs_xattr_trusted_handler = {
+static const struct xattr_handler v9fs_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.get	= v9fs_xattr_handler_get,
 	.set	= v9fs_xattr_handler_set,
 };
 
 #ifdef CONFIG_9P_FS_SECURITY
-static struct xattr_handler v9fs_xattr_security_handler = {
+static const struct xattr_handler v9fs_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.get	= v9fs_xattr_handler_get,
 	.set	= v9fs_xattr_handler_set,
 };
 #endif
 
-const struct xattr_handler *v9fs_xattr_handlers[] = {
+const struct xattr_handler * const v9fs_xattr_handlers[] = {
 	&v9fs_xattr_user_handler,
 	&v9fs_xattr_trusted_handler,
 #ifdef CONFIG_9P_FS_SECURITY
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index b5636e544c8a..3ad5a802352a 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -10,7 +10,7 @@
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
-extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern const struct xattr_handler * const v9fs_xattr_handlers[];
 
 ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
 			   void *buffer, size_t buffer_size);
diff --git a/fs/Kconfig b/fs/Kconfig
index aa7e03cc1941..0d6cb927872a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
+source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"
 
 endif # BLOCK
diff --git a/fs/Makefile b/fs/Makefile
index f9541f40be4e..75522f88e763 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)		+= f2fs/
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 20963002578a..3081edb09e46 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -242,6 +242,7 @@ struct inode *
 adfs_iget(struct super_block *sb, struct object_info *obj)
 {
 	struct inode *inode;
+	struct timespec64 ts;
 
 	inode = new_inode(sb);
 	if (!inode)
@@ -268,9 +269,10 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	ADFS_I(inode)->attr      = obj->attr;
 
 	inode->i_mode	 = adfs_atts2mode(sb, inode);
-	adfs_adfs2unix_time(&inode->i_mtime, inode);
-	inode->i_atime = inode->i_mtime;
-	inode_set_ctime_to_ts(inode, inode->i_mtime);
+	adfs_adfs2unix_time(&ts, inode);
+	inode_set_atime_to_ts(inode, ts);
+	inode_set_mtime_to_ts(inode, ts);
+	inode_set_ctime_to_ts(inode, ts);
 
 	if (S_ISDIR(inode->i_mode)) {
 		inode->i_op	= &adfs_dir_inode_operations;
@@ -321,7 +323,8 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 
 	if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) {
 		adfs_unix2adfs_time(inode, &attr->ia_mtime);
-		adfs_adfs2unix_time(&inode->i_mtime, inode);
+		adfs_adfs2unix_time(&attr->ia_mtime, inode);
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
 	}
 
 	/*
@@ -329,7 +332,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 	 * have the ability to represent them in our filesystem?
 	 */
 	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (ia_valid & ATTR_MODE) {
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7ba93efc1143..fd669daa4e7b 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
 	mark_buffer_dirty_inode(dir_bh, dir);
 	affs_brelse(dir_bh);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	inode_inc_iversion(dir);
 	mark_inode_dirty(dir);
 
@@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
 
 	affs_brelse(bh);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	inode_inc_iversion(dir);
 	mark_inode_dirty(dir);
 
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 060746c63151..0210df8d3500 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -149,13 +149,9 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 		break;
 	}
 
-	inode->i_mtime.tv_sec = inode->i_atime.tv_sec =
-		inode_set_ctime(inode,
-				(be32_to_cpu(tail->change.days) * 86400LL +
-				 be32_to_cpu(tail->change.mins) * 60 +
-				 be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA)
-				+ sys_tz.tz_minuteswest * 60, 0).tv_sec;
-	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0;
+	inode_set_mtime(inode,
+			inode_set_atime(inode, inode_set_ctime(inode, (be32_to_cpu(tail->change.days) * 86400LL + be32_to_cpu(tail->change.mins) * 60 + be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) + sys_tz.tz_minuteswest * 60, 0).tv_sec, 0).tv_sec,
+			0);
 	affs_brelse(bh);
 	unlock_new_inode(inode);
 	return inode;
@@ -187,12 +183,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 	tail = AFFS_TAIL(sb, bh);
 	if (tail->stype == cpu_to_be32(ST_ROOT)) {
-		affs_secs_to_datestamp(inode->i_mtime.tv_sec,
+		affs_secs_to_datestamp(inode_get_mtime_sec(inode),
 				       &AFFS_ROOT_TAIL(sb, bh)->root_change);
 	} else {
 		tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect);
 		tail->size = cpu_to_be32(inode->i_size);
-		affs_secs_to_datestamp(inode->i_mtime.tv_sec, &tail->change);
+		affs_secs_to_datestamp(inode_get_mtime_sec(inode),
+				       &tail->change);
 		if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
 			uid = i_uid_read(inode);
 			gid = i_gid_read(inode);
@@ -314,7 +311,7 @@ affs_new_inode(struct inode *dir)
 	inode->i_gid     = current_fsgid();
 	inode->i_ino     = block;
 	set_nlink(inode, 1);
-	inode->i_mtime   = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
 	AFFS_I(inode)->i_blkcnt = 0;
 	AFFS_I(inode)->i_lc = NULL;
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 95bcbd7654d1..4d04ef2d3ae7 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
 	set_nlink(inode, 2);
 	inode->i_uid		= GLOBAL_ROOT_UID;
 	inode->i_gid		= GLOBAL_ROOT_GID;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_blocks		= 0;
 	inode->i_generation	= 0;
 
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1c794a1896aa..78efc9719349 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -91,8 +91,8 @@ static int afs_inode_init_from_status(struct afs_operation *op,
 
 	t = status->mtime_client;
 	inode_set_ctime_to_ts(inode, t);
-	inode->i_mtime = t;
-	inode->i_atime = t;
+	inode_set_mtime_to_ts(inode, t);
+	inode_set_atime_to_ts(inode, t);
 	inode->i_flags |= S_NOATIME;
 	inode->i_uid = make_kuid(&init_user_ns, status->owner);
 	inode->i_gid = make_kgid(&init_user_ns, status->group);
@@ -204,7 +204,7 @@ static void afs_apply_status(struct afs_operation *op,
 	}
 
 	t = status->mtime_client;
-	inode->i_mtime = t;
+	inode_set_mtime_to_ts(inode, t);
 	if (vp->update_ctime)
 		inode_set_ctime_to_ts(inode, op->ctime);
 
@@ -253,7 +253,7 @@ static void afs_apply_status(struct afs_operation *op,
 		if (change_size) {
 			afs_set_i_size(vnode, status->size);
 			inode_set_ctime_to_ts(inode, t);
-			inode->i_atime = t;
+			inode_set_atime_to_ts(inode, t);
 		}
 	}
 }
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index da73b97e19a9..c9cef3782b4a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -87,7 +87,7 @@ struct afs_addr_list {
 	enum dns_lookup_status	status:8;
 	unsigned long		failed;		/* Mask of addrs that failed locally/ICMP */
 	unsigned long		responded;	/* Mask of addrs that responded */
-	struct sockaddr_rxrpc	addrs[];
+	struct sockaddr_rxrpc	addrs[] __counted_by(max_addrs);
 #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
 };
 
@@ -705,7 +705,7 @@ struct afs_permits {
 	refcount_t		usage;
 	unsigned short		nr_permits;	/* Number of records */
 	bool			invalidated;	/* Invalidated due to key change */
-	struct afs_permit	permits[];	/* List of permits sorted by key pointer */
+	struct afs_permit	permits[] __counted_by(nr_permits);	/* List of permits sorted by key pointer */
 };
 
 /*
@@ -1541,7 +1541,7 @@ int afs_launder_folio(struct folio *);
 /*
  * xattr.c
  */
-extern const struct xattr_handler *afs_xattr_handlers[];
+extern const struct xattr_handler * const afs_xattr_handlers[];
 
 /*
  * yfsclient.c
diff --git a/fs/afs/write.c b/fs/afs/write.c
index e1c45341719b..4a168781936b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -424,7 +424,7 @@ try_next_key:
 
 	op->store.write_iter = iter;
 	op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
-	op->mtime = vnode->netfs.inode.i_mtime;
+	op->mtime = inode_get_mtime(&vnode->netfs.inode);
 
 	afs_wait_for_operation(op);
 
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 9048d8ccc715..64b2c0224f62 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -353,7 +353,7 @@ static const struct xattr_handler afs_xattr_afs_volume_handler = {
 	.get	= afs_xattr_get_volume,
 };
 
-const struct xattr_handler *afs_xattr_handlers[] = {
+const struct xattr_handler * const afs_xattr_handlers[] = {
 	&afs_xattr_afs_acl_handler,
 	&afs_xattr_afs_cell_handler,
 	&afs_xattr_afs_fid_handler,
diff --git a/fs/attr.c b/fs/attr.c
index a8ae5f6d9b16..bdf5deb06ea9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -308,9 +308,9 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
 	i_uid_update(idmap, attr, inode);
 	i_gid_update(idmap, attr, inode);
 	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
-		inode->i_mtime = attr->ia_mtime;
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
 	if (ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (ia_valid & ATTR_MODE) {
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index d5a44fa88acf..8c1d587b3eef 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -25,6 +25,8 @@
 #include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/magic.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 
 /* This is the range of ioctl() numbers we claim as ours */
 #define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
@@ -205,20 +207,34 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry)
 
 /* Initializing function */
 
-int autofs_fill_super(struct super_block *, void *, int);
+extern const struct fs_parameter_spec autofs_param_specs[];
+int autofs_init_fs_context(struct fs_context *fc);
 struct autofs_info *autofs_new_ino(struct autofs_sb_info *);
 void autofs_clean_ino(struct autofs_info *);
 
-static inline int autofs_prepare_pipe(struct file *pipe)
+static inline int autofs_check_pipe(struct file *pipe)
 {
 	if (!(pipe->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
 	if (!S_ISFIFO(file_inode(pipe)->i_mode))
 		return -EINVAL;
+	return 0;
+}
+
+static inline void autofs_set_packet_pipe_flags(struct file *pipe)
+{
 	/* We want a packet pipe */
 	pipe->f_flags |= O_DIRECT;
 	/* We don't expect -EAGAIN */
 	pipe->f_flags &= ~O_NONBLOCK;
+}
+
+static inline int autofs_prepare_pipe(struct file *pipe)
+{
+	int ret = autofs_check_pipe(pipe);
+	if (ret < 0)
+		return ret;
+	autofs_set_packet_pipe_flags(pipe);
 	return 0;
 }
 
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index d3f55e874338..b5e4dfa04ed0 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -7,16 +7,11 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct dentry *autofs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_nodev(fs_type, flags, data, autofs_fill_super);
-}
-
 struct file_system_type autofs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "autofs",
-	.mount		= autofs_mount,
+	.init_fs_context = autofs_init_fs_context,
+	.parameters	= autofs_param_specs,
 	.kill_sb	= autofs_kill_sb,
 };
 MODULE_ALIAS_FS("autofs");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 2b49662ed237..a5083d447a62 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -6,7 +6,6 @@
 
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
-#include <linux/parser.h>
 
 #include "autofs_i.h"
 
@@ -110,189 +109,179 @@ static const struct super_operations autofs_sops = {
 	.evict_inode	= autofs_evict_inode,
 };
 
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
-	Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
-	Opt_ignore};
-
-static const match_table_t tokens = {
-	{Opt_fd, "fd=%u"},
-	{Opt_uid, "uid=%u"},
-	{Opt_gid, "gid=%u"},
-	{Opt_pgrp, "pgrp=%u"},
-	{Opt_minproto, "minproto=%u"},
-	{Opt_maxproto, "maxproto=%u"},
-	{Opt_indirect, "indirect"},
-	{Opt_direct, "direct"},
-	{Opt_offset, "offset"},
-	{Opt_strictexpire, "strictexpire"},
-	{Opt_ignore, "ignore"},
-	{Opt_err, NULL}
+enum {
+	Opt_direct,
+	Opt_fd,
+	Opt_gid,
+	Opt_ignore,
+	Opt_indirect,
+	Opt_maxproto,
+	Opt_minproto,
+	Opt_offset,
+	Opt_pgrp,
+	Opt_strictexpire,
+	Opt_uid,
 };
 
-static int parse_options(char *options,
-			 struct inode *root, int *pgrp, bool *pgrp_set,
-			 struct autofs_sb_info *sbi)
+const struct fs_parameter_spec autofs_param_specs[] = {
+	fsparam_flag	("direct",		Opt_direct),
+	fsparam_fd	("fd",			Opt_fd),
+	fsparam_u32	("gid",			Opt_gid),
+	fsparam_flag	("ignore",		Opt_ignore),
+	fsparam_flag	("indirect",		Opt_indirect),
+	fsparam_u32	("maxproto",		Opt_maxproto),
+	fsparam_u32	("minproto",		Opt_minproto),
+	fsparam_flag	("offset",		Opt_offset),
+	fsparam_u32	("pgrp",		Opt_pgrp),
+	fsparam_flag	("strictexpire",	Opt_strictexpire),
+	fsparam_u32	("uid",			Opt_uid),
+	{}
+};
+
+struct autofs_fs_context {
+	kuid_t	uid;
+	kgid_t	gid;
+	int	pgrp;
+	bool	pgrp_set;
+};
+
+/*
+ * Open the fd.  We do it here rather than in get_tree so that it's done in the
+ * context of the system call that passed the data and not the one that
+ * triggered the superblock creation, lest the fd gets reassigned.
+ */
+static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi,
+			   struct fs_parameter *param,
+			   struct fs_parse_result *result)
 {
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-	int pipefd = -1;
-	kuid_t uid;
-	kgid_t gid;
+	struct file *pipe;
+	int ret;
 
-	root->i_uid = current_uid();
-	root->i_gid = current_gid();
+	if (param->type == fs_value_is_file) {
+		/* came through the new api */
+		pipe = param->file;
+		param->file = NULL;
+	} else {
+		pipe = fget(result->uint_32);
+	}
+	if (!pipe) {
+		errorf(fc, "could not open pipe file descriptor");
+		return -EBADF;
+	}
 
-	sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
-	sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
+	ret = autofs_check_pipe(pipe);
+	if (ret < 0) {
+		errorf(fc, "Invalid/unusable pipe");
+		if (param->type != fs_value_is_file)
+			fput(pipe);
+		return -EBADF;
+	}
 
-	sbi->pipefd = -1;
+	autofs_set_packet_pipe_flags(pipe);
 
-	if (!options)
-		return 1;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_fd:
-			if (match_int(args, &pipefd))
-				return 1;
-			sbi->pipefd = pipefd;
-			break;
-		case Opt_uid:
-			if (match_int(args, &option))
-				return 1;
-			uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(uid))
-				return 1;
-			root->i_uid = uid;
-			break;
-		case Opt_gid:
-			if (match_int(args, &option))
-				return 1;
-			gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(gid))
-				return 1;
-			root->i_gid = gid;
-			break;
-		case Opt_pgrp:
-			if (match_int(args, &option))
-				return 1;
-			*pgrp = option;
-			*pgrp_set = true;
-			break;
-		case Opt_minproto:
-			if (match_int(args, &option))
-				return 1;
-			sbi->min_proto = option;
-			break;
-		case Opt_maxproto:
-			if (match_int(args, &option))
-				return 1;
-			sbi->max_proto = option;
-			break;
-		case Opt_indirect:
-			set_autofs_type_indirect(&sbi->type);
-			break;
-		case Opt_direct:
-			set_autofs_type_direct(&sbi->type);
-			break;
-		case Opt_offset:
-			set_autofs_type_offset(&sbi->type);
-			break;
-		case Opt_strictexpire:
-			sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
-			break;
-		case Opt_ignore:
-			sbi->flags |= AUTOFS_SBI_IGNORE;
-			break;
-		default:
-			return 1;
-		}
+	if (sbi->pipe)
+		fput(sbi->pipe);
+
+	sbi->pipefd = result->uint_32;
+	sbi->pipe = pipe;
+
+	return 0;
+}
+
+static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct autofs_fs_context *ctx = fc->fs_private;
+	struct autofs_sb_info *sbi = fc->s_fs_info;
+	struct fs_parse_result result;
+	kuid_t uid;
+	kgid_t gid;
+	int opt;
+
+	opt = fs_parse(fc, autofs_param_specs, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_fd:
+		return autofs_parse_fd(fc, sbi, param, &result);
+	case Opt_uid:
+		uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(uid))
+			return invalfc(fc, "Invalid uid");
+		ctx->uid = uid;
+		break;
+	case Opt_gid:
+		gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(gid))
+			return invalfc(fc, "Invalid gid");
+		ctx->gid = gid;
+		break;
+	case Opt_pgrp:
+		ctx->pgrp = result.uint_32;
+		ctx->pgrp_set = true;
+		break;
+	case Opt_minproto:
+		sbi->min_proto = result.uint_32;
+		break;
+	case Opt_maxproto:
+		sbi->max_proto = result.uint_32;
+		break;
+	case Opt_indirect:
+		set_autofs_type_indirect(&sbi->type);
+		break;
+	case Opt_direct:
+		set_autofs_type_direct(&sbi->type);
+		break;
+	case Opt_offset:
+		set_autofs_type_offset(&sbi->type);
+		break;
+	case Opt_strictexpire:
+		sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
+		break;
+	case Opt_ignore:
+		sbi->flags |= AUTOFS_SBI_IGNORE;
 	}
-	return (sbi->pipefd < 0);
+
+	return 0;
 }
 
-int autofs_fill_super(struct super_block *s, void *data, int silent)
+static struct autofs_sb_info *autofs_alloc_sbi(void)
 {
-	struct inode *root_inode;
-	struct dentry *root;
-	struct file *pipe;
 	struct autofs_sb_info *sbi;
-	struct autofs_info *ino;
-	int pgrp = 0;
-	bool pgrp_set = false;
-	int ret = -EINVAL;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		return -ENOMEM;
-	pr_debug("starting up, sbi = %p\n", sbi);
+		return NULL;
 
-	s->s_fs_info = sbi;
 	sbi->magic = AUTOFS_SBI_MAGIC;
-	sbi->pipefd = -1;
-	sbi->pipe = NULL;
-	sbi->exp_timeout = 0;
-	sbi->oz_pgrp = NULL;
-	sbi->sb = s;
-	sbi->version = 0;
-	sbi->sub_version = 0;
 	sbi->flags = AUTOFS_SBI_CATATONIC;
+	sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
+	sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
+	sbi->pipefd = -1;
+
 	set_autofs_type_indirect(&sbi->type);
-	sbi->min_proto = 0;
-	sbi->max_proto = 0;
 	mutex_init(&sbi->wq_mutex);
 	mutex_init(&sbi->pipe_mutex);
 	spin_lock_init(&sbi->fs_lock);
-	sbi->queues = NULL;
 	spin_lock_init(&sbi->lookup_lock);
 	INIT_LIST_HEAD(&sbi->active_list);
 	INIT_LIST_HEAD(&sbi->expiring_list);
-	s->s_blocksize = 1024;
-	s->s_blocksize_bits = 10;
-	s->s_magic = AUTOFS_SUPER_MAGIC;
-	s->s_op = &autofs_sops;
-	s->s_d_op = &autofs_dentry_operations;
-	s->s_time_gran = 1;
 
-	/*
-	 * Get the root inode and dentry, but defer checking for errors.
-	 */
-	ino = autofs_new_ino(sbi);
-	if (!ino) {
-		ret = -ENOMEM;
-		goto fail_free;
-	}
-	root_inode = autofs_get_inode(s, S_IFDIR | 0755);
-	root = d_make_root(root_inode);
-	if (!root) {
-		ret = -ENOMEM;
-		goto fail_ino;
-	}
-	pipe = NULL;
-
-	root->d_fsdata = ino;
+	return sbi;
+}
 
-	/* Can this call block? */
-	if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) {
-		pr_err("called with bogus options\n");
-		goto fail_dput;
-	}
+static int autofs_validate_protocol(struct fs_context *fc)
+{
+	struct autofs_sb_info *sbi = fc->s_fs_info;
 
 	/* Test versions first */
 	if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
 	    sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
-		pr_err("kernel does not match daemon version "
+		errorf(fc, "kernel does not match daemon version "
 		       "daemon (%d, %d) kernel (%d, %d)\n",
 		       sbi->min_proto, sbi->max_proto,
 		       AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
-		goto fail_dput;
+		return -EINVAL;
 	}
 
 	/* Establish highest kernel protocol version */
@@ -300,13 +289,62 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 		sbi->version = AUTOFS_MAX_PROTO_VERSION;
 	else
 		sbi->version = sbi->max_proto;
-	sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
 
-	if (pgrp_set) {
-		sbi->oz_pgrp = find_get_pid(pgrp);
+	switch (sbi->version) {
+	case 4:
+		sbi->sub_version = 7;
+		break;
+	case 5:
+		sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+		break;
+	default:
+		sbi->sub_version = 0;
+	}
+
+	return 0;
+}
+
+static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
+{
+	struct autofs_fs_context *ctx = fc->fs_private;
+	struct autofs_sb_info *sbi = s->s_fs_info;
+	struct inode *root_inode;
+	struct dentry *root;
+	struct autofs_info *ino;
+	int ret = -ENOMEM;
+
+	pr_debug("starting up, sbi = %p\n", sbi);
+
+	sbi->sb = s;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = AUTOFS_SUPER_MAGIC;
+	s->s_op = &autofs_sops;
+	s->s_d_op = &autofs_dentry_operations;
+	s->s_time_gran = 1;
+
+	/*
+	 * Get the root inode and dentry, but defer checking for errors.
+	 */
+	ino = autofs_new_ino(sbi);
+	if (!ino)
+		goto fail;
+
+	root_inode = autofs_get_inode(s, S_IFDIR | 0755);
+	root_inode->i_uid = ctx->uid;
+	root_inode->i_gid = ctx->gid;
+
+	root = d_make_root(root_inode);
+	if (!root)
+		goto fail_ino;
+
+	root->d_fsdata = ino;
+
+	if (ctx->pgrp_set) {
+		sbi->oz_pgrp = find_get_pid(ctx->pgrp);
 		if (!sbi->oz_pgrp) {
-			pr_err("could not find process group %d\n",
-				pgrp);
+			ret = invalf(fc, "Could not find process group %d",
+				     ctx->pgrp);
 			goto fail_dput;
 		}
 	} else {
@@ -321,16 +359,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 
 	pr_debug("pipe fd = %d, pgrp = %u\n",
 		 sbi->pipefd, pid_nr(sbi->oz_pgrp));
-	pipe = fget(sbi->pipefd);
 
-	if (!pipe) {
-		pr_err("could not open pipe file descriptor\n");
-		goto fail_put_pid;
-	}
-	ret = autofs_prepare_pipe(pipe);
-	if (ret < 0)
-		goto fail_fput;
-	sbi->pipe = pipe;
 	sbi->flags &= ~AUTOFS_SBI_CATATONIC;
 
 	/*
@@ -342,22 +371,82 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 	/*
 	 * Failure ... clean up.
 	 */
-fail_fput:
-	pr_err("pipe file descriptor does not contain proper ops\n");
-	fput(pipe);
-fail_put_pid:
-	put_pid(sbi->oz_pgrp);
 fail_dput:
 	dput(root);
-	goto fail_free;
+	goto fail;
 fail_ino:
 	autofs_free_ino(ino);
-fail_free:
-	kfree(sbi);
-	s->s_fs_info = NULL;
+fail:
 	return ret;
 }
 
+/*
+ * Validate the parameters and then request a superblock.
+ */
+static int autofs_get_tree(struct fs_context *fc)
+{
+	struct autofs_sb_info *sbi = fc->s_fs_info;
+	int ret;
+
+	ret = autofs_validate_protocol(fc);
+	if (ret)
+		return ret;
+
+	if (sbi->pipefd < 0)
+		return invalf(fc, "No control pipe specified");
+
+	return get_tree_nodev(fc, autofs_fill_super);
+}
+
+static void autofs_free_fc(struct fs_context *fc)
+{
+	struct autofs_fs_context *ctx = fc->fs_private;
+	struct autofs_sb_info *sbi = fc->s_fs_info;
+
+	if (sbi) {
+		if (sbi->pipe)
+			fput(sbi->pipe);
+		kfree(sbi);
+	}
+	kfree(ctx);
+}
+
+static const struct fs_context_operations autofs_context_ops = {
+	.free		= autofs_free_fc,
+	.parse_param	= autofs_parse_param,
+	.get_tree	= autofs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+int autofs_init_fs_context(struct fs_context *fc)
+{
+	struct autofs_fs_context *ctx;
+	struct autofs_sb_info *sbi;
+
+	ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL);
+	if (!ctx)
+		goto nomem;
+
+	ctx->uid = current_uid();
+	ctx->gid = current_gid();
+
+	sbi = autofs_alloc_sbi();
+	if (!sbi)
+		goto nomem_ctx;
+
+	fc->fs_private = ctx;
+	fc->s_fs_info = sbi;
+	fc->ops = &autofs_context_ops;
+	return 0;
+
+nomem_ctx:
+	kfree(ctx);
+nomem:
+	return -ENOMEM;
+}
+
 struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
 {
 	struct inode *inode = new_inode(sb);
@@ -370,7 +459,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
 		inode->i_uid = d_inode(sb->s_root)->i_uid;
 		inode->i_gid = d_inode(sb->s_root)->i_gid;
 	}
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_ino = get_next_ino();
 
 	if (S_ISDIR(mode)) {
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 512b9a26c63d..530d18827e35 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	return 0;
 }
@@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
 	d_inode(dentry)->i_size = 0;
 	clear_nlink(d_inode(dentry));
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	spin_lock(&sbi->lookup_lock);
 	__autofs_add_expiring(dentry);
@@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 	inc_nlink(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	return 0;
 }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 83f9566c973b..316d88da2ce1 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -208,7 +208,7 @@ void make_bad_inode(struct inode *inode)
 	remove_inode_hash(inode);
 
 	inode->i_mode = S_IFREG;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &bad_inode_ops;	
 	inode->i_opflags &= ~IOP_XATTR;
 	inode->i_fop = &bad_file_ops;	
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
new file mode 100644
index 000000000000..df13a4f9a6e3
--- /dev/null
+++ b/fs/bcachefs/Kconfig
@@ -0,0 +1,85 @@
+
+config BCACHEFS_FS
+	tristate "bcachefs filesystem support (EXPERIMENTAL)"
+	depends on BLOCK
+	select EXPORTFS
+	select CLOSURES
+	select LIBCRC32C
+	select CRC64
+	select FS_POSIX_ACL
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	select LZ4HC_COMPRESS
+	select LZ4HC_DECOMPRESS
+	select ZLIB_DEFLATE
+	select ZLIB_INFLATE
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
+	select CRYPTO_SHA256
+	select CRYPTO_CHACHA20
+	select CRYPTO_POLY1305
+	select KEYS
+	select RAID6_PQ
+	select XOR_BLOCKS
+	select XXHASH
+	select SRCU
+	select SYMBOLIC_ERRNAME
+	select MEAN_AND_VARIANCE
+	help
+	The bcachefs filesystem - a modern, copy on write filesystem, with
+	support for multiple devices, compression, checksumming, etc.
+
+config BCACHEFS_QUOTA
+	bool "bcachefs quota support"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+
+config BCACHEFS_POSIX_ACL
+	bool "bcachefs POSIX ACL support"
+	depends on BCACHEFS_FS
+	select FS_POSIX_ACL
+
+config BCACHEFS_DEBUG_TRANSACTIONS
+	bool "bcachefs runtime info"
+	depends on BCACHEFS_FS
+	default y
+	help
+	This makes the list of running btree transactions available in debugfs.
+
+	This is a highly useful debugging feature but does add a small amount of overhead.
+
+config BCACHEFS_DEBUG
+	bool "bcachefs debugging"
+	depends on BCACHEFS_FS
+	help
+	Enables many extra debugging checks and assertions.
+
+	The resulting code will be significantly slower than normal; you
+	probably shouldn't select this option unless you're a developer.
+
+config BCACHEFS_TESTS
+	bool "bcachefs unit and performance tests"
+	depends on BCACHEFS_FS
+	help
+	Include some unit and performance tests for the core btree code
+
+config BCACHEFS_LOCK_TIME_STATS
+       bool "bcachefs lock time statistics"
+       depends on BCACHEFS_FS
+       help
+       Expose statistics for how long we held a lock in debugfs
+
+config BCACHEFS_NO_LATENCY_ACCT
+	bool "disable latency accounting and time stats"
+	depends on BCACHEFS_FS
+	help
+	This disables device latency tracking and time stats, only for performance testing
+
+config MEAN_AND_VARIANCE_UNIT_TEST
+	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	select MEAN_AND_VARIANCE
+	default KUNIT_ALL_TESTS
+	help
+	  This option enables the kunit tests for mean_and_variance module.
+	  If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
new file mode 100644
index 000000000000..0749731b9072
--- /dev/null
+++ b/fs/bcachefs/Makefile
@@ -0,0 +1,88 @@
+
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
+
+bcachefs-y		:=	\
+	acl.o			\
+	alloc_background.o	\
+	alloc_foreground.o	\
+	backpointers.o		\
+	bkey.o			\
+	bkey_methods.o		\
+	bkey_sort.o		\
+	bset.o			\
+	btree_cache.o		\
+	btree_gc.o		\
+	btree_io.o		\
+	btree_iter.o		\
+	btree_journal_iter.o	\
+	btree_key_cache.o	\
+	btree_locking.o		\
+	btree_trans_commit.o	\
+	btree_update.o		\
+	btree_update_interior.o	\
+	btree_write_buffer.o	\
+	buckets.o		\
+	buckets_waiting_for_journal.o	\
+	chardev.o		\
+	checksum.o		\
+	clock.o			\
+	compress.o		\
+	counters.o		\
+	debug.o			\
+	dirent.o		\
+	disk_groups.o		\
+	data_update.o		\
+	ec.o			\
+	errcode.o		\
+	error.o			\
+	extents.o		\
+	extent_update.o		\
+	fs.o			\
+	fs-common.o		\
+	fs-ioctl.o		\
+	fs-io.o			\
+	fs-io-buffered.o	\
+	fs-io-direct.o		\
+	fs-io-pagecache.o	\
+	fsck.o			\
+	inode.o			\
+	io_read.o		\
+	io_misc.o		\
+	io_write.o		\
+	journal.o		\
+	journal_io.o		\
+	journal_reclaim.o	\
+	journal_sb.o		\
+	journal_seq_blacklist.o	\
+	keylist.o		\
+	logged_ops.o		\
+	lru.o			\
+	mean_and_variance.o	\
+	migrate.o		\
+	move.o			\
+	movinggc.o		\
+	nocow_locking.o		\
+	opts.o			\
+	printbuf.o		\
+	quota.o			\
+	rebalance.o		\
+	recovery.o		\
+	reflink.o		\
+	replicas.o		\
+	sb-clean.o		\
+	sb-members.o		\
+	siphash.o		\
+	six.o			\
+	snapshot.o		\
+	subvolume.o		\
+	super.o			\
+	super-io.o		\
+	sysfs.o			\
+	tests.o			\
+	trace.o			\
+	two_state_shared_lock.o	\
+	util.o			\
+	varint.o		\
+	xattr.o
+
+obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
new file mode 100644
index 000000000000..f3809897f00a
--- /dev/null
+++ b/fs/bcachefs/acl.c
@@ -0,0 +1,463 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+
+#include "acl.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+	[ACL_USER_OBJ]	= "user_obj",
+	[ACL_USER]	= "user",
+	[ACL_GROUP_OBJ]	= "group_obj",
+	[ACL_GROUP]	= "group",
+	[ACL_MASK]	= "mask",
+	[ACL_OTHER]	= "other",
+	NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+
+	if (!value ||
+	    size < sizeof(bch_acl_header) ||
+	    ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+		return;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+		unsigned tag = le16_to_cpu(in->e_tag);
+
+		prt_str(out, acl_types[tag]);
+
+		switch (tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+		if (p != end)
+			prt_char(out, ' ');
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
+{
+	return sizeof(bch_acl_header) +
+		sizeof(bch_acl_entry_short) * nr_short +
+		sizeof(bch_acl_entry) * nr_long;
+}
+
+static inline int acl_to_xattr_type(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
+					    const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+	struct posix_acl *acl;
+	struct posix_acl_entry *out;
+	unsigned count = 0;
+	int ret;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		goto invalid;
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		goto invalid;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *entry = p;
+
+		if (p + sizeof(bch_acl_entry_short) > end)
+			goto invalid;
+
+		switch (le16_to_cpu(entry->e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			p += sizeof(bch_acl_entry);
+			break;
+		default:
+			goto invalid;
+		}
+
+		count++;
+	}
+
+	if (p > end)
+		goto invalid;
+
+	if (!count)
+		return NULL;
+
+	acl = allocate_dropping_locks(trans, ret,
+			posix_acl_alloc(count, _gfp));
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	if (ret) {
+		kfree(acl);
+		return ERR_PTR(ret);
+	}
+
+	out = acl->a_entries;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+
+		out->e_tag  = le16_to_cpu(in->e_tag);
+		out->e_perm = le16_to_cpu(in->e_perm);
+
+		switch (out->e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			out->e_uid = make_kuid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			out->e_gid = make_kgid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		out++;
+	}
+
+	BUG_ON(out != acl->a_entries + acl->a_count);
+
+	return acl;
+invalid:
+	pr_err("invalid acl entry");
+	return ERR_PTR(-EINVAL);
+}
+
+#define acl_for_each_entry(acl, acl_e)			\
+	for (acl_e = acl->a_entries;			\
+	     acl_e < acl->a_entries + acl->a_count;	\
+	     acl_e++)
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static struct bkey_i_xattr *
+bch2_acl_to_xattr(struct btree_trans *trans,
+		  const struct posix_acl *acl,
+		  int type)
+{
+	struct bkey_i_xattr *xattr;
+	bch_acl_header *acl_header;
+	const struct posix_acl_entry *acl_e;
+	void *outptr;
+	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
+
+	acl_for_each_entry(acl, acl_e) {
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			nr_long++;
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			nr_short++;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	acl_len = bch2_acl_size(nr_short, nr_long);
+	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
+
+	if (u64s > U8_MAX)
+		return ERR_PTR(-E2BIG);
+
+	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(xattr))
+		return xattr;
+
+	bkey_xattr_init(&xattr->k_i);
+	xattr->k.u64s		= u64s;
+	xattr->v.x_type		= acl_to_xattr_type(type);
+	xattr->v.x_name_len	= 0;
+	xattr->v.x_val_len	= cpu_to_le16(acl_len);
+
+	acl_header = xattr_val(&xattr->v);
+	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
+
+	outptr = (void *) acl_header + sizeof(*acl_header);
+
+	acl_for_each_entry(acl, acl_e) {
+		bch_acl_entry *entry = outptr;
+
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			outptr += sizeof(bch_acl_entry_short);
+			break;
+		}
+	}
+
+	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
+
+	return xattr;
+}
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
+			       struct dentry *dentry, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c_xattr xattr;
+	struct posix_acl *acl = NULL;
+	struct bkey_s_c k;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+			&hash, inode_inum(inode), &search, 0);
+	if (ret) {
+		if (!bch2_err_matches(ret, ENOENT))
+			acl = ERR_PTR(ret);
+		goto out;
+	}
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret) {
+		acl = ERR_PTR(ret);
+		goto out;
+	}
+
+	xattr = bkey_s_c_to_xattr(k);
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+
+	if (!IS_ERR(acl))
+		set_cached_acl(&inode->v, type, acl);
+out:
+	if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return acl;
+}
+
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
+		       struct bch_inode_unpacked *inode_u,
+		       struct posix_acl *acl, int type)
+{
+	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
+	int ret;
+
+	if (type == ACL_TYPE_DEFAULT &&
+	    !S_ISDIR(inode_u->bi_mode))
+		return acl ? -EACCES : 0;
+
+	if (acl) {
+		struct bkey_i_xattr *xattr =
+			bch2_acl_to_xattr(trans, acl, type);
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+				    inum, &xattr->k_i, 0);
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(acl_to_xattr_type(type), "", 0);
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+				       inum, &search);
+	}
+
+	return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+}
+
+int bch2_set_acl(struct mnt_idmap *idmap,
+		 struct dentry *dentry,
+		 struct posix_acl *_acl, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter inode_iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *acl;
+	umode_t mode;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+retry:
+	bch2_trans_begin(trans);
+	acl = _acl;
+
+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto btree_err;
+
+	mode = inode_u.bi_mode;
+
+	if (type == ACL_TYPE_ACCESS) {
+		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
+		if (ret)
+			goto btree_err;
+	}
+
+	ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
+	if (ret)
+		goto btree_err;
+
+	inode_u.bi_ctime	= bch2_current_time(c);
+	inode_u.bi_mode		= mode;
+
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+btree_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	if (unlikely(ret))
+		goto err;
+
+	bch2_inode_update_after_write(trans, inode, &inode_u,
+				      ATTR_CTIME|ATTR_MODE);
+
+	set_cached_acl(&inode->v, type, acl);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
+		   struct bch_inode_unpacked *inode,
+		   umode_t mode,
+		   struct posix_acl **new_acl)
+{
+	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
+	struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
+	struct btree_iter iter;
+	struct bkey_s_c_xattr xattr;
+	struct bkey_i_xattr *new;
+	struct posix_acl *acl = NULL;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+			       &hash_info, inum, &search, BTREE_ITER_INTENT);
+	if (ret)
+		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+	xattr = bkey_s_c_to_xattr(k);
+
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+	ret = PTR_ERR_OR_ZERO(acl);
+	if (IS_ERR_OR_NULL(acl))
+		goto err;
+
+	ret = allocate_dropping_locks_errcode(trans,
+				__posix_acl_chmod(&acl, _gfp, mode));
+	if (ret)
+		goto err;
+
+	new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+	if (IS_ERR(new)) {
+		ret = PTR_ERR(new);
+		goto err;
+	}
+
+	new->k.p = iter.pos;
+	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
+	*new_acl = acl;
+	acl = NULL;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (!IS_ERR_OR_NULL(acl))
+		kfree(acl);
+	return ret;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
new file mode 100644
index 000000000000..27e7eec0f278
--- /dev/null
+++ b/fs/bcachefs/acl.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H
+
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
+#define BCH_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
+		       struct bch_inode_unpacked *,
+		       struct posix_acl *, int);
+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+		   struct bch_inode_unpacked *,
+		   umode_t, struct posix_acl **);
+
+#else
+
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
+				     struct bch_inode_unpacked *inode_u,
+				     struct posix_acl *acl, int type)
+{
+	return 0;
+}
+
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
+				 struct bch_inode_unpacked *inode,
+				 umode_t mode,
+				 struct posix_acl **new_acl)
+{
+	return 0;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+
+#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
new file mode 100644
index 000000000000..2d516207e223
--- /dev/null
+++ b/fs/bcachefs/alloc_background.c
@@ -0,0 +1,2146 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "clock.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+#include "trace.h"
+#include "varint.h"
+
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+
+/* Persistent alloc info: */
+
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+	BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bkey_alloc_unpacked {
+	u64		journal_seq;
+	u8		gen;
+	u8		oldest_gen;
+	u8		data_type;
+	bool		need_discard:1;
+	bool		need_inc_gen:1;
+#define x(_name, _bits)	u##_bits _name;
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+};
+
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+				     const void **p, unsigned field)
+{
+	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
+	u64 v;
+
+	if (!(a->fields & (1 << field)))
+		return 0;
+
+	switch (bytes) {
+	case 1:
+		v = *((const u8 *) *p);
+		break;
+	case 2:
+		v = le16_to_cpup(*p);
+		break;
+	case 4:
+		v = le32_to_cpup(*p);
+		break;
+	case 8:
+		v = le64_to_cpup(*p);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+	return v;
+}
+
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+				 struct bkey_s_c k)
+{
+	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+	const void *d = in->data;
+	unsigned idx = 0;
+
+	out->gen = in->gen;
+
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+	BCH_ALLOC_FIELDS_V1()
+#undef  x
+}
+
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
+	out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+	struct bkey_alloc_unpacked ret = { .gen	= 0 };
+
+	switch (k.k->type) {
+	case KEY_TYPE_alloc:
+		bch2_alloc_unpack_v1(&ret, k);
+		break;
+	case KEY_TYPE_alloc_v2:
+		bch2_alloc_unpack_v2(&ret, k);
+		break;
+	case KEY_TYPE_alloc_v3:
+		bch2_alloc_unpack_v3(&ret, k);
+		break;
+	}
+
+	return ret;
+}
+
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
+{
+	unsigned i, bytes = offsetof(struct bch_alloc, data);
+
+	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
+		if (a->fields & (1 << i))
+			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
+
+	return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
+	/* allow for unknown fields */
+	if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
+		prt_printf(err, "incorrect value size (%zu < %u)",
+		       bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_alloc_unpacked u;
+
+	if (bch2_alloc_unpack_v2(&u, k)) {
+		prt_printf(err, "unpack error");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_alloc_unpacked u;
+
+	if (bch2_alloc_unpack_v3(&u, k)) {
+		prt_printf(err, "unpack error");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+
+	if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
+		prt_printf(err, "bad val size (%u > %zu)",
+		       alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+	    BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
+		prt_printf(err, "invalid backpointers_start");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
+		prt_printf(err, "invalid data type (got %u should be %u)",
+		       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	switch (a.v->data_type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		if (a.v->dirty_sectors ||
+		    a.v->cached_sectors ||
+		    a.v->stripe) {
+			prt_printf(err, "empty data type free but have data");
+			return -BCH_ERR_invalid_bkey;
+		}
+		break;
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+	case BCH_DATA_btree:
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		if (!a.v->dirty_sectors) {
+			prt_printf(err, "data_type %s but dirty_sectors==0",
+			       bch2_data_types[a.v->data_type]);
+			return -BCH_ERR_invalid_bkey;
+		}
+		break;
+	case BCH_DATA_cached:
+		if (!a.v->cached_sectors ||
+		    a.v->dirty_sectors ||
+		    a.v->stripe) {
+			prt_printf(err, "data type inconsistency");
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		if (!a.v->io_time[READ] &&
+		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
+			prt_printf(err, "cached bucket with read_time == 0");
+			return -BCH_ERR_invalid_bkey;
+		}
+		break;
+	case BCH_DATA_stripe:
+		break;
+	}
+
+	return 0;
+}
+
+static inline u64 swab40(u64 x)
+{
+	return (((x & 0x00000000ffULL) << 32)|
+		((x & 0x000000ff00ULL) << 16)|
+		((x & 0x0000ff0000ULL) >>  0)|
+		((x & 0x00ff000000ULL) >> 16)|
+		((x & 0xff00000000ULL) >> 32));
+}
+
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+	struct bch_backpointer *bp, *bps;
+
+	a->journal_seq		= swab64(a->journal_seq);
+	a->flags		= swab32(a->flags);
+	a->dirty_sectors	= swab32(a->dirty_sectors);
+	a->cached_sectors	= swab32(a->cached_sectors);
+	a->io_time[0]		= swab64(a->io_time[0]);
+	a->io_time[1]		= swab64(a->io_time[1]);
+	a->stripe		= swab32(a->stripe);
+	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+
+	bps = alloc_v4_backpointers(a);
+	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+		bp->bucket_offset	= swab40(bp->bucket_offset);
+		bp->bucket_len		= swab32(bp->bucket_len);
+		bch2_bpos_swab(&bp->pos);
+	}
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+	unsigned i;
+
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "gen %u oldest_gen %u data_type %s",
+	       a->gen, a->oldest_gen,
+	       a->data_type < BCH_DATA_NR
+	       ? bch2_data_types[a->data_type]
+	       : "(invalid data type)");
+	prt_newline(out);
+	prt_printf(out, "journal_seq       %llu",	a->journal_seq);
+	prt_newline(out);
+	prt_printf(out, "need_discard      %llu",	BCH_ALLOC_V4_NEED_DISCARD(a));
+	prt_newline(out);
+	prt_printf(out, "need_inc_gen      %llu",	BCH_ALLOC_V4_NEED_INC_GEN(a));
+	prt_newline(out);
+	prt_printf(out, "dirty_sectors     %u",	a->dirty_sectors);
+	prt_newline(out);
+	prt_printf(out, "cached_sectors    %u",	a->cached_sectors);
+	prt_newline(out);
+	prt_printf(out, "stripe            %u",	a->stripe);
+	prt_newline(out);
+	prt_printf(out, "stripe_redundancy %u",	a->stripe_redundancy);
+	prt_newline(out);
+	prt_printf(out, "io_time[READ]     %llu",	a->io_time[READ]);
+	prt_newline(out);
+	prt_printf(out, "io_time[WRITE]    %llu",	a->io_time[WRITE]);
+	prt_newline(out);
+	prt_printf(out, "fragmentation     %llu",	a->fragmentation_lru);
+	prt_newline(out);
+	prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+	prt_newline(out);
+
+	if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
+		struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
+		const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
+
+		prt_printf(out, "backpointers:     %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
+		printbuf_indent_add(out, 2);
+
+		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
+			prt_newline(out);
+			bch2_backpointer_to_text(out, &bps[i]);
+		}
+
+		printbuf_indent_sub(out, 2);
+	}
+
+	printbuf_indent_sub(out, 2);
+}
+
+void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
+{
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		void *src, *dst;
+
+		*out = *bkey_s_c_to_alloc_v4(k).v;
+
+		src = alloc_v4_backpointers(out);
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+		dst = alloc_v4_backpointers(out);
+
+		if (src < dst)
+			memset(src, 0, dst - src);
+
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
+	} else {
+		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+		*out = (struct bch_alloc_v4) {
+			.journal_seq		= u.journal_seq,
+			.flags			= u.need_discard,
+			.gen			= u.gen,
+			.oldest_gen		= u.oldest_gen,
+			.data_type		= u.data_type,
+			.stripe_redundancy	= u.stripe_redundancy,
+			.dirty_sectors		= u.dirty_sectors,
+			.cached_sectors		= u.cached_sectors,
+			.io_time[READ]		= u.read_time,
+			.io_time[WRITE]		= u.write_time,
+			.stripe			= u.stripe,
+		};
+
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+	}
+}
+
+static noinline struct bkey_i_alloc_v4 *
+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_i_alloc_v4 *ret;
+
+	ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
+	if (IS_ERR(ret))
+		return ret;
+
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		void *src, *dst;
+
+		bkey_reassemble(&ret->k_i, k);
+
+		src = alloc_v4_backpointers(&ret->v);
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
+		dst = alloc_v4_backpointers(&ret->v);
+
+		if (src < dst)
+			memset(src, 0, dst - src);
+
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
+		set_alloc_v4_u64s(ret);
+	} else {
+		bkey_alloc_v4_init(&ret->k_i);
+		ret->k.p = k.k->p;
+		bch2_alloc_to_v4(k, &ret->v);
+	}
+	return ret;
+}
+
+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v4 a;
+
+	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
+	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
+	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
+		return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
+
+	return __bch2_alloc_to_v4_mut(trans, k);
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	return bch2_alloc_to_v4_mut_inlined(trans, k);
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+			      struct bpos pos)
+{
+	struct bkey_s_c k;
+	struct bkey_i_alloc_v4 *a;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
+			     BTREE_ITER_WITH_UPDATES|
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+
+	a = bch2_alloc_to_v4_mut_inlined(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (unlikely(ret))
+		goto err;
+	return a;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ERR_PTR(ret);
+}
+
+static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
+{
+	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
+
+	pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
+	return pos;
+}
+
+static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
+{
+	pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
+	pos.offset += offset;
+	return pos;
+}
+
+static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
+{
+	return k.k->type == KEY_TYPE_bucket_gens
+		? bkey_s_c_to_bucket_gens(k).v->gens[offset]
+		: 0;
+}
+
+int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
+{
+	if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
+		prt_printf(err, "bad val size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
+		if (i)
+			prt_char(out, ' ');
+		prt_printf(out, "%u", g.v->gens[i]);
+	}
+}
+
+int bch2_bucket_gens_init(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 a;
+	struct bkey_i_bucket_gens g;
+	bool have_bucket_gens_key = false;
+	unsigned offset;
+	struct bpos pos;
+	u8 gen;
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		/*
+		 * Not a fsck error because this is checked/repaired by
+		 * bch2_check_alloc_key() which runs later:
+		 */
+		if (!bch2_dev_bucket_exists(c, k.k->p))
+			continue;
+
+		gen = bch2_alloc_to_v4(k, &a)->gen;
+		pos = alloc_gens_pos(iter.pos, &offset);
+
+		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
+			ret = commit_do(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW,
+				bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+			if (ret)
+				break;
+			have_bucket_gens_key = false;
+		}
+
+		if (!have_bucket_gens_key) {
+			bkey_bucket_gens_init(&g.k_i);
+			g.k.p = pos;
+			have_bucket_gens_key = true;
+		}
+
+		g.v.gens[offset] = gen;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (have_bucket_gens_key && !ret)
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW,
+			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_alloc_read(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	int ret;
+
+	down_read(&c->gc_lock);
+
+	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
+		const struct bch_bucket_gens *g;
+		u64 b;
+
+		for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+				   BTREE_ITER_PREFETCH, k, ret) {
+			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+
+			if (k.k->type != KEY_TYPE_bucket_gens)
+				continue;
+
+			g = bkey_s_c_to_bucket_gens(k).v;
+
+			/*
+			 * Not a fsck error because this is checked/repaired by
+			 * bch2_check_alloc_key() which runs later:
+			 */
+			if (!bch2_dev_exists2(c, k.k->p.inode))
+				continue;
+
+			ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+			for (b = max_t(u64, ca->mi.first_bucket, start);
+			     b < min_t(u64, ca->mi.nbuckets, end);
+			     b++)
+				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
+		}
+		bch2_trans_iter_exit(trans, &iter);
+	} else {
+		struct bch_alloc_v4 a;
+
+		for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+				   BTREE_ITER_PREFETCH, k, ret) {
+			/*
+			 * Not a fsck error because this is checked/repaired by
+			 * bch2_check_alloc_key() which runs later:
+			 */
+			if (!bch2_dev_bucket_exists(c, k.k->p))
+				continue;
+
+			ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
+		}
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	bch2_trans_put(trans);
+	up_read(&c->gc_lock);
+
+	if (ret)
+		bch_err_fn(c, ret);
+
+	return ret;
+}
+
+/* Free space/discard btree: */
+
+static int bch2_bucket_do_index(struct btree_trans *trans,
+				struct bkey_s_c alloc_k,
+				const struct bch_alloc_v4 *a,
+				bool set)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+	struct btree_iter iter;
+	struct bkey_s_c old;
+	struct bkey_i *k;
+	enum btree_id btree;
+	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (a->data_type != BCH_DATA_free &&
+	    a->data_type != BCH_DATA_need_discard)
+		return 0;
+
+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.type = new_type;
+
+	switch (a->data_type) {
+	case BCH_DATA_free:
+		btree = BTREE_ID_freespace;
+		k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
+		bch2_key_resize(&k->k, 1);
+		break;
+	case BCH_DATA_need_discard:
+		btree = BTREE_ID_need_discard;
+		k->k.p = alloc_k.k->p;
+		break;
+	default:
+		return 0;
+	}
+
+	old = bch2_bkey_get_iter(trans, &iter, btree,
+			     bkey_start_pos(&k->k),
+			     BTREE_ITER_INTENT);
+	ret = bkey_err(old);
+	if (ret)
+		return ret;
+
+	if (ca->mi.freespace_initialized &&
+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
+	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
+			"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
+			"  for %s",
+			set ? "setting" : "clearing",
+			bch2_btree_ids[btree],
+			iter.pos.inode,
+			iter.pos.offset,
+			bch2_bkey_types[old.k->type],
+			bch2_bkey_types[old_type],
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		ret = -EIO;
+		goto err;
+	}
+
+	ret = bch2_trans_update(trans, &iter, k, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
+					   struct bpos bucket, u8 gen)
+{
+	struct btree_iter iter;
+	unsigned offset;
+	struct bpos pos = alloc_gens_pos(bucket, &offset);
+	struct bkey_i_bucket_gens *g;
+	struct bkey_s_c k;
+	int ret;
+
+	g = bch2_trans_kmalloc(trans, sizeof(*g));
+	ret = PTR_ERR_OR_ZERO(g);
+	if (ret)
+		return ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
+			       BTREE_ITER_INTENT|
+			       BTREE_ITER_WITH_UPDATES);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_bucket_gens) {
+		bkey_bucket_gens_init(&g->k_i);
+		g->k.p = iter.pos;
+	} else {
+		bkey_reassemble(&g->k_i, k);
+	}
+
+	g->v.gens[offset] = gen;
+
+	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
+			  struct bkey_s_c old, struct bkey_i *new,
+			  unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_alloc_v4 old_a_convert, *new_a;
+	const struct bch_alloc_v4 *old_a;
+	u64 old_lru, new_lru;
+	int ret = 0;
+
+	/*
+	 * Deletion only happens in the device removal path, with
+	 * BTREE_TRIGGER_NORUN:
+	 */
+	BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
+
+	old_a = bch2_alloc_to_v4(old, &old_a_convert);
+	new_a = &bkey_i_to_alloc_v4(new)->v;
+
+	new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+
+	if (new_a->dirty_sectors > old_a->dirty_sectors ||
+	    new_a->cached_sectors > old_a->cached_sectors) {
+		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+	}
+
+	if (data_type_is_empty(new_a->data_type) &&
+	    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+	    !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+		new_a->gen++;
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+	}
+
+	if (old_a->data_type != new_a->data_type ||
+	    (new_a->data_type == BCH_DATA_free &&
+	     alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+		ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
+			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
+		if (ret)
+			return ret;
+	}
+
+	if (new_a->data_type == BCH_DATA_cached &&
+	    !new_a->io_time[READ])
+		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+	old_lru = alloc_lru_idx_read(*old_a);
+	new_lru = alloc_lru_idx_read(*new_a);
+
+	if (old_lru != new_lru) {
+		ret = bch2_lru_change(trans, new->k.p.inode,
+				      bucket_to_u64(new->k.p),
+				      old_lru, new_lru);
+		if (ret)
+			return ret;
+	}
+
+	new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+					bch_dev_bkey_exists(c, new->k.p.inode));
+
+	if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+		ret = bch2_lru_change(trans,
+				BCH_LRU_FRAGMENTATION_START,
+				bucket_to_u64(new->k.p),
+				old_a->fragmentation_lru, new_a->fragmentation_lru);
+		if (ret)
+			return ret;
+	}
+
+	if (old_a->gen != new_a->gen) {
+		ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
+ * extents style btrees, but works on non-extents btrees:
+ */
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+{
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+	if (bkey_err(k))
+		return k;
+
+	if (k.k->type) {
+		return k;
+	} else {
+		struct btree_iter iter2;
+		struct bpos next;
+
+		bch2_trans_copy_iter(&iter2, iter);
+
+		if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
+			end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
+
+		end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
+
+		/*
+		 * btree node min/max is a closed interval, upto takes a half
+		 * open interval:
+		 */
+		k = bch2_btree_iter_peek_upto(&iter2, end);
+		next = iter2.pos;
+		bch2_trans_iter_exit(iter->trans, &iter2);
+
+		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
+
+		if (bkey_err(k))
+			return k;
+
+		bkey_init(hole);
+		hole->p = iter->pos;
+
+		bch2_key_resize(hole, next.offset - iter->pos.offset);
+		return (struct bkey_s_c) { hole, NULL };
+	}
+}
+
+static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
+{
+	struct bch_dev *ca;
+	unsigned iter;
+
+	if (bch2_dev_bucket_exists(c, *bucket))
+		return true;
+
+	if (bch2_dev_exists2(c, bucket->inode)) {
+		ca = bch_dev_bkey_exists(c, bucket->inode);
+
+		if (bucket->offset < ca->mi.first_bucket) {
+			bucket->offset = ca->mi.first_bucket;
+			return true;
+		}
+
+		bucket->inode++;
+		bucket->offset = 0;
+	}
+
+	rcu_read_lock();
+	iter = bucket->inode;
+	ca = __bch2_next_dev(c, &iter, NULL);
+	if (ca)
+		*bucket = POS(ca->dev_idx, ca->mi.first_bucket);
+	rcu_read_unlock();
+
+	return ca != NULL;
+}
+
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+{
+	struct bch_fs *c = iter->trans->c;
+	struct bkey_s_c k;
+again:
+	k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+	if (bkey_err(k))
+		return k;
+
+	if (!k.k->type) {
+		struct bpos bucket = bkey_start_pos(k.k);
+
+		if (!bch2_dev_bucket_exists(c, bucket)) {
+			if (!next_bucket(c, &bucket))
+				return bkey_s_c_null;
+
+			bch2_btree_iter_set_pos(iter, bucket);
+			goto again;
+		}
+
+		if (!bch2_dev_bucket_exists(c, k.k->p)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+			bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
+		}
+	}
+
+	return k;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_key(struct btree_trans *trans,
+			 struct bkey_s_c alloc_k,
+			 struct btree_iter *alloc_iter,
+			 struct btree_iter *discard_iter,
+			 struct btree_iter *freespace_iter,
+			 struct btree_iter *bucket_gens_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	unsigned discard_key_type, freespace_key_type;
+	unsigned gens_offset;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+			"alloc key for invalid device:bucket %llu:%llu",
+			alloc_k.k->p.inode, alloc_k.k->p.offset))
+		return bch2_btree_delete_at(trans, alloc_iter, 0);
+
+	ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+	if (!ca->mi.freespace_initialized)
+		return 0;
+
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
+	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
+	k = bch2_btree_iter_peek_slot(discard_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != discard_key_type &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
+		      "  %s",
+		      bch2_bkey_types[k.k->type],
+		      bch2_bkey_types[discard_key_type],
+		      (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= discard_key_type;
+		update->k.p	= discard_iter->pos;
+
+		ret = bch2_trans_update(trans, discard_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+
+	freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
+	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
+	k = bch2_btree_iter_peek_slot(freespace_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != freespace_key_type &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
+		      "  %s",
+		      bch2_bkey_types[k.k->type],
+		      bch2_bkey_types[freespace_key_type],
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= freespace_key_type;
+		update->k.p	= freespace_iter->pos;
+		bch2_key_resize(&update->k, 1);
+
+		ret = bch2_trans_update(trans, freespace_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+
+	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (a->gen != alloc_gen(k, gens_offset) &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+		      "  %s",
+		      alloc_gen(k, gens_offset), a->gen,
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+		struct bkey_i_bucket_gens *g =
+			bch2_trans_kmalloc(trans, sizeof(*g));
+
+		ret = PTR_ERR_OR_ZERO(g);
+		if (ret)
+			goto err;
+
+		if (k.k->type == KEY_TYPE_bucket_gens) {
+			bkey_reassemble(&g->k_i, k);
+		} else {
+			bkey_bucket_gens_init(&g->k_i);
+			g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
+		}
+
+		g->v.gens[gens_offset] = a->gen;
+
+		ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+				    struct bpos start,
+				    struct bpos *end,
+				    struct btree_iter *freespace_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	ca = bch_dev_bkey_exists(c, start.inode);
+	if (!ca->mi.freespace_initialized)
+		return 0;
+
+	bch2_btree_iter_set_pos(freespace_iter, start);
+
+	k = bch2_btree_iter_peek_slot(freespace_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	*end = bkey_min(k.k->p, *end);
+
+	if (k.k->type != KEY_TYPE_set &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, "hole in alloc btree missing in freespace btree\n"
+		      "  device %llu buckets %llu-%llu",
+		      freespace_iter->pos.inode,
+		      freespace_iter->pos.offset,
+		      end->offset))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= KEY_TYPE_set;
+		update->k.p	= freespace_iter->pos;
+		bch2_key_resize(&update->k,
+				min_t(u64, U32_MAX, end->offset -
+				      freespace_iter->pos.offset));
+
+		ret = bch2_trans_update(trans, freespace_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
+				      struct bpos start,
+				      struct bpos *end,
+				      struct btree_iter *bucket_gens_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	unsigned i, gens_offset, gens_end_offset;
+	int ret;
+
+	if (c->sb.version < bcachefs_metadata_version_bucket_gens)
+		return 0;
+
+	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
+
+	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
+		     alloc_gens_pos(*end,  &gens_end_offset)))
+		gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
+
+	if (k.k->type == KEY_TYPE_bucket_gens) {
+		struct bkey_i_bucket_gens g;
+		bool need_update = false;
+
+		bkey_reassemble(&g.k_i, k);
+
+		for (i = gens_offset; i < gens_end_offset; i++) {
+			if (fsck_err_on(g.v.gens[i], c,
+					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
+					bucket_gens_pos_to_alloc(k.k->p, i).inode,
+					bucket_gens_pos_to_alloc(k.k->p, i).offset,
+					g.v.gens[i])) {
+				g.v.gens[i] = 0;
+				need_update = true;
+			}
+		}
+
+		if (need_update) {
+			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
+
+			ret = PTR_ERR_OR_ZERO(u);
+			if (ret)
+				goto err;
+
+			memcpy(u, &g, sizeof(g));
+
+			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
+			if (ret)
+				goto err;
+		}
+	}
+
+	*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+					      struct btree_iter *iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter;
+	struct bkey_s_c alloc_k;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	u64 genbits;
+	struct bpos pos;
+	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
+		? BCH_DATA_need_discard
+		: BCH_DATA_free;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	pos = iter->pos;
+	pos.offset &= ~(~0ULL << 56);
+	genbits = iter->pos.offset & (~0ULL << 56);
+
+	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
+	ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
+
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
+			bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
+		goto delete;
+
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	if (fsck_err_on(a->data_type != state ||
+			(state == BCH_DATA_free &&
+			 genbits != alloc_freespace_genbits(*a)), c,
+			"%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+			bch2_btree_ids[iter->btree_id],
+			iter->pos.inode,
+			iter->pos.offset,
+			a->data_type == state,
+			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
+		goto delete;
+out:
+fsck_err:
+	set_btree_iter_dontneed(&alloc_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+delete:
+	ret =   bch2_btree_delete_extent_at(trans, iter,
+			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+			BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+	goto out;
+}
+
+static int bch2_check_discard_freespace_key(struct btree_trans *trans,
+					    struct btree_iter *iter,
+					    struct bpos end)
+{
+	if (!btree_id_is_extents(iter->btree_id)) {
+		return __bch2_check_discard_freespace_key(trans, iter);
+	} else {
+		int ret = 0;
+
+		while (!bkey_eq(iter->pos, end) &&
+		       !(ret = btree_trans_too_many_iters(trans) ?:
+			       __bch2_check_discard_freespace_key(trans, iter)))
+			bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+
+		return ret;
+	}
+}
+
+/*
+ * We've already checked that generation numbers in the bucket_gens btree are
+ * valid for buckets that exist; this just checks for keys for nonexistent
+ * buckets.
+ */
+static noinline_for_stack
+int bch2_check_bucket_gens_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_bucket_gens g;
+	struct bch_dev *ca;
+	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+	u64 b;
+	bool need_update = false, dev_exists;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
+	bkey_reassemble(&g.k_i, k);
+
+	/* if no bch_dev, skip out whether we repair or not */
+	dev_exists = bch2_dev_exists2(c, k.k->p.inode);
+	if (!dev_exists) {
+		if (fsck_err_on(!dev_exists, c,
+				"bucket_gens key for invalid device:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = bch2_btree_delete_at(trans, iter, 0);
+		}
+		goto out;
+	}
+
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+	if (fsck_err_on(end <= ca->mi.first_bucket ||
+			start >= ca->mi.nbuckets, c,
+			"bucket_gens key for invalid buckets:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
+
+	for (b = start; b < ca->mi.first_bucket; b++)
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+				"bucket_gens key has nonzero gen for invalid bucket")) {
+			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+			need_update = true;
+		}
+
+	for (b = ca->mi.nbuckets; b < end; b++)
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+				"bucket_gens key has nonzero gen for invalid bucket")) {
+			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+			need_update = true;
+		}
+
+	if (need_update) {
+		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
+
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto out;
+
+		memcpy(u, &g, sizeof(g));
+		ret = bch2_trans_update(trans, iter, u, 0);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_alloc_info(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
+	struct bkey hole;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+
+	while (1) {
+		struct bpos next;
+
+		bch2_trans_begin(trans);
+
+		k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
+		ret = bkey_err(k);
+		if (ret)
+			goto bkey_err;
+
+		if (!k.k)
+			break;
+
+		if (k.k->type) {
+			next = bpos_nosnap_successor(k.k->p);
+
+			ret = bch2_check_alloc_key(trans,
+						   k, &iter,
+						   &discard_iter,
+						   &freespace_iter,
+						   &bucket_gens_iter);
+			if (ret)
+				goto bkey_err;
+		} else {
+			next = k.k->p;
+
+			ret = bch2_check_alloc_hole_freespace(trans,
+						    bkey_start_pos(k.k),
+						    &next,
+						    &freespace_iter) ?:
+				bch2_check_alloc_hole_bucket_gens(trans,
+						    bkey_start_pos(k.k),
+						    &next,
+						    &bucket_gens_iter);
+			if (ret)
+				goto bkey_err;
+		}
+
+		ret = bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW);
+		if (ret)
+			goto bkey_err;
+
+		bch2_btree_iter_set_pos(&iter, next);
+bkey_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &bucket_gens_iter);
+	bch2_trans_iter_exit(trans, &freespace_iter);
+	bch2_trans_iter_exit(trans, &discard_iter);
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret < 0)
+		goto err;
+
+	ret = for_each_btree_key2(trans, iter,
+			BTREE_ID_need_discard, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+	      for_each_btree_key2(trans, iter,
+			BTREE_ID_freespace, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+	      for_each_btree_key_commit(trans, iter,
+			BTREE_ID_bucket_gens, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		bch2_check_bucket_gens_key(trans, &iter, k));
+err:
+	bch2_trans_put(trans);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
+				       struct btree_iter *alloc_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter lru_iter;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct bkey_s_c alloc_k, lru_k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	alloc_k = bch2_btree_iter_peek(alloc_iter);
+	if (!alloc_k.k)
+		return 0;
+
+	ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
+
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	if (a->data_type != BCH_DATA_cached)
+		return 0;
+
+	lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
+			     lru_pos(alloc_k.k->p.inode,
+				     bucket_to_u64(alloc_k.k->p),
+				     a->io_time[READ]), 0);
+	ret = bkey_err(lru_k);
+	if (ret)
+		return ret;
+
+	if (fsck_err_on(!a->io_time[READ], c,
+			"cached bucket with read_time 0\n"
+			"  %s",
+		(printbuf_reset(&buf),
+		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
+	    fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+			"missing lru entry\n"
+			"  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		u64 read_time = a->io_time[READ] ?:
+			atomic64_read(&c->io_clock[READ].now);
+
+		ret = bch2_lru_set(trans,
+				   alloc_k.k->p.inode,
+				   bucket_to_u64(alloc_k.k->p),
+				   read_time);
+		if (ret)
+			goto err;
+
+		if (a->io_time[READ] != read_time) {
+			struct bkey_i_alloc_v4 *a_mut =
+				bch2_alloc_to_v4_mut(trans, alloc_k);
+			ret = PTR_ERR_OR_ZERO(a_mut);
+			if (ret)
+				goto err;
+
+			a_mut->v.io_time[READ] = read_time;
+			ret = bch2_trans_update(trans, alloc_iter,
+						&a_mut->k_i, BTREE_TRIGGER_NORUN);
+			if (ret)
+				goto err;
+		}
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &lru_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+				POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			bch2_check_alloc_to_lru_ref(trans, &iter)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int bch2_discard_one_bucket(struct btree_trans *trans,
+				   struct btree_iter *need_discard_iter,
+				   struct bpos *discard_pos_done,
+				   u64 *seen,
+				   u64 *open,
+				   u64 *need_journal_commit,
+				   u64 *discarded)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos pos = need_discard_iter->pos;
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	struct bkey_i_alloc_v4 *a;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+	if (!percpu_ref_tryget(&ca->io_ref)) {
+		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
+		return 0;
+	}
+
+	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
+		(*open)++;
+		goto out;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk,
+			pos.inode, pos.offset)) {
+		(*need_journal_commit)++;
+		goto out;
+	}
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+			       need_discard_iter->pos,
+			       BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+		a->v.gen++;
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+		goto write;
+	}
+
+	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+			bch2_trans_inconsistent(trans,
+				"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
+				"%s",
+				a->v.journal_seq,
+				c->journal.flushed_seq_ondisk,
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+			ret = -EIO;
+		}
+		goto out;
+	}
+
+	if (a->v.data_type != BCH_DATA_need_discard) {
+		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+			bch2_trans_inconsistent(trans,
+				"bucket incorrectly set in need_discard btree\n"
+				"%s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+			ret = -EIO;
+		}
+
+		goto out;
+	}
+
+	if (!bkey_eq(*discard_pos_done, iter.pos) &&
+	    ca->mi.discard && !c->opts.nochanges) {
+		/*
+		 * This works without any other locks because this is the only
+		 * thread that removes items from the need_discard tree
+		 */
+		bch2_trans_unlock(trans);
+		blkdev_issue_discard(ca->disk_sb.bdev,
+				     k.k->p.offset * ca->mi.bucket_size,
+				     ca->mi.bucket_size,
+				     GFP_KERNEL);
+		*discard_pos_done = iter.pos;
+
+		ret = bch2_trans_relock_notrace(trans);
+		if (ret)
+			goto out;
+	}
+
+	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+write:
+	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_WATERMARK_btree|
+				  BTREE_INSERT_NOFAIL);
+	if (ret)
+		goto out;
+
+	this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
+	(*discarded)++;
+out:
+	(*seen)++;
+	bch2_trans_iter_exit(trans, &iter);
+	percpu_ref_put(&ca->io_ref);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static void bch2_do_discards_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+	struct bpos discard_pos_done = POS_MAX;
+	int ret;
+
+	/*
+	 * We're doing the commit in bch2_discard_one_bucket instead of using
+	 * for_each_btree_key_commit() so that we can increment counters after
+	 * successful commit:
+	 */
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter,
+				BTREE_ID_need_discard, POS_MIN, 0, k,
+			bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
+						&seen,
+						&open,
+						&need_journal_commit,
+						&discarded)));
+
+	if (need_journal_commit * 2 > seen)
+		bch2_journal_flush_async(&c->journal, NULL);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+
+	trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+			      bch2_err_str(ret));
+}
+
+void bch2_do_discards(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
+	    !queue_work(c->write_ref_wq, &c->discard_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+}
+
+static int invalidate_one_bucket(struct btree_trans *trans,
+				 struct btree_iter *lru_iter,
+				 struct bkey_s_c lru_k,
+				 s64 *nr_to_invalidate)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter = { NULL };
+	struct bkey_i_alloc_v4 *a = NULL;
+	struct printbuf buf = PRINTBUF;
+	struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
+	unsigned cached_sectors;
+	int ret = 0;
+
+	if (*nr_to_invalidate <= 0)
+		return 1;
+
+	if (!bch2_dev_bucket_exists(c, bucket)) {
+		prt_str(&buf, "lru entry points to invalid bucket");
+		goto err;
+	}
+
+	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
+		return 0;
+
+	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	/* We expect harmless races here due to the btree write buffer: */
+	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
+		goto out;
+
+	BUG_ON(a->v.data_type != BCH_DATA_cached);
+
+	if (!a->v.cached_sectors)
+		bch_err(c, "invalidating empty bucket, confused");
+
+	cached_sectors = a->v.cached_sectors;
+
+	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+	a->v.gen++;
+	a->v.data_type		= 0;
+	a->v.dirty_sectors	= 0;
+	a->v.cached_sectors	= 0;
+	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
+	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
+
+	ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
+				BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_WATERMARK_btree|
+				  BTREE_INSERT_NOFAIL);
+	if (ret)
+		goto out;
+
+	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
+	--*nr_to_invalidate;
+out:
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+err:
+	prt_str(&buf, "\n  lru key: ");
+	bch2_bkey_val_to_text(&buf, c, lru_k);
+
+	prt_str(&buf, "\n  lru entry: ");
+	bch2_lru_pos_to_text(&buf, lru_iter->pos);
+
+	prt_str(&buf, "\n  alloc key: ");
+	if (!a)
+		bch2_bpos_to_text(&buf, bucket);
+	else
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+
+	bch_err(c, "%s", buf.buf);
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
+		bch2_inconsistent_error(c);
+		ret = -EINVAL;
+	}
+
+	goto out;
+}
+
+static void bch2_do_invalidates_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+	struct bch_dev *ca;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned i;
+	int ret = 0;
+
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (ret)
+		goto err;
+
+	for_each_member_device(ca, c, i) {
+		s64 nr_to_invalidate =
+			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+
+		ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+				lru_pos(ca->dev_idx, 0, 0),
+				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
+				BTREE_ITER_INTENT, k,
+			invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
+
+		if (ret < 0) {
+			percpu_ref_put(&ca->ref);
+			break;
+		}
+	}
+err:
+	bch2_trans_put(trans);
+	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+void bch2_do_invalidates(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
+	    !queue_work(c->write_ref_wq, &c->invalidate_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+			    u64 bucket_start, u64 bucket_end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey hole;
+	struct bpos end = POS(ca->dev_idx, bucket_end);
+	struct bch_member *m;
+	unsigned long last_updated = jiffies;
+	int ret;
+
+	BUG_ON(bucket_start > bucket_end);
+	BUG_ON(bucket_end > ca->mi.nbuckets);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+		POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
+		BTREE_ITER_PREFETCH);
+	/*
+	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
+	 * freespace/need_discard/need_gc_gens btrees as needed:
+	 */
+	while (1) {
+		if (last_updated + HZ * 10 < jiffies) {
+			bch_info(ca, "%s: currently at %llu/%llu",
+				 __func__, iter.pos.offset, ca->mi.nbuckets);
+			last_updated = jiffies;
+		}
+
+		bch2_trans_begin(trans);
+
+		if (bkey_ge(iter.pos, end)) {
+			ret = 0;
+			break;
+		}
+
+		k = bch2_get_key_or_hole(&iter, end, &hole);
+		ret = bkey_err(k);
+		if (ret)
+			goto bkey_err;
+
+		if (k.k->type) {
+			/*
+			 * We process live keys in the alloc btree one at a
+			 * time:
+			 */
+			struct bch_alloc_v4 a_convert;
+			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+			ret =   bch2_bucket_do_index(trans, k, a, true) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BTREE_INSERT_LAZY_RW|
+						  BTREE_INSERT_NOFAIL);
+			if (ret)
+				goto bkey_err;
+
+			bch2_btree_iter_advance(&iter);
+		} else {
+			struct bkey_i *freespace;
+
+			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
+			ret = PTR_ERR_OR_ZERO(freespace);
+			if (ret)
+				goto bkey_err;
+
+			bkey_init(&freespace->k);
+			freespace->k.type	= KEY_TYPE_set;
+			freespace->k.p		= k.k->p;
+			freespace->k.size	= k.k->size;
+
+			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BTREE_INSERT_LAZY_RW|
+						  BTREE_INSERT_NOFAIL);
+			if (ret)
+				goto bkey_err;
+
+			bch2_btree_iter_set_pos(&iter, k.k->p);
+		}
+bkey_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (ret < 0) {
+		bch_err_msg(ca, ret, "initializing free space");
+		return ret;
+	}
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_fs_freespace_init(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+	bool doing_init = false;
+
+	/*
+	 * We can crash during the device add path, so we need to check this on
+	 * every mount:
+	 */
+
+	for_each_member_device(ca, c, i) {
+		if (ca->mi.freespace_initialized)
+			continue;
+
+		if (!doing_init) {
+			bch_info(c, "initializing freespace");
+			doing_init = true;
+		}
+
+		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+		if (ret) {
+			percpu_ref_put(&ca->ref);
+			bch_err_fn(c, ret);
+			return ret;
+		}
+	}
+
+	if (doing_init) {
+		mutex_lock(&c->sb_lock);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+		bch_verbose(c, "done initializing freespace");
+	}
+
+	return 0;
+}
+
+/* Bucket IO clocks: */
+
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+			      size_t bucket_nr, int rw)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a;
+	u64 now;
+	int ret = 0;
+
+	a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	now = atomic64_read(&c->io_clock[rw].now);
+	if (a->v.io_time[rw] == now)
+		goto out;
+
+	a->v.io_time[rw] = now;
+
+	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* Startup/shutdown (ro/rw): */
+
+void bch2_recalc_capacity(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+	unsigned bucket_size_max = 0;
+	unsigned long ra_pages = 0;
+	unsigned i;
+
+	lockdep_assert_held(&c->state_lock);
+
+	for_each_online_member(ca, c, i) {
+		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	bch2_set_ra_pages(c, ra_pages);
+
+	for_each_rw_member(ca, c, i) {
+		u64 dev_reserve = 0;
+
+		/*
+		 * We need to reserve buckets (from the number
+		 * of currently available buckets) against
+		 * foreground writes so that mainly copygc can
+		 * make forward progress.
+		 *
+		 * We need enough to refill the various reserves
+		 * from scratch - copygc will use its entire
+		 * reserve all at once, then run against when
+		 * its reserve is refilled (from the formerly
+		 * available buckets).
+		 *
+		 * This reserve is just used when considering if
+		 * allocations for foreground writes must wait -
+		 * not -ENOSPC calculations.
+		 */
+
+		dev_reserve += ca->nr_btree_reserve * 2;
+		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
+
+		dev_reserve += 1;	/* btree write point */
+		dev_reserve += 1;	/* copygc write point */
+		dev_reserve += 1;	/* rebalance write point */
+
+		dev_reserve *= ca->mi.bucket_size;
+
+		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+					     ca->mi.first_bucket);
+
+		reserved_sectors += dev_reserve * 2;
+
+		bucket_size_max = max_t(unsigned, bucket_size_max,
+					ca->mi.bucket_size);
+	}
+
+	gc_reserve = c->opts.gc_reserve_bytes
+		? c->opts.gc_reserve_bytes >> 9
+		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
+
+	reserved_sectors = max(gc_reserve, reserved_sectors);
+
+	reserved_sectors = min(reserved_sectors, capacity);
+
+	c->capacity = capacity - reserved_sectors;
+
+	c->bucket_size_max = bucket_size_max;
+
+	/* Wake up case someone was waiting for buckets */
+	closure_wake_up(&c->freelist_wait);
+}
+
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct open_bucket *ob;
+	bool ret = false;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list &&
+		    ob->dev == ca->dev_idx)
+			ret = true;
+		spin_unlock(&ob->lock);
+	}
+
+	return ret;
+}
+
+/* device goes ro: */
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	/* First, remove device from allocation groups: */
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		clear_bit(ca->dev_idx, c->rw_devs[i].d);
+
+	/*
+	 * Capacity is calculated based off of devices in allocation groups:
+	 */
+	bch2_recalc_capacity(c);
+
+	bch2_open_buckets_stop(c, ca, false);
+
+	/*
+	 * Wake up threads that were blocked on allocation, so they can notice
+	 * the device can no longer be removed and the capacity has changed:
+	 */
+	closure_wake_up(&c->freelist_wait);
+
+	/*
+	 * journal_res_get() can block waiting for free space in the journal -
+	 * it needs to notice there may not be devices to allocate from anymore:
+	 */
+	wake_up(&c->journal.wait);
+
+	/* Now wait for any in flight writes: */
+
+	closure_wait_event(&c->open_buckets_wait,
+			   !bch2_dev_has_open_write_point(c, ca));
+}
+
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		if (ca->mi.data_allowed & (1 << i))
+			set_bit(ca->dev_idx, c->rw_devs[i].d);
+}
+
+void bch2_fs_allocator_background_init(struct bch_fs *c)
+{
+	spin_lock_init(&c->freelist_lock);
+	INIT_WORK(&c->discard_work, bch2_do_discards_work);
+	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
+}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
new file mode 100644
index 000000000000..97042067d2a9
--- /dev/null
+++ b/fs/bcachefs/alloc_background.h
@@ -0,0 +1,258 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
+#define _BCACHEFS_ALLOC_BACKGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "debug.h"
+#include "super.h"
+
+enum bkey_invalid_flags;
+
+/* How out of date a pointer gen is allowed to be: */
+#define BUCKET_GC_GEN_MAX	96U
+
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
+{
+	struct bch_dev *ca;
+
+	if (!bch2_dev_exists2(c, pos.inode))
+		return false;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+	return pos.offset >= ca->mi.first_bucket &&
+		pos.offset < ca->mi.nbuckets;
+}
+
+static inline u64 bucket_to_u64(struct bpos bucket)
+{
+	return (bucket.inode << 48) | bucket.offset;
+}
+
+static inline struct bpos u64_to_bucket(u64 bucket)
+{
+	return POS(bucket >> 48, bucket & ~(~0ULL << 48));
+}
+
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
+{
+	return a.gen - a.oldest_gen;
+}
+
+static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
+						   u32 cached_sectors,
+						   u32 stripe,
+						   struct bch_alloc_v4 a,
+						   enum bch_data_type data_type)
+{
+	if (stripe)
+		return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
+	if (dirty_sectors)
+		return data_type;
+	if (cached_sectors)
+		return BCH_DATA_cached;
+	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+		return BCH_DATA_need_discard;
+	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+		return BCH_DATA_need_gc_gens;
+	return BCH_DATA_free;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+						 enum bch_data_type data_type)
+{
+	return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
+				 a.stripe, a, data_type);
+}
+
+static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
+{
+	return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
+}
+
+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
+{
+	return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+}
+
+#define DATA_TYPES_MOVABLE		\
+	((1U << BCH_DATA_btree)|	\
+	 (1U << BCH_DATA_user)|		\
+	 (1U << BCH_DATA_stripe))
+
+static inline bool data_type_movable(enum bch_data_type type)
+{
+	return (1U << type) & DATA_TYPES_MOVABLE;
+}
+
+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
+					      struct bch_dev *ca)
+{
+	if (!data_type_movable(a.data_type) ||
+	    a.dirty_sectors >= ca->mi.bucket_size)
+		return 0;
+
+	return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+	return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
+{
+	pos.offset |= alloc_freespace_genbits(a);
+	return pos;
+}
+
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+	unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+			BCH_ALLOC_V4_U64s_V0) +
+		BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+		(sizeof(struct bch_backpointer) / sizeof(u64));
+
+	BUG_ON(ret > U8_MAX - BKEY_U64s);
+	return ret;
+}
+
+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
+{
+	set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+
+void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+
+static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
+{
+	const struct bch_alloc_v4 *ret;
+
+	if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
+		goto slowpath;
+
+	ret = bkey_s_c_to_alloc_v4(k).v;
+	if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
+		goto slowpath;
+
+	return ret;
+slowpath:
+	__bch2_alloc_to_v4(k, convert);
+	return convert;
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
+
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+void bch2_alloc_v4_swab(struct bkey_s);
+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v1_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
+	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 8,				\
+})
+
+#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v2_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
+	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 8,				\
+})
+
+#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v3_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
+	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 16,				\
+})
+
+#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v4_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.swab		= bch2_alloc_v4_swab,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
+	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 48,				\
+})
+
+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c,
+			     enum bkey_invalid_flags, struct printbuf *);
+void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
+	.key_invalid	= bch2_bucket_gens_invalid,	\
+	.val_to_text	= bch2_bucket_gens_to_text,	\
+})
+
+int bch2_bucket_gens_init(struct bch_fs *);
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_alloc ||
+		k->type == KEY_TYPE_alloc_v2 ||
+		k->type == KEY_TYPE_alloc_v3;
+}
+
+int bch2_alloc_read(struct bch_fs *);
+
+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *);
+int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_do_discards(struct bch_fs *);
+
+static inline u64 should_invalidate_buckets(struct bch_dev *ca,
+					    struct bch_dev_usage u)
+{
+	u64 want_free = ca->mi.nbuckets >> 7;
+	u64 free = max_t(s64, 0,
+			   u.d[BCH_DATA_free].buckets
+			 + u.d[BCH_DATA_need_discard].buckets
+			 - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
+
+	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
+}
+
+void bch2_do_invalidates(struct bch_fs *);
+
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v +
+			 (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+			  BCH_ALLOC_V4_U64s_V0));
+}
+
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
+
+int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
+int bch2_fs_freespace_init(struct bch_fs *);
+
+void bch2_recalc_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_fs_allocator_background_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
new file mode 100644
index 000000000000..3bc4abd3d7d5
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.c
@@ -0,0 +1,1576 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2012 Google, Inc.
+ *
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
+ * sector granularity from writepoints.
+ *
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
+ *
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
+ * in a given filesystem.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_write.h"
+#include "journal.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "trace.h"
+
+#include <linux/math64.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
+					   struct mutex *lock)
+{
+	if (!mutex_trylock(lock)) {
+		bch2_trans_unlock(trans);
+		mutex_lock(lock);
+	}
+}
+
+const char * const bch2_watermarks[] = {
+#define x(t) #t,
+	BCH_WATERMARKS()
+#undef x
+	NULL
+};
+
+/*
+ * Open buckets represent a bucket that's currently being allocated from.  They
+ * serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		ca->alloc_cursor = 0;
+	rcu_read_unlock();
+}
+
+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
+{
+	open_bucket_idx_t idx = ob - c->open_buckets;
+	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+	ob->hash = *slot;
+	*slot = idx;
+}
+
+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
+{
+	open_bucket_idx_t idx = ob - c->open_buckets;
+	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+	while (*slot != idx) {
+		BUG_ON(!*slot);
+		slot = &c->open_buckets[*slot].hash;
+	}
+
+	*slot = ob->hash;
+	ob->hash = 0;
+}
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+	if (ob->ec) {
+		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
+		return;
+	}
+
+	percpu_down_read(&c->mark_lock);
+	spin_lock(&ob->lock);
+
+	ob->valid = false;
+	ob->data_type = 0;
+
+	spin_unlock(&ob->lock);
+	percpu_up_read(&c->mark_lock);
+
+	spin_lock(&c->freelist_lock);
+	bch2_open_bucket_hash_remove(c, ob);
+
+	ob->freelist = c->open_buckets_freelist;
+	c->open_buckets_freelist = ob - c->open_buckets;
+
+	c->open_buckets_nr_free++;
+	ca->nr_open_buckets--;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *c,
+				  struct open_buckets *obs,
+				  unsigned dev)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->dev == dev && ob->ec)
+			bch2_ec_bucket_cancel(c, ob);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+	ob = c->open_buckets + c->open_buckets_freelist;
+	c->open_buckets_freelist = ob->freelist;
+	atomic_set(&ob->pin, 1);
+	ob->data_type = 0;
+
+	c->open_buckets_nr_free--;
+	return ob;
+}
+
+static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
+{
+	BUG_ON(c->open_buckets_partial_nr >=
+	       ARRAY_SIZE(c->open_buckets_partial));
+
+	spin_lock(&c->freelist_lock);
+	ob->on_partial_list = true;
+	c->open_buckets_partial[c->open_buckets_partial_nr++] =
+		ob - c->open_buckets;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+	closure_wake_up(&c->freelist_wait);
+}
+
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
+{
+	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
+		u64 b = ca->new_fs_bucket_idx++;
+
+		if (!is_superblock_bucket(ca, b) &&
+		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
+			return b;
+	}
+
+	return -1;
+}
+
+static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
+{
+	switch (watermark) {
+	case BCH_WATERMARK_reclaim:
+		return 0;
+	case BCH_WATERMARK_btree:
+	case BCH_WATERMARK_btree_copygc:
+		return OPEN_BUCKETS_COUNT / 4;
+	case BCH_WATERMARK_copygc:
+		return OPEN_BUCKETS_COUNT / 3;
+	default:
+		return OPEN_BUCKETS_COUNT / 2;
+	}
+}
+
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+					      u64 bucket,
+					      enum bch_watermark watermark,
+					      const struct bch_alloc_v4 *a,
+					      struct bucket_alloc_state *s,
+					      struct closure *cl)
+{
+	struct open_bucket *ob;
+
+	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+		s->skipped_nouse++;
+		return NULL;
+	}
+
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		s->skipped_open++;
+		return NULL;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+		s->skipped_need_journal_commit++;
+		return NULL;
+	}
+
+	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
+		s->skipped_nocow++;
+		return NULL;
+	}
+
+	spin_lock(&c->freelist_lock);
+
+	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
+		if (cl)
+			closure_wait(&c->open_buckets_wait, cl);
+
+		if (!c->blocked_allocate_open_bucket)
+			c->blocked_allocate_open_bucket = local_clock();
+
+		spin_unlock(&c->freelist_lock);
+		return ERR_PTR(-BCH_ERR_open_buckets_empty);
+	}
+
+	/* Recheck under lock: */
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		spin_unlock(&c->freelist_lock);
+		s->skipped_open++;
+		return NULL;
+	}
+
+	ob = bch2_open_bucket_alloc(c);
+
+	spin_lock(&ob->lock);
+
+	ob->valid	= true;
+	ob->sectors_free = ca->mi.bucket_size;
+	ob->dev		= ca->dev_idx;
+	ob->gen		= a->gen;
+	ob->bucket	= bucket;
+	spin_unlock(&ob->lock);
+
+	ca->nr_open_buckets++;
+	bch2_open_bucket_hash_add(c, ob);
+
+	if (c->blocked_allocate_open_bucket) {
+		bch2_time_stats_update(
+			&c->times[BCH_TIME_blocked_allocate_open_bucket],
+			c->blocked_allocate_open_bucket);
+		c->blocked_allocate_open_bucket = 0;
+	}
+
+	if (c->blocked_allocate) {
+		bch2_time_stats_update(
+			&c->times[BCH_TIME_blocked_allocate],
+			c->blocked_allocate);
+		c->blocked_allocate = 0;
+	}
+
+	spin_unlock(&c->freelist_lock);
+	return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+					    enum bch_watermark watermark, u64 free_entry,
+					    struct bucket_alloc_state *s,
+					    struct bkey_s_c freespace_k,
+					    struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	struct open_bucket *ob;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	u64 b = free_entry & ~(~0ULL << 56);
+	unsigned genbits = free_entry >> 56;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+		       "  freespace key ",
+			ca->mi.first_bucket, ca->mi.nbuckets);
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	k = bch2_bkey_get_iter(trans, &iter,
+			       BTREE_ID_alloc, POS(ca->dev_idx, b),
+			       BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret) {
+		ob = ERR_PTR(ret);
+		goto err;
+	}
+
+	a = bch2_alloc_to_v4(k, &a_convert);
+
+	if (a->data_type != BCH_DATA_free) {
+		if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+			ob = NULL;
+			goto err;
+		}
+
+		prt_printf(&buf, "non free bucket in freespace btree\n"
+		       "  freespace key ");
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		prt_printf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+		       "  freespace key ",
+		       genbits, alloc_freespace_genbits(*a) >> 56);
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		prt_printf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+		struct bch_backpointer bp;
+		struct bpos bp_pos = POS_MIN;
+
+		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+						&bp_pos, &bp,
+						BTREE_ITER_NOPRESERVE);
+		if (ret) {
+			ob = ERR_PTR(ret);
+			goto err;
+		}
+
+		if (!bkey_eq(bp_pos, POS_MAX)) {
+			/*
+			 * Bucket may have data in it - we don't call
+			 * bc2h_trans_inconnsistent() because fsck hasn't
+			 * finished yet
+			 */
+			ob = NULL;
+			goto err;
+		}
+	}
+
+	ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
+	if (!ob)
+		iter.path->preserve = false;
+err:
+	if (iter.trans && iter.path)
+		set_btree_iter_dontneed(&iter);
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ob;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+			struct bch_dev *ca,
+			enum bch_watermark watermark,
+			struct bucket_alloc_state *s,
+			struct closure *cl)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct open_bucket *ob = NULL;
+	u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+	u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+	int ret;
+again:
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+			   BTREE_ITER_SLOTS, k, ret) {
+		struct bch_alloc_v4 a_convert;
+		const struct bch_alloc_v4 *a;
+
+		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
+			break;
+
+		if (ca->new_fs_bucket_idx &&
+		    is_superblock_bucket(ca, k.k->p.offset))
+			continue;
+
+		a = bch2_alloc_to_v4(k, &a_convert);
+
+		if (a->data_type != BCH_DATA_free)
+			continue;
+
+		s->buckets_seen++;
+
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+		if (ob)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	ca->alloc_cursor = alloc_cursor;
+
+	if (!ob && ret)
+		ob = ERR_PTR(ret);
+
+	if (!ob && alloc_cursor > alloc_start) {
+		alloc_cursor = alloc_start;
+		goto again;
+	}
+
+	return ob;
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+						   struct bch_dev *ca,
+						   enum bch_watermark watermark,
+						   struct bucket_alloc_state *s,
+						   struct closure *cl)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct open_bucket *ob = NULL;
+	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+	u64 alloc_cursor = alloc_start;
+	int ret;
+
+	BUG_ON(ca->new_fs_bucket_idx);
+again:
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
+		if (k.k->p.inode != ca->dev_idx)
+			break;
+
+		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
+		     alloc_cursor < k.k->p.offset;
+		     alloc_cursor++) {
+			ret = btree_trans_too_many_iters(trans);
+			if (ret) {
+				ob = ERR_PTR(ret);
+				break;
+			}
+
+			s->buckets_seen++;
+
+			ob = try_alloc_bucket(trans, ca, watermark,
+					      alloc_cursor, s, k, cl);
+			if (ob) {
+				iter.path->preserve = false;
+				break;
+			}
+		}
+
+		if (ob || ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	ca->alloc_cursor = alloc_cursor;
+
+	if (!ob && ret)
+		ob = ERR_PTR(ret);
+
+	if (!ob && alloc_start > ca->mi.first_bucket) {
+		alloc_cursor = alloc_start = ca->mi.first_bucket;
+		goto again;
+	}
+
+	return ob;
+}
+
+/**
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:	transaction object
+ * @ca:		device to allocate from
+ * @watermark:	how important is this allocation?
+ * @cl:		if not NULL, closure to be used to wait if buckets not available
+ * @usage:	for secondarily also returning the current device usage
+ *
+ * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
+ */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+				      struct bch_dev *ca,
+				      enum bch_watermark watermark,
+				      struct closure *cl,
+				      struct bch_dev_usage *usage)
+{
+	struct bch_fs *c = trans->c;
+	struct open_bucket *ob = NULL;
+	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
+	u64 avail;
+	struct bucket_alloc_state s = { 0 };
+	bool waiting = false;
+again:
+	bch2_dev_usage_read_fast(ca, usage);
+	avail = dev_buckets_free(ca, *usage, watermark);
+
+	if (usage->d[BCH_DATA_need_discard].buckets > avail)
+		bch2_do_discards(c);
+
+	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+		bch2_do_gc_gens(c);
+
+	if (should_invalidate_buckets(ca, *usage))
+		bch2_do_invalidates(c);
+
+	if (!avail) {
+		if (cl && !waiting) {
+			closure_wait(&c->freelist_wait, cl);
+			waiting = true;
+			goto again;
+		}
+
+		if (!c->blocked_allocate)
+			c->blocked_allocate = local_clock();
+
+		ob = ERR_PTR(-BCH_ERR_freelist_empty);
+		goto err;
+	}
+
+	if (waiting)
+		closure_wake_up(&c->freelist_wait);
+alloc:
+	ob = likely(freespace)
+		? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
+		: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
+
+	if (s.skipped_need_journal_commit * 2 > avail)
+		bch2_journal_flush_async(&c->journal, NULL);
+
+	if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+		freespace = false;
+		goto alloc;
+	}
+err:
+	if (!ob)
+		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+
+	if (!IS_ERR(ob))
+		trace_and_count(c, bucket_alloc, ca,
+				bch2_watermarks[watermark],
+				ob->bucket,
+				usage->d[BCH_DATA_free].buckets,
+				avail,
+				bch2_copygc_wait_amount(c),
+				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+				&s,
+				cl == NULL,
+				"");
+	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
+		trace_and_count(c, bucket_alloc_fail, ca,
+				bch2_watermarks[watermark],
+				0,
+				usage->d[BCH_DATA_free].buckets,
+				avail,
+				bch2_copygc_wait_amount(c),
+				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+				&s,
+				cl == NULL,
+				bch2_err_str(PTR_ERR(ob)));
+
+	return ob;
+}
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+				      enum bch_watermark watermark,
+				      struct closure *cl)
+{
+	struct bch_dev_usage usage;
+	struct open_bucket *ob;
+
+	bch2_trans_do(c, NULL, NULL, 0,
+		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
+							cl, &usage)));
+	return ob;
+}
+
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+			    unsigned l, unsigned r)
+{
+	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+		(stripe->next_alloc[l] < stripe->next_alloc[r]));
+}
+
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+					  struct dev_stripe_state *stripe,
+					  struct bch_devs_mask *devs)
+{
+	struct dev_alloc_list ret = { .nr = 0 };
+	unsigned i;
+
+	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
+		ret.devs[ret.nr++] = i;
+
+	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
+	return ret;
+}
+
+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
+			       struct dev_stripe_state *stripe,
+			       struct bch_dev_usage *usage)
+{
+	u64 *v = stripe->next_alloc + ca->dev_idx;
+	u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
+	u64 free_space_inv = free_space
+		? div64_u64(1ULL << 48, free_space)
+		: 1ULL << 48;
+	u64 scale = *v / 4;
+
+	if (*v + free_space_inv >= *v)
+		*v += free_space_inv;
+	else
+		*v = U64_MAX;
+
+	for (v = stripe->next_alloc;
+	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
+		*v = *v < scale ? 0 : *v - scale;
+}
+
+void bch2_dev_stripe_increment(struct bch_dev *ca,
+			       struct dev_stripe_state *stripe)
+{
+	struct bch_dev_usage usage;
+
+	bch2_dev_usage_read_fast(ca, &usage);
+	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+}
+
+static int add_new_bucket(struct bch_fs *c,
+			   struct open_buckets *ptrs,
+			   struct bch_devs_mask *devs_may_alloc,
+			   unsigned nr_replicas,
+			   unsigned *nr_effective,
+			   bool *have_cache,
+			   unsigned flags,
+			   struct open_bucket *ob)
+{
+	unsigned durability =
+		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+	BUG_ON(*nr_effective >= nr_replicas);
+	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
+	__clear_bit(ob->dev, devs_may_alloc->d);
+	*nr_effective	+= (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+		? durability : 1;
+	*have_cache	|= !durability;
+
+	ob_push(c, ptrs, ob);
+
+	if (*nr_effective >= nr_replicas)
+		return 1;
+	if (ob->ec)
+		return 1;
+	return 0;
+}
+
+int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+		      struct open_buckets *ptrs,
+		      struct dev_stripe_state *stripe,
+		      struct bch_devs_mask *devs_may_alloc,
+		      unsigned nr_replicas,
+		      unsigned *nr_effective,
+		      bool *have_cache,
+		      unsigned flags,
+		      enum bch_data_type data_type,
+		      enum bch_watermark watermark,
+		      struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct dev_alloc_list devs_sorted =
+		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+	unsigned dev;
+	struct bch_dev *ca;
+	int ret = -BCH_ERR_insufficient_devices;
+	unsigned i;
+
+	BUG_ON(*nr_effective >= nr_replicas);
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		struct bch_dev_usage usage;
+		struct open_bucket *ob;
+
+		dev = devs_sorted.devs[i];
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca)
+			continue;
+
+		if (!ca->mi.durability && *have_cache) {
+			percpu_ref_put(&ca->ref);
+			continue;
+		}
+
+		ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
+		if (!IS_ERR(ob))
+			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+		percpu_ref_put(&ca->ref);
+
+		if (IS_ERR(ob)) {
+			ret = PTR_ERR(ob);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
+				break;
+			continue;
+		}
+
+		ob->data_type = data_type;
+
+		if (add_new_bucket(c, ptrs, devs_may_alloc,
+				   nr_replicas, nr_effective,
+				   have_cache, flags, ob)) {
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/* Allocate from stripes: */
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static int bucket_alloc_from_stripe(struct btree_trans *trans,
+			 struct open_buckets *ptrs,
+			 struct write_point *wp,
+			 struct bch_devs_mask *devs_may_alloc,
+			 u16 target,
+			 unsigned nr_replicas,
+			 unsigned *nr_effective,
+			 bool *have_cache,
+			 enum bch_watermark watermark,
+			 unsigned flags,
+			 struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct dev_alloc_list devs_sorted;
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i, ec_idx;
+	int ret = 0;
+
+	if (nr_replicas < 2)
+		return 0;
+
+	if (ec_open_bucket(c, ptrs))
+		return 0;
+
+	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	if (!h)
+		return 0;
+
+	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+
+	for (i = 0; i < devs_sorted.nr; i++)
+		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+			if (!h->s->blocks[ec_idx])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[ec_idx];
+			if (ob->dev == devs_sorted.devs[i] &&
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+				goto got_bucket;
+		}
+	goto out_put_head;
+got_bucket:
+	ob->ec_idx	= ec_idx;
+	ob->ec		= h->s;
+	ec_stripe_new_get(h->s, STRIPE_REF_io);
+
+	ret = add_new_bucket(c, ptrs, devs_may_alloc,
+			     nr_replicas, nr_effective,
+			     have_cache, flags, ob);
+out_put_head:
+	bch2_ec_stripe_head_put(c, h);
+	return ret;
+}
+
+/* Sector allocator */
+
+static bool want_bucket(struct bch_fs *c,
+			struct write_point *wp,
+			struct bch_devs_mask *devs_may_alloc,
+			bool *have_cache, bool ec,
+			struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+	if (!test_bit(ob->dev, devs_may_alloc->d))
+		return false;
+
+	if (ob->data_type != wp->data_type)
+		return false;
+
+	if (!ca->mi.durability &&
+	    (wp->data_type == BCH_DATA_btree || ec || *have_cache))
+		return false;
+
+	if (ec != (ob->ec != NULL))
+		return false;
+
+	return true;
+}
+
+static int bucket_alloc_set_writepoint(struct bch_fs *c,
+				       struct open_buckets *ptrs,
+				       struct write_point *wp,
+				       struct bch_devs_mask *devs_may_alloc,
+				       unsigned nr_replicas,
+				       unsigned *nr_effective,
+				       bool *have_cache,
+				       bool ec, unsigned flags)
+{
+	struct open_buckets ptrs_skip = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+	int ret = 0;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		if (!ret && want_bucket(c, wp, devs_may_alloc,
+					have_cache, ec, ob))
+			ret = add_new_bucket(c, ptrs, devs_may_alloc,
+				       nr_replicas, nr_effective,
+				       have_cache, flags, ob);
+		else
+			ob_push(c, &ptrs_skip, ob);
+	}
+	wp->ptrs = ptrs_skip;
+
+	return ret;
+}
+
+static int bucket_alloc_set_partial(struct bch_fs *c,
+				    struct open_buckets *ptrs,
+				    struct write_point *wp,
+				    struct bch_devs_mask *devs_may_alloc,
+				    unsigned nr_replicas,
+				    unsigned *nr_effective,
+				    bool *have_cache, bool ec,
+				    enum bch_watermark watermark,
+				    unsigned flags)
+{
+	int i, ret = 0;
+
+	if (!c->open_buckets_partial_nr)
+		return 0;
+
+	spin_lock(&c->freelist_lock);
+
+	if (!c->open_buckets_partial_nr)
+		goto unlock;
+
+	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
+		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
+
+		if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+			struct bch_dev_usage usage;
+			u64 avail;
+
+			bch2_dev_usage_read_fast(ca, &usage);
+			avail = dev_buckets_free(ca, usage, watermark);
+			if (!avail)
+				continue;
+
+			array_remove_item(c->open_buckets_partial,
+					  c->open_buckets_partial_nr,
+					  i);
+			ob->on_partial_list = false;
+
+			ret = add_new_bucket(c, ptrs, devs_may_alloc,
+					     nr_replicas, nr_effective,
+					     have_cache, flags, ob);
+			if (ret)
+				break;
+		}
+	}
+unlock:
+	spin_unlock(&c->freelist_lock);
+	return ret;
+}
+
+static int __open_bucket_add_buckets(struct btree_trans *trans,
+			struct open_buckets *ptrs,
+			struct write_point *wp,
+			struct bch_devs_list *devs_have,
+			u16 target,
+			bool erasure_code,
+			unsigned nr_replicas,
+			unsigned *nr_effective,
+			bool *have_cache,
+			enum bch_watermark watermark,
+			unsigned flags,
+			struct closure *_cl)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	struct closure *cl = NULL;
+	unsigned i;
+	int ret;
+
+	devs = target_rw_devs(c, wp->data_type, target);
+
+	/* Don't allocate from devices we already have pointers to: */
+	for (i = 0; i < devs_have->nr; i++)
+		__clear_bit(devs_have->devs[i], devs.d);
+
+	open_bucket_for_each(c, ptrs, ob, i)
+		__clear_bit(ob->dev, devs.d);
+
+	if (erasure_code && ec_open_bucket(c, ptrs))
+		return 0;
+
+	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
+				 nr_replicas, nr_effective,
+				 have_cache, erasure_code, flags);
+	if (ret)
+		return ret;
+
+	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
+				 nr_replicas, nr_effective,
+				 have_cache, erasure_code, watermark, flags);
+	if (ret)
+		return ret;
+
+	if (erasure_code) {
+		ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
+					 target,
+					 nr_replicas, nr_effective,
+					 have_cache,
+					 watermark, flags, _cl);
+	} else {
+retry_blocking:
+		/*
+		 * Try nonblocking first, so that if one device is full we'll try from
+		 * other devices:
+		 */
+		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+					nr_replicas, nr_effective, have_cache,
+					flags, wp->data_type, watermark, cl);
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+		    !cl && _cl) {
+			cl = _cl;
+			goto retry_blocking;
+		}
+	}
+
+	return ret;
+}
+
+static int open_bucket_add_buckets(struct btree_trans *trans,
+			struct open_buckets *ptrs,
+			struct write_point *wp,
+			struct bch_devs_list *devs_have,
+			u16 target,
+			unsigned erasure_code,
+			unsigned nr_replicas,
+			unsigned *nr_effective,
+			bool *have_cache,
+			enum bch_watermark watermark,
+			unsigned flags,
+			struct closure *cl)
+{
+	int ret;
+
+	if (erasure_code) {
+		ret = __open_bucket_add_buckets(trans, ptrs, wp,
+				devs_have, target, erasure_code,
+				nr_replicas, nr_effective, have_cache,
+				watermark, flags, cl);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
+		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			return ret;
+		if (*nr_effective >= nr_replicas)
+			return 0;
+	}
+
+	ret = __open_bucket_add_buckets(trans, ptrs, wp,
+			devs_have, target, false,
+			nr_replicas, nr_effective, have_cache,
+			watermark, flags, cl);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:		open_bucket to predicate on
+ * @c:		filesystem handle
+ * @ca:		if set, we're killing buckets for a particular device
+ * @ec:		if true, we're shutting down erasure coding and killing all ec
+ *		open_buckets
+ *		otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+			       struct bch_dev *ca, bool ec)
+{
+	if (ec) {
+		return ob->ec != NULL;
+	} else if (ca) {
+		bool drop = ob->dev == ca->dev_idx;
+		struct open_bucket *ob2;
+		unsigned i;
+
+		if (!drop && ob->ec) {
+			unsigned nr_blocks;
+
+			mutex_lock(&ob->ec->lock);
+			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
+
+			for (i = 0; i < nr_blocks; i++) {
+				if (!ob->ec->blocks[i])
+					continue;
+
+				ob2 = c->open_buckets + ob->ec->blocks[i];
+				drop |= ob2->dev == ca->dev_idx;
+			}
+			mutex_unlock(&ob->ec->lock);
+		}
+
+		return drop;
+	} else {
+		return true;
+	}
+}
+
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+				 bool ec, struct write_point *wp)
+{
+	struct open_buckets ptrs = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&wp->lock);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (should_drop_bucket(ob, c, ca, ec))
+			bch2_open_bucket_put(c, ob);
+		else
+			ob_push(c, &ptrs, ob);
+	wp->ptrs = ptrs;
+	mutex_unlock(&wp->lock);
+}
+
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+			    bool ec)
+{
+	unsigned i;
+
+	/* Next, close write points that point to this device... */
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch2_open_buckets_put(c, &a->ob);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	spin_lock(&c->freelist_lock);
+	i = 0;
+	while (i < c->open_buckets_partial_nr) {
+		struct open_bucket *ob =
+			c->open_buckets + c->open_buckets_partial[i];
+
+		if (should_drop_bucket(ob, c, ca, ec)) {
+			--c->open_buckets_partial_nr;
+			swap(c->open_buckets_partial[i],
+			     c->open_buckets_partial[c->open_buckets_partial_nr]);
+			ob->on_partial_list = false;
+			spin_unlock(&c->freelist_lock);
+			bch2_open_bucket_put(c, ob);
+			spin_lock(&c->freelist_lock);
+		} else {
+			i++;
+		}
+	}
+	spin_unlock(&c->freelist_lock);
+
+	bch2_ec_stop_dev(c, ca);
+}
+
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
+						 unsigned long write_point)
+{
+	unsigned hash =
+		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+	return &c->write_points_hash[hash];
+}
+
+static struct write_point *__writepoint_find(struct hlist_head *head,
+					     unsigned long write_point)
+{
+	struct write_point *wp;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(wp, head, node)
+		if (wp->write_point == write_point)
+			goto out;
+	wp = NULL;
+out:
+	rcu_read_unlock();
+	return wp;
+}
+
+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
+{
+	u64 stranded	= c->write_points_nr * c->bucket_size_max;
+	u64 free	= bch2_fs_usage_read_short(c).free;
+
+	return stranded * factor > free;
+}
+
+static bool try_increase_writepoints(struct bch_fs *c)
+{
+	struct write_point *wp;
+
+	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
+	    too_many_writepoints(c, 32))
+		return false;
+
+	wp = c->write_points + c->write_points_nr++;
+	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+	return true;
+}
+
+static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->write_points_hash_lock);
+	if (c->write_points_nr < old_nr) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return true;
+	}
+
+	if (c->write_points_nr == 1 ||
+	    !too_many_writepoints(c, 8)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return false;
+	}
+
+	wp = c->write_points + --c->write_points_nr;
+
+	hlist_del_rcu(&wp->node);
+	mutex_unlock(&c->write_points_hash_lock);
+
+	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, ob);
+	wp->ptrs.nr = 0;
+	mutex_unlock(&wp->lock);
+	return true;
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
+					   unsigned long write_point)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp, *oldest;
+	struct hlist_head *head;
+
+	if (!(write_point & 1UL)) {
+		wp = (struct write_point *) write_point;
+		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+		return wp;
+	}
+
+	head = writepoint_hash(c, write_point);
+restart_find:
+	wp = __writepoint_find(head, write_point);
+	if (wp) {
+lock_wp:
+		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+		if (wp->write_point == write_point)
+			goto out;
+		mutex_unlock(&wp->lock);
+		goto restart_find;
+	}
+restart_find_oldest:
+	oldest = NULL;
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++)
+		if (!oldest || time_before64(wp->last_used, oldest->last_used))
+			oldest = wp;
+
+	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
+	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
+	if (oldest >= c->write_points + c->write_points_nr ||
+	    try_increase_writepoints(c)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto restart_find_oldest;
+	}
+
+	wp = __writepoint_find(head, write_point);
+	if (wp && wp != oldest) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto lock_wp;
+	}
+
+	wp = oldest;
+	hlist_del_rcu(&wp->node);
+	wp->write_point = write_point;
+	hlist_add_head_rcu(&wp->node, head);
+	mutex_unlock(&c->write_points_hash_lock);
+out:
+	wp->last_used = local_clock();
+	return wp;
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
+			     unsigned target,
+			     unsigned erasure_code,
+			     struct write_point_specifier write_point,
+			     struct bch_devs_list *devs_have,
+			     unsigned nr_replicas,
+			     unsigned nr_replicas_required,
+			     enum bch_watermark watermark,
+			     unsigned flags,
+			     struct closure *cl,
+			     struct write_point **wp_ret)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp;
+	struct open_bucket *ob;
+	struct open_buckets ptrs;
+	unsigned nr_effective, write_points_nr;
+	bool have_cache;
+	int ret;
+	int i;
+
+	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
+	BUG_ON(!nr_replicas || !nr_replicas_required);
+retry:
+	ptrs.nr		= 0;
+	nr_effective	= 0;
+	write_points_nr = c->write_points_nr;
+	have_cache	= false;
+
+	*wp_ret = wp = writepoint_find(trans, write_point.v);
+
+	/* metadata may not allocate on cache devices: */
+	if (wp->data_type != BCH_DATA_user)
+		have_cache = true;
+
+	if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, NULL);
+		if (!ret ||
+		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto alloc_done;
+
+		/* Don't retry from all devices if we're out of open buckets: */
+		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			goto allocate_blocking;
+
+		/*
+		 * Only try to allocate cache (durability = 0 devices) from the
+		 * specified target:
+		 */
+		have_cache = true;
+
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      0, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, cl);
+	} else {
+allocate_blocking:
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, cl);
+	}
+alloc_done:
+	BUG_ON(!ret && nr_effective < nr_replicas);
+
+	if (erasure_code && !ec_open_bucket(c, &ptrs))
+		pr_debug("failed to get ec bucket: ret %u", ret);
+
+	if (ret == -BCH_ERR_insufficient_devices &&
+	    nr_effective >= nr_replicas_required)
+		ret = 0;
+
+	if (ret)
+		goto err;
+
+	/* Free buckets we didn't use: */
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, ob);
+
+	wp->ptrs = ptrs;
+
+	wp->sectors_free = UINT_MAX;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+
+	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+
+	return 0;
+err:
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
+			ob_push(c, &ptrs, ob);
+		else
+			open_bucket_free_unused(c, ob);
+	wp->ptrs = ptrs;
+
+	mutex_unlock(&wp->lock);
+
+	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
+	    try_decrease_writepoints(trans, write_points_nr))
+		goto retry;
+
+	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
+		return cl
+			? -BCH_ERR_bucket_alloc_blocked
+			: -BCH_ERR_ENOSPC_bucket_alloc;
+
+	return ret;
+}
+
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+	return (struct bch_extent_ptr) {
+		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
+		.gen	= ob->gen,
+		.dev	= ob->dev,
+		.offset	= bucket_to_sector(ca, ob->bucket) +
+			ca->mi.bucket_size -
+			ob->sectors_free,
+	};
+}
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+				    struct bkey_i *k, unsigned sectors,
+				    bool cached)
+{
+	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
+{
+	bch2_alloc_sectors_done_inlined(c, wp);
+}
+
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->data_type = type;
+
+	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+	INIT_LIST_HEAD(&wp->writes);
+	spin_lock_init(&wp->writes_lock);
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+	struct write_point *wp;
+
+	mutex_init(&c->write_points_hash_lock);
+	c->write_points_nr = ARRAY_SIZE(c->write_points);
+
+	/* open bucket 0 is a sentinal NULL: */
+	spin_lock_init(&c->open_buckets[0].lock);
+
+	for (ob = c->open_buckets + 1;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+		spin_lock_init(&ob->lock);
+		c->open_buckets_nr_free++;
+
+		ob->freelist = c->open_buckets_freelist;
+		c->open_buckets_freelist = ob - c->open_buckets;
+	}
+
+	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
+	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
+	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
+
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++) {
+		writepoint_init(wp, BCH_DATA_user);
+
+		wp->last_used	= local_clock();
+		wp->write_point	= (unsigned long) wp;
+		hlist_add_head_rcu(&wp->node,
+				   writepoint_hash(c, wp->write_point));
+	}
+}
+
+static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+	unsigned data_type = ob->data_type;
+	barrier(); /* READ_ONCE() doesn't work on bitfields */
+
+	prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+		   ob - c->open_buckets,
+		   atomic_read(&ob->pin),
+		   data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+		   ob->dev, ob->bucket, ob->gen,
+		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
+	if (ob->ec)
+		prt_printf(out, " ec idx %llu", ob->ec->idx);
+	if (ob->on_partial_list)
+		prt_str(out, " partial");
+	prt_newline(out);
+}
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	out->atomic++;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list)
+			bch2_open_bucket_to_text(out, c, ob);
+		spin_unlock(&ob->lock);
+	}
+
+	--out->atomic;
+}
+
+void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	unsigned i;
+
+	out->atomic++;
+	spin_lock(&c->freelist_lock);
+
+	for (i = 0; i < c->open_buckets_partial_nr; i++)
+		bch2_open_bucket_to_text(out, c,
+				c->open_buckets + c->open_buckets_partial[i]);
+
+	spin_unlock(&c->freelist_lock);
+	--out->atomic;
+}
+
+static const char * const bch2_write_point_states[] = {
+#define x(n)	#n,
+	WRITE_POINT_STATES()
+#undef x
+	NULL
+};
+
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+				     struct write_point *wp)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	prt_printf(out, "%lu: ", wp->write_point);
+	prt_human_readable_u64(out, wp->sectors_allocated);
+
+	prt_printf(out, " last wrote: ");
+	bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+		prt_printf(out, " %s: ", bch2_write_point_states[i]);
+		bch2_pr_time_units(out, wp->time[i]);
+	}
+
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		bch2_open_bucket_to_text(out, c, ob);
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct write_point *wp;
+
+	prt_str(out, "Foreground write points\n");
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points);
+	     wp++)
+		bch2_write_point_to_text(out, c, wp);
+
+	prt_str(out, "Copygc write point\n");
+	bch2_write_point_to_text(out, c, &c->copygc_write_point);
+
+	prt_str(out, "Rebalance write point\n");
+	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
+
+	prt_str(out, "Btree write point\n");
+	bch2_write_point_to_text(out, c, &c->btree_write_point);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
new file mode 100644
index 000000000000..7aaeec44c746
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.h
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
+#define _BCACHEFS_ALLOC_FOREGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "extents.h"
+#include "sb-members.h"
+
+#include <linux/hash.h>
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+extern const char * const bch2_watermarks[];
+
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+					  struct dev_stripe_state *,
+					  struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
+
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
+				      enum bch_watermark, struct closure *);
+
+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
+			   struct open_bucket *ob)
+{
+	BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
+
+	obs->v[obs->nr++] = ob - c->open_buckets;
+}
+
+#define open_bucket_for_each(_c, _obs, _ob, _i)				\
+	for ((_i) = 0;							\
+	     (_i) < (_obs)->nr &&					\
+	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
+	     (_i)++)
+
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+						 struct open_buckets *obs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ec)
+			return ob;
+
+	return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+			struct open_buckets *, unsigned);
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_buckets_put(struct bch_fs *c,
+					 struct open_buckets *ptrs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i)
+		bch2_open_bucket_put(c, ob);
+	ptrs->nr = 0;
+}
+
+static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+	wp->ptrs = keep;
+
+	mutex_unlock(&wp->lock);
+
+	bch2_open_buckets_put(c, &ptrs);
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					struct open_buckets *ptrs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		ob->data_type = wp->data_type;
+		atomic_inc(&ob->pin);
+		ob_push(c, ptrs, ob);
+	}
+}
+
+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
+						  unsigned dev, u64 bucket)
+{
+	return c->open_buckets_hash +
+		(jhash_3words(dev, bucket, bucket >> 32, 0) &
+		 (OPEN_BUCKETS_COUNT - 1));
+}
+
+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+	open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
+
+	while (slot) {
+		struct open_bucket *ob = &c->open_buckets[slot];
+
+		if (ob->dev == dev && ob->bucket == bucket)
+			return true;
+
+		slot = ob->hash;
+	}
+
+	return false;
+}
+
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+	bool ret;
+
+	if (bch2_bucket_is_open(c, dev, bucket))
+		return true;
+
+	spin_lock(&c->freelist_lock);
+	ret = bch2_bucket_is_open(c, dev, bucket);
+	spin_unlock(&c->freelist_lock);
+
+	return ret;
+}
+
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
+		      struct dev_stripe_state *, struct bch_devs_mask *,
+		      unsigned, unsigned *, bool *, unsigned,
+		      enum bch_data_type, enum bch_watermark,
+		      struct closure *);
+
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+				   unsigned, unsigned,
+				   struct write_point_specifier,
+				   struct bch_devs_list *,
+				   unsigned, unsigned,
+				   enum bch_watermark,
+				   unsigned,
+				   struct closure *,
+				   struct write_point **);
+
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+static inline void
+bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
+				       struct bkey_i *k, unsigned sectors,
+				       bool cached)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	BUG_ON(sectors > wp->sectors_free);
+	wp->sectors_free	-= sectors;
+	wp->sectors_allocated	+= sectors;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
+
+		ptr.cached = cached ||
+			(!ca->mi.durability &&
+			 wp->data_type == BCH_DATA_user);
+
+		bch2_bkey_append_ptr(k, ptr);
+
+		BUG_ON(sectors > ob->sectors_free);
+		ob->sectors_free -= sectors;
+	}
+}
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i *, unsigned, bool);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *);
+
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
new file mode 100644
index 000000000000..b91b7a461056
--- /dev/null
+++ b/fs/bcachefs/alloc_types.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include "clock_types.h"
+#include "fifo.h"
+
+struct bucket_alloc_state {
+	u64	buckets_seen;
+	u64	skipped_open;
+	u64	skipped_need_journal_commit;
+	u64	skipped_nocow;
+	u64	skipped_nouse;
+};
+
+#define BCH_WATERMARKS()		\
+	x(stripe)			\
+	x(normal)			\
+	x(copygc)			\
+	x(btree)			\
+	x(btree_copygc)			\
+	x(reclaim)
+
+enum bch_watermark {
+#define x(name)	BCH_WATERMARK_##name,
+	BCH_WATERMARKS()
+#undef x
+	BCH_WATERMARK_NR,
+};
+
+#define BCH_WATERMARK_BITS	3
+#define BCH_WATERMARK_MASK	~(~0U << BCH_WATERMARK_BITS)
+
+#define OPEN_BUCKETS_COUNT	1024
+
+#define WRITE_POINT_HASH_NR	32
+#define WRITE_POINT_MAX		32
+
+/*
+ * 0 is never a valid open_bucket_idx_t:
+ */
+typedef u16			open_bucket_idx_t;
+
+struct open_bucket {
+	spinlock_t		lock;
+	atomic_t		pin;
+	open_bucket_idx_t	freelist;
+	open_bucket_idx_t	hash;
+
+	/*
+	 * When an open bucket has an ec_stripe attached, this is the index of
+	 * the block in the stripe this open_bucket corresponds to:
+	 */
+	u8			ec_idx;
+	enum bch_data_type	data_type:6;
+	unsigned		valid:1;
+	unsigned		on_partial_list:1;
+
+	u8			dev;
+	u8			gen;
+	u32			sectors_free;
+	u64			bucket;
+	struct ec_stripe_new	*ec;
+};
+
+#define OPEN_BUCKET_LIST_MAX	15
+
+struct open_buckets {
+	open_bucket_idx_t	nr;
+	open_bucket_idx_t	v[OPEN_BUCKET_LIST_MAX];
+};
+
+struct dev_stripe_state {
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
+#define WRITE_POINT_STATES()		\
+	x(stopped)			\
+	x(waiting_io)			\
+	x(waiting_work)			\
+	x(running)
+
+enum write_point_state {
+#define x(n)	WRITE_POINT_##n,
+	WRITE_POINT_STATES()
+#undef x
+	WRITE_POINT_STATE_NR
+};
+
+struct write_point {
+	struct {
+		struct hlist_node	node;
+		struct mutex		lock;
+		u64			last_used;
+		unsigned long		write_point;
+		enum bch_data_type	data_type;
+
+		/* calculated based on how many pointers we're actually going to use: */
+		unsigned		sectors_free;
+
+		struct open_buckets	ptrs;
+		struct dev_stripe_state	stripe;
+
+		u64			sectors_allocated;
+	} __aligned(SMP_CACHE_BYTES);
+
+	struct {
+		struct work_struct	index_update_work;
+
+		struct list_head	writes;
+		spinlock_t		writes_lock;
+
+		enum write_point_state	state;
+		u64			last_state_change;
+		u64			time[WRITE_POINT_STATE_NR];
+	} __aligned(SMP_CACHE_BYTES);
+};
+
+struct write_point_specifier {
+	unsigned long		v;
+};
+
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
new file mode 100644
index 000000000000..cc856150a948
--- /dev/null
+++ b/fs/bcachefs/backpointers.c
@@ -0,0 +1,868 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+
+#include <linux/mm.h>
+
+static bool extent_matches_bp(struct bch_fs *c,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c k,
+			      struct bpos bucket,
+			      struct bch_backpointer bp)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket2;
+		struct bch_backpointer bp2;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+				      &bucket2, &bp2);
+		if (bpos_eq(bucket, bucket2) &&
+		    !memcmp(&bp, &bp2, sizeof(bp)))
+			return true;
+	}
+
+	return false;
+}
+
+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
+{
+	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+
+	if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
+		prt_str(err, "backpointer at wrong pos");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+	prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+	       bch2_btree_ids[bp->btree_id],
+	       bp->level,
+	       (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+	       (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+	       bp->bucket_len);
+	bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	prt_str(out, "bucket=");
+	bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+	prt_str(out, " ");
+
+	bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+	struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+	bp.v->bucket_offset	= swab32(bp.v->bucket_offset);
+	bp.v->bucket_len	= swab32(bp.v->bucket_len);
+	bch2_bpos_swab(&bp.v->pos);
+}
+
+static noinline int backpointer_mod_err(struct btree_trans *trans,
+					struct bch_backpointer bp,
+					struct bkey_s_c bp_k,
+					struct bkey_s_c orig_k,
+					bool insert)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	if (insert) {
+		prt_printf(&buf, "existing backpointer found when inserting ");
+		bch2_backpointer_to_text(&buf, &bp);
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "found ");
+		bch2_bkey_val_to_text(&buf, c, bp_k);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "for ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+
+		bch_err(c, "%s", buf.buf);
+	} else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+		prt_printf(&buf, "backpointer not found when deleting");
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "searching for ");
+		bch2_backpointer_to_text(&buf, &bp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "got ");
+		bch2_bkey_val_to_text(&buf, c, bp_k);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "for ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+
+		bch_err(c, "%s", buf.buf);
+	}
+
+	printbuf_exit(&buf);
+
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+		bch2_inconsistent_error(c);
+		return -EIO;
+	} else {
+		return 0;
+	}
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+				struct bkey_i_backpointer *bp_k,
+				struct bch_backpointer bp,
+				struct bkey_s_c orig_k,
+				bool insert)
+{
+	struct btree_iter bp_iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+			       bp_k->k.p,
+			       BTREE_ITER_INTENT|
+			       BTREE_ITER_SLOTS|
+			       BTREE_ITER_WITH_UPDATES);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (insert
+	    ? k.k->type
+	    : (k.k->type != KEY_TYPE_backpointer ||
+	       memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
+		ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+			      struct bpos bucket, int gen,
+			      struct bpos *bp_pos,
+			      struct bch_backpointer *bp,
+			      unsigned iter_flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+	struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (bpos_ge(*bp_pos, bp_end_pos))
+		goto done;
+
+	if (gen >= 0) {
+		k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
+				       bucket, BTREE_ITER_CACHED|iter_flags);
+		ret = bkey_err(k);
+		if (ret)
+			goto out;
+
+		if (k.k->type != KEY_TYPE_alloc_v4 ||
+		    bkey_s_c_to_alloc_v4(k).v->gen != gen)
+			goto done;
+	}
+
+	*bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
+
+	for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+				     *bp_pos, iter_flags, k, ret) {
+		if (bpos_ge(k.k->p, bp_end_pos))
+			break;
+
+		*bp_pos = k.k->p;
+		*bp = *bkey_s_c_to_backpointer(k).v;
+		goto out;
+	}
+done:
+	*bp_pos = SPOS_MAX;
+out:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+				  struct bpos bp_pos,
+				  struct bch_backpointer bp,
+				  struct bkey_s_c k,
+				  const char *thing_it_points_to)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+
+	if (likely(!bch2_backpointers_no_use_write_buffer))
+		return;
+
+	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
+		   thing_it_points_to);
+	prt_printf(&buf, "bucket: ");
+	bch2_bpos_to_text(&buf, bucket);
+	prt_printf(&buf, "\n  ");
+
+	prt_printf(&buf, "backpointer pos: ");
+	bch2_bpos_to_text(&buf, bp_pos);
+	prt_printf(&buf, "\n  ");
+
+	bch2_backpointer_to_text(&buf, &bp);
+	prt_printf(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k);
+	if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
+		bch_err_ratelimited(c, "%s", buf.buf);
+	else
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+	printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bpos bp_pos,
+					 struct bch_backpointer bp,
+					 unsigned iter_flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
+	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+	struct bkey_s_c k;
+
+	bch2_trans_node_iter_init(trans, iter,
+				  bp.btree_id,
+				  bp.pos,
+				  0,
+				  min(bp.level, r->level),
+				  iter_flags);
+	k = bch2_btree_iter_peek_slot(iter);
+	if (bkey_err(k)) {
+		bch2_trans_iter_exit(trans, iter);
+		return k;
+	}
+
+	if (bp.level == r->level + 1)
+		k = bkey_i_to_s_c(&r->key);
+
+	if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+		return k;
+
+	bch2_trans_iter_exit(trans, iter);
+
+	if (unlikely(bch2_backpointers_no_use_write_buffer)) {
+		if (bp.level) {
+			struct btree *b;
+
+			/*
+			 * If a backpointer for a btree node wasn't found, it may be
+			 * because it was overwritten by a new btree node that hasn't
+			 * been written out yet - backpointer_get_node() checks for
+			 * this:
+			 */
+			b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+			if (!IS_ERR_OR_NULL(b))
+				return bkey_i_to_s_c(&b->key);
+
+			bch2_trans_iter_exit(trans, iter);
+
+			if (IS_ERR(b))
+				return bkey_s_c_err(PTR_ERR(b));
+			return bkey_s_c_null;
+		}
+
+		backpointer_not_found(trans, bp_pos, bp, k, "extent");
+	}
+
+	return bkey_s_c_null;
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct bpos bp_pos,
+					struct bch_backpointer bp)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+	struct btree *b;
+
+	BUG_ON(!bp.level);
+
+	bch2_trans_node_iter_init(trans, iter,
+				  bp.btree_id,
+				  bp.pos,
+				  0,
+				  bp.level - 1,
+				  0);
+	b = bch2_btree_iter_peek_node(iter);
+	if (IS_ERR(b))
+		goto err;
+
+	if (b && extent_matches_bp(c, bp.btree_id, bp.level,
+				   bkey_i_to_s_c(&b->key),
+				   bucket, bp))
+		return b;
+
+	if (b && btree_node_will_make_reachable(b)) {
+		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+	} else {
+		backpointer_not_found(trans, bp_pos, bp,
+				      bkey_i_to_s_c(&b->key), "btree node");
+		b = NULL;
+	}
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+					struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter = { NULL };
+	struct bkey_s_c alloc_k;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+			"backpointer for missing device:\n%s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, bp_iter, 0);
+		goto out;
+	}
+
+	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
+				     bp_pos_to_bucket(c, k.k->p), 0);
+	ret = bkey_err(alloc_k);
+	if (ret)
+		goto out;
+
+	if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+			"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+			alloc_iter.pos.inode, alloc_iter.pos.offset,
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, bp_iter, 0);
+		goto out;
+	}
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+			BTREE_ID_backpointers, POS_MIN, 0, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		  bch2_check_btree_backpointer(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+struct bpos_level {
+	unsigned	level;
+	struct bpos	pos;
+};
+
+static int check_bp_exists(struct btree_trans *trans,
+			   struct bpos bucket,
+			   struct bch_backpointer bp,
+			   struct bkey_s_c orig_k,
+			   struct bpos bucket_start,
+			   struct bpos bucket_end,
+			   struct bpos_level *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter bp_iter = { NULL };
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c bp_k;
+	int ret;
+
+	if (bpos_lt(bucket, bucket_start) ||
+	    bpos_gt(bucket, bucket_end))
+		return 0;
+
+	if (!bch2_dev_bucket_exists(c, bucket))
+		goto missing;
+
+	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+				  bucket_pos_to_bp(c, bucket, bp.bucket_offset),
+				  0);
+	ret = bkey_err(bp_k);
+	if (ret)
+		goto err;
+
+	if (bp_k.k->type != KEY_TYPE_backpointer ||
+	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
+		if (last_flushed->level != bp.level ||
+		    !bpos_eq(last_flushed->pos, orig_k.k->p)) {
+			last_flushed->level = bp.level;
+			last_flushed->pos = orig_k.k->p;
+
+			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+				-BCH_ERR_transaction_restart_write_buffer_flush;
+			goto out;
+		}
+		goto missing;
+	}
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	printbuf_exit(&buf);
+	return ret;
+missing:
+	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+	       bch2_btree_ids[bp.btree_id], bp.level);
+	bch2_bkey_val_to_text(&buf, c, orig_k);
+	prt_printf(&buf, "\nbp pos ");
+	bch2_bpos_to_text(&buf, bp_iter.pos);
+
+	if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
+	    c->opts.reconstruct_alloc ||
+	    fsck_err(c, "%s", buf.buf))
+		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
+
+	goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct bpos bucket_start,
+					struct bpos bucket_end,
+					struct bpos_level *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_btree_iter_peek_all_levels(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k)
+		return 0;
+
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket_pos;
+		struct bch_backpointer bp;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+				      k, p, &bucket_pos, &bp);
+
+		ret = check_bp_exists(trans, bucket_pos, bp, k,
+				      bucket_start, bucket_end,
+				      last_flushed);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+					    enum btree_id btree_id,
+					    struct bpos bucket_start,
+					    struct bpos bucket_end,
+					    struct bpos_level *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_root *r = bch2_btree_id_root(c, btree_id);
+	struct btree_iter iter;
+	struct btree *b;
+	struct bkey_s_c k;
+	struct bkey_ptrs_c ptrs;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	int ret;
+
+	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
+	b = bch2_btree_iter_peek_node(&iter);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto err;
+
+	BUG_ON(b != btree_node_root(c, b));
+
+	k = bkey_i_to_s_c(&b->key);
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket_pos;
+		struct bch_backpointer bp;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
+				      k, p, &bucket_pos, &bp);
+
+		ret = check_bp_exists(trans, bucket_pos, bp, k,
+				      bucket_start, bucket_end,
+				      last_flushed);
+		if (ret)
+			goto err;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+	return (struct bbpos) {
+		.btree	= bp.btree_id,
+		.pos	= bp.pos,
+	};
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+	struct sysinfo i;
+	u64 mem_bytes;
+
+	si_meminfo(&i);
+	mem_bytes = i.totalram * i.mem_unit;
+	return div_u64(mem_bytes >> 1, btree_bytes(c));
+}
+
+static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+					unsigned btree_leaf_mask,
+					unsigned btree_interior_mask,
+					struct bbpos start, struct bbpos *end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+	enum btree_id btree;
+	int ret = 0;
+
+	for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
+		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+
+		if (!((1U << btree) & btree_leaf_mask) &&
+		    !((1U << btree) & btree_interior_mask))
+			continue;
+
+		bch2_trans_node_iter_init(trans, &iter, btree,
+					  btree == start.btree ? start.pos : POS_MIN,
+					  0, depth, 0);
+		/*
+		 * for_each_btree_key_contineu() doesn't check the return value
+		 * from bch2_btree_iter_advance(), which is needed when
+		 * iterating over interior nodes where we'll see keys at
+		 * SPOS_MAX:
+		 */
+		do {
+			k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
+			ret = bkey_err(k);
+			if (!k.k || ret)
+				break;
+
+			--btree_nodes;
+			if (!btree_nodes) {
+				*end = BBPOS(btree, k.k->p);
+				bch2_trans_iter_exit(trans, &iter);
+				return 0;
+			}
+		} while (bch2_btree_iter_advance(&iter));
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	*end = BBPOS_MAX;
+	return ret;
+}
+
+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
+						   struct bpos bucket_start,
+						   struct bpos bucket_end)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	enum btree_id btree_id;
+	struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
+	int ret = 0;
+
+	for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
+		unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+		bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+					  depth,
+					  BTREE_ITER_ALL_LEVELS|
+					  BTREE_ITER_PREFETCH);
+
+		do {
+			ret = commit_do(trans, NULL, NULL,
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_NOFAIL,
+					check_extent_to_backpointers(trans, &iter,
+								bucket_start, bucket_end,
+								&last_flushed));
+			if (ret)
+				break;
+		} while (!bch2_btree_iter_advance(&iter));
+
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret)
+			break;
+
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_LAZY_RW|
+				BTREE_INSERT_NOFAIL,
+				check_btree_root_to_backpointers(trans, btree_id,
+							bucket_start, bucket_end,
+							&last_flushed));
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
+					 struct bpos bucket)
+{
+	return bch2_dev_exists2(c, bucket.inode)
+		? bucket_pos_to_bp(c, bucket, 0)
+		: bucket;
+}
+
+static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
+					struct bpos start, struct bpos *end)
+{
+	struct btree_iter alloc_iter;
+	struct btree_iter bp_iter;
+	struct bkey_s_c alloc_k, bp_k;
+	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+	bool alloc_end = false, bp_end = false;
+	int ret = 0;
+
+	bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+				  start, 0, 1, 0);
+	bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+				  bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
+	while (1) {
+		alloc_k = !alloc_end
+			? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
+			: bkey_s_c_null;
+		bp_k = !bp_end
+			? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
+			: bkey_s_c_null;
+
+		ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
+		if ((!alloc_k.k && !bp_k.k) || ret) {
+			*end = SPOS_MAX;
+			break;
+		}
+
+		--btree_nodes;
+		if (!btree_nodes) {
+			*end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
+			break;
+		}
+
+		if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
+		    bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
+			if (!bch2_btree_iter_advance(&alloc_iter))
+				alloc_end = true;
+		} else {
+			if (!bch2_btree_iter_advance(&bp_iter))
+				bp_end = true;
+		}
+	}
+	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bpos start = POS_MIN, end;
+	int ret;
+
+	while (1) {
+		ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+		if (ret)
+			break;
+
+		if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
+				    __func__, btree_nodes_fit_in_ram(c));
+
+		if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "check_extents_to_backpointers(): ");
+			bch2_bpos_to_text(&buf, start);
+			prt_str(&buf, "-");
+			bch2_bpos_to_text(&buf, end);
+
+			bch_verbose(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
+		if (ret || bpos_eq(end, SPOS_MAX))
+			break;
+
+		start = bpos_successor(end);
+	}
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+				 struct bbpos start,
+				 struct bbpos end,
+				 struct bkey_s_c_backpointer bp,
+				 struct bpos *last_flushed_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bbpos pos = bp_to_bbpos(*bp.v);
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (bbpos_cmp(pos, start) < 0 ||
+	    bbpos_cmp(pos, end) > 0)
+		return 0;
+
+	k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
+	ret = bkey_err(k);
+	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+		return 0;
+	if (ret)
+		return ret;
+
+	if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
+		*last_flushed_pos = bp.k->p;
+		ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+			-BCH_ERR_transaction_restart_write_buffer_flush;
+		goto out;
+	}
+
+	if (fsck_err_on(!k.k, c,
+			"backpointer for missing extent\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
+		ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+		goto out;
+	}
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
+						   struct bbpos start,
+						   struct bbpos end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bpos last_flushed_pos = SPOS_MAX;
+
+	return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
+				  POS_MIN, BTREE_ITER_PREFETCH, k,
+				  NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_one_backpointer(trans, start, end,
+				      bkey_s_c_to_backpointer(k),
+				      &last_flushed_pos));
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
+	int ret;
+
+	while (1) {
+		ret = bch2_get_btree_in_memory_pos(trans,
+						   (1U << BTREE_ID_extents)|
+						   (1U << BTREE_ID_reflink),
+						   ~0,
+						   start, &end);
+		if (ret)
+			break;
+
+		if (!bbpos_cmp(start, BBPOS_MIN) &&
+		    bbpos_cmp(end, BBPOS_MAX))
+			bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
+				    __func__, btree_nodes_fit_in_ram(c));
+
+		if (bbpos_cmp(start, BBPOS_MIN) ||
+		    bbpos_cmp(end, BBPOS_MAX)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "check_backpointers_to_extents(): ");
+			bch2_bbpos_to_text(&buf, start);
+			prt_str(&buf, "-");
+			bch2_bbpos_to_text(&buf, end);
+
+			bch_verbose(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
+		if (ret || !bbpos_cmp(end, BBPOS_MAX))
+			break;
+
+		start = bbpos_successor(end);
+	}
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
new file mode 100644
index 000000000000..547e0617602a
--- /dev/null
+++ b/fs/bcachefs/backpointers.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "super.h"
+
+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+			     enum bkey_invalid_flags, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
+	.key_invalid	= bch2_backpointer_invalid,	\
+	.val_to_text	= bch2_backpointer_k_to_text,	\
+	.swab		= bch2_backpointer_swab,	\
+	.min_val_size	= 32,				\
+})
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+					   struct bpos bp_pos)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+	return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+					   struct bpos bucket,
+					   u64 bucket_offset)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+	struct bpos ret;
+
+	ret = POS(bucket.inode,
+		  (bucket_to_sector(ca, bucket.offset) <<
+		   MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+	EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+
+	return ret;
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *,
+				struct bch_backpointer, struct bkey_s_c, bool);
+
+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+				struct bpos bucket,
+				struct bch_backpointer bp,
+				struct bkey_s_c orig_k,
+				bool insert)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_backpointer *bp_k;
+	int ret;
+
+	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+	ret = PTR_ERR_OR_ZERO(bp_k);
+	if (ret)
+		return ret;
+
+	bkey_backpointer_init(&bp_k->k_i);
+	bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
+	bp_k->v = bp;
+
+	if (!insert) {
+		bp_k->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&bp_k->k, 0);
+	}
+
+	if (unlikely(bch2_backpointers_no_use_write_buffer))
+		return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert);
+
+	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+}
+
+static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
+						    struct bkey_s_c k, struct extent_ptr_decoded p)
+{
+	return  level		? BCH_DATA_btree :
+		p.has_ec	? BCH_DATA_stripe :
+				  BCH_DATA_user;
+}
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c k, struct extent_ptr_decoded p,
+			   struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+	enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+	s64 sectors = level ? btree_sectors(c) : k.k->size;
+	u32 bucket_offset;
+
+	*bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+	*bp = (struct bch_backpointer) {
+		.btree_id	= btree_id,
+		.level		= level,
+		.data_type	= data_type,
+		.bucket_offset	= ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+			p.crc.offset,
+		.bucket_len	= ptr_disk_sectors(sectors, p),
+		.pos		= k.k->p,
+	};
+}
+
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+			      struct bpos *, struct bch_backpointer *, unsigned);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+					 struct bpos, struct bch_backpointer,
+					 unsigned);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+					struct bpos, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
new file mode 100644
index 000000000000..1fbed1f8378d
--- /dev/null
+++ b/fs/bcachefs/bbpos.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_H
+#define _BCACHEFS_BBPOS_H
+
+#include "bkey_methods.h"
+
+struct bbpos {
+	enum btree_id		btree;
+	struct bpos		pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+	return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN	BBPOS(0, POS_MIN)
+#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
+{
+	return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
+}
+
+static inline struct bbpos bbpos_successor(struct bbpos pos)
+{
+	if (bpos_cmp(pos.pos, SPOS_MAX)) {
+		pos.pos = bpos_successor(pos.pos);
+		return pos;
+	}
+
+	if (pos.btree != BTREE_ID_NR) {
+		pos.btree++;
+		pos.pos = POS_MIN;
+		return pos;
+	}
+
+	BUG();
+}
+
+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
+{
+	prt_str(out, bch2_btree_ids[pos.btree]);
+	prt_char(out, ':');
+	bch2_bpos_to_text(out, pos.pos);
+}
+
+#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
new file mode 100644
index 000000000000..53ffa88cae16
--- /dev/null
+++ b/fs/bcachefs/bcachefs.h
@@ -0,0 +1,1156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_H
+#define _BCACHEFS_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#ifdef __KERNEL__
+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#else
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+#endif
+
+#include <linux/backing-dev-defs.h>
+#include <linux/bug.h>
+#include <linux/bio.h>
+#include <linux/closure.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/math64.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/zstd.h>
+
+#include "bcachefs_format.h"
+#include "errcode.h"
+#include "fifo.h"
+#include "nocow_locking_types.h"
+#include "opts.h"
+#include "recovery_types.h"
+#include "seqmutex.h"
+#include "util.h"
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_WRITE_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
+#define dynamic_fault(...)		0
+#endif
+
+#define race_fault(...)			dynamic_fault("bcachefs:race")
+
+#define trace_and_count(_c, _name, ...)					\
+do {									\
+	this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]);		\
+	trace_##_name(__VA_ARGS__);					\
+} while (0)
+
+#define bch2_fs_init_fault(name)					\
+	dynamic_fault("bcachefs:bch_fs_init:" name)
+#define bch2_meta_read_fault(name)					\
+	 dynamic_fault("bcachefs:meta:read:" name)
+#define bch2_meta_write_fault(name)					\
+	 dynamic_fault("bcachefs:meta:write:" name)
+
+#ifdef __KERNEL__
+#define BCACHEFS_LOG_PREFIX
+#endif
+
+#ifdef BCACHEFS_LOG_PREFIX
+
+#define bch2_log_msg(_c, fmt)			"bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt_dev(_ca, fmt)			"bcachefs (%s): " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt)		"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)			\
+	 "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
+
+#else
+
+#define bch2_log_msg(_c, fmt)			fmt
+#define bch2_fmt_dev(_ca, fmt)			"%s: " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt)		"inum %llu: " fmt "\n", (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)				\
+	 "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
+
+#endif
+
+#define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
+
+#define bch_info(c, fmt, ...) \
+	printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_err(c, fmt, ...) \
+	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev(ca, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset(ca, _offset, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum(c, _inum, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+#define bch_err_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev_ratelimited(ca, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+#define bch_err_fn(_c, _ret)						\
+do {									\
+	if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
+#define bch_err_msg(_c, _ret, _msg, ...)				\
+do {									\
+	if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		bch_err(_c, "%s(): error " _msg " %s", __func__,	\
+			##__VA_ARGS__, bch2_err_str(_ret));		\
+} while (0)
+
+#define bch_verbose(c, fmt, ...)					\
+do {									\
+	if ((c)->opts.verbose)						\
+		bch_info(c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+#define pr_verbose_init(opts, fmt, ...)					\
+do {									\
+	if (opt_get(opts, verbose))					\
+		pr_info(fmt, ##__VA_ARGS__);				\
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS()					\
+	BCH_DEBUG_PARAM(key_merging_disabled,				\
+		"Disables merging of extents")				\
+	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
+		"Causes mark and sweep to compact and rewrite every "	\
+		"btree node it traverses")				\
+	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
+		"Disables rewriting of btree nodes during mark and sweep")\
+	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
+		"Disables the shrinker callback for the btree node cache")\
+	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
+		"Reread btree nodes at various points to verify the "	\
+		"mergesort in the read path against modifications "	\
+		"done in memory")					\
+	BCH_DEBUG_PARAM(verify_all_btree_replicas,			\
+		"When reading btree nodes, read all replicas and "	\
+		"compare them")						\
+	BCH_DEBUG_PARAM(backpointers_no_use_write_buffer,		\
+		"Don't use the write buffer for backpointers, enabling "\
+		"extra runtime checks")
+
+/* Parameters that should only be compiled in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()					\
+	BCH_DEBUG_PARAM(expensive_debug_checks,				\
+		"Enables various runtime debugging checks that "	\
+		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_iterators,				\
+		"Enables extra verification for btree iterators")	\
+	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
+		"Verify btree accounting for keys within a node")	\
+	BCH_DEBUG_PARAM(journal_seq_verify,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(inject_invalid_keys,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(test_alloc_startup,				\
+		"Force allocator startup to use the slowpath where it"	\
+		"can't find enough free buckets without invalidating"	\
+		"cached data")						\
+	BCH_DEBUG_PARAM(force_reconstruct_read,				\
+		"Force reads to use the reconstruct path, when reading"	\
+		"from erasure coded extents")				\
+	BCH_DEBUG_PARAM(test_restart_gc,				\
+		"Test restarting mark and sweep gc when bucket gens change")
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
+#define BCH_TIME_STATS()			\
+	x(btree_node_mem_alloc)			\
+	x(btree_node_split)			\
+	x(btree_node_compact)			\
+	x(btree_node_merge)			\
+	x(btree_node_sort)			\
+	x(btree_node_read)			\
+	x(btree_interior_update_foreground)	\
+	x(btree_interior_update_total)		\
+	x(btree_gc)				\
+	x(data_write)				\
+	x(data_read)				\
+	x(data_promote)				\
+	x(journal_flush_write)			\
+	x(journal_noflush_write)		\
+	x(journal_flush_seq)			\
+	x(blocked_journal)			\
+	x(blocked_allocate)			\
+	x(blocked_allocate_open_bucket)		\
+	x(nocow_lock_contended)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+	BCH_TIME_STATS()
+#undef x
+	BCH_TIME_STAT_NR
+};
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "btree_write_buffer_types.h"
+#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
+#include "clock_types.h"
+#include "ec_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "replicas_types.h"
+#include "subvolume_types.h"
+#include "super_types.h"
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES		4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
+
+#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
+struct btree;
+
+enum gc_phase {
+	GC_PHASE_NOT_RUNNING,
+	GC_PHASE_START,
+	GC_PHASE_SB,
+
+	GC_PHASE_BTREE_stripes,
+	GC_PHASE_BTREE_extents,
+	GC_PHASE_BTREE_inodes,
+	GC_PHASE_BTREE_dirents,
+	GC_PHASE_BTREE_xattrs,
+	GC_PHASE_BTREE_alloc,
+	GC_PHASE_BTREE_quotas,
+	GC_PHASE_BTREE_reflink,
+	GC_PHASE_BTREE_subvolumes,
+	GC_PHASE_BTREE_snapshots,
+	GC_PHASE_BTREE_lru,
+	GC_PHASE_BTREE_freespace,
+	GC_PHASE_BTREE_need_discard,
+	GC_PHASE_BTREE_backpointers,
+	GC_PHASE_BTREE_bucket_gens,
+	GC_PHASE_BTREE_snapshot_trees,
+	GC_PHASE_BTREE_deleted_inodes,
+	GC_PHASE_BTREE_logged_ops,
+
+	GC_PHASE_PENDING_DELETE,
+};
+
+struct gc_pos {
+	enum gc_phase		phase;
+	struct bpos		pos;
+	unsigned		level;
+};
+
+struct reflink_gc {
+	u64		offset;
+	u32		size;
+	u32		refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
+struct io_count {
+	u64			sectors[2][BCH_DATA_NR];
+};
+
+struct bch_dev {
+	struct kobject		kobj;
+	struct percpu_ref	ref;
+	struct completion	ref_completion;
+	struct percpu_ref	io_ref;
+	struct completion	io_ref_completion;
+
+	struct bch_fs		*fs;
+
+	u8			dev_idx;
+	/*
+	 * Cached version of this device's member info from superblock
+	 * Committed by bch2_write_super() -> bch_fs_mi_update()
+	 */
+	struct bch_member_cpu	mi;
+	__uuid_t		uuid;
+	char			name[BDEVNAME_SIZE];
+
+	struct bch_sb_handle	disk_sb;
+	struct bch_sb		*sb_read_scratch;
+	int			sb_write_error;
+	dev_t			dev;
+	atomic_t		flush_seq;
+
+	struct bch_devs_mask	self;
+
+	/* biosets used in cloned bios for writing multiple replicas */
+	struct bio_set		replica_set;
+
+	/*
+	 * Buckets:
+	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
+	 * gc_lock, for device resize - holding any is sufficient for access:
+	 * Or rcu_read_lock(), but only for ptr_stale():
+	 */
+	struct bucket_array __rcu *buckets_gc;
+	struct bucket_gens __rcu *bucket_gens;
+	u8			*oldest_gen;
+	unsigned long		*buckets_nouse;
+	struct rw_semaphore	bucket_lock;
+
+	struct bch_dev_usage		*usage_base;
+	struct bch_dev_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_dev_usage __percpu	*usage_gc;
+
+	/* Allocator: */
+	u64			new_fs_bucket_idx;
+	u64			alloc_cursor;
+
+	unsigned		nr_open_buckets;
+	unsigned		nr_btree_reserve;
+
+	size_t			inc_gen_needs_gc;
+	size_t			inc_gen_really_needs_gc;
+	size_t			buckets_waiting_on_journal;
+
+	atomic64_t		rebalance_work;
+
+	struct journal_device	journal;
+	u64			prev_journal_sector;
+
+	struct work_struct	io_error_work;
+
+	/* The rest of this all shows up in sysfs */
+	atomic64_t		cur_latency[2];
+	struct bch2_time_stats	io_latency[2];
+
+#define CONGESTED_MAX		1024
+	atomic_t		congested;
+	u64			congested_last;
+
+	struct io_count __percpu *io_done;
+};
+
+enum {
+	/* startup: */
+	BCH_FS_STARTED,
+	BCH_FS_MAY_GO_RW,
+	BCH_FS_RW,
+	BCH_FS_WAS_RW,
+
+	/* shutdown: */
+	BCH_FS_STOPPING,
+	BCH_FS_EMERGENCY_RO,
+	BCH_FS_GOING_RO,
+	BCH_FS_WRITE_DISABLE_COMPLETE,
+	BCH_FS_CLEAN_SHUTDOWN,
+
+	/* fsck passes: */
+	BCH_FS_FSCK_DONE,
+	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
+	BCH_FS_NEED_ANOTHER_GC,
+
+	BCH_FS_HAVE_DELETED_SNAPSHOTS,
+
+	/* errors: */
+	BCH_FS_ERROR,
+	BCH_FS_TOPOLOGY_ERROR,
+	BCH_FS_ERRORS_FIXED,
+	BCH_FS_ERRORS_NOT_FIXED,
+};
+
+struct btree_debug {
+	unsigned		id;
+};
+
+#define BCH_TRANSACTIONS_NR 128
+
+struct btree_transaction_stats {
+	struct bch2_time_stats	lock_hold_times;
+	struct mutex		lock;
+	unsigned		nr_max_paths;
+	unsigned		wb_updates_size;
+	unsigned		max_mem;
+	char			*max_paths_text;
+};
+
+struct bch_fs_pcpu {
+	u64			sectors_available;
+};
+
+struct journal_seq_blacklist_table {
+	size_t			nr;
+	struct journal_seq_blacklist_table_entry {
+		u64		start;
+		u64		end;
+		bool		dirty;
+	}			entries[0];
+};
+
+struct journal_keys {
+	struct journal_key {
+		u64		journal_seq;
+		u32		journal_offset;
+		enum btree_id	btree_id:8;
+		unsigned	level:8;
+		bool		allocated;
+		bool		overwritten;
+		struct bkey_i	*k;
+	}			*d;
+	/*
+	 * Gap buffer: instead of all the empty space in the array being at the
+	 * end of the buffer - from @nr to @size - the empty space is at @gap.
+	 * This means that sequential insertions are O(n) instead of O(n^2).
+	 */
+	size_t			gap;
+	size_t			nr;
+	size_t			size;
+};
+
+struct btree_trans_buf {
+	struct btree_trans	*trans;
+};
+
+#define REPLICAS_DELTA_LIST_MAX	(1U << 16)
+
+#define BCACHEFS_ROOT_SUBVOL_INUM					\
+	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
+
+#define BCH_WRITE_REFS()						\
+	x(trans)							\
+	x(write)							\
+	x(promote)							\
+	x(node_rewrite)							\
+	x(stripe_create)						\
+	x(stripe_delete)						\
+	x(reflink)							\
+	x(fallocate)							\
+	x(discard)							\
+	x(invalidate)							\
+	x(delete_dead_snapshots)					\
+	x(snapshot_delete_pagecache)					\
+	x(sysfs)
+
+enum bch_write_ref {
+#define x(n) BCH_WRITE_REF_##n,
+	BCH_WRITE_REFS()
+#undef x
+	BCH_WRITE_REF_NR,
+};
+
+struct bch_fs {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		counters_kobj;
+	struct kobject		internal;
+	struct kobject		opts_dir;
+	struct kobject		time_stats;
+	unsigned long		flags;
+
+	int			minor;
+	struct device		*chardev;
+	struct super_block	*vfs_sb;
+	dev_t			dev;
+	char			name[40];
+
+	/* ro/rw, add/remove/resize devices: */
+	struct rw_semaphore	state_lock;
+
+	/* Counts outstanding writes, for clean transition to read-only */
+#ifdef BCH_WRITE_REF_DEBUG
+	atomic_long_t		writes[BCH_WRITE_REF_NR];
+#else
+	struct percpu_ref	writes;
+#endif
+	struct work_struct	read_only_work;
+
+	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
+
+	struct bch_replicas_cpu replicas;
+	struct bch_replicas_cpu replicas_gc;
+	struct mutex		replicas_gc_lock;
+	mempool_t		replicas_delta_pool;
+
+	struct journal_entry_res btree_root_journal_res;
+	struct journal_entry_res replicas_journal_res;
+	struct journal_entry_res clock_journal_res;
+	struct journal_entry_res dev_usage_journal_res;
+
+	struct bch_disk_groups_cpu __rcu *disk_groups;
+
+	struct bch_opts		opts;
+
+	/* Updated by bch2_sb_update():*/
+	struct {
+		__uuid_t	uuid;
+		__uuid_t	user_uuid;
+
+		u16		version;
+		u16		version_min;
+		u16		version_upgrade_complete;
+
+		u8		nr_devices;
+		u8		clean;
+
+		u8		encryption_type;
+
+		u64		time_base_lo;
+		u32		time_base_hi;
+		unsigned	time_units_per_sec;
+		unsigned	nsec_per_time_unit;
+		u64		features;
+		u64		compat;
+	}			sb;
+
+
+	struct bch_sb_handle	disk_sb;
+
+	unsigned short		block_bits;	/* ilog2(block_size) */
+
+	u16			btree_foreground_merge_threshold;
+
+	struct closure		sb_write;
+	struct mutex		sb_lock;
+
+	/* snapshot.c: */
+	struct snapshot_table __rcu *snapshots;
+	size_t			snapshot_table_size;
+	struct mutex		snapshot_table_lock;
+	struct rw_semaphore	snapshot_create_lock;
+
+	struct work_struct	snapshot_delete_work;
+	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
+	snapshot_id_list	snapshots_unlinked;
+	struct mutex		snapshots_unlinked_lock;
+
+	/* BTREE CACHE */
+	struct bio_set		btree_bio;
+	struct workqueue_struct	*io_complete_wq;
+
+	struct btree_root	btree_roots_known[BTREE_ID_NR];
+	DARRAY(struct btree_root) btree_roots_extra;
+	struct mutex		btree_root_lock;
+
+	struct btree_cache	btree_cache;
+
+	/*
+	 * Cache of allocated btree nodes - if we allocate a btree node and
+	 * don't use it, if we free it that space can't be reused until going
+	 * _all_ the way through the allocator (which exposes us to a livelock
+	 * when allocating btree reserves fail halfway through) - instead, we
+	 * can stick them here:
+	 */
+	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	unsigned		btree_reserve_cache_nr;
+	struct mutex		btree_reserve_cache_lock;
+
+	mempool_t		btree_interior_update_pool;
+	struct list_head	btree_interior_update_list;
+	struct list_head	btree_interior_updates_unwritten;
+	struct mutex		btree_interior_update_lock;
+	struct closure_waitlist	btree_interior_update_wait;
+
+	struct workqueue_struct	*btree_interior_update_worker;
+	struct work_struct	btree_interior_update_work;
+
+	struct list_head	pending_node_rewrites;
+	struct mutex		pending_node_rewrites_lock;
+
+	/* btree_io.c: */
+	spinlock_t		btree_write_error_lock;
+	struct btree_write_stats {
+		atomic64_t	nr;
+		atomic64_t	bytes;
+	}			btree_write_stats[BTREE_WRITE_TYPE_NR];
+
+	/* btree_iter.c: */
+	struct seqmutex		btree_trans_lock;
+	struct list_head	btree_trans_list;
+	mempool_t		btree_trans_pool;
+	mempool_t		btree_trans_mem_pool;
+	struct btree_trans_buf  __percpu	*btree_trans_bufs;
+
+	struct srcu_struct	btree_trans_barrier;
+	bool			btree_trans_barrier_initialized;
+
+	struct btree_key_cache	btree_key_cache;
+	unsigned		btree_key_cache_btrees;
+
+	struct btree_write_buffer btree_write_buffer;
+
+	struct workqueue_struct	*btree_update_wq;
+	struct workqueue_struct	*btree_io_complete_wq;
+	/* copygc needs its own workqueue for index updates.. */
+	struct workqueue_struct	*copygc_wq;
+	/*
+	 * Use a dedicated wq for write ref holder tasks. Required to avoid
+	 * dependency problems with other wq tasks that can block on ref
+	 * draining, such as read-only transition.
+	 */
+	struct workqueue_struct *write_ref_wq;
+
+	/* ALLOCATION */
+	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
+
+	u64			capacity; /* sectors */
+
+	/*
+	 * When capacity _decreases_ (due to a disk being removed), we
+	 * increment capacity_gen - this invalidates outstanding reservations
+	 * and forces them to be revalidated
+	 */
+	u32			capacity_gen;
+	unsigned		bucket_size_max;
+
+	atomic64_t		sectors_available;
+	struct mutex		sectors_available_lock;
+
+	struct bch_fs_pcpu __percpu	*pcpu;
+
+	struct percpu_rw_semaphore	mark_lock;
+
+	seqcount_t			usage_lock;
+	struct bch_fs_usage		*usage_base;
+	struct bch_fs_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_fs_usage __percpu	*usage_gc;
+	u64 __percpu		*online_reserved;
+
+	/* single element mempool: */
+	struct mutex		usage_scratch_lock;
+	struct bch_fs_usage_online *usage_scratch;
+
+	struct io_clock		io_clock[2];
+
+	/* JOURNAL SEQ BLACKLIST */
+	struct journal_seq_blacklist_table *
+				journal_seq_blacklist_table;
+	struct work_struct	journal_seq_blacklist_gc_work;
+
+	/* ALLOCATOR */
+	spinlock_t		freelist_lock;
+	struct closure_waitlist	freelist_wait;
+	u64			blocked_allocate;
+	u64			blocked_allocate_open_bucket;
+
+	open_bucket_idx_t	open_buckets_freelist;
+	open_bucket_idx_t	open_buckets_nr_free;
+	struct closure_waitlist	open_buckets_wait;
+	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
+
+	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_partial_nr;
+
+	struct write_point	btree_write_point;
+	struct write_point	rebalance_write_point;
+
+	struct write_point	write_points[WRITE_POINT_MAX];
+	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
+	struct mutex		write_points_hash_lock;
+	unsigned		write_points_nr;
+
+	struct buckets_waiting_for_journal buckets_waiting_for_journal;
+	struct work_struct	discard_work;
+	struct work_struct	invalidate_work;
+
+	/* GARBAGE COLLECTION */
+	struct task_struct	*gc_thread;
+	atomic_t		kick_gc;
+	unsigned long		gc_count;
+
+	enum btree_id		gc_gens_btree;
+	struct bpos		gc_gens_pos;
+
+	/*
+	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+	 * has been marked by GC.
+	 *
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
+	 *
+	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+	 * can read without a lock.
+	 */
+	seqcount_t		gc_pos_lock;
+	struct gc_pos		gc_pos;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress.
+	 */
+	struct rw_semaphore	gc_lock;
+	struct mutex		gc_gens_lock;
+
+	/* IO PATH */
+	struct semaphore	io_in_flight;
+	struct bio_set		bio_read;
+	struct bio_set		bio_read_split;
+	struct bio_set		bio_write;
+	struct mutex		bio_bounce_pages_lock;
+	mempool_t		bio_bounce_pages;
+	struct bucket_nocow_lock_table
+				nocow_locks;
+	struct rhashtable	promote_table;
+
+	mempool_t		compression_bounce[2];
+	mempool_t		compress_workspace[BCH_COMPRESSION_TYPE_NR];
+	mempool_t		decompress_workspace;
+	ZSTD_parameters		zstd_params;
+
+	struct crypto_shash	*sha256;
+	struct crypto_sync_skcipher *chacha20;
+	struct crypto_shash	*poly1305;
+
+	atomic64_t		key_version;
+
+	mempool_t		large_bkey_pool;
+
+	/* MOVE.C */
+	struct list_head	moving_context_list;
+	struct mutex		moving_context_lock;
+
+	struct list_head	data_progress_list;
+	struct mutex		data_progress_lock;
+
+	/* REBALANCE */
+	struct bch_fs_rebalance	rebalance;
+
+	/* COPYGC */
+	struct task_struct	*copygc_thread;
+	struct write_point	copygc_write_point;
+	s64			copygc_wait_at;
+	s64			copygc_wait;
+	bool			copygc_running;
+	wait_queue_head_t	copygc_running_wq;
+
+	/* STRIPES: */
+	GENRADIX(struct stripe) stripes;
+	GENRADIX(struct gc_stripe) gc_stripes;
+
+	struct hlist_head	ec_stripes_new[32];
+	spinlock_t		ec_stripes_new_lock;
+
+	ec_stripes_heap		ec_stripes_heap;
+	struct mutex		ec_stripes_heap_lock;
+
+	/* ERASURE CODING */
+	struct list_head	ec_stripe_head_list;
+	struct mutex		ec_stripe_head_lock;
+
+	struct list_head	ec_stripe_new_list;
+	struct mutex		ec_stripe_new_lock;
+	wait_queue_head_t	ec_stripe_new_wait;
+
+	struct work_struct	ec_stripe_create_work;
+	u64			ec_stripe_hint;
+
+	struct work_struct	ec_stripe_delete_work;
+
+	struct bio_set		ec_bioset;
+
+	/* REFLINK */
+	reflink_gc_table	reflink_gc_table;
+	size_t			reflink_gc_nr;
+
+	/* fs.c */
+	struct list_head	vfs_inodes_list;
+	struct mutex		vfs_inodes_lock;
+
+	/* VFS IO PATH - fs-io.c */
+	struct bio_set		writepage_bioset;
+	struct bio_set		dio_write_bioset;
+	struct bio_set		dio_read_bioset;
+	struct bio_set		nocow_flush_bioset;
+
+	/* ERRORS */
+	struct list_head	fsck_errors;
+	struct mutex		fsck_error_lock;
+	bool			fsck_alloc_err;
+
+	/* QUOTAS */
+	struct bch_memquota_type quotas[QTYP_NR];
+
+	/* RECOVERY */
+	u64			journal_replay_seq_start;
+	u64			journal_replay_seq_end;
+	enum bch_recovery_pass	curr_recovery_pass;
+	/* bitmap of explicitly enabled recovery passes: */
+	u64			recovery_passes_explicit;
+	u64			recovery_passes_complete;
+
+	/* DEBUG JUNK */
+	struct dentry		*fs_debug_dir;
+	struct dentry		*btree_debug_dir;
+	struct btree_debug	btree_debug[BTREE_ID_NR];
+	struct btree		*verify_data;
+	struct btree_node	*verify_ondisk;
+	struct mutex		verify_lock;
+
+	u64			*unused_inode_hints;
+	unsigned		inode_shard_bits;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - have to dynamically allocate them
+	 */
+	mempool_t		fill_iter;
+
+	mempool_t		btree_bounce_pool;
+
+	struct journal		journal;
+	GENRADIX(struct journal_replay *) journal_entries;
+	u64			journal_entries_base_seq;
+	struct journal_keys	journal_keys;
+	struct list_head	journal_iters;
+
+	u64			last_bucket_seq_cleanup;
+
+	u64			counters_on_mount[BCH_COUNTER_NR];
+	u64 __percpu		*counters;
+
+	unsigned		btree_gc_periodic:1;
+	unsigned		copy_gc_enabled:1;
+	bool			promote_whole_extents;
+
+	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
+
+	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
+};
+
+extern struct wait_queue_head bch2_read_only_wait;
+
+static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	atomic_long_inc(&c->writes[ref]);
+#else
+	percpu_ref_get(&c->writes);
+#endif
+}
+
+static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+		atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+	return percpu_ref_tryget_live(&c->writes);
+#endif
+}
+
+static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	long v = atomic_long_dec_return(&c->writes[ref]);
+
+	BUG_ON(v < 0);
+	if (v)
+		return;
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+		if (atomic_long_read(&c->writes[i]))
+			return;
+
+	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch2_read_only_wait);
+#else
+	percpu_ref_put(&c->writes);
+#endif
+}
+
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+	if (c->vfs_sb)
+		c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
+static inline unsigned bucket_bytes(const struct bch_dev *ca)
+{
+	return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct bch_fs *c)
+{
+	return c->opts.block_size;
+}
+
+static inline unsigned block_sectors(const struct bch_fs *c)
+{
+	return c->opts.block_size >> 9;
+}
+
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+	return c->opts.btree_node_size >> 9;
+}
+
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+	return c->btree_key_cache_btrees & (1U << btree);
+}
+
+static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
+{
+	struct timespec64 t;
+	s32 rem;
+
+	time += c->sb.time_base_lo;
+
+	t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+	t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+	return t;
+}
+
+static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
+{
+	return (ts.tv_sec * c->sb.time_units_per_sec +
+		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
+}
+
+static inline s64 bch2_current_time(const struct bch_fs *c)
+{
+	struct timespec64 now;
+
+	ktime_get_coarse_real_ts64(&now);
+	return timespec_to_bch2_time(c, now);
+}
+
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+	return dev < c->sb.nr_devices && c->devs[dev];
+}
+
+#define BKEY_PADDED_ONSTACK(key, pad)				\
+	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
+#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
new file mode 100644
index 000000000000..99749f3315fe
--- /dev/null
+++ b/fs/bcachefs/bcachefs_format.h
@@ -0,0 +1,2413 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FORMAT_H
+#define _BCACHEFS_FORMAT_H
+
+/*
+ * bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ *  - superblock
+ *  - journal
+ *  - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
+ */
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/uuid.h>
+#include "vstructs.h"
+
+#ifdef __KERNEL__
+typedef uuid_t __uuid_t;
+#endif
+
+#define BITMASK(name, type, field, offset, end)				\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
+									\
+static inline __u64 name(const type *k)					\
+{									\
+	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
+}									\
+									\
+static inline void SET_##name(type *k, __u64 v)				\
+{									\
+	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
+	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
+}
+
+#define LE_BITMASK(_bits, name, type, field, offset, end)		\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
+static const __maybe_unused __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;\
+									\
+static inline __u64 name(const type *k)					\
+{									\
+	return (__le##_bits##_to_cpu(k->field) >> offset) &		\
+		~(~0ULL << (end - offset));				\
+}									\
+									\
+static inline void SET_##name(type *k, __u64 v)				\
+{									\
+	__u##_bits new = __le##_bits##_to_cpu(k->field);		\
+									\
+	new &= ~(~(~0ULL << (end - offset)) << offset);			\
+	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
+	k->field = __cpu_to_le##_bits(new);				\
+}
+
+#define LE16_BITMASK(n, t, f, o, e)	LE_BITMASK(16, n, t, f, o, e)
+#define LE32_BITMASK(n, t, f, o, e)	LE_BITMASK(32, n, t, f, o, e)
+#define LE64_BITMASK(n, t, f, o, e)	LE_BITMASK(64, n, t, f, o, e)
+
+struct bkey_format {
+	__u8		key_u64s;
+	__u8		nr_fields;
+	/* One unused slot for now: */
+	__u8		bits_per_field[6];
+	__le64		field_offset[6];
+};
+
+/* Btree keys - all units are in sectors */
+
+struct bpos {
+	/*
+	 * Word order matches machine byte order - btree code treats a bpos as a
+	 * single large integer, for search/comparison purposes
+	 *
+	 * Note that wherever a bpos is embedded in another on disk data
+	 * structure, it has to be byte swabbed when reading in metadata that
+	 * wasn't written in native endian order:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u32		snapshot;
+	__u64		offset;
+	__u64		inode;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u64		inode;
+	__u64		offset;		/* Points to end of extent - sectors */
+	__u32		snapshot;
+#else
+#error edit for your odd byteorder.
+#endif
+} __packed __aligned(4);
+
+#define KEY_INODE_MAX			((__u64)~0ULL)
+#define KEY_OFFSET_MAX			((__u64)~0ULL)
+#define KEY_SNAPSHOT_MAX		((__u32)~0U)
+#define KEY_SIZE_MAX			((__u32)~0U)
+
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
+{
+	return (struct bpos) {
+		.inode		= inode,
+		.offset		= offset,
+		.snapshot	= snapshot,
+	};
+}
+
+#define POS_MIN				SPOS(0, 0, 0)
+#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
+#define SPOS_MAX			SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
+
+/* Empty placeholder struct, for container_of() */
+struct bch_val {
+	__u64		__nothing[0];
+};
+
+struct bversion {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u64		lo;
+	__u32		hi;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u32		hi;
+	__u64		lo;
+#endif
+} __packed __aligned(4);
+
+struct bkey {
+	/* Size of combined key and value, in u64s */
+	__u8		u64s;
+
+	/* Format of key (0 for format local to btree node) */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#else
+#error edit for your odd byteorder.
+#endif
+
+	/* Type of the value */
+	__u8		type;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u8		pad[1];
+
+	struct bversion	version;
+	__u32		size;		/* extent size, in sectors */
+	struct bpos	p;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	struct bpos	p;
+	__u32		size;		/* extent size, in sectors */
+	struct bversion	version;
+
+	__u8		pad[1];
+#endif
+} __packed __aligned(8);
+
+struct bkey_packed {
+	__u64		_data[0];
+
+	/* Size of combined key and value, in u64s */
+	__u8		u64s;
+
+	/* Format of key (0 for format local to btree node) */
+
+	/*
+	 * XXX: next incompat on disk format change, switch format and
+	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
+	 * bits of the bitfield
+	 */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#endif
+
+	/* Type of the value */
+	__u8		type;
+	__u8		key_start[0];
+
+	/*
+	 * We copy bkeys with struct assignment in various places, and while
+	 * that shouldn't be done with packed bkeys we can't disallow it in C,
+	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
+	 * to the same size as struct bkey should hopefully be safest.
+	 */
+	__u8		pad[sizeof(struct bkey) - 3];
+} __packed __aligned(8);
+
+typedef struct {
+	__le64			lo;
+	__le64			hi;
+} bch_le128;
+
+#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX			U8_MAX
+#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
+
+#define KEY_PACKED_BITS_START		24
+
+#define KEY_FORMAT_LOCAL_BTREE		0
+#define KEY_FORMAT_CURRENT		1
+
+enum bch_bkey_fields {
+	BKEY_FIELD_INODE,
+	BKEY_FIELD_OFFSET,
+	BKEY_FIELD_SNAPSHOT,
+	BKEY_FIELD_SIZE,
+	BKEY_FIELD_VERSION_HI,
+	BKEY_FIELD_VERSION_LO,
+	BKEY_NR_FIELDS,
+};
+
+#define bkey_format_field(name, field)					\
+	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
+
+#define BKEY_FORMAT_CURRENT						\
+((struct bkey_format) {							\
+	.key_u64s	= BKEY_U64s,					\
+	.nr_fields	= BKEY_NR_FIELDS,				\
+	.bits_per_field = {						\
+		bkey_format_field(INODE,	p.inode),		\
+		bkey_format_field(OFFSET,	p.offset),		\
+		bkey_format_field(SNAPSHOT,	p.snapshot),		\
+		bkey_format_field(SIZE,		size),			\
+		bkey_format_field(VERSION_HI,	version.hi),		\
+		bkey_format_field(VERSION_LO,	version.lo),		\
+	},								\
+})
+
+/* bkey with inline value */
+struct bkey_i {
+	__u64			_data[0];
+
+	struct bkey	k;
+	struct bch_val	v;
+};
+
+#define KEY(_inode, _offset, _size)					\
+((struct bkey) {							\
+	.u64s		= BKEY_U64s,					\
+	.format		= KEY_FORMAT_CURRENT,				\
+	.p		= POS(_inode, _offset),				\
+	.size		= _size,					\
+})
+
+static inline void bkey_init(struct bkey *k)
+{
+	*k = KEY(0, 0, 0);
+}
+
+#define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
+
+#define __BKEY_PADDED(key, pad)					\
+	struct bkey_i key; __u64 key ## _pad[pad]
+
+/*
+ * - DELETED keys are used internally to mark keys that should be ignored but
+ *   override keys in composition order.  Their version number is ignored.
+ *
+ * - DISCARDED keys indicate that the data is all 0s because it has been
+ *   discarded. DISCARDs may have a version; if the version is nonzero the key
+ *   will be persistent, otherwise the key will be dropped whenever the btree
+ *   node is rewritten (like DELETED keys).
+ *
+ * - ERROR: any read of the data returns a read error, as the data was lost due
+ *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
+ *   by new writes or cluster-wide GC. Node repair can also overwrite them with
+ *   the same or a more recent version number, but not with an older version
+ *   number.
+ *
+ * - WHITEOUT: for hash table btrees
+ */
+#define BCH_BKEY_TYPES()				\
+	x(deleted,		0)			\
+	x(whiteout,		1)			\
+	x(error,		2)			\
+	x(cookie,		3)			\
+	x(hash_whiteout,	4)			\
+	x(btree_ptr,		5)			\
+	x(extent,		6)			\
+	x(reservation,		7)			\
+	x(inode,		8)			\
+	x(inode_generation,	9)			\
+	x(dirent,		10)			\
+	x(xattr,		11)			\
+	x(alloc,		12)			\
+	x(quota,		13)			\
+	x(stripe,		14)			\
+	x(reflink_p,		15)			\
+	x(reflink_v,		16)			\
+	x(inline_data,		17)			\
+	x(btree_ptr_v2,		18)			\
+	x(indirect_inline_data,	19)			\
+	x(alloc_v2,		20)			\
+	x(subvolume,		21)			\
+	x(snapshot,		22)			\
+	x(inode_v2,		23)			\
+	x(alloc_v3,		24)			\
+	x(set,			25)			\
+	x(lru,			26)			\
+	x(alloc_v4,		27)			\
+	x(backpointer,		28)			\
+	x(inode_v3,		29)			\
+	x(bucket_gens,		30)			\
+	x(snapshot_tree,	31)			\
+	x(logged_op_truncate,	32)			\
+	x(logged_op_finsert,	33)
+
+enum bch_bkey_type {
+#define x(name, nr) KEY_TYPE_##name	= nr,
+	BCH_BKEY_TYPES()
+#undef x
+	KEY_TYPE_MAX,
+};
+
+struct bch_deleted {
+	struct bch_val		v;
+};
+
+struct bch_whiteout {
+	struct bch_val		v;
+};
+
+struct bch_error {
+	struct bch_val		v;
+};
+
+struct bch_cookie {
+	struct bch_val		v;
+	__le64			cookie;
+};
+
+struct bch_hash_whiteout {
+	struct bch_val		v;
+};
+
+struct bch_set {
+	struct bch_val		v;
+};
+
+/* Extents */
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32	- 0b1
+ * bch_extent_ptr	- 0b10
+ * bch_extent_crc64	- 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+/* 128 bits, sufficient for cryptographic MACs: */
+struct bch_csum {
+	__le64			lo;
+	__le64			hi;
+} __packed __aligned(8);
+
+#define BCH_EXTENT_ENTRY_TYPES()		\
+	x(ptr,			0)		\
+	x(crc32,		1)		\
+	x(crc64,		2)		\
+	x(crc128,		3)		\
+	x(stripe_ptr,		4)		\
+	x(rebalance,		5)
+#define BCH_EXTENT_ENTRY_MAX	6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32			type:2,
+				_compressed_size:7,
+				_uncompressed_size:7,
+				offset:7,
+				_unused:1,
+				csum_type:4,
+				compression_type:4;
+	__u32			csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u32			csum;
+	__u32			compression_type:4,
+				csum_type:4,
+				_unused:1,
+				offset:7,
+				_uncompressed_size:7,
+				_compressed_size:7,
+				type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX		(1U << 7)
+#define CRC32_NONCE_MAX		0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:3,
+				_compressed_size:9,
+				_uncompressed_size:9,
+				offset:9,
+				nonce:10,
+				csum_type:4,
+				compression_type:4,
+				csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			csum_hi:16,
+				compression_type:4,
+				csum_type:4,
+				nonce:10,
+				offset:9,
+				_uncompressed_size:9,
+				_compressed_size:9,
+				type:3;
+#endif
+	__u64			csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX		(1U << 9)
+#define CRC64_NONCE_MAX		((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:4,
+				_compressed_size:13,
+				_uncompressed_size:13,
+				offset:13,
+				nonce:13,
+				csum_type:4,
+				compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			compression_type:4,
+				csum_type:4,
+				nonce:13,
+				offset:13,
+				_uncompressed_size:13,
+				_compressed_size:13,
+				type:4;
+#endif
+	struct bch_csum		csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX		(1U << 13)
+#define CRC128_NONCE_MAX	((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:1,
+				cached:1,
+				unused:1,
+				unwritten:1,
+				offset:44, /* 8 petabytes */
+				dev:8,
+				gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			gen:8,
+				dev:8,
+				offset:44,
+				unwritten:1,
+				unused:1,
+				cached:1,
+				type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:5,
+				block:8,
+				redundancy:4,
+				idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			idx:47,
+				redundancy:4,
+				block:8,
+				type:5;
+#endif
+};
+
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:6,
+				unused:22,
+				replicas:4,
+				generation:32;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			generation:32,
+				replicas:4,
+				unused:22,
+				type:6;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:7,
+				unused:33,
+				compression:8,
+				target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			target:16,
+				compression:8,
+				unused:33,
+				type:7;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
+	unsigned long			type;
+#elif __BITS_PER_LONG == 32
+	struct {
+		unsigned long		pad;
+		unsigned long		type;
+	};
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f	f;
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+	struct bch_val		v;
+
+	__u64			_data[0];
+	struct bch_extent_ptr	start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+	struct bch_val		v;
+
+	__u64			mem_ptr;
+	__le64			seq;
+	__le16			sectors_written;
+	__le16			flags;
+	struct bpos		min_key;
+	__u64			_data[0];
+	struct bch_extent_ptr	start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+	struct bch_val		v;
+
+	__u64			_data[0];
+	union bch_extent_entry	start[];
+} __packed __aligned(8);
+
+struct bch_reservation {
+	struct bch_val		v;
+
+	__le32			generation;
+	__u8			nr_replicas;
+	__u8			pad[3];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+	((sizeof(struct bch_extent_crc128) +			\
+	  sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX				\
+	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
+	((sizeof(struct bch_btree_ptr_v2) +			\
+	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX					\
+	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+/* Inodes */
+
+#define BLOCKDEV_INODE_MAX	4096
+
+#define BCACHEFS_ROOT_INO	4096
+
+struct bch_inode {
+	struct bch_val		v;
+
+	__le64			bi_hash_seed;
+	__le32			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+	struct bch_val		v;
+
+	__le64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	__le64			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+	struct bch_val		v;
+
+	__le64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	__le64			bi_flags;
+	__le64			bi_sectors;
+	__le64			bi_size;
+	__le64			bi_version;
+	__u8			fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL	6
+#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+	struct bch_val		v;
+
+	__le32			bi_generation;
+	__le32			pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2()			\
+	x(bi_atime,			96)	\
+	x(bi_ctime,			96)	\
+	x(bi_mtime,			96)	\
+	x(bi_otime,			96)	\
+	x(bi_size,			64)	\
+	x(bi_sectors,			64)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)	\
+	x(bi_subvol,			32)	\
+	x(bi_parent_subvol,		32)
+
+#define BCH_INODE_FIELDS_v3()			\
+	x(bi_atime,			96)	\
+	x(bi_ctime,			96)	\
+	x(bi_mtime,			96)	\
+	x(bi_otime,			96)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)	\
+	x(bi_subvol,			32)	\
+	x(bi_parent_subvol,		32)	\
+	x(bi_nocow,			8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS()			\
+	x(data_checksum,		8)	\
+	x(compression,			8)	\
+	x(project,			32)	\
+	x(background_compression,	8)	\
+	x(data_replicas,		8)	\
+	x(promote_target,		16)	\
+	x(foreground_target,		16)	\
+	x(background_target,		16)	\
+	x(erasure_code,			16)	\
+	x(nocow,			8)
+
+enum inode_opt_id {
+#define x(name, ...)				\
+	Inode_opt_##name,
+	BCH_INODE_OPTS()
+#undef  x
+	Inode_opt_nr,
+};
+
+enum {
+	/*
+	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
+	 * flags)
+	 */
+	__BCH_INODE_SYNC		= 0,
+	__BCH_INODE_IMMUTABLE		= 1,
+	__BCH_INODE_APPEND		= 2,
+	__BCH_INODE_NODUMP		= 3,
+	__BCH_INODE_NOATIME		= 4,
+
+	__BCH_INODE_I_SIZE_DIRTY	= 5, /* obsolete */
+	__BCH_INODE_I_SECTORS_DIRTY	= 6, /* obsolete */
+	__BCH_INODE_UNLINKED		= 7,
+	__BCH_INODE_BACKPTR_UNTRUSTED	= 8,
+
+	/* bits 20+ reserved for packed fields below: */
+};
+
+#define BCH_INODE_SYNC		(1 << __BCH_INODE_SYNC)
+#define BCH_INODE_IMMUTABLE	(1 << __BCH_INODE_IMMUTABLE)
+#define BCH_INODE_APPEND	(1 << __BCH_INODE_APPEND)
+#define BCH_INODE_NODUMP	(1 << __BCH_INODE_NODUMP)
+#define BCH_INODE_NOATIME	(1 << __BCH_INODE_NOATIME)
+#define BCH_INODE_I_SIZE_DIRTY	(1 << __BCH_INODE_I_SIZE_DIRTY)
+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
+#define BCH_INODE_UNLINKED	(1 << __BCH_INODE_UNLINKED)
+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
+
+LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT,	struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH,	struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS,	struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+				struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE,	struct bch_inode_v3, bi_flags, 36, 52);
+
+/* Dirents */
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+	struct bch_val		v;
+
+	/* Target inode number: */
+	union {
+	__le64			d_inum;
+	struct {		/* DT_SUBVOL */
+	__le32			d_child_subvol;
+	__le32			d_parent_subvol;
+	};
+	};
+
+	/*
+	 * Copy of mode bits 12-15 from the target inode - so userspace can get
+	 * the filetype without having to do a stat()
+	 */
+	__u8			d_type;
+
+	__u8			d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL	16
+#define BCH_DT_MAX	17
+
+#define BCH_NAME_MAX	512
+
+/* Xattrs */
+
+#define KEY_TYPE_XATTR_INDEX_USER			0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS	1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT	2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED			3
+#define KEY_TYPE_XATTR_INDEX_SECURITY	        4
+
+struct bch_xattr {
+	struct bch_val		v;
+	__u8			x_type;
+	__u8			x_name_len;
+	__le16			x_val_len;
+	__u8			x_name[];
+} __packed __aligned(8);
+
+/* Bucket/allocation information: */
+
+struct bch_alloc {
+	struct bch_val		v;
+	__u8			fields;
+	__u8			gen;
+	__u8			data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1()			\
+	x(read_time,		16)		\
+	x(write_time,		16)		\
+	x(data_type,		8)		\
+	x(dirty_sectors,	16)		\
+	x(cached_sectors,	16)		\
+	x(oldest_gen,		8)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+	BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+	struct bch_val		v;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2()			\
+	x(read_time,		64)		\
+	x(write_time,		64)		\
+	x(dirty_sectors,	32)		\
+	x(cached_sectors,	32)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
+
+struct bch_alloc_v3 {
+	struct bch_val		v;
+	__le64			journal_seq;
+	__le32			flags;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
+struct bch_alloc_v4 {
+	struct bch_val		v;
+	__u64			journal_seq;
+	__u32			flags;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			stripe_redundancy;
+	__u32			dirty_sectors;
+	__u32			cached_sectors;
+	__u64			io_time[2];
+	__u32			stripe;
+	__u32			nr_external_backpointers;
+	__u64			fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0	6
+#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
+
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX	40
+
+struct bch_backpointer {
+	struct bch_val		v;
+	__u8			btree_id;
+	__u8			level;
+	__u8			data_type;
+	__u64			bucket_offset:40;
+	__u32			bucket_len;
+	struct bpos		pos;
+} __packed __aligned(8);
+
+#define KEY_TYPE_BUCKET_GENS_BITS	8
+#define KEY_TYPE_BUCKET_GENS_NR		(1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK	(KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+	struct bch_val		v;
+	u8			gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+/* Quotas: */
+
+enum quota_types {
+	QTYP_USR		= 0,
+	QTYP_GRP		= 1,
+	QTYP_PRJ		= 2,
+	QTYP_NR			= 3,
+};
+
+enum quota_counters {
+	Q_SPC			= 0,
+	Q_INO			= 1,
+	Q_COUNTERS		= 2,
+};
+
+struct bch_quota_counter {
+	__le64			hardlimit;
+	__le64			softlimit;
+};
+
+struct bch_quota {
+	struct bch_val		v;
+	struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* Erasure coding */
+
+struct bch_stripe {
+	struct bch_val		v;
+	__le16			sectors;
+	__u8			algorithm;
+	__u8			nr_blocks;
+	__u8			nr_redundant;
+
+	__u8			csum_granularity_bits;
+	__u8			csum_type;
+	__u8			pad;
+
+	struct bch_extent_ptr	ptrs[];
+} __packed __aligned(8);
+
+/* Reflink: */
+
+struct bch_reflink_p {
+	struct bch_val		v;
+	__le64			idx;
+	/*
+	 * A reflink pointer might point to an indirect extent which is then
+	 * later split (by copygc or rebalance). If we only pointed to part of
+	 * the original indirect extent, and then one of the fragments is
+	 * outside the range we point to, we'd leak a refcount: so when creating
+	 * reflink pointers, we need to store pad values to remember the full
+	 * range we were taking a reference on.
+	 */
+	__le32			front_pad;
+	__le32			back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+	struct bch_val		v;
+	__le64			refcount;
+	union bch_extent_entry	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+	struct bch_val		v;
+	__le64			refcount;
+	u8			data[];
+};
+
+/* Inline data */
+
+struct bch_inline_data {
+	struct bch_val		v;
+	u8			data[];
+};
+
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN		POS(0, 1)
+#define SUBVOL_POS_MAX		POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL	1
+
+struct bch_subvolume {
+	struct bch_val		v;
+	__le32			flags;
+	__le32			snapshot;
+	__le64			inode;
+	/*
+	 * Snapshot subvolumes form a tree, separate from the snapshot nodes
+	 * tree - if this subvolume is a snapshot, this is the ID of the
+	 * subvolume it was created from:
+	 */
+	__le32			parent;
+	__le32			pad;
+	bch_le128		otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP,	struct bch_subvolume, flags,  1,  2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,	struct bch_subvolume, flags,  2,  3)
+
+/* Snapshots */
+
+struct bch_snapshot {
+	struct bch_val		v;
+	__le32			flags;
+	__le32			parent;
+	__le32			children[2];
+	__le32			subvol;
+	/* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+	__le32			tree;
+	__le32			depth;
+	__le32			skip[3];
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+	struct bch_val		v;
+	__le32			master_subvol;
+	__le32			root_snapshot;
+};
+
+/* LRU btree: */
+
+struct bch_lru {
+	struct bch_val		v;
+	__le64			idx;
+} __packed __aligned(8);
+
+#define LRU_ID_STRIPES		(1U << 16)
+
+/* Logged operations btree: */
+
+struct bch_logged_op_truncate {
+	struct bch_val		v;
+	__le32			subvol;
+	__le32			pad;
+	__le64			inum;
+	__le64			new_i_size;
+};
+
+enum logged_op_finsert_state {
+	LOGGED_OP_FINSERT_start,
+	LOGGED_OP_FINSERT_shift_extents,
+	LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+	struct bch_val		v;
+	__u8			state;
+	__u8			pad[3];
+	__le32			subvol;
+	__le64			inum;
+	__le64			dst_offset;
+	__le64			src_offset;
+	__le64			pos;
+};
+
+/* Optional/variable size superblock sections: */
+
+struct bch_sb_field {
+	__u64			_data[0];
+	__le32			u64s;
+	__le32			type;
+};
+
+#define BCH_SB_FIELDS()				\
+	x(journal,	0)			\
+	x(members_v1,	1)			\
+	x(crypt,	2)			\
+	x(replicas_v0,	3)			\
+	x(quota,	4)			\
+	x(disk_groups,	5)			\
+	x(clean,	6)			\
+	x(replicas,	7)			\
+	x(journal_seq_blacklist, 8)		\
+	x(journal_v2,	9)			\
+	x(counters,	10)			\
+	x(members_v2,	11)
+
+enum bch_sb_field_type {
+#define x(f, nr)	BCH_SB_FIELD_##f = nr,
+	BCH_SB_FIELDS()
+#undef x
+	BCH_SB_FIELD_NR
+};
+
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS		\
+	((1U << BCH_SB_FIELD_journal)|		\
+	 (1U << BCH_SB_FIELD_journal_v2))
+
+/* BCH_SB_FIELD_journal: */
+
+struct bch_sb_field_journal {
+	struct bch_sb_field	field;
+	__le64			buckets[];
+};
+
+struct bch_sb_field_journal_v2 {
+	struct bch_sb_field	field;
+
+	struct bch_sb_field_journal_v2_entry {
+		__le64		start;
+		__le64		nr;
+	}			d[];
+};
+
+/* BCH_SB_FIELD_members_v1: */
+
+#define BCH_MIN_NR_NBUCKETS	(1 << 6)
+
+#define BCH_IOPS_MEASUREMENTS()			\
+	x(seqread,	0)			\
+	x(seqwrite,	1)			\
+	x(randread,	2)			\
+	x(randwrite,	3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+	BCH_IOPS_MEASUREMENTS()
+#undef x
+	BCH_IOPS_NR
+};
+
+struct bch_member {
+	__uuid_t		uuid;
+	__le64			nbuckets;	/* device size */
+	__le16			first_bucket;   /* index of first bucket used */
+	__le16			bucket_size;	/* sectors */
+	__le32			pad;
+	__le64			last_mount;	/* time_t */
+
+	__le64			flags;
+	__le32			iops[4];
+};
+
+#define BCH_MEMBER_V1_BYTES	56
+
+LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags,  0,  4)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
+LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags, 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+					struct bch_member, flags, 30, 31)
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+#define BCH_MEMBER_STATES()			\
+	x(rw,		0)			\
+	x(ro,		1)			\
+	x(failed,	2)			\
+	x(spare,	3)
+
+enum bch_member_state {
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+	BCH_MEMBER_STATES()
+#undef x
+	BCH_MEMBER_STATE_NR
+};
+
+struct bch_sb_field_members_v1 {
+	struct bch_sb_field	field;
+	struct bch_member	_members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
+	struct bch_sb_field	field;
+	__le16			member_bytes; //size of single member entry
+	u8			pad[6];
+	struct bch_member	_members[];
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+struct nonce {
+	__le32			d[4];
+};
+
+struct bch_key {
+	__le64			key[4];
+};
+
+#define BCH_KEY_MAGIC					\
+	(((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|		\
+	 ((__u64) 'h' << 16)|((__u64) '*' << 24)|		\
+	 ((__u64) '*' << 32)|((__u64) 'k' << 40)|		\
+	 ((__u64) 'e' << 48)|((__u64) 'y' << 56))
+
+struct bch_encrypted_key {
+	__le64			magic;
+	struct bch_key		key;
+};
+
+/*
+ * If this field is present in the superblock, it stores an encryption key which
+ * is used encrypt all other data/metadata. The key will normally be encrypted
+ * with the key userspace provides, but if encryption has been turned off we'll
+ * just store the master key unencrypted in the superblock so we can access the
+ * previously encrypted data.
+ */
+struct bch_sb_field_crypt {
+	struct bch_sb_field	field;
+
+	__le64			flags;
+	__le64			kdf_flags;
+	struct bch_encrypted_key key;
+};
+
+LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
+
+enum bch_kdf_types {
+	BCH_KDF_SCRYPT		= 0,
+	BCH_KDF_NR		= 1,
+};
+
+/* stored as base 2 log of scrypt params: */
+LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
+LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
+LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
+
+/* BCH_SB_FIELD_replicas: */
+
+#define BCH_DATA_TYPES()		\
+	x(free,		0)		\
+	x(sb,		1)		\
+	x(journal,	2)		\
+	x(btree,	3)		\
+	x(user,		4)		\
+	x(cached,	5)		\
+	x(parity,	6)		\
+	x(stripe,	7)		\
+	x(need_gc_gens,	8)		\
+	x(need_discard,	9)
+
+enum bch_data_type {
+#define x(t, n) BCH_DATA_##t,
+	BCH_DATA_TYPES()
+#undef x
+	BCH_DATA_NR
+};
+
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_replicas_entry_v0 {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			devs[];
+} __packed;
+
+struct bch_sb_field_replicas_v0 {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry_v0 entries[];
+} __packed __aligned(8);
+
+struct bch_replicas_entry {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			nr_required;
+	__u8			devs[];
+} __packed;
+
+#define replicas_entry_bytes(_i)					\
+	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+struct bch_sb_field_replicas {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry entries[];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+	__le32				timelimit;
+	__le32				warnlimit;
+};
+
+struct bch_sb_quota_type {
+	__le64				flags;
+	struct bch_sb_quota_counter	c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+	struct bch_sb_field		field;
+	struct bch_sb_quota_type	q[QTYP_NR];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_disk_groups: */
+
+#define BCH_SB_LABEL_SIZE		32
+
+struct bch_disk_group {
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			flags[2];
+} __packed __aligned(8);
+
+LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
+LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+	struct bch_sb_field	field;
+	struct bch_disk_group	entries[];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_counters */
+
+#define BCH_PERSISTENT_COUNTERS()				\
+	x(io_read,					0)	\
+	x(io_write,					1)	\
+	x(io_move,					2)	\
+	x(bucket_invalidate,				3)	\
+	x(bucket_discard,				4)	\
+	x(bucket_alloc,					5)	\
+	x(bucket_alloc_fail,				6)	\
+	x(btree_cache_scan,				7)	\
+	x(btree_cache_reap,				8)	\
+	x(btree_cache_cannibalize,			9)	\
+	x(btree_cache_cannibalize_lock,			10)	\
+	x(btree_cache_cannibalize_lock_fail,		11)	\
+	x(btree_cache_cannibalize_unlock,		12)	\
+	x(btree_node_write,				13)	\
+	x(btree_node_read,				14)	\
+	x(btree_node_compact,				15)	\
+	x(btree_node_merge,				16)	\
+	x(btree_node_split,				17)	\
+	x(btree_node_rewrite,				18)	\
+	x(btree_node_alloc,				19)	\
+	x(btree_node_free,				20)	\
+	x(btree_node_set_root,				21)	\
+	x(btree_path_relock_fail,			22)	\
+	x(btree_path_upgrade_fail,			23)	\
+	x(btree_reserve_get_fail,			24)	\
+	x(journal_entry_full,				25)	\
+	x(journal_full,					26)	\
+	x(journal_reclaim_finish,			27)	\
+	x(journal_reclaim_start,			28)	\
+	x(journal_write,				29)	\
+	x(read_promote,					30)	\
+	x(read_bounce,					31)	\
+	x(read_split,					33)	\
+	x(read_retry,					32)	\
+	x(read_reuse_race,				34)	\
+	x(move_extent_read,				35)	\
+	x(move_extent_write,				36)	\
+	x(move_extent_finish,				37)	\
+	x(move_extent_fail,				38)	\
+	x(move_extent_alloc_mem_fail,			39)	\
+	x(copygc,					40)	\
+	x(copygc_wait,					41)	\
+	x(gc_gens_end,					42)	\
+	x(gc_gens_start,				43)	\
+	x(trans_blocked_journal_reclaim,		44)	\
+	x(trans_restart_btree_node_reused,		45)	\
+	x(trans_restart_btree_node_split,		46)	\
+	x(trans_restart_fault_inject,			47)	\
+	x(trans_restart_iter_upgrade,			48)	\
+	x(trans_restart_journal_preres_get,		49)	\
+	x(trans_restart_journal_reclaim,		50)	\
+	x(trans_restart_journal_res_get,		51)	\
+	x(trans_restart_key_cache_key_realloced,	52)	\
+	x(trans_restart_key_cache_raced,		53)	\
+	x(trans_restart_mark_replicas,			54)	\
+	x(trans_restart_mem_realloced,			55)	\
+	x(trans_restart_memory_allocation_failure,	56)	\
+	x(trans_restart_relock,				57)	\
+	x(trans_restart_relock_after_fill,		58)	\
+	x(trans_restart_relock_key_cache_fill,		59)	\
+	x(trans_restart_relock_next_node,		60)	\
+	x(trans_restart_relock_parent_for_fill,		61)	\
+	x(trans_restart_relock_path,			62)	\
+	x(trans_restart_relock_path_intent,		63)	\
+	x(trans_restart_too_many_iters,			64)	\
+	x(trans_restart_traverse,			65)	\
+	x(trans_restart_upgrade,			66)	\
+	x(trans_restart_would_deadlock,			67)	\
+	x(trans_restart_would_deadlock_write,		68)	\
+	x(trans_restart_injected,			69)	\
+	x(trans_restart_key_cache_upgrade,		70)	\
+	x(trans_traverse_all,				71)	\
+	x(transaction_commit,				72)	\
+	x(write_super,					73)	\
+	x(trans_restart_would_deadlock_recursion_limit,	74)	\
+	x(trans_restart_write_buffer_flush,		75)	\
+	x(trans_restart_split_race,			76)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+	struct bch_sb_field	field;
+	__le64			d[];
+};
+
+/*
+ * On clean shutdown, store btree roots and current journal sequence number in
+ * the superblock:
+ */
+struct jset_entry {
+	__le16			u64s;
+	__u8			btree_id;
+	__u8			level;
+	__u8			type; /* designates what this jset holds */
+	__u8			pad[3];
+
+	struct bkey_i		start[0];
+	__u64			_data[];
+};
+
+struct bch_sb_field_clean {
+	struct bch_sb_field	field;
+
+	__le32			flags;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
+	__le64			journal_seq;
+
+	struct jset_entry	start[0];
+	__u64			_data[];
+};
+
+struct journal_seq_blacklist_entry {
+	__le64			start;
+	__le64			end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+	struct bch_sb_field	field;
+
+	struct journal_seq_blacklist_entry start[0];
+	__u64			_data[];
+};
+
+/* Superblock: */
+
+/*
+ * New versioning scheme:
+ * One common version number for all on disk data structures - superblock, btree
+ * nodes, journal entries
+ */
+#define BCH_VERSION_MAJOR(_v)		((__u16) ((_v) >> 10))
+#define BCH_VERSION_MINOR(_v)		((__u16) ((_v) & ~(~0U << 10)))
+#define BCH_VERSION(_major, _minor)	(((_major) << 10)|(_minor) << 0)
+
+#define RECOVERY_PASS_ALL_FSCK		(1ULL << 63)
+
+#define BCH_METADATA_VERSIONS()						\
+	x(bkey_renumber,		BCH_VERSION(0, 10),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_btree_change,		BCH_VERSION(0, 11),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot,			BCH_VERSION(0, 12),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_backpointers,		BCH_VERSION(0, 13),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot_2,			BCH_VERSION(0, 15),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)|		\
+	  BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)|		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(reflink_p_fix,		BCH_VERSION(0, 16),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p))			\
+	x(subvol_dirent,		BCH_VERSION(0, 17),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_v2,			BCH_VERSION(0, 18),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(freespace,			BCH_VERSION(0, 19),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(alloc_v4,			BCH_VERSION(0, 20),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(new_data_types,		BCH_VERSION(0, 21),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(backpointers,			BCH_VERSION(0, 22),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_v3,			BCH_VERSION(0, 23),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(unwritten_extents,		BCH_VERSION(0, 24),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(bucket_gens,			BCH_VERSION(0, 25),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)|			\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(lru_v2,			BCH_VERSION(0, 26),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(fragmentation_lru,		BCH_VERSION(0, 27),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot_trees,		BCH_VERSION(0, 29),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(major_minor,			BCH_VERSION(1,  0),		\
+	  0)								\
+	x(snapshot_skiplists,		BCH_VERSION(1,  1),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))			\
+	x(deleted_inodes,		BCH_VERSION(1,  2),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
+
+enum bcachefs_metadata_version {
+	bcachefs_metadata_version_min = 9,
+#define x(t, n, upgrade_passes)	bcachefs_metadata_version_##t = n,
+	BCH_METADATA_VERSIONS()
+#undef x
+	bcachefs_metadata_version_max
+};
+
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+
+#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
+
+#define BCH_SB_SECTOR			8
+#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
+
+struct bch_sb_layout {
+	__uuid_t		magic;	/* bcachefs superblock UUID */
+	__u8			layout_type;
+	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
+	__u8			nr_superblocks;
+	__u8			pad[5];
+	__le64			sb_offset[61];
+} __packed __aligned(8);
+
+#define BCH_SB_LAYOUT_SECTOR	7
+
+/*
+ * @offset	- sector where this sb was written
+ * @version	- on disk format version
+ * @version_min	- Oldest metadata version this filesystem contains; so we can
+ *		  safely drop compatibility code and refuse to mount filesystems
+ *		  we'd need it for
+ * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
+ * @seq		- incremented each time superblock is written
+ * @uuid	- used for generating various magic numbers and identifying
+ *                member devices, never changes
+ * @user_uuid	- user visible UUID, may be changed
+ * @label	- filesystem label
+ * @seq		- identifies most recent superblock, incremented each time
+ *		  superblock is written
+ * @features	- enabled incompatible features
+ */
+struct bch_sb {
+	struct bch_csum		csum;
+	__le16			version;
+	__le16			version_min;
+	__le16			pad[2];
+	__uuid_t		magic;
+	__uuid_t		uuid;
+	__uuid_t		user_uuid;
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			offset;
+	__le64			seq;
+
+	__le16			block_size;
+	__u8			dev_idx;
+	__u8			nr_devices;
+	__le32			u64s;
+
+	__le64			time_base_lo;
+	__le32			time_base_hi;
+	__le32			time_precision;
+
+	__le64			flags[8];
+	__le64			features[2];
+	__le64			compat[2];
+
+	struct bch_sb_layout	layout;
+
+	struct bch_sb_field	start[0];
+	__le64			_data[];
+} __packed __aligned(8);
+
+/*
+ * Flags:
+ * BCH_SB_INITALIZED	- set on first mount
+ * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
+ *			  behaviour of mount/recovery path:
+ * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
+ * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
+ *			   DATA/META_CSUM_TYPE. Also indicates encryption
+ *			   algorithm in use, if/when we get more than one
+ */
+
+LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);
+
+LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
+LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
+LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
+LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
+
+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
+
+LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
+LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
+
+LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
+
+LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
+LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
+LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
+LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
+
+LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
+
+LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
+
+LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
+LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
+
+LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
+
+/*
+ * Max size of an extent that may require bouncing to read or write
+ * (checksummed, compressed): 64k
+ */
+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
+					struct bch_sb, flags[1], 14, 20);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
+
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
+
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
+					struct bch_sb, flags[2],  0,  4);
+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
+
+LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
+LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
+LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
+LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
+LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);
+
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
+					struct bch_sb, flags[4], 60, 64);
+
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
+					struct bch_sb, flags[5],  0, 16);
+
+static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+	return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+	SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
+	SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+	return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
+		(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+/*
+ * Features:
+ *
+ * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
+ * reflink:			gates KEY_TYPE_reflink
+ * inline_data:			gates KEY_TYPE_inline_data
+ * new_siphash:			gates BCH_STR_HASH_siphash
+ * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
+ */
+#define BCH_SB_FEATURES()			\
+	x(lz4,				0)	\
+	x(gzip,				1)	\
+	x(zstd,				2)	\
+	x(atomic_nlink,			3)	\
+	x(ec,				4)	\
+	x(journal_seq_blacklist_v3,	5)	\
+	x(reflink,			6)	\
+	x(new_siphash,			7)	\
+	x(inline_data,			8)	\
+	x(new_extent_overwrite,		9)	\
+	x(incompressible,		10)	\
+	x(btree_ptr_v2,			11)	\
+	x(extents_above_btree_updates,	12)	\
+	x(btree_updates_journalled,	13)	\
+	x(reflink_inline_data,		14)	\
+	x(new_varint,			15)	\
+	x(journal_no_flush,		16)	\
+	x(alloc_v2,			17)	\
+	x(extents_across_btree_nodes,	18)
+
+#define BCH_SB_FEATURES_ALWAYS				\
+	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
+	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+	 (1ULL << BCH_FEATURE_alloc_v2)|\
+	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
+
+#define BCH_SB_FEATURES_ALL				\
+	(BCH_SB_FEATURES_ALWAYS|			\
+	 (1ULL << BCH_FEATURE_new_siphash)|		\
+	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
+	 (1ULL << BCH_FEATURE_new_varint)|		\
+	 (1ULL << BCH_FEATURE_journal_no_flush))
+
+enum bch_sb_feature {
+#define x(f, n) BCH_FEATURE_##f,
+	BCH_SB_FEATURES()
+#undef x
+	BCH_FEATURE_NR,
+};
+
+#define BCH_SB_COMPAT()					\
+	x(alloc_info,				0)	\
+	x(alloc_metadata,			1)	\
+	x(extents_above_btree_updates_done,	2)	\
+	x(bformat_overflow_done,		3)
+
+enum bch_sb_compat {
+#define x(f, n) BCH_COMPAT_##f,
+	BCH_SB_COMPAT()
+#undef x
+	BCH_COMPAT_NR,
+};
+
+/* options: */
+
+#define BCH_VERSION_UPGRADE_OPTS()	\
+	x(compatible,		0)	\
+	x(incompatible,		1)	\
+	x(none,			2)
+
+enum bch_version_upgrade_opts {
+#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
+	BCH_VERSION_UPGRADE_OPTS()
+#undef x
+};
+
+#define BCH_REPLICAS_MAX		4U
+
+#define BCH_BKEY_PTRS_MAX		16U
+
+#define BCH_ERROR_ACTIONS()		\
+	x(continue,		0)	\
+	x(ro,			1)	\
+	x(panic,		2)
+
+enum bch_error_actions {
+#define x(t, n) BCH_ON_ERROR_##t = n,
+	BCH_ERROR_ACTIONS()
+#undef x
+	BCH_ON_ERROR_NR
+};
+
+#define BCH_STR_HASH_TYPES()		\
+	x(crc32c,		0)	\
+	x(crc64,		1)	\
+	x(siphash_old,		2)	\
+	x(siphash,		3)
+
+enum bch_str_hash_type {
+#define x(t, n) BCH_STR_HASH_##t = n,
+	BCH_STR_HASH_TYPES()
+#undef x
+	BCH_STR_HASH_NR
+};
+
+#define BCH_STR_HASH_OPTS()		\
+	x(crc32c,		0)	\
+	x(crc64,		1)	\
+	x(siphash,		2)
+
+enum bch_str_hash_opts {
+#define x(t, n) BCH_STR_HASH_OPT_##t = n,
+	BCH_STR_HASH_OPTS()
+#undef x
+	BCH_STR_HASH_OPT_NR
+};
+
+#define BCH_CSUM_TYPES()			\
+	x(none,				0)	\
+	x(crc32c_nonzero,		1)	\
+	x(crc64_nonzero,		2)	\
+	x(chacha20_poly1305_80,		3)	\
+	x(chacha20_poly1305_128,	4)	\
+	x(crc32c,			5)	\
+	x(crc64,			6)	\
+	x(xxhash,			7)
+
+enum bch_csum_type {
+#define x(t, n) BCH_CSUM_##t = n,
+	BCH_CSUM_TYPES()
+#undef x
+	BCH_CSUM_NR
+};
+
+static const __maybe_unused unsigned bch_crc_bytes[] = {
+	[BCH_CSUM_none]				= 0,
+	[BCH_CSUM_crc32c_nonzero]		= 4,
+	[BCH_CSUM_crc32c]			= 4,
+	[BCH_CSUM_crc64_nonzero]		= 8,
+	[BCH_CSUM_crc64]			= 8,
+	[BCH_CSUM_xxhash]			= 8,
+	[BCH_CSUM_chacha20_poly1305_80]		= 10,
+	[BCH_CSUM_chacha20_poly1305_128]	= 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+	switch (type) {
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define BCH_CSUM_OPTS()			\
+	x(none,			0)	\
+	x(crc32c,		1)	\
+	x(crc64,		2)	\
+	x(xxhash,		3)
+
+enum bch_csum_opts {
+#define x(t, n) BCH_CSUM_OPT_##t = n,
+	BCH_CSUM_OPTS()
+#undef x
+	BCH_CSUM_OPT_NR
+};
+
+#define BCH_COMPRESSION_TYPES()		\
+	x(none,			0)	\
+	x(lz4_old,		1)	\
+	x(gzip,			2)	\
+	x(lz4,			3)	\
+	x(zstd,			4)	\
+	x(incompressible,	5)
+
+enum bch_compression_type {
+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
+	BCH_COMPRESSION_TYPES()
+#undef x
+	BCH_COMPRESSION_TYPE_NR
+};
+
+#define BCH_COMPRESSION_OPTS()		\
+	x(none,		0)		\
+	x(lz4,		1)		\
+	x(gzip,		2)		\
+	x(zstd,		3)
+
+enum bch_compression_opts {
+#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
+	BCH_COMPRESSION_OPTS()
+#undef x
+	BCH_COMPRESSION_OPT_NR
+};
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define BCACHE_MAGIC							\
+	UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,				\
+		  0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC							\
+	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,				\
+		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
+
+#define BCACHEFS_STATFS_MAGIC		0xca451a4e
+
+#define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
+#define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
+
+static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
+{
+	__le64 ret;
+
+	memcpy(&ret, &sb->uuid, sizeof(ret));
+	return ret;
+}
+
+static inline __u64 __jset_magic(struct bch_sb *sb)
+{
+	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
+}
+
+static inline __u64 __bset_magic(struct bch_sb *sb)
+{
+	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
+}
+
+/* Journal */
+
+#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES()			\
+	x(btree_keys,		0)		\
+	x(btree_root,		1)		\
+	x(prio_ptrs,		2)		\
+	x(blacklist,		3)		\
+	x(blacklist_v2,		4)		\
+	x(usage,		5)		\
+	x(data_usage,		6)		\
+	x(clock,		7)		\
+	x(dev_usage,		8)		\
+	x(log,			9)		\
+	x(overwrite,		10)
+
+enum {
+#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+	BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
+struct jset_entry_blacklist {
+	struct jset_entry	entry;
+	__le64			seq;
+};
+
+struct jset_entry_blacklist_v2 {
+	struct jset_entry	entry;
+	__le64			start;
+	__le64			end;
+};
+
+#define BCH_FS_USAGE_TYPES()			\
+	x(reserved,		0)		\
+	x(inodes,		1)		\
+	x(key_version,		2)
+
+enum {
+#define x(f, nr)	BCH_FS_USAGE_##f	= nr,
+	BCH_FS_USAGE_TYPES()
+#undef x
+	BCH_FS_USAGE_NR
+};
+
+struct jset_entry_usage {
+	struct jset_entry	entry;
+	__le64			v;
+} __packed;
+
+struct jset_entry_data_usage {
+	struct jset_entry	entry;
+	__le64			v;
+	struct bch_replicas_entry r;
+} __packed;
+
+struct jset_entry_clock {
+	struct jset_entry	entry;
+	__u8			rw;
+	__u8			pad[7];
+	__le64			time;
+} __packed;
+
+struct jset_entry_dev_usage_type {
+	__le64			buckets;
+	__le64			sectors;
+	__le64			fragmented;
+} __packed;
+
+struct jset_entry_dev_usage {
+	struct jset_entry	entry;
+	__le32			dev;
+	__u32			pad;
+
+	__le64			buckets_ec;
+	__le64			_buckets_unavailable; /* No longer used */
+
+	struct jset_entry_dev_usage_type d[];
+};
+
+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
+{
+	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
+		sizeof(struct jset_entry_dev_usage_type);
+}
+
+struct jset_entry_log {
+	struct jset_entry	entry;
+	u8			d[];
+} __packed;
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+	struct bch_csum		csum;
+
+	__le64			magic;
+	__le64			seq;
+	__le32			version;
+	__le32			flags;
+
+	__le32			u64s; /* size of d[] in u64s */
+
+	__u8			encrypted_start[0];
+
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
+
+	/* Sequence number of oldest dirty journal entry */
+	__le64			last_seq;
+
+
+	struct jset_entry	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
+LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
+
+#define BCH_JOURNAL_BUCKETS_MIN		8
+
+/* Btree: */
+
+enum btree_id_flags {
+	BTREE_ID_EXTENTS	= BIT(0),
+	BTREE_ID_SNAPSHOTS	= BIT(1),
+	BTREE_ID_DATA		= BIT(2),
+};
+
+#define BCH_BTREE_IDS()								\
+	x(extents,		0,	BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_error)|						\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_extent)|						\
+	  BIT_ULL(KEY_TYPE_reservation)|					\
+	  BIT_ULL(KEY_TYPE_reflink_p)|						\
+	  BIT_ULL(KEY_TYPE_inline_data))					\
+	x(inodes,		1,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_inode)|						\
+	  BIT_ULL(KEY_TYPE_inode_v2)|						\
+	  BIT_ULL(KEY_TYPE_inode_v3)|						\
+	  BIT_ULL(KEY_TYPE_inode_generation))					\
+	x(dirents,		2,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
+	  BIT_ULL(KEY_TYPE_dirent))						\
+	x(xattrs,		3,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
+	  BIT_ULL(KEY_TYPE_xattr))						\
+	x(alloc,		4,	0,					\
+	  BIT_ULL(KEY_TYPE_alloc)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v2)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v3)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v4))						\
+	x(quotas,		5,	0,					\
+	  BIT_ULL(KEY_TYPE_quota))						\
+	x(stripes,		6,	0,					\
+	  BIT_ULL(KEY_TYPE_stripe))						\
+	x(reflink,		7,	BTREE_ID_EXTENTS|BTREE_ID_DATA,		\
+	  BIT_ULL(KEY_TYPE_reflink_v)|						\
+	  BIT_ULL(KEY_TYPE_indirect_inline_data))				\
+	x(subvolumes,		8,	0,					\
+	  BIT_ULL(KEY_TYPE_subvolume))						\
+	x(snapshots,		9,	0,					\
+	  BIT_ULL(KEY_TYPE_snapshot))						\
+	x(lru,			10,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(freespace,		11,	BTREE_ID_EXTENTS,			\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(need_discard,		12,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(backpointers,		13,	0,					\
+	  BIT_ULL(KEY_TYPE_backpointer))					\
+	x(bucket_gens,		14,	0,					\
+	  BIT_ULL(KEY_TYPE_bucket_gens))					\
+	x(snapshot_trees,	15,	0,					\
+	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
+	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(logged_ops,		17,	0,					\
+	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
+	  BIT_ULL(KEY_TYPE_logged_op_finsert))
+
+enum btree_id {
+#define x(name, nr, ...) BTREE_ID_##name = nr,
+	BCH_BTREE_IDS()
+#undef x
+	BTREE_ID_NR
+};
+
+#define BTREE_MAX_DEPTH		4U
+
+/* Btree nodes */
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+	__le64			seq;
+
+	/*
+	 * Highest journal entry this bset contains keys for.
+	 * If on recovery we don't see that journal entry, this bset is ignored:
+	 * this allows us to preserve the order of all index updates after a
+	 * crash, since the journal records a total order of all index updates
+	 * and anything that didn't make it to the journal doesn't get used.
+	 */
+	__le64			journal_seq;
+
+	__le32			flags;
+	__le16			version;
+	__le16			u64s; /* count of d[] in u64s */
+
+	struct bkey_packed	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
+
+LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
+				struct bset, flags, 5, 6);
+
+/* Sector offset within the btree node: */
+LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);
+
+struct btree_node {
+	struct bch_csum		csum;
+	__le64			magic;
+
+	/* this flags field is encrypted, unlike bset->flags: */
+	__le64			flags;
+
+	/* Closed interval: */
+	struct bpos		min_key;
+	struct bpos		max_key;
+	struct bch_extent_ptr	_ptr; /* not used anymore */
+	struct bkey_format	format;
+
+	union {
+	struct bset		keys;
+	struct {
+		__u8		pad[22];
+		__le16		u64s;
+		__u64		_data[0];
+
+	};
+	};
+} __packed __aligned(8);
+
+LE64_BITMASK(BTREE_NODE_ID_LO,	struct btree_node, flags,  0,  4);
+LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
+				struct btree_node, flags,  8,  9);
+LE64_BITMASK(BTREE_NODE_ID_HI,	struct btree_node, flags,  9, 25);
+/* 25-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
+
+static inline __u64 BTREE_NODE_ID(struct btree_node *n)
+{
+	return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
+}
+
+static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
+{
+	SET_BTREE_NODE_ID_LO(n, v);
+	SET_BTREE_NODE_ID_HI(n, v >> 4);
+}
+
+struct btree_node_entry {
+	struct bch_csum		csum;
+
+	union {
+	struct bset		keys;
+	struct {
+		__u8		pad[22];
+		__le16		u64s;
+		__u64		_data[0];
+	};
+	};
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
new file mode 100644
index 000000000000..f05881f7e113
--- /dev/null
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
+
+#include <linux/uuid.h>
+#include <asm/ioctl.h>
+#include "bcachefs_format.h"
+
+/*
+ * Flags common to multiple ioctls:
+ */
+#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
+
+#define BCH_FORCE_IF_LOST			\
+	(BCH_FORCE_IF_DATA_LOST|		\
+	 BCH_FORCE_IF_METADATA_LOST)
+#define BCH_FORCE_IF_DEGRADED			\
+	(BCH_FORCE_IF_DATA_DEGRADED|		\
+	 BCH_FORCE_IF_METADATA_DEGRADED)
+
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
+#define BCH_BY_INDEX			(1 << 4)
+
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
+#define BCH_READ_DEV			(1 << 5)
+
+/* global control dev: */
+
+/* These are currently broken, and probably unnecessary: */
+#if 0
+#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
+#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
+
+struct bch_ioctl_assemble {
+	__u32			flags;
+	__u32			nr_devs;
+	__u64			pad;
+	__u64			devs[];
+};
+
+struct bch_ioctl_incremental {
+	__u32			flags;
+	__u64			pad;
+	__u64			dev;
+};
+#endif
+
+/* filesystem ioctls: */
+
+#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
+#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
+#define BCH_IOCTL_STOP		_IO(0xbc,	3)
+#endif
+
+#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
+#define BCH_IOCTL_FS_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_fs_usage)
+#define BCH_IOCTL_DEV_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_dev_usage)
+#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
+
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,	16,  struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc,	17,  struct bch_ioctl_subvolume)
+
+/* ioctl below act on a particular file, not the filesystem as a whole: */
+
+#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
+
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
+struct bch_ioctl_query_uuid {
+	__uuid_t		uuid;
+};
+
+#if 0
+struct bch_ioctl_start {
+	__u32			flags;
+	__u32			pad;
+};
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
+
+struct bch_ioctl_disk {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state		- one of the bch_member_state states (rw, ro, failed,
+ *			  spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk_set_state {
+	__u32			flags;
+	__u8			new_state;
+	__u8			pad[3];
+	__u64			dev;
+};
+
+enum bch_data_ops {
+	BCH_DATA_OP_SCRUB		= 0,
+	BCH_DATA_OP_REREPLICATE		= 1,
+	BCH_DATA_OP_MIGRATE		= 2,
+	BCH_DATA_OP_REWRITE_OLD_NODES	= 3,
+	BCH_DATA_OP_NR			= 4,
+};
+
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
+struct bch_ioctl_data {
+	__u16			op;
+	__u8			start_btree;
+	__u8			end_btree;
+	__u32			flags;
+
+	struct bpos		start_pos;
+	struct bpos		end_pos;
+
+	union {
+	struct {
+		__u32		dev;
+		__u32		pad;
+	}			migrate;
+	struct {
+		__u64		pad[8];
+	};
+	};
+} __packed __aligned(8);
+
+enum bch_data_event {
+	BCH_DATA_EVENT_PROGRESS	= 0,
+	/* XXX: add an event for reporting errors */
+	BCH_DATA_EVENT_NR	= 1,
+};
+
+struct bch_ioctl_data_progress {
+	__u8			data_type;
+	__u8			btree_id;
+	__u8			pad[2];
+	struct bpos		pos;
+
+	__u64			sectors_done;
+	__u64			sectors_total;
+} __packed __aligned(8);
+
+struct bch_ioctl_data_event {
+	__u8			type;
+	__u8			pad[7];
+	union {
+	struct bch_ioctl_data_progress p;
+	__u64			pad2[15];
+	};
+} __packed __aligned(8);
+
+struct bch_replicas_usage {
+	__u64			sectors;
+	struct bch_replicas_entry r;
+} __packed;
+
+static inline struct bch_replicas_usage *
+replicas_usage_next(struct bch_replicas_usage *u)
+{
+	return (void *) u + replicas_entry_bytes(&u->r) + 8;
+}
+
+/*
+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
+struct bch_ioctl_fs_usage {
+	__u64			capacity;
+	__u64			used;
+	__u64			online_reserved;
+	__u64			persistent_reserved[BCH_REPLICAS_MAX];
+
+	__u32			replica_entries_bytes;
+	__u32			pad;
+
+	struct bch_replicas_usage replicas[0];
+};
+
+/*
+ * BCH_IOCTL_DEV_USAGE: query device disk space usage
+ *
+ * Returns disk space usage broken out by data type - both by buckets and
+ * sectors.
+ */
+struct bch_ioctl_dev_usage {
+	__u64			dev;
+	__u32			flags;
+	__u8			state;
+	__u8			pad[7];
+
+	__u32			bucket_size;
+	__u64			nr_buckets;
+
+	__u64			buckets_ec;
+
+	struct bch_ioctl_dev_usage_type {
+		__u64		buckets;
+		__u64		sectors;
+		__u64		fragmented;
+	}			d[BCH_DATA_NR];
+};
+
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb		- buffer to read into
+ * @size	- size of userspace allocated buffer
+ * @dev		- device to read superblock for, if BCH_READ_DEV flag is
+ *		  specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
+struct bch_ioctl_read_super {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			size;
+	__u64			sb;
+};
+
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
+struct bch_ioctl_disk_get_idx {
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize_journal {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
+struct bch_ioctl_subvolume {
+	__u32			flags;
+	__u32			dirfd;
+	__u16			mode;
+	__u16			pad[3];
+	__u64			dst_ptr;
+	__u64			src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE	(1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO		(1U << 1)
+
+#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
new file mode 100644
index 000000000000..abdb05507d16
--- /dev/null
+++ b/fs/bcachefs/bkey.c
@@ -0,0 +1,1120 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_cmp.h"
+#include "bkey_methods.h"
+#include "bset.h"
+#include "util.h"
+
+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
+
+void bch2_bkey_packed_to_binary_text(struct printbuf *out,
+				     const struct bkey_format *f,
+				     const struct bkey_packed *k)
+{
+	const u64 *p = high_word(f, k);
+	unsigned word_bits = 64 - high_bit_offset;
+	unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
+	u64 v = *p & (~0ULL >> high_bit_offset);
+
+	if (!nr_key_bits) {
+		prt_str(out, "(empty)");
+		return;
+	}
+
+	while (1) {
+		unsigned next_key_bits = nr_key_bits;
+
+		if (nr_key_bits < 64) {
+			v >>= 64 - nr_key_bits;
+			next_key_bits = 0;
+		} else {
+			next_key_bits -= 64;
+		}
+
+		bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+
+		if (!next_key_bits)
+			break;
+
+		prt_char(out, ' ');
+
+		p = next_word(p);
+		v = *p;
+		word_bits = 64;
+		nr_key_bits = next_key_bits;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+				  const struct bkey *unpacked,
+				  const struct bkey_format *format)
+{
+	struct bkey tmp;
+
+	BUG_ON(bkeyp_val_u64s(format, packed) !=
+	       bkey_val_u64s(unpacked));
+
+	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
+
+	tmp = __bch2_bkey_unpack_key(format, packed);
+
+	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
+		      format->key_u64s,
+		      format->bits_per_field[0],
+		      format->bits_per_field[1],
+		      format->bits_per_field[2],
+		      format->bits_per_field[3],
+		      format->bits_per_field[4]);
+
+		prt_printf(&buf, "compiled unpack: ");
+		bch2_bkey_to_text(&buf, unpacked);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "c unpack:        ");
+		bch2_bkey_to_text(&buf, &tmp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "compiled unpack: ");
+		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+						(struct bkey_packed *) unpacked);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "c unpack:        ");
+		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+						(struct bkey_packed *) &tmp);
+		prt_newline(&buf);
+
+		panic("%s", buf.buf);
+	}
+}
+
+#else
+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+					const struct bkey *unpacked,
+					const struct bkey_format *format) {}
+#endif
+
+struct pack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	u64			*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct pack_state pack_state_init(const struct bkey_format *format,
+					 struct bkey_packed *k)
+{
+	u64 *p = high_word(format, k);
+
+	return (struct pack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= 0,
+		.p	= p,
+	};
+}
+
+__always_inline
+static void pack_state_finish(struct pack_state *state,
+			      struct bkey_packed *k)
+{
+	EBUG_ON(state->p <  k->_data);
+	EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
+
+	*state->p = state->w;
+}
+
+struct unpack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	const u64		*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
+					     const struct bkey_packed *k)
+{
+	const u64 *p = high_word(format, k);
+
+	return (struct unpack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= *p << high_bit_offset,
+		.p	= p,
+	};
+}
+
+__always_inline
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (bits >= state->bits) {
+		v = state->w >> (64 - bits);
+		bits -= state->bits;
+
+		state->p = next_word(state->p);
+		state->w = *state->p;
+		state->bits = 64;
+	}
+
+	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+	v |= (state->w >> 1) >> (63 - bits);
+	state->w <<= bits;
+	state->bits -= bits;
+
+	return v + offset;
+}
+
+__always_inline
+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+
+	if (bits) {
+		if (bits > state->bits) {
+			bits -= state->bits;
+			/* avoid shift by 64 if bits is 64 - bits is never 0 here: */
+			state->w |= (v >> 1) >> (bits - 1);
+
+			*state->p = state->w;
+			state->p = next_word(state->p);
+			state->w = 0;
+			state->bits = 64;
+		}
+
+		state->bits -= bits;
+		state->w |= v << state->bits;
+	}
+}
+
+__always_inline
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (v < offset)
+		return false;
+
+	v -= offset;
+
+	if (fls64(v) > bits)
+		return false;
+
+	__set_inc_field(state, field, v);
+	return true;
+}
+
+/*
+ * Note: does NOT set out->format (we don't know what it should be here!)
+ *
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
+ * if k is packed bkey_start_pos(k) will successfully pack
+ */
+static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
+				   struct bkey_packed *out,
+				   const struct bkey_format *in_f,
+				   const struct bkey_packed *in)
+{
+	struct pack_state out_s = pack_state_init(out_f, out);
+	struct unpack_state in_s = unpack_state_init(in_f, in);
+	u64 *w = out->_data;
+	unsigned i;
+
+	*w = 0;
+
+	for (i = 0; i < BKEY_NR_FIELDS; i++)
+		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
+			return false;
+
+	/* Can't happen because the val would be too big to unpack: */
+	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
+	pack_state_finish(&out_s, out);
+	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	return true;
+}
+
+bool bch2_bkey_transform(const struct bkey_format *out_f,
+			struct bkey_packed *out,
+			const struct bkey_format *in_f,
+			const struct bkey_packed *in)
+{
+	if (!bch2_bkey_transform_key(out_f, out, in_f, in))
+		return false;
+
+	memcpy_u64s((u64 *) out + out_f->key_u64s,
+		    (u64 *) in + in_f->key_u64s,
+		    (in->u64s - in_f->key_u64s));
+	return true;
+}
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
+			      const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bkey out;
+
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+
+	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
+	out.format	= KEY_FORMAT_CURRENT;
+	out.needs_whiteout = in->needs_whiteout;
+	out.type	= in->type;
+	out.pad[0]	= 0;
+
+#define x(id, field)	out.field = get_inc_field(&state, id);
+	bkey_fields()
+#undef x
+
+	return out;
+}
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
+				     const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bpos out;
+
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+
+	return out;
+}
+#endif
+
+/**
+ * bch2_bkey_pack_key -- pack just the key, not the value
+ * @out:	packed result
+ * @in:		key to pack
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
+ */
+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+			const struct bkey_format *format)
+{
+	struct pack_state state = pack_state_init(format, out);
+	u64 *w = out->_data;
+
+	EBUG_ON((void *) in == (void *) out);
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+	*w = 0;
+
+#define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
+	bkey_fields()
+#undef x
+	pack_state_finish(&state, out);
+	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	bch2_bkey_pack_verify(out, in, format);
+	return true;
+}
+
+/**
+ * bch2_bkey_unpack -- unpack the key and the value
+ * @b:		btree node of @src key (for packed format)
+ * @dst:	unpacked result
+ * @src:	packed input
+ */
+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
+		      const struct bkey_packed *src)
+{
+	__bkey_unpack_key(b, &dst->k, src);
+
+	memcpy_u64s(&dst->v,
+		    bkeyp_val(&b->format, src),
+		    bkeyp_val_u64s(&b->format, src));
+}
+
+/**
+ * bch2_bkey_pack -- pack the key and the value
+ * @dst:	packed result
+ * @src:	unpacked input
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
+ */
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+		    const struct bkey_format *format)
+{
+	struct bkey_packed tmp;
+
+	if (!bch2_bkey_pack_key(&tmp, &src->k, format))
+		return false;
+
+	memmove_u64s((u64 *) dst + format->key_u64s,
+		     &src->v,
+		     bkey_val_u64s(&src->k));
+	memcpy_u64s_small(dst, &tmp, format->key_u64s);
+
+	return true;
+}
+
+__always_inline
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+	bool ret = true;
+
+	EBUG_ON(v < offset);
+	v -= offset;
+
+	if (fls64(v) > bits) {
+		v = ~(~0ULL << bits);
+		ret = false;
+	}
+
+	__set_inc_field(state, field, v);
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool bkey_packed_successor(struct bkey_packed *out,
+				  const struct btree *b,
+				  struct bkey_packed k)
+{
+	const struct bkey_format *f = &b->format;
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned first_bit, offset;
+	u64 *p;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	if (!nr_key_bits)
+		return false;
+
+	*out = k;
+
+	first_bit = high_bit_offset + nr_key_bits - 1;
+	p = nth_word(high_word(f, out), first_bit >> 6);
+	offset = 63 - (first_bit & 63);
+
+	while (nr_key_bits) {
+		unsigned bits = min(64 - offset, nr_key_bits);
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if ((*p & mask) != mask) {
+			*p += 1ULL << offset;
+			EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
+			return true;
+		}
+
+		*p &= ~mask;
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		offset = 0;
+	}
+
+	return false;
+}
+
+static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
+{
+	for (unsigned i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 packed_max = f->bits_per_field[i]
+			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+			: 0;
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (packed_max + field_offset < packed_max ||
+		    packed_max + field_offset > unpacked_max)
+			return true;
+	}
+
+	return false;
+}
+#endif
+
+/*
+ * Returns a packed key that compares <= in
+ *
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
+ * able to compare against the keys in the auxiliary search tree - and it's
+ * legal to use a packed pos that isn't equivalent to the original pos,
+ * _provided_ it compares <= to the original pos.
+ */
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
+					   struct bpos in,
+					   const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct pack_state state = pack_state_init(f, out);
+	u64 *w = out->_data;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bpos orig = in;
+#endif
+	bool exact = true;
+	unsigned i;
+
+	/*
+	 * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
+	 * byte header, but pack_pos() won't if the len/version fields are big
+	 * enough - we need to make sure to zero them out:
+	 */
+	for (i = 0; i < f->key_u64s; i++)
+		w[i] = 0;
+
+	if (unlikely(in.snapshot <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
+		if (!in.offset-- &&
+		    !in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.offset <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
+		if (!in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.inode <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
+		return BKEY_PACK_POS_FAIL;
+
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
+		exact = false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= f->key_u64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->type	= KEY_TYPE_deleted;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	if (exact) {
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+	} else {
+		struct bkey_packed successor;
+
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+		       bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+		       !bkey_format_has_too_big_fields(f));
+	}
+#endif
+
+	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
+}
+
+void bch2_bkey_format_init(struct bkey_format_state *s)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
+		s->field_min[i] = U64_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
+		s->field_max[i] = 0;
+
+	/* Make sure we can store a size of 0: */
+	s->field_min[BKEY_FIELD_SIZE] = 0;
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
+{
+	unsigned field = 0;
+
+	__bkey_format_add(s, field++, p.inode);
+	__bkey_format_add(s, field++, p.offset);
+	__bkey_format_add(s, field++, p.snapshot);
+}
+
+/*
+ * We don't want it to be possible for the packed format to represent fields
+ * bigger than a u64... that will cause confusion and issues (like with
+ * bkey_packed_successor())
+ */
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
+			     unsigned bits, u64 offset)
+{
+	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+	u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+
+	bits = min(bits, unpacked_bits);
+
+	offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
+
+	f->bits_per_field[i]	= bits;
+	f->field_offset[i]	= cpu_to_le64(offset);
+}
+
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+	struct bkey_format ret = {
+		.nr_fields = BKEY_NR_FIELDS,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
+		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
+
+		set_format_field(&ret, i,
+				 fls64(s->field_max[i] - s->field_min[i]),
+				 s->field_min[i]);
+
+		bits += ret.bits_per_field[i];
+	}
+
+	/* allow for extent merging: */
+	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
+		unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
+
+		ret.bits_per_field[BKEY_FIELD_SIZE] += b;
+		bits += b;
+	}
+
+	ret.key_u64s = DIV_ROUND_UP(bits, 64);
+
+	/* if we have enough spare bits, round fields up to nearest byte */
+	bits = ret.key_u64s * 64 - bits;
+
+	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
+		unsigned r = round_up(ret.bits_per_field[i], 8) -
+			ret.bits_per_field[i];
+
+		if (r <= bits) {
+			set_format_field(&ret, i,
+					 ret.bits_per_field[i] + r,
+					 le64_to_cpu(ret.field_offset[i]));
+			bits -= r;
+		}
+	}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	{
+		struct printbuf buf = PRINTBUF;
+
+		BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
+		printbuf_exit(&buf);
+	}
+#endif
+	return ret;
+}
+
+int bch2_bkey_format_invalid(struct bch_fs *c,
+			     struct bkey_format *f,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+
+	if (f->nr_fields != BKEY_NR_FIELDS) {
+		prt_printf(err, "incorrect number of fields: got %u, should be %u",
+			   f->nr_fields, BKEY_NR_FIELDS);
+		return -BCH_ERR_invalid;
+	}
+
+	/*
+	 * Verify that the packed format can't represent fields larger than the
+	 * unpacked format:
+	 */
+	for (i = 0; i < f->nr_fields; i++) {
+		if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+			unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+			u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+			u64 packed_max = f->bits_per_field[i]
+				? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+				: 0;
+			u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+			if (packed_max + field_offset < packed_max ||
+			    packed_max + field_offset > unpacked_max) {
+				prt_printf(err, "field %u too large: %llu + %llu > %llu",
+					   i, packed_max, field_offset, unpacked_max);
+				return -BCH_ERR_invalid;
+			}
+		}
+
+		bits += f->bits_per_field[i];
+	}
+
+	if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
+		prt_printf(err, "incorrect key_u64s: got %u, should be %u",
+			   f->key_u64s, DIV_ROUND_UP(bits, 64));
+		return -BCH_ERR_invalid;
+	}
+
+	return 0;
+}
+
+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
+{
+	prt_printf(out, "u64s %u fields ", f->key_u64s);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
+		if (i)
+			prt_str(out, ", ");
+		prt_printf(out, "%u:%llu",
+			   f->bits_per_field[i],
+			   le64_to_cpu(f->field_offset[i]));
+	}
+}
+
+/*
+ * Most significant differing bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
+					  const struct bkey_packed *l_k,
+					  const struct bkey_packed *r_k)
+{
+	const u64 *l = high_word(&b->format, l_k);
+	const u64 *r = high_word(&b->format, r_k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned word_bits = 64 - high_bit_offset;
+	u64 l_v, r_v;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	/* for big endian, skip past header */
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (nr_key_bits) {
+		if (nr_key_bits < word_bits) {
+			l_v >>= word_bits - nr_key_bits;
+			r_v >>= word_bits - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= word_bits;
+		}
+
+		if (l_v != r_v)
+			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+		word_bits = 64;
+	}
+
+	return 0;
+}
+
+/*
+ * First set bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
+{
+	const u64 *p = high_word(&b->format, k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned ret = 0, offset;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	offset = nr_key_bits;
+	while (offset > 64) {
+		p = next_word(p);
+		offset -= 64;
+	}
+
+	offset = 64 - offset;
+
+	while (nr_key_bits) {
+		unsigned bits = nr_key_bits + offset < 64
+			? nr_key_bits
+			: 64 - offset;
+
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if (*p & mask)
+			return ret + __ffs64(*p & mask) - offset;
+
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		ret += bits;
+		offset = 0;
+	}
+
+	return 0;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+#define I(_x)			(*(out)++ = (_x))
+#define I1(i0)						I(i0)
+#define I2(i0, i1)		(I1(i0),		I(i1))
+#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
+#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
+#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
+
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
+			      enum bch_bkey_fields field,
+			      unsigned dst_offset, unsigned dst_size,
+			      bool *eax_zeroed)
+{
+	unsigned bits = format->bits_per_field[field];
+	u64 offset = le64_to_cpu(format->field_offset[field]);
+	unsigned i, byte, bit_offset, align, shl, shr;
+
+	if (!bits && !offset) {
+		if (!*eax_zeroed) {
+			/* xor eax, eax */
+			I2(0x31, 0xc0);
+		}
+
+		*eax_zeroed = true;
+		goto set_field;
+	}
+
+	if (!bits) {
+		/* just return offset: */
+
+		switch (dst_size) {
+		case 8:
+			if (offset > S32_MAX) {
+				/* mov [rdi + dst_offset], offset */
+				I3(0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+
+				I3(0xc7, 0x47, dst_offset + 4);
+				memcpy(out, (void *) &offset + 4, 4);
+				out += 4;
+			} else {
+				/* mov [rdi + dst_offset], offset */
+				/* sign extended */
+				I4(0x48, 0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+			}
+			break;
+		case 4:
+			/* mov [rdi + dst_offset], offset */
+			I3(0xc7, 0x47, dst_offset);
+			memcpy(out, &offset, 4);
+			out += 4;
+			break;
+		default:
+			BUG();
+		}
+
+		return out;
+	}
+
+	bit_offset = format->key_u64s * 64;
+	for (i = 0; i <= field; i++)
+		bit_offset -= format->bits_per_field[i];
+
+	byte = bit_offset / 8;
+	bit_offset -= byte * 8;
+
+	*eax_zeroed = false;
+
+	if (bit_offset == 0 && bits == 8) {
+		/* movzx eax, BYTE PTR [rsi + imm8] */
+		I4(0x0f, 0xb6, 0x46, byte);
+	} else if (bit_offset == 0 && bits == 16) {
+		/* movzx eax, WORD PTR [rsi + imm8] */
+		I4(0x0f, 0xb7, 0x46, byte);
+	} else if (bit_offset + bits <= 32) {
+		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 32);
+
+		/* mov eax, [rsi + imm8] */
+		I3(0x8b, 0x46, byte);
+
+		if (bit_offset) {
+			/* shr eax, imm8 */
+			I3(0xc1, 0xe8, bit_offset);
+		}
+
+		if (bit_offset + bits < 32) {
+			unsigned mask = ~0U >> (32 - bits);
+
+			/* and eax, imm32 */
+			I1(0x25);
+			memcpy(out, &mask, 4);
+			out += 4;
+		}
+	} else if (bit_offset + bits <= 64) {
+		align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 64);
+
+		/* mov rax, [rsi + imm8] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		shl = 64 - bit_offset - bits;
+		shr = bit_offset + shl;
+
+		if (shl) {
+			/* shl rax, imm8 */
+			I4(0x48, 0xc1, 0xe0, shl);
+		}
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	} else {
+		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 96);
+
+		/* mov rax, [rsi + byte] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		/* mov edx, [rsi + byte + 8] */
+		I3(0x8b, 0x56, byte + 8);
+
+		/* bits from next word: */
+		shr = bit_offset + bits - 64;
+		BUG_ON(shr > bit_offset);
+
+		/* shr rax, bit_offset */
+		I4(0x48, 0xc1, 0xe8, shr);
+
+		/* shl rdx, imm8 */
+		I4(0x48, 0xc1, 0xe2, 64 - shr);
+
+		/* or rax, rdx */
+		I3(0x48, 0x09, 0xd0);
+
+		shr = bit_offset - shr;
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	}
+
+	/* rax += offset: */
+	if (offset > S32_MAX) {
+		/* mov rdx, imm64 */
+		I2(0x48, 0xba);
+		memcpy(out, &offset, 8);
+		out += 8;
+		/* add %rdx, %rax */
+		I3(0x48, 0x01, 0xd0);
+	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
+		/* add rax, imm32 */
+		I2(0x48, 0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	} else if (offset) {
+		/* add eax, imm32 */
+		I1(0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	}
+set_field:
+	switch (dst_size) {
+	case 8:
+		/* mov [rdi + dst_offset], rax */
+		I4(0x48, 0x89, 0x47, dst_offset);
+		break;
+	case 4:
+		/* mov [rdi + dst_offset], eax */
+		I3(0x89, 0x47, dst_offset);
+		break;
+	default:
+		BUG();
+	}
+
+	return out;
+}
+
+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
+{
+	bool eax_zeroed = false;
+	u8 *out = _out;
+
+	/*
+	 * rdi: dst - unpacked key
+	 * rsi: src - packed key
+	 */
+
+	/* k->u64s, k->format, k->type */
+
+	/* mov eax, [rsi] */
+	I2(0x8b, 0x06);
+
+	/* add eax, BKEY_U64s - format->key_u64s */
+	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
+
+	/* and eax, imm32: mask out k->pad: */
+	I5(0x25, 0xff, 0xff, 0xff, 0);
+
+	/* mov [rdi], eax */
+	I2(0x89, 0x07);
+
+#define x(id, field)							\
+	out = compile_bkey_field(format, out, id,			\
+				 offsetof(struct bkey, field),		\
+				 sizeof(((struct bkey *) NULL)->field),	\
+				 &eax_zeroed);
+	bkey_fields()
+#undef x
+
+	/* retq */
+	I1(0xc3);
+
+	return (void *) out - _out;
+}
+
+#else
+#endif
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
+					  const struct bkey_packed *r,
+					  const struct btree *b)
+{
+	return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
+					       const struct bkey_packed *l,
+					       const struct bpos *r)
+{
+	return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+}
+
+__pure __flatten
+int bch2_bkey_cmp_packed(const struct btree *b,
+			 const struct bkey_packed *l,
+			 const struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed_inlined(b, l, r);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bpos *r)
+{
+	const struct bkey *l_unpacked;
+
+	return unlikely(l_unpacked = packed_to_bkey_c(l))
+		? bpos_cmp(l_unpacked->p, *r)
+		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+void bch2_bpos_swab(struct bpos *p)
+{
+	u8 *l = (u8 *) p;
+	u8 *h = ((u8 *) &p[1]) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
+{
+	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
+	u8 *l = k->key_start;
+	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void)
+{
+	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
+	struct bkey_packed p;
+
+	struct bkey_format test_format = {
+		.key_u64s	= 3,
+		.nr_fields	= BKEY_NR_FIELDS,
+		.bits_per_field = {
+			13,
+			64,
+			32,
+		},
+	};
+
+	struct unpack_state in_s =
+		unpack_state_init(&bch2_bkey_format_current, (void *) &t);
+	struct pack_state out_s = pack_state_init(&test_format, &p);
+	unsigned i;
+
+	for (i = 0; i < out_s.format->nr_fields; i++) {
+		u64 a, v = get_inc_field(&in_s, i);
+
+		switch (i) {
+#define x(id, field)	case id: a = t.field; break;
+	bkey_fields()
+#undef x
+		default:
+			BUG();
+		}
+
+		if (a != v)
+			panic("got %llu actual %llu i %u\n", v, a, i);
+
+		if (!set_inc_field(&out_s, i, v))
+			panic("failed at %u\n", i);
+	}
+
+	BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
+}
+#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
new file mode 100644
index 000000000000..518450209236
--- /dev/null
+++ b/fs/bcachefs/bkey.h
@@ -0,0 +1,782 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_H
+#define _BCACHEFS_BKEY_H
+
+#include <linux/bug.h>
+#include "bcachefs_format.h"
+
+#include "btree_types.h"
+#include "util.h"
+#include "vstructs.h"
+
+enum bkey_invalid_flags {
+	BKEY_INVALID_WRITE		= (1U << 0),
+	BKEY_INVALID_COMMIT		= (1U << 1),
+	BKEY_INVALID_JOURNAL		= (1U << 2),
+};
+
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK	1
+#endif
+#endif
+
+void bch2_bkey_packed_to_binary_text(struct printbuf *,
+				     const struct bkey_format *,
+				     const struct bkey_packed *);
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_p_next(_k)		vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	unsigned u64s = BKEY_U64s + val_u64s;
+
+	BUG_ON(u64s > U8_MAX);
+	k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+enum bkey_lr_packed {
+	BKEY_PACKED_BOTH,
+	BKEY_PACKED_RIGHT,
+	BKEY_PACKED_LEFT,
+	BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed(_l, _r)						\
+	((_l)->format + ((_r)->format << 1))
+
+#define bkey_copy(_dst, _src)					\
+do {								\
+	BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&		\
+		     !type_is(_dst, struct bkey_packed *));	\
+	BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&		\
+		     !type_is(_src, struct bkey_packed *));	\
+	EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&		\
+		(u64 *) (_dst) < (u64 *) (_src) +		\
+		((struct bkey *) (_src))->u64s);		\
+								\
+	memcpy_u64s_small((_dst), (_src),			\
+			  ((struct bkey *) (_src))->u64s);	\
+} while (0)
+
+struct btree;
+
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bkey_packed *);
+__pure
+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
+				     const struct bkey_packed *,
+				     const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bpos *);
+
+__pure
+int bch2_bkey_cmp_packed(const struct btree *,
+			 const struct bkey_packed *,
+			 const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_left_packed(const struct btree *,
+				const struct bkey_packed *,
+				const struct bpos *);
+
+static inline __pure
+int bkey_cmp_left_packed(const struct btree *b,
+			 const struct bkey_packed *l, const struct bpos *r)
+{
+	return __bch2_bkey_cmp_left_packed(b, l, r);
+}
+
+/*
+ * The compiler generates better code when we pass bpos by ref, but it's often
+ * enough terribly convenient to pass it by val... as much as I hate c++, const
+ * ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+					     const struct bkey_packed *l,
+					     struct bpos r)
+{
+	return bkey_cmp_left_packed(b, l, &r);
+}
+
+static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
+{
+	return  !((l.inode	^ r.inode) |
+		  (l.offset	^ r.offset) |
+		  (l.snapshot	^ r.snapshot));
+}
+
+static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode ? l.inode < r.inode :
+		l.offset != r.offset ? l.offset < r.offset :
+		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
+}
+
+static __always_inline bool bpos_le(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode ? l.inode < r.inode :
+		l.offset != r.offset ? l.offset < r.offset :
+		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
+}
+
+static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
+{
+	return bpos_lt(r, l);
+}
+
+static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
+{
+	return bpos_le(r, l);
+}
+
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset) ?:
+		cmp_int(l.snapshot, r.snapshot);
+}
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+	return bpos_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+	return bpos_gt(l, r) ? l : r;
+}
+
+static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
+{
+	return  !((l.inode	^ r.inode) |
+		  (l.offset	^ r.offset));
+}
+
+static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode
+		? l.inode < r.inode
+		: l.offset < r.offset;
+}
+
+static __always_inline bool bkey_le(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode
+		? l.inode < r.inode
+		: l.offset <= r.offset;
+}
+
+static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
+{
+	return bkey_lt(r, l);
+}
+
+static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
+{
+	return bkey_le(r, l);
+}
+
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset);
+}
+
+static inline struct bpos bkey_min(struct bpos l, struct bpos r)
+{
+	return bkey_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bkey_max(struct bpos l, struct bpos r)
+{
+	return bkey_gt(l, r) ? l : r;
+}
+
+void bch2_bpos_swab(struct bpos *);
+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+	return  cmp_int(l.hi, r.hi) ?:
+		cmp_int(l.lo, r.lo);
+}
+
+#define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+	return !bversion_cmp(v, ZERO_VERSION);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k)							\
+	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
+	 (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+	return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+	return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+	return format->bits_per_field[BKEY_FIELD_INODE] +
+		format->bits_per_field[BKEY_FIELD_OFFSET] +
+		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bpos_successor(struct bpos p)
+{
+	if (!++p.snapshot &&
+	    !++p.offset &&
+	    !++p.inode)
+		BUG();
+
+	return p;
+}
+
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+	if (!p.snapshot-- &&
+	    !p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
+}
+
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
+{
+	p.snapshot = 0;
+
+	if (!++p.offset &&
+	    !++p.inode)
+		BUG();
+
+	return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+	p.snapshot = 0;
+
+	if (!p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+	return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+	return (struct bpos) {
+		.inode		= k->p.inode,
+		.offset		= bkey_start_offset(k),
+		.snapshot	= k->p.snapshot,
+	};
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+	EBUG_ON(k->u64s < ret);
+	return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+				       const struct bkey_packed *k)
+{
+	return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+				     const struct bkey_packed *k)
+{
+	return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+				      struct bkey_packed *k, unsigned val_u64s)
+{
+	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k)						\
+	 ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch2_bkey_format_current;
+
+bool bch2_bkey_transform(const struct bkey_format *,
+			 struct bkey_packed *,
+			 const struct bkey_format *,
+			 const struct bkey_packed *);
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+				   const struct bkey_packed *);
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+			      const struct bkey_packed *);
+#endif
+
+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
+		   const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+	BKEY_PACK_POS_EXACT,
+	BKEY_PACK_POS_SMALLER,
+	BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+					   const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+				 const struct btree *b)
+{
+	return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
+		 const struct bkey_packed *);
+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
+	       const struct bkey_format *);
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+			       struct bkey *dst,
+			       const struct bkey_packed *src)
+{
+	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(dst, src);
+
+		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+		    bch2_expensive_debug_checks) {
+			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+		}
+	} else {
+		*dst = __bch2_bkey_unpack_key(&b->format, src);
+	}
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+	__bkey_unpack_key_format_checked(b, &dst, src);
+	return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+				     struct bkey *dst,
+				     const struct bkey_packed *src)
+{
+	if (likely(bkey_packed(src)))
+		__bkey_unpack_key_format_checked(b, dst, src);
+	else
+		*dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(const struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+				 enum bch_bkey_fields nr)
+{
+	return f->bits_per_field[nr] < 64
+		? (le64_to_cpu(f->field_offset[nr]) +
+		   ~(~0ULL << f->bits_per_field[nr]))
+		: U64_MAX;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+int bch2_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch2_compile_bkey_format(const struct bkey_format *format,
+					  void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+				   struct bkey_s_c src)
+{
+	dst->k = *src.k;
+	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...)					\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+};									\
+									\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = KEY_TYPE_##name;					\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+/* byte order helpers */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return f->key_u64s - 1;
+}
+
+#define high_bit_offset		0
+#define nth_word(p, n)		((p) - (n))
+
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return 0;
+}
+
+#define high_bit_offset		KEY_PACKED_BITS_START
+#define nth_word(p, n)		((p) + (n))
+
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define high_word(f, k)		((u64 *) (k)->_data + high_word_offset(f))
+#define next_word(p)		nth_word(p, 1)
+#define prev_word(p)		nth_word(p, -1)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void);
+#else
+static inline void bch2_bkey_pack_test(void) {}
+#endif
+
+#define bkey_fields()							\
+	x(BKEY_FIELD_INODE,		p.inode)			\
+	x(BKEY_FIELD_OFFSET,		p.offset)			\
+	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
+	x(BKEY_FIELD_SIZE,		size)				\
+	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
+	x(BKEY_FIELD_VERSION_LO,	version.lo)
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+
+static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
+{
+	s->field_min[field] = min(s->field_min[field], v);
+	s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+	bkey_fields()
+#undef x
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
+			     enum bkey_invalid_flags, struct printbuf *);
+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
+
+#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
new file mode 100644
index 000000000000..a30c4ae8eb36
--- /dev/null
+++ b/fs/bcachefs/bkey_buf.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+
+struct bkey_buf {
+	struct bkey_i	*k;
+	u64		onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+					 struct bch_fs *c, unsigned u64s)
+{
+	if (s->k == (void *) s->onstack &&
+	    u64s > ARRAY_SIZE(s->onstack)) {
+		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+		memcpy(s->k, s->onstack, sizeof(s->onstack));
+	}
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+					    struct bch_fs *c,
+					    struct bkey_s_c k)
+{
+	bch2_bkey_buf_realloc(s, c, k.k->u64s);
+	bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+				      struct bch_fs *c,
+				      struct bkey_i *src)
+{
+	bch2_bkey_buf_realloc(s, c, src->k.u64s);
+	bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+					struct bch_fs *c,
+					struct btree *b,
+					struct bkey_packed *src)
+{
+	bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+			      bkeyp_val_u64s(&b->format, src));
+	bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+	s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+	if (s->k != (void *) s->onstack)
+		mempool_free(s->k, &c->large_bkey_pool);
+	s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
new file mode 100644
index 000000000000..5f42a6e69360
--- /dev/null
+++ b/fs/bcachefs/bkey_cmp.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_CMP_H
+#define _BCACHEFS_BKEY_CMP_H
+
+#include "bkey.h"
+
+#ifdef CONFIG_X86_64
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	long d0, d1, d2, d3;
+	int cmp;
+
+	/* we shouldn't need asm for this, but gcc is being retarded: */
+
+	asm(".intel_syntax noprefix;"
+	    "xor eax, eax;"
+	    "xor edx, edx;"
+	    "1:;"
+	    "mov r8, [rdi];"
+	    "mov r9, [rsi];"
+	    "sub ecx, 64;"
+	    "jl 2f;"
+
+	    "cmp r8, r9;"
+	    "jnz 3f;"
+
+	    "lea rdi, [rdi - 8];"
+	    "lea rsi, [rsi - 8];"
+	    "jmp 1b;"
+
+	    "2:;"
+	    "not ecx;"
+	    "shr r8, 1;"
+	    "shr r9, 1;"
+	    "shr r8, cl;"
+	    "shr r9, cl;"
+	    "cmp r8, r9;"
+
+	    "3:\n"
+	    "seta al;"
+	    "setb dl;"
+	    "sub eax, edx;"
+	    ".att_syntax prefix;"
+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+	    : "0" (l), "1" (r), "3" (nr_key_bits)
+	    : "r8", "r9", "cc", "memory");
+
+	return cmp;
+}
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	u64 l_v, r_v;
+
+	if (!nr_key_bits)
+		return 0;
+
+	/* for big endian, skip past header */
+	nr_key_bits += high_bit_offset;
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (1) {
+		if (nr_key_bits < 64) {
+			l_v >>= 64 - nr_key_bits;
+			r_v >>= 64 - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= 64;
+		}
+
+		if (!nr_key_bits || l_v != r_v)
+			break;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+	}
+
+	return cmp_int(l_v, r_v);
+}
+#endif
+
+static inline __pure __flatten
+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
+					  const struct bkey_packed *r,
+					  const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	int ret;
+
+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	ret = __bkey_cmp_bits(high_word(f, l),
+			      high_word(f, r),
+			      b->nr_key_bits);
+
+	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
+				bkey_unpack_pos(b, r)));
+	return ret;
+}
+
+static inline __pure __flatten
+int bch2_bkey_cmp_packed_inlined(const struct btree *b,
+			 const struct bkey_packed *l,
+			 const struct bkey_packed *r)
+{
+	struct bkey unpacked;
+
+	if (likely(bkey_packed(l) && bkey_packed(r)))
+		return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+
+	if (bkey_packed(l)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, l);
+		l = (void *) &unpacked;
+	} else if (bkey_packed(r)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, r);
+		r = (void *) &unpacked;
+	}
+
+	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+}
+
+#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
new file mode 100644
index 000000000000..d9fb1fc81f1e
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.c
@@ -0,0 +1,458 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "backpointers.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "alloc_background.h"
+#include "dirent.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "lru.h"
+#include "quota.h"
+#include "reflink.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "xattr.h"
+
+const char * const bch2_bkey_types[] = {
+#define x(name, nr) #name,
+	BCH_BKEY_TYPES()
+#undef x
+	NULL
+};
+
+static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			       enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	return 0;
+}
+
+#define bch2_bkey_ops_deleted ((struct bkey_ops) {	\
+	.key_invalid = deleted_key_invalid,		\
+})
+
+#define bch2_bkey_ops_whiteout ((struct bkey_ops) {	\
+	.key_invalid = deleted_key_invalid,		\
+})
+
+static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				 enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	if (bkey_val_bytes(k.k)) {
+		prt_printf(err, "incorrect value size (%zu != 0)",
+		       bkey_val_bytes(k.k));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+#define bch2_bkey_ops_error ((struct bkey_ops) {	\
+	.key_invalid = empty_val_key_invalid,		\
+})
+
+static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				   enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	return 0;
+}
+
+#define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
+	.key_invalid	= key_type_cookie_invalid,	\
+	.min_val_size	= 8,				\
+})
+
+#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
+	.key_invalid = empty_val_key_invalid,		\
+})
+
+static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+					enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	return 0;
+}
+
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+					 struct bkey_s_c k)
+{
+	struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
+
+	prt_printf(out, "datalen %u: %*phN",
+	       datalen, min(datalen, 32U), d.v->data);
+}
+
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) {	\
+	.key_invalid	= key_type_inline_data_invalid,	\
+	.val_to_text	= key_type_inline_data_to_text,	\
+})
+
+static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	if (bkey_val_bytes(k.k)) {
+		prt_printf(err, "incorrect value size (%zu != %zu)",
+		       bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+#define bch2_bkey_ops_set ((struct bkey_ops) {		\
+	.key_invalid	= key_type_set_invalid,		\
+	.key_merge	= key_type_set_merge,		\
+})
+
+const struct bkey_ops bch2_bkey_ops[] = {
+#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
+	BCH_BKEY_TYPES()
+#undef x
+};
+
+const struct bkey_ops bch2_bkey_null_ops = {
+};
+
+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+	if (bkey_val_bytes(k.k) < ops->min_val_size) {
+		prt_printf(err, "bad val size (%zu < %u)",
+			   bkey_val_bytes(k.k), ops->min_val_size);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (!ops->key_invalid)
+		return 0;
+
+	return ops->key_invalid(c, k, flags, err);
+}
+
+static u64 bch2_key_types_allowed[] = {
+#define x(name, nr, flags, keys)	[BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
+	BCH_BTREE_IDS()
+#undef x
+	[BKEY_TYPE_btree] =
+		BIT_ULL(KEY_TYPE_deleted)|
+		BIT_ULL(KEY_TYPE_btree_ptr)|
+		BIT_ULL(KEY_TYPE_btree_ptr_v2),
+};
+
+int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+			enum btree_node_type type,
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
+{
+	if (k.k->u64s < BKEY_U64s) {
+		prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (flags & BKEY_INVALID_COMMIT	 &&
+	    !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) {
+		prt_printf(err, "invalid key type for btree %s (%s)",
+			   bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+		if (k.k->size == 0) {
+			prt_printf(err, "size == 0");
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		if (k.k->size > k.k->p.offset) {
+			prt_printf(err, "size greater than offset (%u > %llu)",
+			       k.k->size, k.k->p.offset);
+			return -BCH_ERR_invalid_bkey;
+		}
+	} else {
+		if (k.k->size) {
+			prt_printf(err, "size != 0");
+			return -BCH_ERR_invalid_bkey;
+		}
+	}
+
+	if (type != BKEY_TYPE_btree) {
+		if (!btree_type_has_snapshots((enum btree_id) type) &&
+		    k.k->p.snapshot) {
+			prt_printf(err, "nonzero snapshot");
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		if (btree_type_has_snapshots((enum btree_id) type) &&
+		    !k.k->p.snapshot) {
+			prt_printf(err, "snapshot == 0");
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		if (bkey_eq(k.k->p, POS_MAX)) {
+			prt_printf(err, "key at POS_MAX");
+			return -BCH_ERR_invalid_bkey;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+		      enum btree_node_type type,
+		      enum bkey_invalid_flags flags,
+		      struct printbuf *err)
+{
+	return __bch2_bkey_invalid(c, k, type, flags, err) ?:
+		bch2_bkey_val_invalid(c, k, flags, err);
+}
+
+int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
+			    struct printbuf *err)
+{
+	if (bpos_lt(k.k->p, b->data->min_key)) {
+		prt_printf(err, "key before start of btree node");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (bpos_gt(k.k->p, b->data->max_key)) {
+		prt_printf(err, "key past end of btree node");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
+{
+	if (bpos_eq(pos, POS_MIN))
+		prt_printf(out, "POS_MIN");
+	else if (bpos_eq(pos, POS_MAX))
+		prt_printf(out, "POS_MAX");
+	else if (bpos_eq(pos, SPOS_MAX))
+		prt_printf(out, "SPOS_MAX");
+	else {
+		if (pos.inode == U64_MAX)
+			prt_printf(out, "U64_MAX");
+		else
+			prt_printf(out, "%llu", pos.inode);
+		prt_printf(out, ":");
+		if (pos.offset == U64_MAX)
+			prt_printf(out, "U64_MAX");
+		else
+			prt_printf(out, "%llu", pos.offset);
+		prt_printf(out, ":");
+		if (pos.snapshot == U32_MAX)
+			prt_printf(out, "U32_MAX");
+		else
+			prt_printf(out, "%u", pos.snapshot);
+	}
+}
+
+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
+{
+	if (k) {
+		prt_printf(out, "u64s %u type ", k->u64s);
+
+		if (k->type < KEY_TYPE_MAX)
+			prt_printf(out, "%s ", bch2_bkey_types[k->type]);
+		else
+			prt_printf(out, "%u ", k->type);
+
+		bch2_bpos_to_text(out, k->p);
+
+		prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
+	} else {
+		prt_printf(out, "(null)");
+	}
+}
+
+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
+		      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+	if (likely(ops->val_to_text))
+		ops->val_to_text(out, c, k);
+}
+
+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	bch2_bkey_to_text(out, k.k);
+
+	if (bkey_val_bytes(k.k)) {
+		prt_printf(out, ": ");
+		bch2_val_to_text(out, c, k);
+	}
+}
+
+void bch2_bkey_swab_val(struct bkey_s k)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+	if (ops->swab)
+		ops->swab(k);
+}
+
+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+	return ops->key_normalize
+		? ops->key_normalize(c, k)
+		: false;
+}
+
+bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
+
+	return ops->key_merge &&
+		bch2_bkey_maybe_mergable(l.k, r.k) &&
+		(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
+		!bch2_key_merging_disabled &&
+		ops->key_merge(c, l, r);
+}
+
+static const struct old_bkey_type {
+	u8		btree_node_type;
+	u8		old;
+	u8		new;
+} bkey_renumber_table[] = {
+	{BKEY_TYPE_btree,	128, KEY_TYPE_btree_ptr		},
+	{BKEY_TYPE_extents,	128, KEY_TYPE_extent		},
+	{BKEY_TYPE_extents,	129, KEY_TYPE_extent		},
+	{BKEY_TYPE_extents,	130, KEY_TYPE_reservation	},
+	{BKEY_TYPE_inodes,	128, KEY_TYPE_inode		},
+	{BKEY_TYPE_inodes,	130, KEY_TYPE_inode_generation	},
+	{BKEY_TYPE_dirents,	128, KEY_TYPE_dirent		},
+	{BKEY_TYPE_dirents,	129, KEY_TYPE_hash_whiteout	},
+	{BKEY_TYPE_xattrs,	128, KEY_TYPE_xattr		},
+	{BKEY_TYPE_xattrs,	129, KEY_TYPE_hash_whiteout	},
+	{BKEY_TYPE_alloc,	128, KEY_TYPE_alloc		},
+	{BKEY_TYPE_quotas,	128, KEY_TYPE_quota		},
+};
+
+void bch2_bkey_renumber(enum btree_node_type btree_node_type,
+			struct bkey_packed *k,
+			int write)
+{
+	const struct old_bkey_type *i;
+
+	for (i = bkey_renumber_table;
+	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
+	     i++)
+		if (btree_node_type == i->btree_node_type &&
+		    k->type == (write ? i->new : i->old)) {
+			k->type = write ? i->old : i->new;
+			break;
+		}
+}
+
+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+			unsigned version, unsigned big_endian,
+			int write,
+			struct bkey_format *f,
+			struct bkey_packed *k)
+{
+	const struct bkey_ops *ops;
+	struct bkey uk;
+	unsigned nr_compat = 5;
+	int i;
+
+	/*
+	 * Do these operations in reverse order in the write path:
+	 */
+
+	for (i = 0; i < nr_compat; i++)
+	switch (!write ? i : nr_compat - 1 - i) {
+	case 0:
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_key(f, k);
+		break;
+	case 1:
+		if (version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+		break;
+	case 2:
+		if (version < bcachefs_metadata_version_inode_btree_change &&
+		    btree_id == BTREE_ID_inodes) {
+			if (!bkey_packed(k)) {
+				struct bkey_i *u = packed_to_bkey(k);
+
+				swap(u->k.p.inode, u->k.p.offset);
+			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
+				struct bkey_format tmp = *f, *in = f, *out = &tmp;
+
+				swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+				     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+				swap(tmp.field_offset[BKEY_FIELD_INODE],
+				     tmp.field_offset[BKEY_FIELD_OFFSET]);
+
+				if (!write)
+					swap(in, out);
+
+				uk = __bch2_bkey_unpack_key(in, k);
+				swap(uk.p.inode, uk.p.offset);
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+			}
+		}
+		break;
+	case 3:
+		if (version < bcachefs_metadata_version_snapshot &&
+		    (level || btree_type_has_snapshots(btree_id))) {
+			struct bkey_i *u = packed_to_bkey(k);
+
+			if (u) {
+				u->k.p.snapshot = write
+					? 0 : U32_MAX;
+			} else {
+				u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
+				u64 max_packed = min_packed +
+					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+				uk = __bch2_bkey_unpack_key(f, k);
+				uk.p.snapshot = write
+					? min_packed : min_t(u64, U32_MAX, max_packed);
+
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+			}
+		}
+
+		break;
+	case 4: {
+		struct bkey_s u;
+
+		if (!bkey_packed(k)) {
+			u = bkey_i_to_s(packed_to_bkey(k));
+		} else {
+			uk = __bch2_bkey_unpack_key(f, k);
+			u.k = &uk;
+			u.v = bkeyp_val(f, k);
+		}
+
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_val(u);
+
+		ops = bch2_bkey_type_ops(k->type);
+
+		if (ops->compat)
+			ops->compat(btree_id, version, big_endian, write, u);
+		break;
+	}
+	default:
+		BUG();
+	}
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
new file mode 100644
index 000000000000..668f595e2fcf
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.h
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_METHODS_H
+#define _BCACHEFS_BKEY_METHODS_H
+
+#include "bkey.h"
+
+struct bch_fs;
+struct btree;
+struct btree_trans;
+struct bkey;
+enum btree_node_type;
+
+extern const char * const bch2_bkey_types[];
+extern const struct bkey_ops bch2_bkey_null_ops;
+
+/*
+ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * invalid, entire key will be deleted.
+ *
+ * When invalid, error string is returned via @err. @rw indicates whether key is
+ * being read or written; more aggressive checks can be enabled when rw == WRITE.
+ */
+struct bkey_ops {
+	int		(*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
+				       enum bkey_invalid_flags flags, struct printbuf *err);
+	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
+				       struct bkey_s_c);
+	void		(*swab)(struct bkey_s);
+	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
+	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+	int		(*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
+					 struct bkey_s_c, struct bkey_i *, unsigned);
+	int		(*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned,
+					  struct bkey_s_c, struct bkey_s_c, unsigned);
+	void		(*compat)(enum btree_id id, unsigned version,
+				  unsigned big_endian, int write,
+				  struct bkey_s);
+
+	/* Size of value type when first created: */
+	unsigned	min_val_size;
+};
+
+extern const struct bkey_ops bch2_bkey_ops[];
+
+static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
+{
+	return likely(type < KEY_TYPE_MAX)
+		? &bch2_bkey_ops[type]
+		: &bch2_bkey_null_ops;
+}
+
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+			enum bkey_invalid_flags, struct printbuf *);
+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+		      enum bkey_invalid_flags, struct printbuf *);
+int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
+
+void bch2_bpos_to_text(struct printbuf *, struct bpos);
+void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
+void bch2_val_to_text(struct printbuf *, struct bch_fs *,
+		      struct bkey_s_c);
+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
+			   struct bkey_s_c);
+
+void bch2_bkey_swab_val(struct bkey_s);
+
+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
+
+static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
+{
+	return l->type == r->type &&
+		!bversion_cmp(l->version, r->version) &&
+		bpos_eq(l->p, bkey_start_pos(r));
+}
+
+bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+static inline int bch2_mark_key(struct btree_trans *trans,
+		enum btree_id btree, unsigned level,
+		struct bkey_s_c old, struct bkey_s_c new,
+		unsigned flags)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
+
+	return ops->atomic_trigger
+		? ops->atomic_trigger(trans, btree, level, old, new, flags)
+		: 0;
+}
+
+enum btree_update_flags {
+	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
+	__BTREE_UPDATE_NOJOURNAL,
+	__BTREE_UPDATE_PREJOURNAL,
+	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
+
+	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
+
+	__BTREE_TRIGGER_INSERT,
+	__BTREE_TRIGGER_OVERWRITE,
+
+	__BTREE_TRIGGER_GC,
+	__BTREE_TRIGGER_BUCKET_INVALIDATE,
+	__BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_NOJOURNAL		(1U << __BTREE_UPDATE_NOJOURNAL)
+#define BTREE_UPDATE_PREJOURNAL		(1U << __BTREE_UPDATE_PREJOURNAL)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+
+#define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
+
+#define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE		(1U << __BTREE_TRIGGER_OVERWRITE)
+
+#define BTREE_TRIGGER_GC		(1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE	(1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
+
+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW		\
+	((1U << KEY_TYPE_alloc)|		\
+	 (1U << KEY_TYPE_alloc_v2)|		\
+	 (1U << KEY_TYPE_alloc_v3)|		\
+	 (1U << KEY_TYPE_alloc_v4)|		\
+	 (1U << KEY_TYPE_stripe)|		\
+	 (1U << KEY_TYPE_inode)|		\
+	 (1U << KEY_TYPE_inode_v2)|		\
+	 (1U << KEY_TYPE_snapshot))
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_s_c old, struct bkey_i *new,
+				      unsigned flags)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type);
+
+	return ops->trans_trigger
+		? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+		: 0;
+}
+
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_s_c old, unsigned flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = old.k->p;
+
+	return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
+				   BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_i *new, unsigned flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = new->k.p;
+
+	return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+				   BTREE_TRIGGER_INSERT|flags);
+}
+
+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
+
+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
+			int, struct bkey_format *, struct bkey_packed *);
+
+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+			       unsigned version, unsigned big_endian,
+			       int write,
+			       struct bkey_format *f,
+			       struct bkey_packed *k)
+{
+	if (version < bcachefs_metadata_version_current ||
+	    big_endian != CPU_BIG_ENDIAN)
+		__bch2_bkey_compat(level, btree_id, version,
+				   big_endian, write, f, k);
+
+}
+
+#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
new file mode 100644
index 000000000000..b9aa027c881b
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "bkey_cmp.h"
+#include "bkey_sort.h"
+#include "bset.h"
+#include "extents.h"
+
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
+
+static inline bool sort_iter_end(struct sort_iter *iter)
+{
+	return !iter->used;
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
+				  sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		sort_iter_sift(iter, i, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return !sort_iter_end(iter) ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	struct sort_iter_set *i = iter->data;
+
+	BUG_ON(!iter->used);
+
+	i->k = bkey_p_next(i->k);
+
+	BUG_ON(i->k > i->end);
+
+	if (i->k == i->end)
+		array_remove_item(iter->data, iter->used, 0);
+	else
+		sort_iter_sift(iter, 0, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ */
+static inline int key_sort_fix_overlapping_cmp(struct btree *b,
+					       struct bkey_packed *l,
+					       struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed(b, l, r) ?:
+		cmp_int((unsigned long) l, (unsigned long) r);
+}
+
+static inline bool should_drop_next_key(struct sort_iter *iter)
+{
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older
+	 * and should be dropped.
+	 */
+	return iter->used >= 2 &&
+		!bch2_bkey_cmp_packed(iter->b,
+				 iter->data[0].k,
+				 iter->data[1].k);
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+			      struct sort_iter *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
+
+	while ((k = sort_iter_peek(iter))) {
+		if (!bkey_deleted(k) &&
+		    !should_drop_next_key(iter)) {
+			bkey_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_p_next(out);
+		}
+
+		sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort + repack in a new format: */
+struct btree_nr_keys
+bch2_sort_repack(struct bset *dst, struct btree *src,
+		 struct btree_node_iter *src_iter,
+		 struct bkey_format *out_f,
+		 bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = vstruct_last(dst);
+	struct btree_nr_keys nr;
+	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_deleted(in))
+			continue;
+
+		if (!transform)
+			bkey_copy(out, in);
+		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+					     ? in_f : &bch2_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(src, (void *) out, in);
+
+		out->needs_whiteout = false;
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_p_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+				struct bkey_packed *l,
+				struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
+		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
+		(int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+unsigned bch2_sort_keys(struct bkey_packed *dst,
+			struct sort_iter *iter,
+			bool filter_whiteouts)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, sort_keys_cmp);
+
+	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		bool needs_whiteout = false;
+
+		if (bkey_deleted(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		while ((next = sort_iter_peek(iter)) &&
+		       !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
+			BUG_ON(in->needs_whiteout &&
+			       next->needs_whiteout);
+			needs_whiteout |= in->needs_whiteout;
+			in = sort_iter_next(iter, sort_keys_cmp);
+		}
+
+		if (bkey_deleted(in)) {
+			memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
+			set_bkeyp_val_u64s(f, out, 0);
+		} else {
+			bkey_copy(out, in);
+		}
+		out->needs_whiteout |= needs_whiteout;
+		out = bkey_p_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
new file mode 100644
index 000000000000..7c0f0b160f18
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_SORT_H
+#define _BCACHEFS_BKEY_SORT_H
+
+struct sort_iter {
+	struct btree		*b;
+	unsigned		used;
+	unsigned		size;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[];
+};
+
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
+{
+	iter->b = b;
+	iter->used = 0;
+	iter->size = size;
+}
+
+struct sort_iter_stack {
+	struct sort_iter	iter;
+	struct sort_iter_set	sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+	sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
+}
+
+static inline void sort_iter_add(struct sort_iter *iter,
+				 struct bkey_packed *k,
+				 struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= iter->size);
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
+			      struct sort_iter *);
+
+struct btree_nr_keys
+bch2_sort_repack(struct bset *, struct btree *,
+		 struct btree_node_iter *,
+		 struct bkey_format *, bool);
+
+unsigned bch2_sort_keys(struct bkey_packed *,
+			struct sort_iter *, bool);
+
+#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
new file mode 100644
index 000000000000..bb73ba9017b0
--- /dev/null
+++ b/fs/bcachefs/bset.c
@@ -0,0 +1,1592 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "bset.h"
+#include "eytzinger.h"
+#include "trace.h"
+#include "util.h"
+
+#include <asm/unaligned.h>
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
+						  struct btree *);
+
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+	unsigned n = ARRAY_SIZE(iter->data);
+
+	while (n && __btree_node_iter_set_end(iter, n - 1))
+		--n;
+
+	return n;
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+	return bch2_bkey_to_bset_inlined(b, k);
+}
+
+/*
+ * There are never duplicate live keys in the btree - but including keys that
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
+ * duplicates.
+ *
+ * Thus the sort order is: usual key comparison first, but for keys that compare
+ * equal the deleted key(s) come first, and the (at most one) live version comes
+ * last.
+ *
+ * The main reason for this is insertion: to handle overwrites, we first iterate
+ * over keys that compare equal to our insert key, and then insert immediately
+ * prior to the first key greater than the key we're inserting - our insert
+ * position will be after all keys that compare equal to our insert key, which
+ * by the time we actually do the insert will all be deleted.
+ */
+
+void bch2_dump_bset(struct bch_fs *c, struct btree *b,
+		    struct bset *i, unsigned set)
+{
+	struct bkey_packed *_k, *_n;
+	struct bkey uk, n;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+
+	if (!i->u64s)
+		return;
+
+	for (_k = i->start;
+	     _k < vstruct_last(i);
+	     _k = _n) {
+		_n = bkey_p_next(_k);
+
+		k = bkey_disassemble(b, _k, &uk);
+
+		printbuf_reset(&buf);
+		if (c)
+			bch2_bkey_val_to_text(&buf, c, k);
+		else
+			bch2_bkey_to_text(&buf, k.k);
+		printk(KERN_ERR "block %u key %5zu: %s\n", set,
+		       _k->_data - i->_data, buf.buf);
+
+		if (_n == vstruct_last(i))
+			continue;
+
+		n = bkey_unpack_key(b, _n);
+
+		if (bpos_lt(n.p, k.k->p)) {
+			printk(KERN_ERR "Key skipped backwards\n");
+			continue;
+		}
+
+		if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
+			printk(KERN_ERR "Duplicate keys\n");
+	}
+
+	printbuf_exit(&buf);
+}
+
+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	console_lock();
+	for_each_bset(b, t)
+		bch2_dump_bset(c, b, bset(b, t), t - b->set);
+	console_unlock();
+}
+
+void bch2_dump_btree_node_iter(struct btree *b,
+			      struct btree_node_iter *iter)
+{
+	struct btree_node_iter_set *set;
+	struct printbuf buf = PRINTBUF;
+
+	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+	       __btree_node_iter_used(iter), b->nsets);
+
+	btree_node_iter_for_each(iter, set) {
+		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+		struct bset_tree *t = bch2_bkey_to_bset(b, k);
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		printbuf_reset(&buf);
+		bch2_bkey_to_text(&buf, &uk);
+		printk(KERN_ERR "set %zu key %u: %s\n",
+		       t - b->set, set->k, buf.buf);
+	}
+
+	printbuf_exit(&buf);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr = { 0 };
+
+	for_each_bset(b, t)
+		bset_tree_for_each_key(b, t, k)
+			if (!bkey_deleted(k))
+				btree_keys_account_key_add(&nr, t - b->set, k);
+
+	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
+}
+
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+					    struct btree *b)
+{
+	struct btree_node_iter iter = *_iter;
+	const struct bkey_packed *k, *n;
+
+	k = bch2_btree_node_iter_peek_all(&iter, b);
+	__bch2_btree_node_iter_advance(&iter, b);
+	n = bch2_btree_node_iter_peek_all(&iter, b);
+
+	bkey_unpack_key(b, k);
+
+	if (n &&
+	    bkey_iter_cmp(b, k, n) > 0) {
+		struct btree_node_iter_set *set;
+		struct bkey ku = bkey_unpack_key(b, k);
+		struct bkey nu = bkey_unpack_key(b, n);
+		struct printbuf buf1 = PRINTBUF;
+		struct printbuf buf2 = PRINTBUF;
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &ku);
+		bch2_bkey_to_text(&buf2, &nu);
+		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
+		       buf1.buf, buf2.buf);
+		printk(KERN_ERR "iter was:");
+
+		btree_node_iter_for_each(_iter, set) {
+			struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+			struct bset_tree *t = bch2_bkey_to_bset(b, k2);
+			printk(" [%zi %zi]", t - b->set,
+			       k2->_data - bset(b, t)->_data);
+		}
+		panic("\n");
+	}
+}
+
+void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+				 struct btree *b)
+{
+	struct btree_node_iter_set *set, *s2;
+	struct bkey_packed *k, *p;
+	struct bset_tree *t;
+
+	if (bch2_btree_node_iter_end(iter))
+		return;
+
+	/* Verify no duplicates: */
+	btree_node_iter_for_each(iter, set) {
+		BUG_ON(set->k > set->end);
+		btree_node_iter_for_each(iter, s2)
+			BUG_ON(set != s2 && set->end == s2->end);
+	}
+
+	/* Verify that set->end is correct: */
+	btree_node_iter_for_each(iter, set) {
+		for_each_bset(b, t)
+			if (set->end == t->end_offset)
+				goto found;
+		BUG();
+found:
+		BUG_ON(set->k < btree_bkey_first_offset(t) ||
+		       set->k >= t->end_offset);
+	}
+
+	/* Verify iterator is sorted: */
+	btree_node_iter_for_each(iter, set)
+		BUG_ON(set != iter->data &&
+		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+	k = bch2_btree_node_iter_peek_all(iter, b);
+
+	for_each_bset(b, t) {
+		if (iter->data[0].end == t->end_offset)
+			continue;
+
+		p = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
+
+		BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+	}
+}
+
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+			    struct bkey_packed *insert, unsigned clobber_u64s)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, where);
+	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
+	struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+#if 0
+	BUG_ON(prev &&
+	       bkey_iter_cmp(b, prev, insert) > 0);
+#else
+	if (prev &&
+	    bkey_iter_cmp(b, prev, insert) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, prev);
+		struct bkey k2 = bkey_unpack_key(b, insert);
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &k1);
+		bch2_bkey_to_text(&buf2, &k2);
+
+		panic("prev > insert:\n"
+		      "prev    key %s\n"
+		      "insert  key %s\n",
+		      buf1.buf, buf2.buf);
+	}
+#endif
+#if 0
+	BUG_ON(next != btree_bkey_last(b, t) &&
+	       bkey_iter_cmp(b, insert, next) > 0);
+#else
+	if (next != btree_bkey_last(b, t) &&
+	    bkey_iter_cmp(b, insert, next) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, insert);
+		struct bkey k2 = bkey_unpack_key(b, next);
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &k1);
+		bch2_bkey_to_text(&buf2, &k2);
+
+		panic("insert > next:\n"
+		      "insert  key %s\n"
+		      "next    key %s\n",
+		      buf1.buf, buf2.buf);
+	}
+#endif
+}
+
+#else
+
+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+						   struct btree *b) {}
+
+#endif
+
+/* Auxiliary search trees */
+
+#define BFLOAT_FAILED_UNPACKED	U8_MAX
+#define BFLOAT_FAILED		U8_MAX
+
+struct bkey_float {
+	u8		exponent;
+	u8		key_offset;
+	u16		mantissa;
+};
+#define BKEY_MANTISSA_BITS	16
+
+static unsigned bkey_float_byte_offset(unsigned idx)
+{
+	return idx * sizeof(struct bkey_float);
+}
+
+struct ro_aux_tree {
+	u8			nothing[0];
+	struct bkey_float	f[];
+};
+
+struct rw_aux_tree {
+	u16		offset;
+	struct bpos	k;
+};
+
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
+{
+	BUG_ON(t->aux_data_offset == U16_MAX);
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		return t->aux_data_offset;
+	case BSET_RO_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
+				     t->size * sizeof(u8), 8);
+	case BSET_RW_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
+	default:
+		BUG();
+	}
+}
+
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
+					const struct bset_tree *t)
+{
+	return t == b->set
+		? DIV_ROUND_UP(b->unpack_fn_len, 8)
+		: bset_aux_tree_buf_end(t - 1);
+}
+
+static void *__aux_tree_base(const struct btree *b,
+			     const struct bset_tree *t)
+{
+	return b->aux_data + t->aux_data_offset * 8;
+}
+
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
+					    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+static u8 *ro_aux_tree_prev(const struct btree *b,
+			    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
+}
+
+static struct bkey_float *bkey_float(const struct btree *b,
+				     const struct bset_tree *t,
+				     unsigned idx)
+{
+	return ro_aux_tree_base(b, t)->f + idx;
+}
+
+static void bset_aux_tree_verify(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	const struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		if (t->aux_data_offset == U16_MAX)
+			continue;
+
+		BUG_ON(t != b->set &&
+		       t[-1].aux_data_offset == U16_MAX);
+
+		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
+		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
+		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
+	}
+#endif
+}
+
+void bch2_btree_keys_init(struct btree *b)
+{
+	unsigned i;
+
+	b->nsets		= 0;
+	memset(&b->nr, 0, sizeof(b->nr));
+
+	for (i = 0; i < MAX_BSETS; i++)
+		b->set[i].data_offset = U16_MAX;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
+ * bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static inline void *bset_cacheline(const struct btree *b,
+				   const struct bset_tree *t,
+				   unsigned cacheline)
+{
+	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
+				   L1_CACHE_BYTES) +
+		cacheline * BSET_CACHELINE;
+}
+
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned cacheline,
+					     unsigned offset)
+{
+	return bset_cacheline(b, t, cacheline) + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(const struct btree *b,
+				  const struct bset_tree *t,
+				  const struct bkey_packed *k)
+{
+	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
+}
+
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
+					  const struct bset_tree *t,
+					  unsigned cacheline,
+					  const struct bkey_packed *k)
+{
+	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
+}
+
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
+					 const struct bset_tree *t,
+					 unsigned cacheline,
+					 const struct bkey_packed *k)
+{
+	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
+
+	EBUG_ON(m > U8_MAX);
+	return m;
+}
+
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
+					       const struct bset_tree *t,
+					       unsigned j)
+{
+	return cacheline_to_bkey(b, t,
+			__eytzinger1_to_inorder(j, t->size - 1, t->extra),
+			bkey_float(b, t, j)->key_offset);
+}
+
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned j)
+{
+	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
+
+	return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
+}
+
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
+				       const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
+					  struct bset_tree *t,
+					  unsigned j)
+{
+	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
+}
+
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
+			    unsigned j, struct bkey_packed *k)
+{
+	EBUG_ON(k >= btree_bkey_last(b, t));
+
+	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
+		.offset	= __btree_node_key_to_offset(b, k),
+		.k	= bkey_unpack_pos(b, k),
+	};
+}
+
+static void bch2_bset_verify_rw_aux_tree(struct btree *b,
+					struct bset_tree *t)
+{
+	struct bkey_packed *k = btree_bkey_first(b, t);
+	unsigned j = 0;
+
+	if (!bch2_expensive_debug_checks)
+		return;
+
+	BUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	BUG_ON(t->size < 1);
+	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
+
+	goto start;
+	while (1) {
+		if (rw_aux_to_bkey(b, t, j) == k) {
+			BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
+					bkey_unpack_pos(b, k)));
+start:
+			if (++j == t->size)
+				break;
+
+			BUG_ON(rw_aux_tree(b, t)[j].offset <=
+			       rw_aux_tree(b, t)[j - 1].offset);
+		}
+
+		k = bkey_p_next(k);
+		BUG_ON(k >= btree_bkey_last(b, t));
+	}
+}
+
+/* returns idx of first entry >= offset: */
+static unsigned rw_aux_tree_bsearch(struct btree *b,
+				    struct bset_tree *t,
+				    unsigned offset)
+{
+	unsigned bset_offs = offset - btree_bkey_first_offset(t);
+	unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
+	unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
+
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+	EBUG_ON(!t->size);
+	EBUG_ON(idx > t->size);
+
+	while (idx < t->size &&
+	       rw_aux_tree(b, t)[idx].offset < offset)
+		idx++;
+
+	while (idx &&
+	       rw_aux_tree(b, t)[idx - 1].offset >= offset)
+		idx--;
+
+	EBUG_ON(idx < t->size &&
+		rw_aux_tree(b, t)[idx].offset < offset);
+	EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
+	EBUG_ON(idx + 1 < t->size &&
+		rw_aux_tree(b, t)[idx].offset ==
+		rw_aux_tree(b, t)[idx + 1].offset);
+
+	return idx;
+}
+
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
+				     const struct bkey_float *f,
+				     unsigned idx)
+{
+	u64 v;
+
+	EBUG_ON(!bkey_packed(k));
+
+	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
+
+	/*
+	 * In little endian, we're shifting off low bits (and then the bits we
+	 * want are at the low end), in big endian we're shifting off high bits
+	 * (and then the bits we want are at the high end, so we shift them
+	 * back down):
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	v >>= f->exponent & 7;
+#else
+	v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
+#endif
+	return (u16) v;
+}
+
+static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
+					unsigned j,
+					struct bkey_packed *min_key,
+					struct bkey_packed *max_key)
+{
+	struct bkey_float *f = bkey_float(b, t, j);
+	struct bkey_packed *m = tree_to_bkey(b, t, j);
+	struct bkey_packed *l = is_power_of_2(j)
+		? min_key
+		: tree_to_prev_bkey(b, t, j >> ffs(j));
+	struct bkey_packed *r = is_power_of_2(j + 1)
+		? max_key
+		: tree_to_bkey(b, t, j >> (ffz(j) + 1));
+	unsigned mantissa;
+	int shift, exponent, high_bit;
+
+	/*
+	 * for failed bfloats, the lookup code falls back to comparing against
+	 * the original key.
+	 */
+
+	if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
+	    !b->nr_key_bits) {
+		f->exponent = BFLOAT_FAILED_UNPACKED;
+		return;
+	}
+
+	/*
+	 * The greatest differing bit of l and r is the first bit we must
+	 * include in the bfloat mantissa we're creating in order to do
+	 * comparisons - that bit always becomes the high bit of
+	 * bfloat->mantissa, and thus the exponent we're calculating here is
+	 * the position of what will become the low bit in bfloat->mantissa:
+	 *
+	 * Note that this may be negative - we may be running off the low end
+	 * of the key: we handle this later:
+	 */
+	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
+		       min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
+	exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
+
+	/*
+	 * Then we calculate the actual shift value, from the start of the key
+	 * (k->_data), to get the key bits starting at exponent:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
+
+	EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
+#else
+	shift = high_bit_offset +
+		b->nr_key_bits -
+		exponent -
+		BKEY_MANTISSA_BITS;
+
+	EBUG_ON(shift < KEY_PACKED_BITS_START);
+#endif
+	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+
+	f->exponent = shift;
+	mantissa = bkey_mantissa(m, f, j);
+
+	/*
+	 * If we've got garbage bits, set them to all 1s - it's legal for the
+	 * bfloat to compare larger than the original key, but not smaller:
+	 */
+	if (exponent < 0)
+		mantissa |= ~(~0U << -exponent);
+
+	f->mantissa = mantissa;
+}
+
+/* bytes remaining - only valid for last bset: */
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+	bset_aux_tree_verify(b);
+
+	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
+}
+
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+	return __bset_tree_capacity(b, t) /
+		(sizeof(struct bkey_float) + sizeof(u8));
+}
+
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
+}
+
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *k;
+
+	t->size = 1;
+	t->extra = BSET_RW_AUX_TREE_VAL;
+	rw_aux_tree(b, t)[0].offset =
+		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
+
+	bset_tree_for_each_key(b, t, k) {
+		if (t->size == bset_rw_tree_capacity(b, t))
+			break;
+
+		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
+		    L1_CACHE_BYTES)
+			rw_aux_tree_set(b, t, t->size++, k);
+	}
+}
+
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+	struct bkey_i min_key, max_key;
+	unsigned j, cacheline = 1;
+
+	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
+		      bset_ro_tree_capacity(b, t));
+retry:
+	if (t->size < 2) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		return;
+	}
+
+	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+	/* First we figure out where the first key in each cacheline is */
+	eytzinger1_for_each(j, t->size - 1) {
+		while (bkey_to_cacheline(b, t, k) < cacheline)
+			prev = k, k = bkey_p_next(k);
+
+		if (k >= btree_bkey_last(b, t)) {
+			/* XXX: this path sucks */
+			t->size--;
+			goto retry;
+		}
+
+		ro_aux_tree_prev(b, t)[j] = prev->u64s;
+		bkey_float(b, t, j)->key_offset =
+			bkey_to_cacheline_offset(b, t, cacheline++, k);
+
+		EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+		EBUG_ON(tree_to_bkey(b, t, j) != k);
+	}
+
+	while (k != btree_bkey_last(b, t))
+		prev = k, k = bkey_p_next(k);
+
+	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+		bkey_init(&min_key.k);
+		min_key.k.p = b->data->min_key;
+	}
+
+	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+		bkey_init(&max_key.k);
+		max_key.k.p = b->data->max_key;
+	}
+
+	/* Then we build the tree */
+	eytzinger1_for_each(j, t->size - 1)
+		make_bfloat(b, t, j,
+			    bkey_to_packed(&min_key),
+			    bkey_to_packed(&max_key));
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bset_tree *i;
+
+	for (i = b->set; i != t; i++)
+		BUG_ON(bset_has_rw_aux_tree(i));
+
+	bch2_bset_set_no_aux_tree(b, t);
+
+	/* round up to next cacheline: */
+	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
+				      SMP_CACHE_BYTES / sizeof(u64));
+
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
+			     bool writeable)
+{
+	if (writeable
+	    ? bset_has_rw_aux_tree(t)
+	    : bset_has_ro_aux_tree(t))
+		return;
+
+	bset_alloc_tree(b, t);
+
+	if (!__bset_tree_capacity(b, t))
+		return;
+
+	if (writeable)
+		__build_rw_aux_tree(b, t);
+	else
+		__build_ro_aux_tree(b, t);
+
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_init_first(struct btree *b, struct bset *i)
+{
+	struct bset_tree *t;
+
+	BUG_ON(b->nsets);
+
+	memset(i, 0, sizeof(*i));
+	get_random_bytes(&i->seq, sizeof(i->seq));
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+			 struct btree_node_entry *bne)
+{
+	struct bset *i = &bne->keys;
+	struct bset_tree *t;
+
+	BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
+	BUG_ON(b->nsets >= MAX_BSETS);
+
+	memset(i, 0, sizeof(*i));
+	i->seq = btree_bset_first(b)->seq;
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+/*
+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the
+ * immediate predecessor:
+ */
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
+				       struct bkey_packed *k)
+{
+	struct bkey_packed *p;
+	unsigned offset;
+	int j;
+
+	EBUG_ON(k < btree_bkey_first(b, t) ||
+		k > btree_bkey_last(b, t));
+
+	if (k == btree_bkey_first(b, t))
+		return NULL;
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		p = btree_bkey_first(b, t);
+		break;
+	case BSET_RO_AUX_TREE:
+		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
+
+		do {
+			p = j ? tree_to_bkey(b, t,
+					__inorder_to_eytzinger1(j--,
+							t->size - 1, t->extra))
+			      : btree_bkey_first(b, t);
+		} while (p >= k);
+		break;
+	case BSET_RW_AUX_TREE:
+		offset = __btree_node_key_to_offset(b, k);
+		j = rw_aux_tree_bsearch(b, t, offset);
+		p = j ? rw_aux_to_bkey(b, t, j - 1)
+		      : btree_bkey_first(b, t);
+		break;
+	}
+
+	return p;
+}
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
+					  struct bset_tree *t,
+					  struct bkey_packed *k,
+					  unsigned min_key_type)
+{
+	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
+
+	while ((p = __bkey_prev(b, t, k)) && !ret) {
+		for (i = p; i != k; i = bkey_p_next(i))
+			if (i->type >= min_key_type)
+				ret = i;
+
+		k = p;
+	}
+
+	if (bch2_expensive_debug_checks) {
+		BUG_ON(ret >= orig_k);
+
+		for (i = ret
+			? bkey_p_next(ret)
+			: btree_bkey_first(b, t);
+		     i != orig_k;
+		     i = bkey_p_next(i))
+			BUG_ON(i->type >= min_key_type);
+	}
+
+	return ret;
+}
+
+/* Insert */
+
+static void bch2_bset_fix_lookup_table(struct btree *b,
+				       struct bset_tree *t,
+				       struct bkey_packed *_where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
+{
+	int shift = new_u64s - clobber_u64s;
+	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+
+	EBUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	/* returns first entry >= where */
+	l = rw_aux_tree_bsearch(b, t, where);
+
+	if (!l) /* never delete first entry */
+		l++;
+	else if (l < t->size &&
+		 where < t->end_offset &&
+		 rw_aux_tree(b, t)[l].offset == where)
+		rw_aux_tree_set(b, t, l++, _where);
+
+	/* l now > where */
+
+	for (j = l;
+	     j < t->size &&
+	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
+	     j++)
+		;
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset + shift ==
+	    rw_aux_tree(b, t)[l - 1].offset)
+		j++;
+
+	memmove(&rw_aux_tree(b, t)[l],
+		&rw_aux_tree(b, t)[j],
+		(void *) &rw_aux_tree(b, t)[t->size] -
+		(void *) &rw_aux_tree(b, t)[j]);
+	t->size -= j - l;
+
+	for (j = l; j < t->size; j++)
+		rw_aux_tree(b, t)[j].offset += shift;
+
+	EBUG_ON(l < t->size &&
+		rw_aux_tree(b, t)[l].offset ==
+		rw_aux_tree(b, t)[l - 1].offset);
+
+	if (t->size < bset_rw_tree_capacity(b, t) &&
+	    (l < t->size
+	     ? rw_aux_tree(b, t)[l].offset
+	     : t->end_offset) -
+	    rw_aux_tree(b, t)[l - 1].offset >
+	    L1_CACHE_BYTES / sizeof(u64)) {
+		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
+		struct bkey_packed *end = l < t->size
+			? rw_aux_to_bkey(b, t, l)
+			: btree_bkey_last(b, t);
+		struct bkey_packed *k = start;
+
+		while (1) {
+			k = bkey_p_next(k);
+			if (k == end)
+				break;
+
+			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+				memmove(&rw_aux_tree(b, t)[l + 1],
+					&rw_aux_tree(b, t)[l],
+					(void *) &rw_aux_tree(b, t)[t->size] -
+					(void *) &rw_aux_tree(b, t)[l]);
+				t->size++;
+				rw_aux_tree_set(b, t, l, k);
+				break;
+			}
+		}
+	}
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_insert(struct btree *b,
+		      struct btree_node_iter *iter,
+		      struct bkey_packed *where,
+		      struct bkey_i *insert,
+		      unsigned clobber_u64s)
+{
+	struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed packed, *src = bkey_to_packed(insert);
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+	bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
+
+	if (bch2_bkey_pack_key(&packed, &insert->k, f))
+		src = &packed;
+
+	if (!bkey_deleted(&insert->k))
+		btree_keys_account_key_add(&b->nr, t - b->set, src);
+
+	if (src->u64s != clobber_u64s) {
+		u64 *src_p = (u64 *) where->_data + clobber_u64s;
+		u64 *dst_p = (u64 *) where->_data + src->u64s;
+
+		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+			(int) clobber_u64s - src->u64s);
+
+		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
+		set_btree_bset_end(b, t);
+	}
+
+	memcpy_u64s_small(where, src,
+		    bkeyp_key_u64s(f, src));
+	memcpy_u64s(bkeyp_val(f, where), &insert->v,
+		    bkeyp_val_u64s(f, src));
+
+	if (src->u64s != clobber_u64s)
+		bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+
+	bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_bset_delete(struct btree *b,
+		      struct bkey_packed *where,
+		      unsigned clobber_u64s)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	u64 *src_p = (u64 *) where->_data + clobber_u64s;
+	u64 *dst_p = where->_data;
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+
+	EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+
+	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
+	set_btree_bset_end(b, t);
+
+	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
+}
+
+/* Lookup */
+
+__flatten
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search)
+{
+	unsigned l = 0, r = t->size;
+
+	while (l + 1 != r) {
+		unsigned m = (l + r) >> 1;
+
+		if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
+			l = m;
+		else
+			r = m;
+	}
+
+	return rw_aux_to_bkey(b, t, l);
+}
+
+static inline void prefetch_four_cachelines(void *p)
+{
+#ifdef CONFIG_X86_64
+	asm("prefetcht0 (-127 + 64 * 0)(%0);"
+	    "prefetcht0 (-127 + 64 * 1)(%0);"
+	    "prefetcht0 (-127 + 64 * 2)(%0);"
+	    "prefetcht0 (-127 + 64 * 3)(%0);"
+	    :
+	    : "r" (p + 127));
+#else
+	prefetch(p + L1_CACHE_BYTES * 0);
+	prefetch(p + L1_CACHE_BYTES * 1);
+	prefetch(p + L1_CACHE_BYTES * 2);
+	prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+}
+
+static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
+					      const struct bkey_float *f,
+					      unsigned idx)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
+
+	return f->exponent > key_bits_start;
+#else
+	unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
+
+	return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
+#endif
+}
+
+__flatten
+static struct bkey_packed *bset_search_tree(const struct btree *b,
+				const struct bset_tree *t,
+				const struct bpos *search,
+				const struct bkey_packed *packed_search)
+{
+	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
+	struct bkey_float *f;
+	struct bkey_packed *k;
+	unsigned inorder, n = 1, l, r;
+	int cmp;
+
+	do {
+		if (likely(n << 4 < t->size))
+			prefetch(&base->f[n << 4]);
+
+		f = &base->f[n];
+		if (unlikely(f->exponent >= BFLOAT_FAILED))
+			goto slowpath;
+
+		l = f->mantissa;
+		r = bkey_mantissa(packed_search, f, n);
+
+		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+			goto slowpath;
+
+		n = n * 2 + (l < r);
+		continue;
+slowpath:
+		k = tree_to_bkey(b, t, n);
+		cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
+		if (!cmp)
+			return k;
+
+		n = n * 2 + (cmp < 0);
+	} while (n < t->size);
+
+	inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
+
+	/*
+	 * n would have been the node we recursed to - the low bit tells us if
+	 * we recursed left or recursed right.
+	 */
+	if (likely(!(n & 1))) {
+		--inorder;
+		if (unlikely(!inorder))
+			return btree_bkey_first(b, t);
+
+		f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
+	}
+
+	return cacheline_to_bkey(b, t, inorder, f->key_offset);
+}
+
+static __always_inline __flatten
+struct bkey_packed *__bch2_bset_search(struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				const struct bkey_packed *lossy_packed_search)
+{
+
+	/*
+	 * First, we search for a cacheline, then lastly we do a linear search
+	 * within that cacheline.
+	 *
+	 * To search for the cacheline, there's three different possibilities:
+	 *  * The set is too small to have a search tree, so we just do a linear
+	 *    search over the whole set.
+	 *  * The set is the one we're currently inserting into; keeping a full
+	 *    auxiliary search tree up to date would be too expensive, so we
+	 *    use a much simpler lookup table to do a binary search -
+	 *    bset_search_write_set().
+	 *  * Or we use the auxiliary search tree we constructed earlier -
+	 *    bset_search_tree()
+	 */
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		return btree_bkey_first(b, t);
+	case BSET_RW_AUX_TREE:
+		return bset_search_write_set(b, t, search);
+	case BSET_RO_AUX_TREE:
+		return bset_search_tree(b, t, search, lossy_packed_search);
+	default:
+		BUG();
+	}
+}
+
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search_linear(struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search,
+				struct bkey_packed *m)
+{
+	if (lossy_packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       bkey_iter_cmp_p_or_unp(b, m,
+					lossy_packed_search, search) < 0)
+			m = bkey_p_next(m);
+
+	if (!packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       bkey_iter_pos_cmp(b, m, search) < 0)
+			m = bkey_p_next(m);
+
+	if (bch2_expensive_debug_checks) {
+		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
+
+		BUG_ON(prev &&
+		       bkey_iter_cmp_p_or_unp(b, prev,
+					packed_search, search) >= 0);
+	}
+
+	return m;
+}
+
+/* Btree node iterator */
+
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos;
+
+		btree_node_iter_for_each(iter, pos)
+			;
+
+		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+		*pos = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+	}
+}
+
+void bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			       struct btree *b,
+			       const struct bkey_packed *k,
+			       const struct bkey_packed *end)
+{
+	__bch2_btree_node_iter_push(iter, b, k, end);
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+noinline __flatten __cold
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
+			      struct btree *b, struct bpos *search)
+{
+	struct bkey_packed *k;
+
+	trace_bkey_pack_pos_fail(search);
+
+	bch2_btree_node_iter_init_from_start(iter, b);
+
+	while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+	       bkey_iter_pos_cmp(b, k, search) < 0)
+		bch2_btree_node_iter_advance(iter, b);
+}
+
+/**
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * given position
+ *
+ * @iter:	iterator to initialize
+ * @b:		btree node to search
+ * @search:	search key
+ *
+ * Main entry point to the lookup code for individual btree nodes:
+ *
+ * NOTE:
+ *
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
+ * keys. This doesn't matter for most code, but it does matter for lookups.
+ *
+ * Some adjacent keys with a string of equal keys:
+ *	i j k k k k l m
+ *
+ * If you search for k, the lookup code isn't guaranteed to return you any
+ * specific k. The lookup code is conceptually doing a binary search and
+ * iterating backwards is very expensive so if the pivot happens to land at the
+ * last k that's what you'll get.
+ *
+ * This works out ok, but it's something to be aware of:
+ *
+ *  - For non extents, we guarantee that the live key comes last - see
+ *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
+ *    see will only be deleted keys you don't care about.
+ *
+ *  - For extents, deleted keys sort last (see the comment at the top of this
+ *    file). But when you're searching for extents, you actually want the first
+ *    key strictly greater than your search key - an extent that compares equal
+ *    to the search key is going to have 0 sectors after the search key.
+ *
+ *    But this does mean that we can't just search for
+ *    bpos_successor(start_of_range) to get the first extent that overlaps with
+ *    the range we want - if we're unlucky and there's an extent that ends
+ *    exactly where we searched, then there could be a deleted key at the same
+ *    position and we'd get that when we search instead of the preceding extent
+ *    we needed.
+ *
+ *    So we've got to search for start_of_range, then after the lookup iterate
+ *    past any extents that compare equal to the position we searched for.
+ */
+__flatten
+void bch2_btree_node_iter_init(struct btree_node_iter *iter,
+			       struct btree *b, struct bpos *search)
+{
+	struct bkey_packed p, *packed_search = NULL;
+	struct btree_node_iter_set *pos = iter->data;
+	struct bkey_packed *k[MAX_BSETS];
+	unsigned i;
+
+	EBUG_ON(bpos_lt(*search, b->data->min_key));
+	EBUG_ON(bpos_gt(*search, b->data->max_key));
+	bset_aux_tree_verify(b);
+
+	memset(iter, 0, sizeof(*iter));
+
+	switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
+	case BKEY_PACK_POS_EXACT:
+		packed_search = &p;
+		break;
+	case BKEY_PACK_POS_SMALLER:
+		packed_search = NULL;
+		break;
+	case BKEY_PACK_POS_FAIL:
+		btree_node_iter_init_pack_failed(iter, b, search);
+		return;
+	}
+
+	for (i = 0; i < b->nsets; i++) {
+		k[i] = __bch2_bset_search(b, b->set + i, search, &p);
+		prefetch_four_cachelines(k[i]);
+	}
+
+	for (i = 0; i < b->nsets; i++) {
+		struct bset_tree *t = b->set + i;
+		struct bkey_packed *end = btree_bkey_last(b, t);
+
+		k[i] = bch2_bset_search_linear(b, t, search,
+					       packed_search, &p, k[i]);
+		if (k[i] != end)
+			*pos++ = (struct btree_node_iter_set) {
+				__btree_node_key_to_offset(b, k[i]),
+				__btree_node_key_to_offset(b, end)
+			};
+	}
+
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
+					  struct btree *b)
+{
+	struct bset_tree *t;
+
+	memset(iter, 0, sizeof(*iter));
+
+	for_each_bset(b, t)
+		__bch2_btree_node_iter_push(iter, b,
+					   btree_bkey_first(b, t),
+					   btree_bkey_last(b, t));
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
+						  struct btree *b,
+						  struct bset_tree *t)
+{
+	struct btree_node_iter_set *set;
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset)
+			return __btree_node_offset_to_key(b, set->k);
+
+	return btree_bkey_last(b, t);
+}
+
+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
+					    struct btree *b,
+					    unsigned first)
+{
+	bool ret;
+
+	if ((ret = (btree_node_iter_cmp(b,
+					iter->data[first],
+					iter->data[first + 1]) > 0)))
+		swap(iter->data[first], iter->data[first + 1]);
+	return ret;
+}
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
+			       struct btree *b)
+{
+	/* unrolled bubble sort: */
+
+	if (!__btree_node_iter_set_end(iter, 2)) {
+		btree_node_iter_sort_two(iter, b, 0);
+		btree_node_iter_sort_two(iter, b, 1);
+	}
+
+	if (!__btree_node_iter_set_end(iter, 1))
+		btree_node_iter_sort_two(iter, b, 0);
+}
+
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
+				   struct btree_node_iter_set *set)
+{
+	struct btree_node_iter_set *last =
+		iter->data + ARRAY_SIZE(iter->data) - 1;
+
+	memmove(&set[0], &set[1], (void *) last - (void *) set);
+	*last = (struct btree_node_iter_set) { 0, 0 };
+}
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+						  struct btree *b)
+{
+	iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
+
+	EBUG_ON(iter->data->k > iter->data->end);
+
+	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
+		/* avoid an expensive memmove call: */
+		iter->data[0] = iter->data[1];
+		iter->data[1] = iter->data[2];
+		iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
+		return;
+	}
+
+	if (__btree_node_iter_set_end(iter, 1))
+		return;
+
+	if (!btree_node_iter_sort_two(iter, b, 0))
+		return;
+
+	if (__btree_node_iter_set_end(iter, 2))
+		return;
+
+	btree_node_iter_sort_two(iter, b, 1);
+}
+
+void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+				  struct btree *b)
+{
+	if (bch2_expensive_debug_checks) {
+		bch2_btree_node_iter_verify(iter, b);
+		bch2_btree_node_iter_next_check(iter, b);
+	}
+
+	__bch2_btree_node_iter_advance(iter, b);
+}
+
+/*
+ * Expensive:
+ */
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+						  struct btree *b)
+{
+	struct bkey_packed *k, *prev = NULL;
+	struct btree_node_iter_set *set;
+	struct bset_tree *t;
+	unsigned end = 0;
+
+	if (bch2_expensive_debug_checks)
+		bch2_btree_node_iter_verify(iter, b);
+
+	for_each_bset(b, t) {
+		k = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
+		if (k &&
+		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
+			prev = k;
+			end = t->end_offset;
+		}
+	}
+
+	if (!prev)
+		return NULL;
+
+	/*
+	 * We're manually memmoving instead of just calling sort() to ensure the
+	 * prev we picked ends up in slot 0 - sort won't necessarily put it
+	 * there because of duplicate deleted keys:
+	 */
+	btree_node_iter_for_each(iter, set)
+		if (set->end == end)
+			goto found;
+
+	BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
+found:
+	BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
+
+	memmove(&iter->data[1],
+		&iter->data[0],
+		(void *) set - (void *) &iter->data[0]);
+
+	iter->data[0].k = __btree_node_key_to_offset(b, prev);
+	iter->data[0].end = end;
+
+	if (bch2_expensive_debug_checks)
+		bch2_btree_node_iter_verify(iter, b);
+	return prev;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
+					      struct btree *b)
+{
+	struct bkey_packed *prev;
+
+	do {
+		prev = bch2_btree_node_iter_prev_all(iter, b);
+	} while (prev && bkey_deleted(prev));
+
+	return prev;
+}
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
+						 struct btree *b,
+						 struct bkey *u)
+{
+	struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
+
+	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
+}
+
+/* Mergesort */
+
+void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
+{
+	const struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		enum bset_aux_tree_type type = bset_aux_tree_type(t);
+		size_t j;
+
+		stats->sets[type].nr++;
+		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
+			sizeof(u64);
+
+		if (bset_has_ro_aux_tree(t)) {
+			stats->floats += t->size - 1;
+
+			for (j = 1; j < t->size; j++)
+				stats->failed +=
+					bkey_float(b, t, j)->exponent ==
+					BFLOAT_FAILED;
+		}
+	}
+}
+
+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
+			 struct bkey_packed *k)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+	struct bkey uk;
+	unsigned j, inorder;
+
+	if (!bset_has_ro_aux_tree(t))
+		return;
+
+	inorder = bkey_to_cacheline(b, t, k);
+	if (!inorder || inorder >= t->size)
+		return;
+
+	j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
+	if (k != tree_to_bkey(b, t, j))
+		return;
+
+	switch (bkey_float(b, t, j)->exponent) {
+	case BFLOAT_FAILED:
+		uk = bkey_unpack_key(b, k);
+		prt_printf(out,
+		       "    failed unpacked at depth %u\n"
+		       "\t",
+		       ilog2(j));
+		bch2_bpos_to_text(out, uk.p);
+		prt_printf(out, "\n");
+		break;
+	}
+}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
new file mode 100644
index 000000000000..632c2b8c5460
--- /dev/null
+++ b/fs/bcachefs/bset.h
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BSET_H
+#define _BCACHEFS_BSET_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+#include "vstructs.h"
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+enum bset_aux_tree_type {
+	BSET_NO_AUX_TREE,
+	BSET_RO_AUX_TREE,
+	BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES	3
+
+#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
+#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+	switch (t->extra) {
+	case BSET_NO_AUX_TREE_VAL:
+		EBUG_ON(t->size);
+		return BSET_NO_AUX_TREE;
+	case BSET_RW_AUX_TREE_VAL:
+		EBUG_ON(!t->size);
+		return BSET_RW_AUX_TREE;
+	default:
+		EBUG_ON(!t->size);
+		return BSET_RO_AUX_TREE;
+	}
+}
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		256
+
+static inline size_t btree_keys_cachelines(const struct btree *b)
+{
+	return (1U << b->byte_order) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(const struct btree *b)
+{
+	return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(const struct btree *b)
+{
+	return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
+#define for_each_bset(_b, _t)						\
+	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+#define bset_tree_for_each_key(_b, _t, _k)				\
+	for (_k = btree_bkey_first(_b, _t);				\
+	     _k != btree_bkey_last(_b, _t);				\
+	     _k = bkey_p_next(_k))
+
+static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch2_bset_set_no_aux_tree(struct btree *b,
+					    struct bset_tree *t)
+{
+	BUG_ON(t < b->set);
+
+	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		t->aux_data_offset = U16_MAX;
+	}
+}
+
+static inline void btree_node_set_format(struct btree *b,
+					 struct bkey_format f)
+{
+	int len;
+
+	b->format	= f;
+	b->nr_key_bits	= bkey_format_key_bits(&f);
+
+	len = bch2_compile_bkey_format(&b->format, b->aux_data);
+	BUG_ON(len < 0 || len > U8_MAX);
+
+	b->unpack_fn_len = len;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+static inline struct bset *bset_next_set(struct btree *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = btree_bset_last(b);
+
+	EBUG_ON(!is_power_of_2(block_bytes));
+
+	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
+}
+
+void bch2_btree_keys_init(struct btree *);
+
+void bch2_bset_init_first(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+			 struct btree_node_entry *);
+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+
+void bch2_bset_insert(struct btree *, struct btree_node_iter *,
+		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    const struct bpos *r)
+{
+	EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+	if (unlikely(!bkey_packed(l)))
+		return bpos_cmp(packed_to_bkey_c(l)->p, *r);
+
+	if (likely(r_packed))
+		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
+
+	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+static inline struct bset_tree *
+bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
+{
+	unsigned offset = __btree_node_key_to_offset(b, k);
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (offset <= t->end_offset) {
+			EBUG_ON(offset < btree_bkey_first_offset(t));
+			return t;
+		}
+
+	BUG();
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
+					  struct bkey_packed *, unsigned);
+
+static inline struct bkey_packed *
+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, 0);
+}
+
+static inline struct bkey_packed *
+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, 1);
+}
+
+/* Btree key iteration */
+
+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+			      const struct bkey_packed *,
+			      const struct bkey_packed *);
+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+			       struct bpos *);
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
+					  struct btree *);
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
+						 struct btree *,
+						 struct bset_tree *);
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
+				   struct btree_node_iter_set *);
+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set)				\
+	for (_set = (_iter)->data;					\
+	     _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&	\
+	     (_set)->k != (_set)->end;					\
+	     _set++)
+
+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
+					     unsigned i)
+{
+	return iter->data[i].k == iter->data[i].end;
+}
+
+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
+{
+	return __btree_node_iter_set_end(iter, 0);
+}
+
+/*
+ * When keys compare equal, deleted keys compare first:
+ *
+ * XXX: only need to compare pointers for keys that are both within a
+ * btree_node_iterator - we need to break ties for prev() to work correctly
+ */
+static inline int bkey_iter_cmp(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed(b, l, r)
+		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
+		?: cmp_int(l, r);
+}
+
+static inline int btree_node_iter_cmp(const struct btree *b,
+				      struct btree_node_iter_set l,
+				      struct btree_node_iter_set r)
+{
+	return bkey_iter_cmp(b,
+			__btree_node_offset_to_key(b, l.k),
+			__btree_node_offset_to_key(b, r.k));
+}
+
+/* These assume r (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(const struct btree *b,
+			const struct bkey_packed *l,
+			const struct bpos *r)
+{
+	return bkey_cmp_left_packed(b, l, r)
+		?: -((int) bkey_deleted(l));
+}
+
+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    const struct bpos *r)
+{
+	return bkey_cmp_p_or_unp(b, l, r_packed, r)
+		?: -((int) bkey_deleted(l));
+}
+
+static inline struct bkey_packed *
+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+				struct btree *b)
+{
+	return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
+{
+	return !bch2_btree_node_iter_end(iter)
+		? __btree_node_offset_to_key(b, iter->data->k)
+		: NULL;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *k;
+
+	while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
+	       bkey_deleted(k))
+		bch2_btree_node_iter_advance(iter, b);
+
+	return k;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+						  struct btree *);
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
+					      struct btree *);
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
+						struct btree *,
+						struct bkey *);
+
+#define for_each_btree_node_key(b, k, iter)				\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
+	     (k = bch2_btree_node_iter_peek((iter), (b)));		\
+	     bch2_btree_node_iter_advance(iter, b))
+
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
+	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+	     bch2_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+					  unsigned bset,
+					  struct bkey_packed *k,
+					  int sign)
+{
+	n->live_u64s		+= k->u64s * sign;
+	n->bset_u64s[bset]	+= k->u64s * sign;
+
+	if (bkey_packed(k))
+		n->packed_keys	+= sign;
+	else
+		n->unpacked_keys += sign;
+}
+
+static inline void btree_keys_account_val_delta(struct btree *b,
+						struct bkey_packed *k,
+						int delta)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
+	b->nr.live_u64s			+= delta;
+	b->nr.bset_u64s[t - b->set]	+= delta;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
+	btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
+	btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+#define btree_account_key_add(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
+#define btree_account_key_drop(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
+
+struct bset_stats {
+	struct {
+		size_t nr, bytes;
+	} sets[BSET_TREE_NR_TYPES];
+
+	size_t floats;
+	size_t failed;
+};
+
+void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
+void bch2_bfloat_to_text(struct printbuf *, struct btree *,
+			 struct bkey_packed *);
+
+/* Debug stuff */
+
+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct bch_fs *, struct btree *);
+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *);
+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
+			    struct bkey_packed *, unsigned);
+
+#else
+
+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+					      struct btree *b) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+					  struct bkey_packed *where,
+					  struct bkey_packed *insert,
+					  unsigned clobber_u64s) {}
+#endif
+
+static inline void bch2_verify_btree_nr_keys(struct btree *b)
+{
+	if (bch2_debug_check_btree_accounting)
+		__bch2_verify_btree_nr_keys(b);
+}
+
+#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
new file mode 100644
index 000000000000..82cf243aa288
--- /dev/null
+++ b/fs/bcachefs/btree_cache.c
@@ -0,0 +1,1202 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "trace.h"
+
+#include <linux/prefetch.h>
+#include <linux/sched/mm.h>
+
+const char * const bch2_btree_node_flags[] = {
+#define x(f)	#f,
+	BTREE_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_recalc_btree_reserve(struct bch_fs *c)
+{
+	unsigned i, reserve = 16;
+
+	if (!c->btree_roots_known[0].b)
+		reserve += 8;
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->b)
+			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
+	}
+
+	c->btree_cache.reserve = reserve;
+}
+
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+{
+	return max_t(int, 0, bc->used - bc->reserve);
+}
+
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+	if (b->c.lock.readers)
+		list_move(&b->list, &bc->freed_pcpu);
+	else
+		list_move(&b->list, &bc->freed_nonpcpu);
+}
+
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	EBUG_ON(btree_node_write_in_flight(b));
+
+	clear_btree_node_just_written(b);
+
+	kvpfree(b->data, btree_bytes(c));
+	b->data = NULL;
+#ifdef __KERNEL__
+	kvfree(b->aux_data);
+#else
+	munmap(b->aux_data, btree_aux_data_bytes(b));
+#endif
+	b->aux_data = NULL;
+
+	bc->used--;
+
+	btree_node_to_freedlist(bc, b);
+}
+
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				   const void *obj)
+{
+	const struct btree *b = obj;
+	const u64 *v = arg->key;
+
+	return b->hash_val == *v ? 0 : 1;
+}
+
+static const struct rhashtable_params bch_btree_cache_params = {
+	.head_offset	= offsetof(struct btree, hash),
+	.key_offset	= offsetof(struct btree, hash_val),
+	.key_len	= sizeof(u64),
+	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
+};
+
+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+{
+	BUG_ON(b->data || b->aux_data);
+
+	b->data = kvpmalloc(btree_bytes(c), gfp);
+	if (!b->data)
+		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+#ifdef __KERNEL__
+	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
+#else
+	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
+			   PROT_READ|PROT_WRITE|PROT_EXEC,
+			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+	if (b->aux_data == MAP_FAILED)
+		b->aux_data = NULL;
+#endif
+	if (!b->aux_data) {
+		kvpfree(b->data, btree_bytes(c));
+		b->data = NULL;
+		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+	}
+
+	return 0;
+}
+
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+{
+	struct btree *b;
+
+	b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	bkey_btree_ptr_init(&b->key);
+	INIT_LIST_HEAD(&b->list);
+	INIT_LIST_HEAD(&b->write_blocked);
+	b->byte_order = ilog2(btree_bytes(c));
+	return b;
+}
+
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	b = __btree_node_mem_alloc(c, GFP_KERNEL);
+	if (!b)
+		return NULL;
+
+	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
+		kfree(b);
+		return NULL;
+	}
+
+	bch2_btree_lock_init(&b->c, 0);
+
+	bc->used++;
+	list_add(&b->list, &bc->freeable);
+	return b;
+}
+
+/* Btree in memory cache - hash table */
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
+	BUG_ON(ret);
+
+	/* Cause future lookups for this node to fail: */
+	b->hash_val = 0;
+}
+
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
+{
+	BUG_ON(b->hash_val);
+	b->hash_val = btree_ptr_hash_val(&b->key);
+
+	return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+					     bch_btree_cache_params);
+}
+
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+				unsigned level, enum btree_id id)
+{
+	int ret;
+
+	b->c.level	= level;
+	b->c.btree_id	= id;
+
+	mutex_lock(&bc->lock);
+	ret = __bch2_btree_node_hash_insert(bc, b);
+	if (!ret)
+		list_add_tail(&b->list, &bc->live);
+	mutex_unlock(&bc->lock);
+
+	return ret;
+}
+
+__flatten
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
+				     const struct bkey_i *k)
+{
+	u64 v = btree_ptr_hash_val(k);
+
+	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	int ret = 0;
+
+	lockdep_assert_held(&bc->lock);
+wait_on_io:
+	if (b->flags & ((1U << BTREE_NODE_dirty)|
+			(1U << BTREE_NODE_read_in_flight)|
+			(1U << BTREE_NODE_write_in_flight))) {
+		if (!flush)
+			return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
+		/* XXX: waiting on IO with btree cache lock held */
+		bch2_btree_node_wait_on_read(b);
+		bch2_btree_node_wait_on_write(b);
+	}
+
+	if (!six_trylock_intent(&b->c.lock))
+		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
+	if (!six_trylock_write(&b->c.lock))
+		goto out_unlock_intent;
+
+	/* recheck under lock */
+	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+			(1U << BTREE_NODE_write_in_flight))) {
+		if (!flush)
+			goto out_unlock;
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
+
+	if (btree_node_noevict(b) ||
+	    btree_node_write_blocked(b) ||
+	    btree_node_will_make_reachable(b))
+		goto out_unlock;
+
+	if (btree_node_dirty(b)) {
+		if (!flush)
+			goto out_unlock;
+		/*
+		 * Using the underscore version because we don't want to compact
+		 * bsets after the write, since this node is about to be evicted
+		 * - unless btree verify mode is enabled, since it runs out of
+		 * the post write cleanup:
+		 */
+		if (bch2_verify_btree_ondisk)
+			bch2_btree_node_write(c, b, SIX_LOCK_intent,
+					      BTREE_WRITE_cache_reclaim);
+		else
+			__bch2_btree_node_write(c, b,
+						BTREE_WRITE_cache_reclaim);
+
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
+out:
+	if (b->hash_val && !ret)
+		trace_and_count(c, btree_cache_reap, c, b);
+	return ret;
+out_unlock:
+	six_unlock_write(&b->c.lock);
+out_unlock_intent:
+	six_unlock_intent(&b->c.lock);
+	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
+	goto out;
+}
+
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, false);
+}
+
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, true);
+}
+
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_cache.shrink);
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b, *t;
+	unsigned long nr = sc->nr_to_scan;
+	unsigned long can_free = 0;
+	unsigned long freed = 0;
+	unsigned long touched = 0;
+	unsigned i, flags;
+	unsigned long ret = SHRINK_STOP;
+	bool trigger_writes = atomic_read(&bc->dirty) + nr >=
+		bc->used * 3 / 4;
+
+	if (bch2_btree_shrinker_disabled)
+		return SHRINK_STOP;
+
+	mutex_lock(&bc->lock);
+	flags = memalloc_nofs_save();
+
+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
+	can_free = btree_cache_can_free(bc);
+	nr = min_t(unsigned long, nr, can_free);
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &bc->freeable, list) {
+		/*
+		 * Leave a few nodes on the freeable list, so that a btree split
+		 * won't have to hit the system allocator:
+		 */
+		if (++i <= 3)
+			continue;
+
+		touched++;
+
+		if (touched >= nr)
+			goto out;
+
+		if (!btree_node_reclaim(c, b)) {
+			btree_node_data_free(c, b);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+			freed++;
+		}
+	}
+restart:
+	list_for_each_entry_safe(b, t, &bc->live, list) {
+		touched++;
+
+		if (btree_node_accessed(b)) {
+			clear_btree_node_accessed(b);
+		} else if (!btree_node_reclaim(c, b)) {
+			freed++;
+			btree_node_data_free(c, b);
+
+			bch2_btree_node_hash_remove(bc, b);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+
+			if (freed == nr)
+				goto out_rotate;
+		} else if (trigger_writes &&
+			   btree_node_dirty(b) &&
+			   !btree_node_will_make_reachable(b) &&
+			   !btree_node_write_blocked(b) &&
+			   six_trylock_read(&b->c.lock)) {
+			list_move(&bc->live, &b->list);
+			mutex_unlock(&bc->lock);
+			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+			six_unlock_read(&b->c.lock);
+			if (touched >= nr)
+				goto out_nounlock;
+			mutex_lock(&bc->lock);
+			goto restart;
+		}
+
+		if (touched >= nr)
+			break;
+	}
+out_rotate:
+	if (&t->list != &bc->live)
+		list_move_tail(&bc->live, &t->list);
+out:
+	mutex_unlock(&bc->lock);
+out_nounlock:
+	ret = freed;
+	memalloc_nofs_restore(flags);
+	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
+	return ret;
+}
+
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_cache.shrink);
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (bch2_btree_shrinker_disabled)
+		return 0;
+
+	return btree_cache_can_free(bc);
+}
+
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	unsigned i, flags;
+
+	unregister_shrinker(&bc->shrink);
+
+	/* vfree() can allocate memory: */
+	flags = memalloc_nofs_save();
+	mutex_lock(&bc->lock);
+
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &bc->live);
+
+	kvpfree(c->verify_ondisk, btree_bytes(c));
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->b)
+			list_add(&r->b->list, &bc->live);
+	}
+
+	list_splice(&bc->freeable, &bc->live);
+
+	while (!list_empty(&bc->live)) {
+		b = list_first_entry(&bc->live, struct btree, list);
+
+		BUG_ON(btree_node_read_in_flight(b) ||
+		       btree_node_write_in_flight(b));
+
+		if (btree_node_dirty(b))
+			bch2_btree_complete_write(c, b, btree_current_write(b));
+		clear_btree_node_dirty_acct(c, b);
+
+		btree_node_data_free(c, b);
+	}
+
+	BUG_ON(atomic_read(&c->btree_cache.dirty));
+
+	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+	while (!list_empty(&bc->freed_nonpcpu)) {
+		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
+		list_del(&b->list);
+		six_lock_exit(&b->c.lock);
+		kfree(b);
+	}
+
+	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
+
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
+}
+
+int bch2_fs_btree_cache_init(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	unsigned i;
+	int ret = 0;
+
+	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
+	if (ret)
+		goto err;
+
+	bc->table_init_done = true;
+
+	bch2_recalc_btree_reserve(c);
+
+	for (i = 0; i < bc->reserve; i++)
+		if (!__bch2_btree_node_mem_alloc(c))
+			goto err;
+
+	list_splice_init(&bc->live, &bc->freeable);
+
+	mutex_init(&c->verify_lock);
+
+	bc->shrink.count_objects	= bch2_btree_cache_count;
+	bc->shrink.scan_objects		= bch2_btree_cache_scan;
+	bc->shrink.seeks		= 4;
+	ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+}
+
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+{
+	mutex_init(&bc->lock);
+	INIT_LIST_HEAD(&bc->live);
+	INIT_LIST_HEAD(&bc->freeable);
+	INIT_LIST_HEAD(&bc->freed_pcpu);
+	INIT_LIST_HEAD(&bc->freed_nonpcpu);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (bc->alloc_lock == current) {
+		trace_and_count(c, btree_cache_cannibalize_unlock, c);
+		bc->alloc_lock = NULL;
+		closure_wake_up(&bc->alloc_wait);
+	}
+}
+
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct task_struct *old;
+
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current)
+		goto success;
+
+	if (!cl) {
+		trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
+	}
+
+	closure_wait(&bc->alloc_wait, cl);
+
+	/* Try again, after adding ourselves to waitlist */
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current) {
+		/* We raced */
+		closure_wake_up(&bc->alloc_wait);
+		goto success;
+	}
+
+	trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
+
+success:
+	trace_and_count(c, btree_cache_cannibalize_lock, c);
+	return 0;
+}
+
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	list_for_each_entry_reverse(b, &bc->live, list)
+		if (!btree_node_reclaim(c, b))
+			return b;
+
+	while (1) {
+		list_for_each_entry_reverse(b, &bc->live, list)
+			if (!btree_node_write_and_reclaim(c, b))
+				return b;
+
+		/*
+		 * Rare case: all nodes were intent-locked.
+		 * Just busy-wait.
+		 */
+		WARN_ONCE(1, "btree cache cannibalize failed\n");
+		cond_resched();
+	}
+}
+
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct list_head *freed = pcpu_read_locks
+		? &bc->freed_pcpu
+		: &bc->freed_nonpcpu;
+	struct btree *b, *b2;
+	u64 start_time = local_clock();
+	unsigned flags;
+
+	flags = memalloc_nofs_save();
+	mutex_lock(&bc->lock);
+
+	/*
+	 * We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, freed, list)
+		if (!btree_node_reclaim(c, b)) {
+			list_del_init(&b->list);
+			goto got_node;
+		}
+
+	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
+	if (!b) {
+		mutex_unlock(&bc->lock);
+		bch2_trans_unlock(trans);
+		b = __btree_node_mem_alloc(c, GFP_KERNEL);
+		if (!b)
+			goto err;
+		mutex_lock(&bc->lock);
+	}
+
+	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
+
+	BUG_ON(!six_trylock_intent(&b->c.lock));
+	BUG_ON(!six_trylock_write(&b->c.lock));
+got_node:
+
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b2, &bc->freeable, list)
+		if (!btree_node_reclaim(c, b2)) {
+			swap(b->data, b2->data);
+			swap(b->aux_data, b2->aux_data);
+			btree_node_to_freedlist(bc, b2);
+			six_unlock_write(&b2->c.lock);
+			six_unlock_intent(&b2->c.lock);
+			goto got_mem;
+		}
+
+	mutex_unlock(&bc->lock);
+
+	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+		bch2_trans_unlock(trans);
+		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
+			goto err;
+	}
+
+	mutex_lock(&bc->lock);
+	bc->used++;
+got_mem:
+	mutex_unlock(&bc->lock);
+
+	BUG_ON(btree_node_hashed(b));
+	BUG_ON(btree_node_dirty(b));
+	BUG_ON(btree_node_write_in_flight(b));
+out:
+	b->flags		= 0;
+	b->written		= 0;
+	b->nsets		= 0;
+	b->sib_u64s[0]		= 0;
+	b->sib_u64s[1]		= 0;
+	b->whiteout_u64s	= 0;
+	bch2_btree_keys_init(b);
+	set_btree_node_accessed(b);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+			       start_time);
+
+	memalloc_nofs_restore(flags);
+	return b;
+err:
+	mutex_lock(&bc->lock);
+
+	/* Try to cannibalize another cached btree node: */
+	if (bc->alloc_lock == current) {
+		b2 = btree_node_cannibalize(c);
+		clear_btree_node_just_written(b2);
+		bch2_btree_node_hash_remove(bc, b2);
+
+		if (b) {
+			swap(b->data, b2->data);
+			swap(b->aux_data, b2->aux_data);
+			btree_node_to_freedlist(bc, b2);
+			six_unlock_write(&b2->c.lock);
+			six_unlock_intent(&b2->c.lock);
+		} else {
+			b = b2;
+			list_del_init(&b->list);
+		}
+
+		mutex_unlock(&bc->lock);
+
+		trace_and_count(c, btree_cache_cannibalize, c);
+		goto out;
+	}
+
+	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
+	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
+				struct btree_path *path,
+				const struct bkey_i *k,
+				enum btree_id btree_id,
+				unsigned level,
+				enum six_lock_type lock_type,
+				bool sync)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	u32 seq;
+
+	BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+	/*
+	 * Parent node must be locked, else we could read in a btree node that's
+	 * been freed:
+	 */
+	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
+		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
+	}
+
+	b = bch2_btree_node_mem_alloc(trans, level != 0);
+
+	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+		trans->memory_allocation_failure = true;
+		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
+	}
+
+	if (IS_ERR(b))
+		return b;
+
+	/*
+	 * Btree nodes read in from disk should not have the accessed bit set
+	 * initially, so that linear scans don't thrash the cache:
+	 */
+	clear_btree_node_accessed(b);
+
+	bkey_copy(&b->key, k);
+	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
+		/* raced with another fill: */
+
+		/* mark as unhashed... */
+		b->hash_val = 0;
+
+		mutex_lock(&bc->lock);
+		list_add(&b->list, &bc->freeable);
+		mutex_unlock(&bc->lock);
+
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		return NULL;
+	}
+
+	set_btree_node_read_in_flight(b);
+
+	six_unlock_write(&b->c.lock);
+	seq = six_lock_seq(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+
+	/* Unlock before doing IO: */
+	if (path && sync)
+		bch2_trans_unlock_noassert(trans);
+
+	bch2_btree_node_read(c, b, sync);
+
+	if (!sync)
+		return NULL;
+
+	if (path) {
+		int ret = bch2_trans_relock(trans) ?:
+			bch2_btree_path_relock_intent(trans, path);
+		if (ret) {
+			BUG_ON(!trans->restarted);
+			return ERR_PTR(ret);
+		}
+	}
+
+	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+		if (path)
+			trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+	}
+
+	return b;
+}
+
+static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
+{
+	struct printbuf buf = PRINTBUF;
+
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+		return;
+
+	prt_printf(&buf,
+	       "btree node header doesn't match ptr\n"
+	       "btree %s level %u\n"
+	       "ptr: ",
+	       bch2_btree_ids[b->c.btree_id], b->c.level);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	prt_printf(&buf, "\nheader: btree %s level %llu\n"
+	       "min ",
+	       bch2_btree_ids[BTREE_NODE_ID(b->data)],
+	       BTREE_NODE_LEVEL(b->data));
+	bch2_bpos_to_text(&buf, b->data->min_key);
+
+	prt_printf(&buf, "\nmax ");
+	bch2_bpos_to_text(&buf, b->data->max_key);
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+	printbuf_exit(&buf);
+}
+
+static inline void btree_check_header(struct bch_fs *c, struct btree *b)
+{
+	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
+	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
+	    !bpos_eq(b->data->max_key, b->key.k.p) ||
+	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+	     !bpos_eq(b->data->min_key,
+		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
+		btree_bad_header(c, b);
+}
+
+static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+					   const struct bkey_i *k, unsigned level,
+					   enum six_lock_type lock_type,
+					   unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	struct bset_tree *t;
+	bool need_relock = false;
+	int ret;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+	b = btree_cache_find(bc, k);
+	if (unlikely(!b)) {
+		/*
+		 * We must have the parent locked to call bch2_btree_node_fill(),
+		 * else we could read in a btree node from disk that's been
+		 * freed:
+		 */
+		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
+					 level, lock_type, true);
+		need_relock = true;
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b))
+			return b;
+	} else {
+		if (btree_node_read_locked(path, level + 1))
+			btree_node_unlock(trans, path, level + 1);
+
+		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			return ERR_PTR(ret);
+
+		BUG_ON(ret);
+
+		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+			     b->c.level != level ||
+			     race_fault())) {
+			six_unlock_type(&b->c.lock, lock_type);
+			if (bch2_btree_node_relock(trans, path, level + 1))
+				goto retry;
+
+			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+		}
+
+		/* avoid atomic set bit if it's not needed: */
+		if (!btree_node_accessed(b))
+			set_btree_node_accessed(b);
+	}
+
+	if (unlikely(btree_node_read_in_flight(b))) {
+		u32 seq = six_lock_seq(&b->c.lock);
+
+		six_unlock_type(&b->c.lock, lock_type);
+		bch2_trans_unlock(trans);
+		need_relock = true;
+
+		bch2_btree_node_wait_on_read(b);
+
+		/*
+		 * should_be_locked is not set on this path yet, so we need to
+		 * relock it specifically:
+		 */
+		if (!six_relock_type(&b->c.lock, lock_type, seq))
+			goto retry;
+	}
+
+	if (unlikely(need_relock)) {
+		ret = bch2_trans_relock(trans) ?:
+			bch2_btree_path_relock_intent(trans, path);
+		if (ret) {
+			six_unlock_type(&b->c.lock, lock_type);
+			return ERR_PTR(ret);
+		}
+	}
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->c.btree_id != path->btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
+
+	return b;
+}
+
+/**
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * @trans:	btree transaction object
+ * @path:	btree_path being traversed
+ * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level:	level of btree node being looked up (0 == leaf node)
+ * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
+ */
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+				  const struct bkey_i *k, unsigned level,
+				  enum six_lock_type lock_type,
+				  unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	struct bset_tree *t;
+	int ret;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = btree_node_mem_ptr(k);
+
+	/*
+	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
+	 * be the node we want anymore, and trying to lock the wrong node could
+	 * cause an unneccessary transaction restart:
+	 */
+	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
+		     !b ||
+		     b->hash_val != btree_ptr_hash_val(k)))
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+	if (btree_node_read_locked(path, level + 1))
+		btree_node_unlock(trans, path, level + 1);
+
+	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ERR_PTR(ret);
+
+	BUG_ON(ret);
+
+	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+		     b->c.level != level ||
+		     race_fault())) {
+		six_unlock_type(&b->c.lock, lock_type);
+		if (bch2_btree_node_relock(trans, path, level + 1))
+			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+	}
+
+	if (unlikely(btree_node_read_in_flight(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+	}
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (!btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->c.btree_id != path->btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
+
+	return b;
+}
+
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
+					 const struct bkey_i *k,
+					 enum btree_id btree_id,
+					 unsigned level,
+					 bool nofill)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	struct bset_tree *t;
+	int ret;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	if (c->opts.btree_node_mem_ptr_optimization) {
+		b = btree_node_mem_ptr(k);
+		if (b)
+			goto lock_node;
+	}
+retry:
+	b = btree_cache_find(bc, k);
+	if (unlikely(!b)) {
+		if (nofill)
+			goto out;
+
+		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
+					 level, SIX_LOCK_read, true);
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b) &&
+		    !bch2_btree_cache_cannibalize_lock(c, NULL))
+			goto retry;
+
+		if (IS_ERR(b))
+			goto out;
+	} else {
+lock_node:
+		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			return ERR_PTR(ret);
+
+		BUG_ON(ret);
+
+		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+			     b->c.btree_id != btree_id ||
+			     b->c.level != level)) {
+			six_unlock_read(&b->c.lock);
+			goto retry;
+		}
+	}
+
+	/* XXX: waiting on IO with btree locks held: */
+	__bch2_btree_node_wait_on_read(b);
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (!btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_read(&b->c.lock);
+		b = ERR_PTR(-EIO);
+		goto out;
+	}
+
+	EBUG_ON(b->c.btree_id != btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
+out:
+	bch2_btree_cache_cannibalize_unlock(c);
+	return b;
+}
+
+int bch2_btree_node_prefetch(struct btree_trans *trans,
+			     struct btree_path *path,
+			     const struct bkey_i *k,
+			     enum btree_id btree_id, unsigned level)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	BUG_ON(trans && !btree_node_locked(path, level + 1));
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = btree_cache_find(bc, k);
+	if (b)
+		return 0;
+
+	b = bch2_btree_node_fill(trans, path, k, btree_id,
+				 level, SIX_LOCK_read, false);
+	return PTR_ERR_OR_ZERO(b);
+}
+
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	b = btree_cache_find(bc, k);
+	if (!b)
+		return;
+wait_on_io:
+	/* not allowed to wait on io with btree locks held: */
+
+	/* XXX we're called from btree_gc which will be holding other btree
+	 * nodes locked
+	 */
+	__bch2_btree_node_wait_on_read(b);
+	__bch2_btree_node_wait_on_write(b);
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+
+	if (btree_node_dirty(b)) {
+		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
+
+	BUG_ON(btree_node_dirty(b));
+
+	mutex_lock(&bc->lock);
+	btree_node_data_free(c, b);
+	bch2_btree_node_hash_remove(bc, b);
+	mutex_unlock(&bc->lock);
+
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+}
+
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+			     const struct btree *b)
+{
+	struct bset_stats stats;
+
+	memset(&stats, 0, sizeof(stats));
+
+	bch2_btree_keys_stats(b, &stats);
+
+	prt_printf(out, "l %u ", b->c.level);
+	bch2_bpos_to_text(out, b->data->min_key);
+	prt_printf(out, " - ");
+	bch2_bpos_to_text(out, b->data->max_key);
+	prt_printf(out, ":\n"
+	       "    ptrs: ");
+	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	prt_newline(out);
+
+	prt_printf(out,
+	       "    format: ");
+	bch2_bkey_format_to_text(out, &b->format);
+
+	prt_printf(out,
+	       "    unpack fn len: %u\n"
+	       "    bytes used %zu/%zu (%zu%% full)\n"
+	       "    sib u64s: %u, %u (merge threshold %u)\n"
+	       "    nr packed keys %u\n"
+	       "    nr unpacked keys %u\n"
+	       "    floats %zu\n"
+	       "    failed unpacked %zu\n",
+	       b->unpack_fn_len,
+	       b->nr.live_u64s * sizeof(u64),
+	       btree_bytes(c) - sizeof(struct btree_node),
+	       b->nr.live_u64s * 100 / btree_max_u64s(c),
+	       b->sib_u64s[0],
+	       b->sib_u64s[1],
+	       c->btree_foreground_merge_threshold,
+	       b->nr.packed_keys,
+	       b->nr.unpacked_keys,
+	       stats.floats,
+	       stats.failed);
+}
+
+void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
+{
+	prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+	prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+	prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
new file mode 100644
index 000000000000..1e562b6efa62
--- /dev/null
+++ b/fs/bcachefs/btree_cache.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_CACHE_H
+#define _BCACHEFS_BTREE_CACHE_H
+
+#include "bcachefs.h"
+#include "btree_types.h"
+#include "bkey_methods.h"
+
+extern const char * const bch2_btree_node_flags[];
+
+struct btree_iter;
+
+void bch2_recalc_btree_reserve(struct bch_fs *);
+
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
+				unsigned, enum btree_id);
+
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
+
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
+				  const struct bkey_i *, unsigned,
+				  enum six_lock_type, unsigned long);
+
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
+					 enum btree_id, unsigned, bool);
+
+int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
+			     const struct bkey_i *, enum btree_id, unsigned);
+
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
+
+void bch2_fs_btree_cache_exit(struct bch_fs *);
+int bch2_fs_btree_cache_init(struct bch_fs *);
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
+
+static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
+{
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
+	case KEY_TYPE_btree_ptr_v2:
+		/*
+		 * The cast/deref is only necessary to avoid sparse endianness
+		 * warnings:
+		 */
+		return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
+	default:
+		return 0;
+	}
+}
+
+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_btree_ptr_v2
+		? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
+		: NULL;
+}
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+	return b->hash_val != 0;
+}
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
+	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
+					  &(_c)->btree_cache.table),	\
+	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
+		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct bch_fs *c)
+{
+	return c->opts.btree_node_size;
+}
+
+static inline size_t btree_max_u64s(struct bch_fs *c)
+{
+	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_pages(struct bch_fs *c)
+{
+	return btree_bytes(c) / PAGE_SIZE;
+}
+
+static inline unsigned btree_blocks(struct bch_fs *c)
+{
+	return btree_sectors(c) >> c->block_bits;
+}
+
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
+	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
+
+static inline unsigned btree_id_nr_alive(struct bch_fs *c)
+{
+	return BTREE_ID_NR + c->btree_roots_extra.nr;
+}
+
+static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
+{
+	if (likely(id < BTREE_ID_NR)) {
+		return &c->btree_roots_known[id];
+	} else {
+		unsigned idx = id - BTREE_ID_NR;
+
+		EBUG_ON(idx >= c->btree_roots_extra.nr);
+		return &c->btree_roots_extra.data[idx];
+	}
+}
+
+static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
+{
+	return bch2_btree_id_root(c, b->c.btree_id)->b;
+}
+
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
+			     const struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
new file mode 100644
index 000000000000..693ed067b1a7
--- /dev/null
+++ b/fs/bcachefs/btree_gc.c
@@ -0,0 +1,2111 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright (C) 2014 Datera Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "bkey_buf.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "recovery.h"
+#include "reflink.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+
+#define DROP_THIS_NODE		10
+#define DROP_PREV_NODE		11
+
+static bool should_restart_for_topology_repair(struct bch_fs *c)
+{
+	return c->opts.fix_errors != FSCK_FIX_no &&
+		!(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
+}
+
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	preempt_disable();
+	write_seqcount_begin(&c->gc_pos_lock);
+	c->gc_pos = new_pos;
+	write_seqcount_end(&c->gc_pos_lock);
+	preempt_enable();
+}
+
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+	__gc_pos_set(c, new_pos);
+}
+
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
+static int bch2_gc_check_topology(struct bch_fs *c,
+				  struct btree *b,
+				  struct bkey_buf *prev,
+				  struct bkey_buf cur,
+				  bool is_last)
+{
+	struct bpos node_start	= b->data->min_key;
+	struct bpos node_end	= b->data->max_key;
+	struct bpos expected_start = bkey_deleted(&prev->k->k)
+		? node_start
+		: bpos_successor(prev->k->k.p);
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	int ret = 0;
+
+	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
+
+		if (!bpos_eq(expected_start, bp->v.min_key)) {
+			bch2_topology_error(c);
+
+			if (bkey_deleted(&prev->k->k)) {
+				prt_printf(&buf1, "start of node: ");
+				bch2_bpos_to_text(&buf1, node_start);
+			} else {
+				bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
+			}
+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
+
+			if (__fsck_err(c,
+				  FSCK_CAN_FIX|
+				  FSCK_CAN_IGNORE|
+				  FSCK_NO_RATELIMIT,
+				  "btree node with incorrect min_key at btree %s level %u:\n"
+				  "  prev %s\n"
+				  "  cur %s",
+				  bch2_btree_ids[b->c.btree_id], b->c.level,
+				  buf1.buf, buf2.buf) &&
+			    should_restart_for_topology_repair(c)) {
+				bch_info(c, "Halting mark and sweep to start topology repair pass");
+				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+				goto err;
+			} else {
+				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+			}
+		}
+	}
+
+	if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
+		bch2_topology_error(c);
+
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+
+		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
+		bch2_bpos_to_text(&buf2, node_end);
+
+		if (__fsck_err(c,
+			  FSCK_CAN_FIX|
+			  FSCK_CAN_IGNORE|
+			  FSCK_NO_RATELIMIT,
+			  "btree node with incorrect max_key at btree %s level %u:\n"
+			  "  %s\n"
+			  "  expected %s",
+			  bch2_btree_ids[b->c.btree_id], b->c.level,
+			  buf1.buf, buf2.buf) &&
+		    should_restart_for_topology_repair(c)) {
+			bch_info(c, "Halting mark and sweep to start topology repair pass");
+			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+			goto err;
+		} else {
+			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+		}
+	}
+
+	bch2_bkey_buf_copy(prev, c, cur.k);
+err:
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
+{
+	switch (b->key.k.type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
+
+		dst->k.p		= src->k.p;
+		dst->v.mem_ptr		= 0;
+		dst->v.seq		= b->data->keys.seq;
+		dst->v.sectors_written	= 0;
+		dst->v.flags		= 0;
+		dst->v.min_key		= b->data->min_key;
+		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
+		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
+		break;
+	}
+	case KEY_TYPE_btree_ptr_v2:
+		bkey_copy(&dst->k_i, &b->key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void bch2_btree_node_update_key_early(struct btree_trans *trans,
+					     enum btree_id btree, unsigned level,
+					     struct bkey_s_c old, struct bkey_i *new)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	struct bkey_buf tmp;
+	int ret;
+
+	bch2_bkey_buf_init(&tmp);
+	bch2_bkey_buf_reassemble(&tmp, c, old);
+
+	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
+	if (!IS_ERR_OR_NULL(b)) {
+		mutex_lock(&c->btree_cache.lock);
+
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		bkey_copy(&b->key, new);
+		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+		BUG_ON(ret);
+
+		mutex_unlock(&c->btree_cache.lock);
+		six_unlock_read(&b->c.lock);
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
+{
+	struct bkey_i_btree_ptr_v2 *new;
+	int ret;
+
+	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
+	if (!new)
+		return -BCH_ERR_ENOMEM_gc_repair_key;
+
+	btree_ptr_to_v2(b, new);
+	b->data->min_key	= new_min;
+	new->v.min_key		= new_min;
+	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	if (ret) {
+		kfree(new);
+		return ret;
+	}
+
+	bch2_btree_node_drop_keys_outside_node(b);
+	bkey_copy(&b->key, &new->k_i);
+	return 0;
+}
+
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
+{
+	struct bkey_i_btree_ptr_v2 *new;
+	int ret;
+
+	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
+	if (ret)
+		return ret;
+
+	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
+	if (!new)
+		return -BCH_ERR_ENOMEM_gc_repair_key;
+
+	btree_ptr_to_v2(b, new);
+	b->data->max_key	= new_max;
+	new->k.p		= new_max;
+	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	if (ret) {
+		kfree(new);
+		return ret;
+	}
+
+	bch2_btree_node_drop_keys_outside_node(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+	bkey_copy(&b->key, &new->k_i);
+	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+	BUG_ON(ret);
+	mutex_unlock(&c->btree_cache.lock);
+	return 0;
+}
+
+static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
+					struct btree *prev, struct btree *cur)
+{
+	struct bpos expected_start = !prev
+		? b->data->min_key
+		: bpos_successor(prev->key.k.p);
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	int ret = 0;
+
+	if (!prev) {
+		prt_printf(&buf1, "start of node: ");
+		bch2_bpos_to_text(&buf1, b->data->min_key);
+	} else {
+		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+	}
+
+	bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
+
+	if (prev &&
+	    bpos_gt(expected_start, cur->data->min_key) &&
+	    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
+		/* cur overwrites prev: */
+
+		if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
+						cur->data->min_key), c,
+				"btree node overwritten by next node at btree %s level %u:\n"
+				"  node %s\n"
+				"  next %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
+				buf1.buf, buf2.buf)) {
+			ret = DROP_PREV_NODE;
+			goto out;
+		}
+
+		if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
+						 bpos_predecessor(cur->data->min_key)), c,
+				"btree node with incorrect max_key at btree %s level %u:\n"
+				"  node %s\n"
+				"  next %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
+				buf1.buf, buf2.buf))
+			ret = set_node_max(c, prev,
+					   bpos_predecessor(cur->data->min_key));
+	} else {
+		/* prev overwrites cur: */
+
+		if (mustfix_fsck_err_on(bpos_ge(expected_start,
+						cur->data->max_key), c,
+				"btree node overwritten by prev node at btree %s level %u:\n"
+				"  prev %s\n"
+				"  node %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
+				buf1.buf, buf2.buf)) {
+			ret = DROP_THIS_NODE;
+			goto out;
+		}
+
+		if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
+				"btree node with incorrect min_key at btree %s level %u:\n"
+				"  prev %s\n"
+				"  node %s",
+				bch2_btree_ids[b->c.btree_id], b->c.level,
+				buf1.buf, buf2.buf))
+			ret = set_node_min(c, cur, expected_start);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+				 struct btree *child)
+{
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	int ret = 0;
+
+	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
+	bch2_bpos_to_text(&buf2, b->key.k.p);
+
+	if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
+			"btree node with incorrect max_key at btree %s level %u:\n"
+			"  %s\n"
+			"  expected %s",
+			bch2_btree_ids[b->c.btree_id], b->c.level,
+			buf1.buf, buf2.buf)) {
+		ret = set_node_max(c, child, b->key.k.p);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	struct bkey_buf prev_k, cur_k;
+	struct btree *prev = NULL, *cur = NULL;
+	bool have_child, dropped_children = false;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (!b->c.level)
+		return 0;
+again:
+	prev = NULL;
+	have_child = dropped_children = false;
+	bch2_bkey_buf_init(&prev_k);
+	bch2_bkey_buf_init(&cur_k);
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+
+		bch2_btree_and_journal_iter_advance(&iter);
+		bch2_bkey_buf_reassemble(&cur_k, c, k);
+
+		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
+					b->c.btree_id, b->c.level - 1,
+					false);
+		ret = PTR_ERR_OR_ZERO(cur);
+
+		printbuf_reset(&buf);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
+		if (mustfix_fsck_err_on(ret == -EIO, c,
+				"Topology repair: unreadable btree node at btree %s level %u:\n"
+				"  %s",
+				bch2_btree_ids[b->c.btree_id],
+				b->c.level - 1,
+				buf.buf)) {
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
+			if (ret)
+				break;
+			continue;
+		}
+
+		if (ret) {
+			bch_err_msg(c, ret, "getting btree node");
+			break;
+		}
+
+		ret = btree_repair_node_boundaries(c, b, prev, cur);
+
+		if (ret == DROP_THIS_NODE) {
+			six_unlock_read(&cur->c.lock);
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
+			if (ret)
+				break;
+			continue;
+		}
+
+		if (prev)
+			six_unlock_read(&prev->c.lock);
+		prev = NULL;
+
+		if (ret == DROP_PREV_NODE) {
+			bch2_btree_node_evict(trans, prev_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, prev_k.k->k.p);
+			if (ret)
+				break;
+
+			bch2_btree_and_journal_iter_exit(&iter);
+			bch2_bkey_buf_exit(&prev_k, c);
+			bch2_bkey_buf_exit(&cur_k, c);
+			goto again;
+		} else if (ret)
+			break;
+
+		prev = cur;
+		cur = NULL;
+		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
+	}
+
+	if (!ret && !IS_ERR_OR_NULL(prev)) {
+		BUG_ON(cur);
+		ret = btree_repair_node_end(c, b, prev);
+	}
+
+	if (!IS_ERR_OR_NULL(prev))
+		six_unlock_read(&prev->c.lock);
+	prev = NULL;
+	if (!IS_ERR_OR_NULL(cur))
+		six_unlock_read(&cur->c.lock);
+	cur = NULL;
+
+	if (ret)
+		goto err;
+
+	bch2_btree_and_journal_iter_exit(&iter);
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		bch2_bkey_buf_reassemble(&cur_k, c, k);
+		bch2_btree_and_journal_iter_advance(&iter);
+
+		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
+					b->c.btree_id, b->c.level - 1,
+					false);
+		ret = PTR_ERR_OR_ZERO(cur);
+
+		if (ret) {
+			bch_err_msg(c, ret, "getting btree node");
+			goto err;
+		}
+
+		ret = bch2_btree_repair_topology_recurse(trans, cur);
+		six_unlock_read(&cur->c.lock);
+		cur = NULL;
+
+		if (ret == DROP_THIS_NODE) {
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			dropped_children = true;
+		}
+
+		if (ret)
+			goto err;
+
+		have_child = true;
+	}
+
+	printbuf_reset(&buf);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	if (mustfix_fsck_err_on(!have_child, c,
+			"empty interior btree node at btree %s level %u\n"
+			"  %s",
+			bch2_btree_ids[b->c.btree_id],
+			b->c.level, buf.buf))
+		ret = DROP_THIS_NODE;
+err:
+fsck_err:
+	if (!IS_ERR_OR_NULL(prev))
+		six_unlock_read(&prev->c.lock);
+	if (!IS_ERR_OR_NULL(cur))
+		six_unlock_read(&cur->c.lock);
+
+	bch2_btree_and_journal_iter_exit(&iter);
+	bch2_bkey_buf_exit(&prev_k, c);
+	bch2_bkey_buf_exit(&cur_k, c);
+
+	if (!ret && dropped_children)
+		goto again;
+
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_topology(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree *b;
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->alive)
+			continue;
+
+		b = r->b;
+		if (btree_node_fake(b))
+			continue;
+
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		ret = bch2_btree_repair_topology_recurse(trans, b);
+		six_unlock_read(&b->c.lock);
+
+		if (ret == DROP_THIS_NODE) {
+			bch_err(c, "empty btree root - repair unimplemented");
+			ret = -BCH_ERR_fsck_repair_unimplemented;
+		}
+	}
+
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
+			       unsigned level, bool is_root,
+			       struct bkey_s_c *k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
+	const union bch_extent_entry *entry_c;
+	struct extent_ptr_decoded p = { 0 };
+	bool do_update = false;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	/*
+	 * XXX
+	 * use check_bucket_ref here
+	 */
+	bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
+
+		if (!g->gen_valid &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+			if (!p.ptr.cached) {
+				g->gen_valid		= true;
+				g->gen			= p.ptr.gen;
+			} else {
+				do_update = true;
+			}
+		}
+
+		if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen, g->gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+			if (!p.ptr.cached) {
+				g->gen_valid		= true;
+				g->gen			= p.ptr.gen;
+				g->data_type		= 0;
+				g->dirty_sectors	= 0;
+				g->cached_sectors	= 0;
+				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+			} else {
+				do_update = true;
+			}
+		}
+
+		if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+			do_update = true;
+
+		if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen, g->gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+			do_update = true;
+
+		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+			continue;
+
+		if (fsck_err_on(bucket_data_type(g->data_type) &&
+				bucket_data_type(g->data_type) != data_type, c,
+				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_types[g->data_type],
+				bch2_data_types[data_type],
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
+			if (data_type == BCH_DATA_btree) {
+				g->data_type	= data_type;
+				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+			} else {
+				do_update = true;
+			}
+		}
+
+		if (p.has_ec) {
+			struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
+
+			if (fsck_err_on(!m || !m->alive, c,
+					"pointer to nonexistent stripe %llu\n"
+					"while marking %s",
+					(u64) p.ec.idx,
+					(printbuf_reset(&buf),
+					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
+				do_update = true;
+
+			if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
+					"pointer does not match stripe %llu\n"
+					"while marking %s",
+					(u64) p.ec.idx,
+					(printbuf_reset(&buf),
+					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
+				do_update = true;
+		}
+	}
+
+	if (do_update) {
+		struct bkey_ptrs ptrs;
+		union bch_extent_entry *entry;
+		struct bch_extent_ptr *ptr;
+		struct bkey_i *new;
+
+		if (is_root) {
+			bch_err(c, "cannot update btree roots yet");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+		if (!new) {
+			bch_err_msg(c, ret, "allocating new key");
+			ret = -BCH_ERR_ENOMEM_gc_repair_key;
+			goto err;
+		}
+
+		bkey_reassemble(new, *k);
+
+		if (level) {
+			/*
+			 * We don't want to drop btree node pointers - if the
+			 * btree node isn't there anymore, the read path will
+			 * sort it out:
+			 */
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_for_each_ptr(ptrs, ptr) {
+				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+
+				ptr->gen = g->gen;
+			}
+		} else {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
+
+				(ptr->cached &&
+				 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
+				(!ptr->cached &&
+				 gen_cmp(ptr->gen, g->gen) < 0) ||
+				gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+				(g->data_type &&
+				 g->data_type != data_type);
+			}));
+again:
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_extent_entry_for_each(ptrs, entry) {
+				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
+									entry->stripe_ptr.idx);
+					union bch_extent_entry *next_ptr;
+
+					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+							goto found;
+					next_ptr = NULL;
+found:
+					if (!next_ptr) {
+						bch_err(c, "aieee, found stripe ptr with no data ptr");
+						continue;
+					}
+
+					if (!m || !m->alive ||
+					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+								       &next_ptr->ptr,
+								       m->sectors)) {
+						bch2_bkey_extent_entry_drop(new, entry);
+						goto again;
+					}
+				}
+			}
+		}
+
+		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+		if (ret) {
+			kfree(new);
+			goto err;
+		}
+
+		if (level)
+			bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+
+		if (0) {
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, *k);
+			bch_info(c, "updated %s", buf.buf);
+
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+			bch_info(c, "new key %s", buf.buf);
+		}
+
+		*k = bkey_i_to_s_c(new);
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/* marking of btree keys/nodes: */
+
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
+			    unsigned level, bool is_root,
+			    struct bkey_s_c *k,
+			    bool initial)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey deleted = KEY(0, 0, 0);
+	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+	unsigned flags =
+		BTREE_TRIGGER_GC|
+		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
+	int ret = 0;
+
+	deleted.p = k->k->p;
+
+	if (initial) {
+		BUG_ON(bch2_journal_seq_verify &&
+		       k->k->version.lo > atomic64_read(&c->journal.seq));
+
+		ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
+				"key version number higher than recorded: %llu > %llu",
+				k->k->version.lo,
+				atomic64_read(&c->key_version)))
+			atomic64_set(&c->key_version, k->k->version.lo);
+	}
+
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_mark_key(trans, btree_id, level, old, *k, flags));
+fsck_err:
+err:
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_node_iter iter;
+	struct bkey unpacked;
+	struct bkey_s_c k;
+	struct bkey_buf prev, cur;
+	int ret = 0;
+
+	if (!btree_node_type_needs_gc(btree_node_type(b)))
+		return 0;
+
+	bch2_btree_node_iter_init_from_start(&iter, b);
+	bch2_bkey_buf_init(&prev);
+	bch2_bkey_buf_init(&cur);
+	bkey_init(&prev.k->k);
+
+	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
+				       &k, initial);
+		if (ret)
+			break;
+
+		bch2_btree_node_iter_advance(&iter, b);
+
+		if (b->c.level) {
+			bch2_bkey_buf_reassemble(&cur, c, k);
+
+			ret = bch2_gc_check_topology(c, b, &prev, cur,
+					bch2_btree_node_iter_end(&iter));
+			if (ret)
+				break;
+		}
+	}
+
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
+	return ret;
+}
+
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
+			 bool initial, bool metadata_only)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct btree *b;
+	unsigned depth = metadata_only ? 1 : 0;
+	int ret = 0;
+
+	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
+
+	__for_each_btree_node(trans, iter, btree_id, POS_MIN,
+			      0, depth, BTREE_ITER_PREFETCH, b, ret) {
+		bch2_verify_btree_nr_keys(b);
+
+		gc_pos_set(c, gc_pos_btree_node(b));
+
+		ret = btree_gc_mark_node(trans, b, initial);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->btree_root_lock);
+	b = bch2_btree_id_root(c, btree_id)->b;
+	if (!btree_node_fake(b)) {
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
+				       true, &k, initial);
+	}
+	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
+	mutex_unlock(&c->btree_root_lock);
+
+	return ret;
+}
+
+static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
+				      unsigned target_depth)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	struct bkey_buf cur, prev;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+	bch2_bkey_buf_init(&prev);
+	bch2_bkey_buf_init(&cur);
+	bkey_init(&prev.k->k);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+				       false, &k, true);
+		if (ret)
+			goto fsck_err;
+
+		if (b->c.level) {
+			bch2_bkey_buf_reassemble(&cur, c, k);
+			k = bkey_i_to_s_c(cur.k);
+
+			bch2_btree_and_journal_iter_advance(&iter);
+
+			ret = bch2_gc_check_topology(c, b,
+					&prev, cur,
+					!bch2_btree_and_journal_iter_peek(&iter).k);
+			if (ret)
+				goto fsck_err;
+		} else {
+			bch2_btree_and_journal_iter_advance(&iter);
+		}
+	}
+
+	if (b->c.level > target_depth) {
+		bch2_btree_and_journal_iter_exit(&iter);
+		bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+			struct btree *child;
+
+			bch2_bkey_buf_reassemble(&cur, c, k);
+			bch2_btree_and_journal_iter_advance(&iter);
+
+			child = bch2_btree_node_get_noiter(trans, cur.k,
+						b->c.btree_id, b->c.level - 1,
+						false);
+			ret = PTR_ERR_OR_ZERO(child);
+
+			if (ret == -EIO) {
+				bch2_topology_error(c);
+
+				if (__fsck_err(c,
+					  FSCK_CAN_FIX|
+					  FSCK_CAN_IGNORE|
+					  FSCK_NO_RATELIMIT,
+					  "Unreadable btree node at btree %s level %u:\n"
+					  "  %s",
+					  bch2_btree_ids[b->c.btree_id],
+					  b->c.level - 1,
+					  (printbuf_reset(&buf),
+					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
+				    should_restart_for_topology_repair(c)) {
+					bch_info(c, "Halting mark and sweep to start topology repair pass");
+					ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+					goto fsck_err;
+				} else {
+					/* Continue marking when opted to not
+					 * fix the error: */
+					ret = 0;
+					set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+					continue;
+				}
+			} else if (ret) {
+				bch_err_msg(c, ret, "getting btree node");
+				break;
+			}
+
+			ret = bch2_gc_btree_init_recurse(trans, child,
+							 target_depth);
+			six_unlock_read(&child->c.lock);
+
+			if (ret)
+				break;
+		}
+	}
+fsck_err:
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
+	bch2_btree_and_journal_iter_exit(&iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_btree_init(struct btree_trans *trans,
+			      enum btree_id btree_id,
+			      bool metadata_only)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	unsigned target_depth = metadata_only ? 1 : 0;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	b = bch2_btree_id_root(c, btree_id)->b;
+
+	if (btree_node_fake(b))
+		return 0;
+
+	six_lock_read(&b->c.lock, NULL, NULL);
+	printbuf_reset(&buf);
+	bch2_bpos_to_text(&buf, b->data->min_key);
+	if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
+			"btree root with incorrect min_key: %s", buf.buf)) {
+		bch_err(c, "repair unimplemented");
+		ret = -BCH_ERR_fsck_repair_unimplemented;
+		goto fsck_err;
+	}
+
+	printbuf_reset(&buf);
+	bch2_bpos_to_text(&buf, b->data->max_key);
+	if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
+			"btree root with incorrect max_key: %s", buf.buf)) {
+		bch_err(c, "repair unimplemented");
+		ret = -BCH_ERR_fsck_repair_unimplemented;
+		goto fsck_err;
+	}
+
+	if (b->c.level >= target_depth)
+		ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
+
+	if (!ret) {
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
+				       &k, true);
+	}
+fsck_err:
+	six_unlock_read(&b->c.lock);
+
+	if (ret < 0)
+		bch_err_fn(c, ret);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+	return  (int) btree_id_to_gc_phase(l) -
+		(int) btree_id_to_gc_phase(r);
+}
+
+static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	enum btree_id ids[BTREE_ID_NR];
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		ids[i] = i;
+	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
+
+	for (i = 0; i < BTREE_ID_NR && !ret; i++)
+		ret = initial
+			? bch2_gc_btree_init(trans, ids[i], metadata_only)
+			: bch2_gc_btree(trans, ids[i], initial, metadata_only);
+
+	for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
+		if (!bch2_btree_id_root(c, i)->alive)
+			continue;
+
+		ret = initial
+			? bch2_gc_btree_init(trans, i, metadata_only)
+			: bch2_gc_btree(trans, i, initial, metadata_only);
+	}
+
+	if (ret < 0)
+		bch_err_fn(c, ret);
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+				  u64 start, u64 end,
+				  enum bch_data_type type,
+				  unsigned flags)
+{
+	u64 b = sector_to_bucket(ca, start);
+
+	do {
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+		bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+					  gc_phase(GC_PHASE_SB), flags);
+		b++;
+		start += sectors;
+	} while (start < end);
+}
+
+static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+				     unsigned flags)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	unsigned i;
+	u64 b;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR)
+			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+					      BCH_DATA_sb, flags);
+
+		mark_metadata_sectors(c, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
+				      BCH_DATA_sb, flags);
+	}
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		b = ca->journal.buckets[i];
+		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
+					  ca->mi.bucket_size,
+					  gc_phase(GC_PHASE_SB), flags);
+	}
+}
+
+static void bch2_mark_superblocks(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&c->sb_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_SB));
+
+	for_each_online_member(ca, c, i)
+		bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
+	mutex_unlock(&c->sb_lock);
+}
+
+#if 0
+/* Also see bch2_pending_btree_node_free_insert_done() */
+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
+{
+	struct btree_update *as;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
+
+	for_each_pending_btree_node_free(c, as, d)
+		if (d->index_update_done)
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+#endif
+
+static void bch2_gc_free(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	genradix_free(&c->reflink_gc_table);
+	genradix_free(&c->gc_stripes);
+
+	for_each_member_device(ca, c, i) {
+		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
+			sizeof(struct bucket_array) +
+			ca->mi.nbuckets * sizeof(struct bucket));
+		ca->buckets_gc = NULL;
+
+		free_percpu(ca->usage_gc);
+		ca->usage_gc = NULL;
+	}
+
+	free_percpu(c->usage_gc);
+	c->usage_gc = NULL;
+}
+
+static int bch2_gc_done(struct bch_fs *c,
+			bool initial, bool metadata_only)
+{
+	struct bch_dev *ca = NULL;
+	struct printbuf buf = PRINTBUF;
+	bool verify = !metadata_only &&
+		!c->opts.reconstruct_alloc &&
+		(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+	unsigned i, dev;
+	int ret = 0;
+
+	percpu_down_write(&c->mark_lock);
+
+#define copy_field(_f, _msg, ...)					\
+	if (dst->_f != src->_f &&					\
+	    (!verify ||							\
+	     fsck_err(c, _msg ": got %llu, should be %llu"		\
+		      , ##__VA_ARGS__, dst->_f, src->_f)))		\
+		dst->_f = src->_f
+#define copy_dev_field(_f, _msg, ...)					\
+	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+#define copy_fs_field(_f, _msg, ...)					\
+	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
+
+	for_each_member_device(ca, c, dev) {
+		struct bch_dev_usage *dst = ca->usage_base;
+		struct bch_dev_usage *src = (void *)
+			bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
+					     dev_usage_u64s());
+
+		copy_dev_field(buckets_ec,		"buckets_ec");
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
+			copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
+			copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
+		}
+	}
+
+	{
+		unsigned nr = fs_usage_u64s(c);
+		struct bch_fs_usage *dst = c->usage_base;
+		struct bch_fs_usage *src = (void *)
+			bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
+
+		copy_fs_field(hidden,		"hidden");
+		copy_fs_field(btree,		"btree");
+
+		if (!metadata_only) {
+			copy_fs_field(data,	"data");
+			copy_fs_field(cached,	"cached");
+			copy_fs_field(reserved,	"reserved");
+			copy_fs_field(nr_inodes,"nr_inodes");
+
+			for (i = 0; i < BCH_REPLICAS_MAX; i++)
+				copy_fs_field(persistent_reserved[i],
+					      "persistent_reserved[%i]", i);
+		}
+
+		for (i = 0; i < c->replicas.nr; i++) {
+			struct bch_replicas_entry *e =
+				cpu_replicas_entry(&c->replicas, i);
+
+			if (metadata_only &&
+			    (e->data_type == BCH_DATA_user ||
+			     e->data_type == BCH_DATA_cached))
+				continue;
+
+			printbuf_reset(&buf);
+			bch2_replicas_entry_to_text(&buf, e);
+
+			copy_fs_field(replicas[i], "%s", buf.buf);
+		}
+	}
+
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_stripe_field
+#undef copy_field
+fsck_err:
+	if (ca)
+		percpu_ref_put(&ca->ref);
+	if (ret)
+		bch_err_fn(c, ret);
+
+	percpu_up_write(&c->mark_lock);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+	struct bch_dev *ca = NULL;
+	unsigned i;
+
+	BUG_ON(c->usage_gc);
+
+	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+					 sizeof(u64), GFP_KERNEL);
+	if (!c->usage_gc) {
+		bch_err(c, "error allocating c->usage_gc");
+		return -BCH_ERR_ENOMEM_gc_start;
+	}
+
+	for_each_member_device(ca, c, i) {
+		BUG_ON(ca->usage_gc);
+
+		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage_gc) {
+			bch_err(c, "error allocating ca->usage_gc");
+			percpu_ref_put(&ca->ref);
+			return -BCH_ERR_ENOMEM_gc_start;
+		}
+
+		this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
+			       ca->mi.nbuckets - ca->mi.first_bucket);
+	}
+
+	return 0;
+}
+
+static int bch2_gc_reset(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		free_percpu(ca->usage_gc);
+		ca->usage_gc = NULL;
+	}
+
+	free_percpu(c->usage_gc);
+	c->usage_gc = NULL;
+
+	return bch2_gc_start(c);
+}
+
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+				     struct bch_alloc_v4 r)
+{
+	return  l.gen != r.gen				||
+		l.oldest_gen != r.oldest_gen		||
+		l.data_type != r.data_type		||
+		l.dirty_sectors	!= r.dirty_sectors	||
+		l.cached_sectors != r.cached_sectors	 ||
+		l.stripe_redundancy != r.stripe_redundancy ||
+		l.stripe != r.stripe;
+}
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+				struct btree_iter *iter,
+				struct bkey_s_c k,
+				bool metadata_only)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+	struct bucket gc, *b;
+	struct bkey_i_alloc_v4 *a;
+	struct bch_alloc_v4 old_convert, new;
+	const struct bch_alloc_v4 *old;
+	enum bch_data_type type;
+	int ret;
+
+	if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
+		return 1;
+
+	old = bch2_alloc_to_v4(k, &old_convert);
+	new = *old;
+
+	percpu_down_read(&c->mark_lock);
+	b = gc_bucket(ca, iter->pos.offset);
+
+	/*
+	 * b->data_type doesn't yet include need_discard & need_gc_gen states -
+	 * fix that here:
+	 */
+	type = __alloc_data_type(b->dirty_sectors,
+				 b->cached_sectors,
+				 b->stripe,
+				 *old,
+				 b->data_type);
+	if (b->data_type != type) {
+		struct bch_dev_usage *u;
+
+		preempt_disable();
+		u = this_cpu_ptr(ca->usage_gc);
+		u->d[b->data_type].buckets--;
+		b->data_type = type;
+		u->d[b->data_type].buckets++;
+		preempt_enable();
+	}
+
+	gc = *b;
+	percpu_up_read(&c->mark_lock);
+
+	if (metadata_only &&
+	    gc.data_type != BCH_DATA_sb &&
+	    gc.data_type != BCH_DATA_journal &&
+	    gc.data_type != BCH_DATA_btree)
+		return 0;
+
+	if (gen_after(old->gen, gc.gen))
+		return 0;
+
+	if (c->opts.reconstruct_alloc ||
+	    fsck_err_on(new.data_type != gc.data_type, c,
+			"bucket %llu:%llu gen %u has wrong data_type"
+			": got %s, should be %s",
+			iter->pos.inode, iter->pos.offset,
+			gc.gen,
+			bch2_data_types[new.data_type],
+			bch2_data_types[gc.data_type]))
+		new.data_type = gc.data_type;
+
+#define copy_bucket_field(_f)						\
+	if (c->opts.reconstruct_alloc ||				\
+	    fsck_err_on(new._f != gc._f, c,				\
+			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
+			": got %u, should be %u",			\
+			iter->pos.inode, iter->pos.offset,		\
+			gc.gen,						\
+			bch2_data_types[gc.data_type],			\
+			new._f, gc._f))					\
+		new._f = gc._f;						\
+
+	copy_bucket_field(gen);
+	copy_bucket_field(dirty_sectors);
+	copy_bucket_field(cached_sectors);
+	copy_bucket_field(stripe_redundancy);
+	copy_bucket_field(stripe);
+#undef copy_bucket_field
+
+	if (!bch2_alloc_v4_cmp(*old, new))
+		return 0;
+
+	a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	a->v = new;
+
+	/*
+	 * The trigger normally makes sure this is set, but we're not running
+	 * triggers:
+	 */
+	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
+fsck_err:
+	return ret;
+}
+
+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+
+	for_each_member_device(ca, c, i) {
+		ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+				POS(ca->dev_idx, ca->mi.first_bucket),
+				BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_LAZY_RW,
+			bch2_alloc_write_key(trans, &iter, k, metadata_only));
+
+		if (ret < 0) {
+			bch_err_fn(c, ret);
+			percpu_ref_put(&ca->ref);
+			break;
+		}
+	}
+
+	bch2_trans_put(trans);
+	return ret < 0 ? ret : 0;
+}
+
+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
+{
+	struct bch_dev *ca;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bucket *g;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	unsigned i;
+	int ret;
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+				ca->mi.nbuckets * sizeof(struct bucket),
+				GFP_KERNEL|__GFP_ZERO);
+		if (!buckets) {
+			percpu_ref_put(&ca->ref);
+			bch_err(c, "error allocating ca->buckets[gc]");
+			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+			goto err;
+		}
+
+		buckets->first_bucket	= ca->mi.first_bucket;
+		buckets->nbuckets	= ca->mi.nbuckets;
+		rcu_assign_pointer(ca->buckets_gc, buckets);
+	}
+
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		ca = bch_dev_bkey_exists(c, k.k->p.inode);
+		g = gc_bucket(ca, k.k->p.offset);
+
+		a = bch2_alloc_to_v4(k, &a_convert);
+
+		g->gen_valid	= 1;
+		g->gen		= a->gen;
+
+		if (metadata_only &&
+		    (a->data_type == BCH_DATA_user ||
+		     a->data_type == BCH_DATA_cached ||
+		     a->data_type == BCH_DATA_parity)) {
+			g->data_type		= a->data_type;
+			g->dirty_sectors	= a->dirty_sectors;
+			g->cached_sectors	= a->cached_sectors;
+			g->stripe		= a->stripe;
+			g->stripe_redundancy	= a->stripe_redundancy;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch2_trans_put(trans);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *buckets = gc_bucket_array(ca);
+		struct bucket *g;
+
+		for_each_bucket(g, buckets) {
+			if (metadata_only &&
+			    (g->data_type == BCH_DATA_user ||
+			     g->data_type == BCH_DATA_cached ||
+			     g->data_type == BCH_DATA_parity))
+				continue;
+			g->data_type = 0;
+			g->dirty_sectors = 0;
+			g->cached_sectors = 0;
+		}
+	}
+}
+
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     size_t *idx)
+{
+	struct bch_fs *c = trans->c;
+	const __le64 *refcount = bkey_refcount_c(k);
+	struct printbuf buf = PRINTBUF;
+	struct reflink_gc *r;
+	int ret = 0;
+
+	if (!refcount)
+		return 0;
+
+	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+	       r->offset < k.k->p.offset)
+		++*idx;
+
+	if (!r ||
+	    r->offset != k.k->p.offset ||
+	    r->size != k.k->size) {
+		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+		return -EINVAL;
+	}
+
+	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+			"reflink key has wrong refcount:\n"
+			"  %s\n"
+			"  should be %u",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			r->refcount)) {
+		struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
+
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
+
+		if (!r->refcount)
+			new->k.type = KEY_TYPE_deleted;
+		else
+			*bkey_refcount(new) = cpu_to_le64(r->refcount);
+	}
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t idx = 0;
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	trans = bch2_trans_get(c);
+
+	ret = for_each_btree_key_commit(trans, iter,
+			BTREE_ID_reflink, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_gc_write_reflink_key(trans, &iter, k, &idx));
+
+	c->reflink_gc_nr = 0;
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int bch2_gc_reflink_start(struct bch_fs *c,
+				 bool metadata_only)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct reflink_gc *r;
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	trans = bch2_trans_get(c);
+	c->reflink_gc_nr = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		const __le64 *refcount = bkey_refcount_c(k);
+
+		if (!refcount)
+			continue;
+
+		r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+				       GFP_KERNEL);
+		if (!r) {
+			ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+			break;
+		}
+
+		r->offset	= k.k->p.offset;
+		r->size		= k.k->size;
+		r->refcount	= 0;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
+{
+	struct genradix_iter iter;
+	struct reflink_gc *r;
+
+	genradix_for_each(&c->reflink_gc_table, iter, r)
+		r->refcount = 0;
+}
+
+static int bch2_gc_write_stripes_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	const struct bch_stripe *s;
+	struct gc_stripe *m;
+	bool bad = false;
+	unsigned i;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_stripe)
+		return 0;
+
+	s = bkey_s_c_to_stripe(k).v;
+	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+
+	for (i = 0; i < s->nr_blocks; i++) {
+		u32 old = stripe_blockcount_get(s, i);
+		u32 new = (m ? m->block_sectors[i] : 0);
+
+		if (old != new) {
+			prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
+				   i, old, new);
+			bad = true;
+		}
+	}
+
+	if (bad)
+		bch2_bkey_val_to_text(&buf, c, k);
+
+	if (fsck_err_on(bad, c, "%s", buf.buf)) {
+		struct bkey_i_stripe *new;
+
+		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
+
+		bkey_reassemble(&new->k_i, k);
+
+		for (i = 0; i < new->v.nr_blocks; i++)
+			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+
+		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+	}
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	trans = bch2_trans_get(c);
+
+	ret = for_each_btree_key_commit(trans, iter,
+			BTREE_ID_stripes, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_gc_write_stripes_key(trans, &iter, k));
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
+{
+	genradix_free(&c->gc_stripes);
+}
+
+/**
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * @c:			filesystem object
+ * @initial:		are we in recovery?
+ * @metadata_only:	are we just checking metadata references, or everything?
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
+ */
+int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
+{
+	unsigned iter = 0;
+	int ret;
+
+	lockdep_assert_held(&c->state_lock);
+
+	down_write(&c->gc_lock);
+
+	bch2_btree_interior_updates_flush(c);
+
+	ret   = bch2_gc_start(c) ?:
+		bch2_gc_alloc_start(c, metadata_only) ?:
+		bch2_gc_reflink_start(c, metadata_only);
+	if (ret)
+		goto out;
+again:
+	gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+	bch2_mark_superblocks(c);
+
+	ret = bch2_gc_btrees(c, initial, metadata_only);
+
+	if (ret)
+		goto out;
+
+#if 0
+	bch2_mark_pending_btree_node_frees(c);
+#endif
+	c->gc_count++;
+
+	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+	    (!iter && bch2_test_restart_gc)) {
+		if (iter++ > 2) {
+			bch_info(c, "Unable to fix bucket gens, looping");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * XXX: make sure gens we fixed got saved
+		 */
+		bch_info(c, "Second GC pass needed, restarting:");
+		clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+		__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+		bch2_gc_stripes_reset(c, metadata_only);
+		bch2_gc_alloc_reset(c, metadata_only);
+		bch2_gc_reflink_reset(c, metadata_only);
+		ret = bch2_gc_reset(c);
+		if (ret)
+			goto out;
+
+		/* flush fsck errors, reset counters */
+		bch2_flush_fsck_errs(c);
+		goto again;
+	}
+out:
+	if (!ret) {
+		bch2_journal_block(&c->journal);
+
+		ret   = bch2_gc_stripes_done(c, metadata_only) ?:
+			bch2_gc_reflink_done(c, metadata_only) ?:
+			bch2_gc_alloc_done(c, metadata_only) ?:
+			bch2_gc_done(c, initial, metadata_only);
+
+		bch2_journal_unblock(&c->journal);
+	}
+
+	percpu_down_write(&c->mark_lock);
+	/* Indicates that gc is no longer in progress: */
+	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+	bch2_gc_free(c);
+	percpu_up_write(&c->mark_lock);
+
+	up_write(&c->gc_lock);
+
+	/*
+	 * At startup, allocations can happen directly instead of via the
+	 * allocator thread - issue wakeup in case they blocked on gc_lock:
+	 */
+	closure_wake_up(&c->freelist_wait);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int gc_btree_gens_key(struct btree_trans *trans,
+			     struct btree_iter *iter,
+			     struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	struct bkey_i *u;
+	int ret;
+
+	percpu_down_read(&c->mark_lock);
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ptr_stale(ca, ptr) > 16) {
+			percpu_up_read(&c->mark_lock);
+			goto update;
+		}
+	}
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+
+		if (gen_after(*gen, ptr->gen))
+			*gen = ptr->gen;
+	}
+	percpu_up_read(&c->mark_lock);
+	return 0;
+update:
+	u = bch2_bkey_make_mut(trans, iter, &k, 0);
+	ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
+
+	bch2_extent_normalize(c, bkey_i_to_s(u));
+	return 0;
+}
+
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
+				       struct bkey_s_c k)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+	struct bkey_i_alloc_v4 *a_mut;
+	int ret;
+
+	if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
+		return 0;
+
+	a_mut = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a_mut);
+	if (ret)
+		return ret;
+
+	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+	a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
+
+	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
+}
+
+int bch2_gc_gens(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	u64 b, start_time = local_clock();
+	unsigned i;
+	int ret;
+
+	/*
+	 * Ideally we would be using state_lock and not gc_lock here, but that
+	 * introduces a deadlock in the RO path - we currently take the state
+	 * lock at the start of going RO, thus the gc thread may get stuck:
+	 */
+	if (!mutex_trylock(&c->gc_gens_lock))
+		return 0;
+
+	trace_and_count(c, gc_gens_start, c);
+	down_read(&c->gc_lock);
+	trans = bch2_trans_get(c);
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_gens *gens;
+
+		BUG_ON(ca->oldest_gen);
+
+		ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+		if (!ca->oldest_gen) {
+			percpu_ref_put(&ca->ref);
+			ret = -BCH_ERR_ENOMEM_gc_gens;
+			goto err;
+		}
+
+		gens = bucket_gens(ca);
+
+		for (b = gens->first_bucket;
+		     b < gens->nbuckets; b++)
+			ca->oldest_gen[b] = gens->b[b];
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (btree_type_has_ptrs(i)) {
+			c->gc_gens_btree = i;
+			c->gc_gens_pos = POS_MIN;
+
+			ret = for_each_btree_key_commit(trans, iter, i,
+					POS_MIN,
+					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+					k,
+					NULL, NULL,
+					BTREE_INSERT_NOFAIL,
+				gc_btree_gens_key(trans, &iter, k));
+			if (ret && !bch2_err_matches(ret, EROFS))
+				bch_err_fn(c, ret);
+			if (ret)
+				goto err;
+		}
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+			POS_MIN,
+			BTREE_ITER_PREFETCH,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_NOFAIL,
+		bch2_alloc_write_oldest_gen(trans, &iter, k));
+	if (ret && !bch2_err_matches(ret, EROFS))
+		bch_err_fn(c, ret);
+	if (ret)
+		goto err;
+
+	c->gc_gens_btree	= 0;
+	c->gc_gens_pos		= POS_MIN;
+
+	c->gc_count++;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+	trace_and_count(c, gc_gens_end, c);
+err:
+	for_each_member_device(ca, c, i) {
+		kvfree(ca->oldest_gen);
+		ca->oldest_gen = NULL;
+	}
+
+	bch2_trans_put(trans);
+	up_read(&c->gc_lock);
+	mutex_unlock(&c->gc_gens_lock);
+	return ret;
+}
+
+static int bch2_gc_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last = atomic64_read(&clock->now);
+	unsigned last_kick = atomic_read(&c->kick_gc);
+	int ret;
+
+	set_freezable();
+
+	while (1) {
+		while (1) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop()) {
+				__set_current_state(TASK_RUNNING);
+				return 0;
+			}
+
+			if (atomic_read(&c->kick_gc) != last_kick)
+				break;
+
+			if (c->btree_gc_periodic) {
+				unsigned long next = last + c->capacity / 16;
+
+				if (atomic64_read(&clock->now) >= next)
+					break;
+
+				bch2_io_clock_schedule_timeout(clock, next);
+			} else {
+				schedule();
+			}
+
+			try_to_freeze();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		last = atomic64_read(&clock->now);
+		last_kick = atomic_read(&c->kick_gc);
+
+		/*
+		 * Full gc is currently incompatible with btree key cache:
+		 */
+#if 0
+		ret = bch2_gc(c, false, false);
+#else
+		ret = bch2_gc_gens(c);
+#endif
+		if (ret < 0)
+			bch_err_fn(c, ret);
+
+		debug_check_no_locks_held();
+	}
+
+	return 0;
+}
+
+void bch2_gc_thread_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	p = c->gc_thread;
+	c->gc_thread = NULL;
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_gc_thread_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	if (c->gc_thread)
+		return 0;
+
+	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
+	if (IS_ERR(p)) {
+		bch_err_fn(c, PTR_ERR(p));
+		return PTR_ERR(p);
+	}
+
+	get_task_struct(p);
+	c->gc_thread = p;
+	wake_up_process(p);
+	return 0;
+}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
new file mode 100644
index 000000000000..607575f83a00
--- /dev/null
+++ b/fs/bcachefs/btree_gc.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_H
+#define _BCACHEFS_BTREE_GC_H
+
+#include "bkey.h"
+#include "btree_types.h"
+
+int bch2_check_topology(struct bch_fs *);
+int bch2_gc(struct bch_fs *, bool, bool);
+int bch2_gc_gens(struct bch_fs *);
+void bch2_gc_thread_stop(struct bch_fs *);
+int bch2_gc_thread_start(struct bch_fs *);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+	return (struct gc_pos) {
+		.phase	= phase,
+		.pos	= POS_MIN,
+		.level	= 0,
+	};
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+	return  cmp_int(l.phase, r.phase) ?:
+		bpos_cmp(l.pos, r.pos) ?:
+		cmp_int(l.level, r.level);
+}
+
+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
+{
+	switch (id) {
+#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
+	BCH_BTREE_IDS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+					 struct bpos pos, unsigned level)
+{
+	return (struct gc_pos) {
+		.phase	= btree_id_to_gc_phase(id),
+		.pos	= pos,
+		.level	= level,
+	};
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+	return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+	return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
+}
+
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
+{
+	unsigned seq;
+	bool ret;
+
+	do {
+		seq = read_seqcount_begin(&c->gc_pos_lock);
+		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
+	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+	return ret;
+}
+
+static inline void bch2_do_gc_gens(struct bch_fs *c)
+{
+	atomic_inc(&c->kick_gc);
+	if (c->gc_thread)
+		wake_up_process(c->gc_thread);
+}
+
+#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
new file mode 100644
index 000000000000..a869cf6ac7c6
--- /dev/null
+++ b/fs/bcachefs/btree_io.c
@@ -0,0 +1,2223 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "bkey_sort.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io_write.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "recovery.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+void bch2_btree_node_io_unlock(struct btree *b)
+{
+	EBUG_ON(!btree_node_write_in_flight(b));
+
+	clear_btree_node_write_in_flight_inner(b);
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_node_io_lock(struct btree *b)
+{
+	bch2_assert_btree_nodes_not_locked();
+
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_read(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_write(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_read(struct btree *b)
+{
+	bch2_assert_btree_nodes_not_locked();
+
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_write(struct btree *b)
+{
+	bch2_assert_btree_nodes_not_locked();
+
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+static void verify_no_dups(struct btree *b,
+			   struct bkey_packed *start,
+			   struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bkey_packed *k, *p;
+
+	if (start == end)
+		return;
+
+	for (p = start, k = bkey_p_next(start);
+	     k != end;
+	     p = k, k = bkey_p_next(k)) {
+		struct bkey l = bkey_unpack_key(b, p);
+		struct bkey r = bkey_unpack_key(b, k);
+
+		BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
+	}
+#endif
+}
+
+static void set_needs_whiteout(struct bset *i, int v)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+		k->needs_whiteout = v;
+}
+
+static void btree_bounce_free(struct bch_fs *c, size_t size,
+			      bool used_mempool, void *p)
+{
+	if (used_mempool)
+		mempool_free(p, &c->btree_bounce_pool);
+	else
+		vpfree(p, size);
+}
+
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
+				bool *used_mempool)
+{
+	unsigned flags = memalloc_nofs_save();
+	void *p;
+
+	BUG_ON(size > btree_bytes(c));
+
+	*used_mempool = false;
+	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+	if (!p) {
+		*used_mempool = true;
+		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+	}
+	memalloc_nofs_restore(flags);
+	return p;
+}
+
+static void sort_bkey_ptrs(const struct btree *bt,
+			   struct bkey_packed **ptrs, unsigned nr)
+{
+	unsigned n = nr, a = nr / 2, b, c, d;
+
+	if (!a)
+		return;
+
+	/* Heap sort: see lib/sort.c: */
+	while (1) {
+		if (a)
+			a--;
+		else if (--n)
+			swap(ptrs[0], ptrs[n]);
+		else
+			break;
+
+		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
+			b = bch2_bkey_cmp_packed(bt,
+					    ptrs[c],
+					    ptrs[d]) >= 0 ? c : d;
+		if (d == n)
+			b = c;
+
+		while (b != a &&
+		       bch2_bkey_cmp_packed(bt,
+				       ptrs[a],
+				       ptrs[b]) >= 0)
+			b = (b - 1) / 2;
+		c = b;
+		while (b != a) {
+			b = (b - 1) / 2;
+			swap(ptrs[b], ptrs[c]);
+		}
+	}
+}
+
+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
+{
+	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
+	bool used_mempool = false;
+	size_t bytes = b->whiteout_u64s * sizeof(u64);
+
+	if (!b->whiteout_u64s)
+		return;
+
+	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
+
+	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
+
+	for (k = unwritten_whiteouts_start(c, b);
+	     k != unwritten_whiteouts_end(c, b);
+	     k = bkey_p_next(k))
+		*--ptrs = k;
+
+	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
+
+	k = new_whiteouts;
+
+	while (ptrs != ptrs_end) {
+		bkey_copy(k, *ptrs);
+		k = bkey_p_next(k);
+		ptrs++;
+	}
+
+	verify_no_dups(b, new_whiteouts,
+		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
+
+	memcpy_u64s(unwritten_whiteouts_start(c, b),
+		    new_whiteouts, b->whiteout_u64s);
+
+	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
+}
+
+static bool should_compact_bset(struct btree *b, struct bset_tree *t,
+				bool compacting, enum compact_mode mode)
+{
+	if (!bset_dead_u64s(b, t))
+		return false;
+
+	switch (mode) {
+	case COMPACT_LAZY:
+		return should_compact_bset_lazy(b, t) ||
+			(compacting && !bset_written(b, bset(b, t)));
+	case COMPACT_ALL:
+		return true;
+	default:
+		BUG();
+	}
+}
+
+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
+{
+	struct bset_tree *t;
+	bool ret = false;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+		struct btree_node_entry *src = NULL, *dst = NULL;
+
+		if (t != b->set && !bset_written(b, i)) {
+			src = container_of(i, struct btree_node_entry, keys);
+			dst = max(write_block(b),
+				  (void *) btree_bkey_last(b, t - 1));
+		}
+
+		if (src != dst)
+			ret = true;
+
+		if (!should_compact_bset(b, t, ret, mode)) {
+			if (src != dst) {
+				memmove(dst, src, sizeof(*src) +
+					le16_to_cpu(src->keys.u64s) *
+					sizeof(u64));
+				i = &dst->keys;
+				set_btree_bset(b, t, i);
+			}
+			continue;
+		}
+
+		start	= btree_bkey_first(b, t);
+		end	= btree_bkey_last(b, t);
+
+		if (src != dst) {
+			memmove(dst, src, sizeof(*src));
+			i = &dst->keys;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_p_next(k);
+
+			if (!bkey_deleted(k)) {
+				bkey_copy(out, k);
+				out = bkey_p_next(out);
+			} else {
+				BUG_ON(k->needs_whiteout);
+			}
+		}
+
+		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		set_btree_bset_end(b, t);
+		bch2_bset_set_no_aux_tree(b, t);
+		ret = true;
+	}
+
+	bch2_verify_btree_nr_keys(b);
+
+	bch2_btree_build_aux_trees(b);
+
+	return ret;
+}
+
+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+			    enum compact_mode mode)
+{
+	return bch2_drop_whiteouts(b, mode);
+}
+
+static void btree_node_sort(struct bch_fs *c, struct btree *b,
+			    unsigned start_idx,
+			    unsigned end_idx,
+			    bool filter_whiteouts)
+{
+	struct btree_node *out;
+	struct sort_iter_stack sort_iter;
+	struct bset_tree *t;
+	struct bset *start_bset = bset(b, &b->set[start_idx]);
+	bool used_mempool = false;
+	u64 start_time, seq = 0;
+	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
+	bool sorting_entire_node = start_idx == 0 &&
+		end_idx == b->nsets;
+
+	sort_iter_stack_init(&sort_iter, b);
+
+	for (t = b->set + start_idx;
+	     t < b->set + end_idx;
+	     t++) {
+		u64s += le16_to_cpu(bset(b, t)->u64s);
+		sort_iter_add(&sort_iter.iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+	}
+
+	bytes = sorting_entire_node
+		? btree_bytes(c)
+		: __vstruct_bytes(struct btree_node, u64s);
+
+	out = btree_bounce_alloc(c, bytes, &used_mempool);
+
+	start_time = local_clock();
+
+	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
+
+	out->keys.u64s = cpu_to_le16(u64s);
+
+	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
+
+	if (sorting_entire_node)
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+				       start_time);
+
+	/* Make sure we preserve bset journal_seq: */
+	for (t = b->set + start_idx; t < b->set + end_idx; t++)
+		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+	start_bset->journal_seq = cpu_to_le64(seq);
+
+	if (sorting_entire_node) {
+		u64s = le16_to_cpu(out->keys.u64s);
+
+		BUG_ON(bytes != btree_bytes(c));
+
+		/*
+		 * Our temporary buffer is the same size as the btree node's
+		 * buffer, we can just swap buffers instead of doing a big
+		 * memcpy()
+		 */
+		*out = *b->data;
+		out->keys.u64s = cpu_to_le16(u64s);
+		swap(out, b->data);
+		set_btree_bset(b, b->set, &b->data->keys);
+	} else {
+		start_bset->u64s = out->keys.u64s;
+		memcpy_u64s(start_bset->start,
+			    out->keys.start,
+			    le16_to_cpu(out->keys.u64s));
+	}
+
+	for (i = start_idx + 1; i < end_idx; i++)
+		b->nr.bset_u64s[start_idx] +=
+			b->nr.bset_u64s[i];
+
+	b->nsets -= shift;
+
+	for (i = start_idx + 1; i < b->nsets; i++) {
+		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
+		b->set[i]		= b->set[i + shift];
+	}
+
+	for (i = b->nsets; i < MAX_BSETS; i++)
+		b->nr.bset_u64s[i] = 0;
+
+	set_btree_bset_end(b, &b->set[start_idx]);
+	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
+
+	btree_bounce_free(c, bytes, used_mempool, out);
+
+	bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_btree_sort_into(struct bch_fs *c,
+			 struct btree *dst,
+			 struct btree *src)
+{
+	struct btree_nr_keys nr;
+	struct btree_node_iter src_iter;
+	u64 start_time = local_clock();
+
+	BUG_ON(dst->nsets != 1);
+
+	bch2_bset_set_no_aux_tree(dst, dst->set);
+
+	bch2_btree_node_iter_init_from_start(&src_iter, src);
+
+	nr = bch2_sort_repack(btree_bset_first(dst),
+			src, &src_iter,
+			&dst->format,
+			true);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+			       start_time);
+
+	set_btree_bset_end(dst, dst->set);
+
+	dst->nr.live_u64s	+= nr.live_u64s;
+	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
+	dst->nr.packed_keys	+= nr.packed_keys;
+	dst->nr.unpacked_keys	+= nr.unpacked_keys;
+
+	bch2_verify_btree_nr_keys(dst);
+}
+
+/*
+ * We're about to add another bset to the btree node, so if there's currently
+ * too many bsets - sort some of them together:
+ */
+static bool btree_node_compact(struct bch_fs *c, struct btree *b)
+{
+	unsigned unwritten_idx;
+	bool ret = false;
+
+	for (unwritten_idx = 0;
+	     unwritten_idx < b->nsets;
+	     unwritten_idx++)
+		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
+			break;
+
+	if (b->nsets - unwritten_idx > 1) {
+		btree_node_sort(c, b, unwritten_idx,
+				b->nsets, false);
+		ret = true;
+	}
+
+	if (unwritten_idx > 1) {
+		btree_node_sort(c, b, 0, unwritten_idx, false);
+		ret = true;
+	}
+
+	return ret;
+}
+
+void bch2_btree_build_aux_trees(struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		bch2_bset_build_aux_tree(b, t,
+				!bset_written(b, bset(b, t)) &&
+				t == bset_tree_last(b));
+}
+
+/*
+ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
+ *
+ * The first bset is going to be of similar order to the size of the node, the
+ * last bset is bounded by btree_write_set_buffer(), which is set to keep the
+ * memmove on insert from being too expensive: the middle bset should, ideally,
+ * be the geometric mean of the first and the last.
+ *
+ * Returns true if the middle bset is greater than that geometric mean:
+ */
+static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
+{
+	unsigned mid_u64s_bits =
+		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
+
+	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
+}
+
+/*
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
+ * inserted into
+ *
+ * Safe to call if there already is an unwritten bset - will only add a new bset
+ * if @b doesn't already have one.
+ *
+ * Returns true if we sorted (i.e. invalidated iterators
+ */
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_node_entry *bne;
+	bool reinit_iter = false;
+
+	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
+	BUG_ON(bset_written(b, bset(b, &b->set[1])));
+	BUG_ON(btree_node_just_written(b));
+
+	if (b->nsets == MAX_BSETS &&
+	    !btree_node_write_in_flight(b) &&
+	    should_compact_all(c, b)) {
+		bch2_btree_node_write(c, b, SIX_LOCK_write,
+				      BTREE_WRITE_init_next_bset);
+		reinit_iter = true;
+	}
+
+	if (b->nsets == MAX_BSETS &&
+	    btree_node_compact(c, b))
+		reinit_iter = true;
+
+	BUG_ON(b->nsets >= MAX_BSETS);
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch2_bset_init_next(c, b, bne);
+
+	bch2_btree_build_aux_trees(b);
+
+	if (reinit_iter)
+		bch2_trans_node_reinit_iter(trans, b);
+}
+
+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+			  struct btree *b)
+{
+	prt_printf(out, "%s level %u/%u\n  ",
+	       bch2_btree_ids[b->c.btree_id],
+	       b->c.level,
+	       bch2_btree_id_root(c, b->c.btree_id)->level);
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+			  struct bch_dev *ca,
+			  struct btree *b, struct bset *i,
+			  unsigned offset, int write)
+{
+	prt_printf(out, bch2_log_msg(c, "%s"),
+		   write == READ
+		   ? "error validating btree node "
+		   : "corrupt btree node before write ");
+	if (ca)
+		prt_printf(out, "on %s ", ca->name);
+	prt_printf(out, "at btree ");
+	btree_pos_to_text(out, c, b);
+
+	prt_printf(out, "\n  node offset %u", b->written);
+	if (i)
+		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+	prt_str(out, ": ");
+}
+
+__printf(8, 9)
+static int __btree_err(int ret,
+		       struct bch_fs *c,
+		       struct bch_dev *ca,
+		       struct btree *b,
+		       struct bset *i,
+		       int write,
+		       bool have_retry,
+		       const char *fmt, ...)
+{
+	struct printbuf out = PRINTBUF;
+	va_list args;
+
+	btree_err_msg(&out, c, ca, b, i, b->written, write);
+
+	va_start(args, fmt);
+	prt_vprintf(&out, fmt, args);
+	va_end(args);
+
+	if (write == WRITE) {
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		ret = c->opts.errors == BCH_ON_ERROR_continue
+			? 0
+			: -BCH_ERR_fsck_errors_not_fixed;
+		goto out;
+	}
+
+	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+		ret = -BCH_ERR_btree_node_read_err_fixable;
+	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+		ret = -BCH_ERR_btree_node_read_err_bad_node;
+
+	switch (ret) {
+	case -BCH_ERR_btree_node_read_err_fixable:
+		mustfix_fsck_err(c, "%s", out.buf);
+		ret = -BCH_ERR_fsck_fix;
+		break;
+	case -BCH_ERR_btree_node_read_err_want_retry:
+	case -BCH_ERR_btree_node_read_err_must_retry:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		break;
+	case -BCH_ERR_btree_node_read_err_bad_node:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		bch2_topology_error(c);
+		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+		break;
+	case -BCH_ERR_btree_node_read_err_incompatible:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+		break;
+	default:
+		BUG();
+	}
+out:
+fsck_err:
+	printbuf_exit(&out);
+	return ret;
+}
+
+#define btree_err(type, c, ca, b, i, msg, ...)				\
+({									\
+	int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
+									\
+	if (_ret != -BCH_ERR_fsck_fix) {				\
+		ret = _ret;						\
+		goto fsck_err;						\
+	}								\
+									\
+	*saw_error = true;						\
+})
+
+#define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
+
+/*
+ * When btree topology repair changes the start or end of a node, that might
+ * mean we have to drop keys that are no longer inside the node:
+ */
+__cold
+void bch2_btree_node_drop_keys_outside_node(struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k;
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
+				break;
+
+		if (k != i->start) {
+			unsigned shift = (u64 *) k - (u64 *) i->start;
+
+			memmove_u64s_down(i->start, k,
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
+			set_btree_bset_end(b, t);
+		}
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
+				break;
+
+		if (k != vstruct_last(i)) {
+			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
+			set_btree_bset_end(b, t);
+		}
+	}
+
+	/*
+	 * Always rebuild search trees: eytzinger search tree nodes directly
+	 * depend on the values of min/max key:
+	 */
+	bch2_bset_set_no_aux_tree(b, b->set);
+	bch2_btree_build_aux_trees(b);
+
+	struct bkey_s_c k;
+	struct bkey unpacked;
+	struct btree_node_iter iter;
+	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+	}
+}
+
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
+			 struct btree *b, struct bset *i,
+			 unsigned offset, unsigned sectors,
+			 int write, bool have_retry, bool *saw_error)
+{
+	unsigned version = le16_to_cpu(i->version);
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret = 0;
+
+	btree_err_on(!bch2_version_compatible(version),
+		     -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
+		     "unsupported bset version %u.%u",
+		     BCH_VERSION_MAJOR(version),
+		     BCH_VERSION_MINOR(version));
+
+	if (btree_err_on(version < c->sb.version_min,
+			 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
+			 "bset version %u older than superblock version_min %u",
+			 version, c->sb.version_min)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version_min = cpu_to_le16(version);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (btree_err_on(BCH_VERSION_MAJOR(version) >
+			 BCH_VERSION_MAJOR(c->sb.version),
+			 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
+			 "bset version %u newer than superblock version %u",
+			 version, c->sb.version)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version = cpu_to_le16(version);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
+		     -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
+		     "BSET_SEPARATE_WHITEOUTS no longer supported");
+
+	if (btree_err_on(offset + sectors > btree_sectors(c),
+			 -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
+			 "bset past end of btree node")) {
+		i->u64s = 0;
+		ret = 0;
+		goto out;
+	}
+
+	btree_err_on(offset && !i->u64s,
+		     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
+		     "empty bset");
+
+	btree_err_on(BSET_OFFSET(i) &&
+		     BSET_OFFSET(i) != offset,
+		     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
+		     "bset at wrong sector offset");
+
+	if (!offset) {
+		struct btree_node *bn =
+			container_of(i, struct btree_node, keys);
+		/* These indicate that we read the wrong btree node: */
+
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+			/* XXX endianness */
+			btree_err_on(bp->seq != bn->keys.seq,
+				     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+				     "incorrect sequence number (wrong btree node)");
+		}
+
+		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
+			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
+			     "incorrect btree id");
+
+		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
+			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
+			     "incorrect level");
+
+		if (!write)
+			compat_btree_node(b->c.level, b->c.btree_id, version,
+					  BSET_BIG_ENDIAN(i), write, bn);
+
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+			if (BTREE_PTR_RANGE_UPDATED(bp)) {
+				b->data->min_key = bp->min_key;
+				b->data->max_key = b->key.k.p;
+			}
+
+			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
+				     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+				     "incorrect min_key: got %s should be %s",
+				     (printbuf_reset(&buf1),
+				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+				     (printbuf_reset(&buf2),
+				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
+		}
+
+		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
+			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
+			     "incorrect max key %s",
+			     (printbuf_reset(&buf1),
+			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
+
+		if (write)
+			compat_btree_node(b->c.level, b->c.btree_id, version,
+					  BSET_BIG_ENDIAN(i), write, bn);
+
+		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
+			     -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i,
+			     "invalid bkey format: %s\n  %s", buf1.buf,
+			     (printbuf_reset(&buf2),
+			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
+		printbuf_reset(&buf1);
+
+		compat_bformat(b->c.level, b->c.btree_id, version,
+			       BSET_BIG_ENDIAN(i), write,
+			       &bn->format);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+static int bset_key_invalid(struct bch_fs *c, struct btree *b,
+			    struct bkey_s_c k,
+			    bool updated_range, int rw,
+			    struct printbuf *err)
+{
+	return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
+		(!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
+		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+}
+
+static int validate_bset_keys(struct bch_fs *c, struct btree *b,
+			 struct bset *i, int write,
+			 bool have_retry, bool *saw_error)
+{
+	unsigned version = le16_to_cpu(i->version);
+	struct bkey_packed *k, *prev = NULL;
+	struct printbuf buf = PRINTBUF;
+	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
+	int ret = 0;
+
+	for (k = i->start;
+	     k != vstruct_last(i);) {
+		struct bkey_s u;
+		struct bkey tmp;
+
+		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
+				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
+				 "key extends past end of bset")) {
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
+				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
+				 "invalid bkey format %u", k->format)) {
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_p_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		/* XXX: validate k->u64s */
+		if (!write)
+			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+				    BSET_BIG_ENDIAN(i), write,
+				    &b->format, k);
+
+		u = __bkey_disassemble(b, k, &tmp);
+
+		printbuf_reset(&buf);
+		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
+			printbuf_reset(&buf);
+			prt_printf(&buf, "invalid bkey:  ");
+			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+			prt_printf(&buf, "\n  ");
+			bch2_bkey_val_to_text(&buf, c, u.s_c);
+
+			btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_p_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		if (write)
+			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+				    BSET_BIG_ENDIAN(i), write,
+				    &b->format, k);
+
+		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+			struct bkey up = bkey_unpack_key(b, prev);
+
+			printbuf_reset(&buf);
+			prt_printf(&buf, "keys out of order: ");
+			bch2_bkey_to_text(&buf, &up);
+			prt_printf(&buf, " > ");
+			bch2_bkey_to_text(&buf, u.k);
+
+			bch2_dump_bset(c, b, i, 0);
+
+			if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) {
+				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+				memmove_u64s_down(k, bkey_p_next(k),
+						  (u64 *) vstruct_end(i) - (u64 *) k);
+				continue;
+			}
+		}
+
+		prev = k;
+		k = bkey_p_next(k);
+	}
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
+			      struct btree *b, bool have_retry, bool *saw_error)
+{
+	struct btree_node_entry *bne;
+	struct sort_iter *iter;
+	struct btree_node *sorted;
+	struct bkey_packed *k;
+	struct bch_extent_ptr *ptr;
+	struct bset *i;
+	bool used_mempool, blacklisted;
+	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
+	unsigned u64s;
+	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+	struct printbuf buf = PRINTBUF;
+	int ret = 0, retry_read = 0, write = READ;
+
+	b->version_ondisk = U16_MAX;
+	/* We might get called multiple times on read retry: */
+	b->written = 0;
+
+	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
+	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
+
+	if (bch2_meta_read_fault("btree"))
+		btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+			  "dynamic fault");
+
+	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
+		     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+		     "bad magic: want %llx, got %llx",
+		     bset_magic(c), le64_to_cpu(b->data->magic));
+
+	btree_err_on(!b->data->keys.seq,
+		     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+		     "bad btree header: seq 0");
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bch_btree_ptr_v2 *bp =
+			&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+		btree_err_on(b->data->keys.seq != bp->seq,
+			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
+			     "got wrong btree node (seq %llx want %llx)",
+			     b->data->keys.seq, bp->seq);
+	}
+
+	while (b->written < (ptr_written ?: btree_sectors(c))) {
+		unsigned sectors;
+		struct nonce nonce;
+		struct bch_csum csum;
+		bool first = !b->written;
+
+		if (!b->written) {
+			i = &b->data->keys;
+
+			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
+				     "unknown checksum type %llu",
+				     BSET_CSUM_TYPE(i));
+
+			nonce = btree_nonce(i, b->written << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+
+			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
+				     "invalid checksum");
+
+			ret = bset_encrypt(c, i, b->written << 9);
+			if (bch2_fs_fatal_err_on(ret, c,
+					"error decrypting btree node: %i", ret))
+				goto fsck_err;
+
+			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
+				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
+				     -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL,
+				     "btree node does not have NEW_EXTENT_OVERWRITE set");
+
+			sectors = vstruct_sectors(b->data, c->block_bits);
+		} else {
+			bne = write_block(b);
+			i = &bne->keys;
+
+			if (i->seq != b->data->keys.seq)
+				break;
+
+			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
+				     "unknown checksum type %llu",
+				     BSET_CSUM_TYPE(i));
+
+			nonce = btree_nonce(i, b->written << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+			btree_err_on(bch2_crc_cmp(csum, bne->csum),
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
+				     "invalid checksum");
+
+			ret = bset_encrypt(c, i, b->written << 9);
+			if (bch2_fs_fatal_err_on(ret, c,
+					"error decrypting btree node: %i\n", ret))
+				goto fsck_err;
+
+			sectors = vstruct_sectors(bne, c->block_bits);
+		}
+
+		b->version_ondisk = min(b->version_ondisk,
+					le16_to_cpu(i->version));
+
+		ret = validate_bset(c, ca, b, i, b->written, sectors,
+				    READ, have_retry, saw_error);
+		if (ret)
+			goto fsck_err;
+
+		if (!b->written)
+			btree_node_set_format(b, b->data->format);
+
+		ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
+		if (ret)
+			goto fsck_err;
+
+		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+		blacklisted = bch2_journal_seq_is_blacklisted(c,
+					le64_to_cpu(i->journal_seq),
+					true);
+
+		btree_err_on(blacklisted && first,
+			     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
+			     "first btree node bset has blacklisted journal seq (%llu)",
+			     le64_to_cpu(i->journal_seq));
+
+		btree_err_on(blacklisted && ptr_written,
+			     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
+			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
+			     le64_to_cpu(i->journal_seq),
+			     b->written, b->written + sectors, ptr_written);
+
+		b->written += sectors;
+
+		if (blacklisted && !first)
+			continue;
+
+		sort_iter_add(iter,
+			      vstruct_idx(i, 0),
+			      vstruct_last(i));
+	}
+
+	if (ptr_written) {
+		btree_err_on(b->written < ptr_written,
+			     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
+			     "btree node data missing: expected %u sectors, found %u",
+			     ptr_written, b->written);
+	} else {
+		for (bne = write_block(b);
+		     bset_byte_offset(b, bne) < btree_bytes(c);
+		     bne = (void *) bne + block_bytes(c))
+			btree_err_on(bne->keys.seq == b->data->keys.seq &&
+				     !bch2_journal_seq_is_blacklisted(c,
+								      le64_to_cpu(bne->keys.journal_seq),
+								      true),
+				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
+				     "found bset signature after last bset");
+	}
+
+	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+	sorted->keys.u64s = 0;
+
+	set_btree_bset(b, b->set, &b->data->keys);
+
+	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+
+	u64s = le16_to_cpu(sorted->keys.u64s);
+	*sorted = *b->data;
+	sorted->keys.u64s = cpu_to_le16(u64s);
+	swap(sorted, b->data);
+	set_btree_bset(b, b->set, &b->data->keys);
+	b->nsets = 1;
+
+	BUG_ON(b->nr.live_u64s != u64s);
+
+	btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+
+	if (updated_range)
+		bch2_btree_node_drop_keys_outside_node(b);
+
+	i = &b->data->keys;
+	for (k = i->start; k != vstruct_last(i);) {
+		struct bkey tmp;
+		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+
+		printbuf_reset(&buf);
+
+		if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
+		    (bch2_inject_invalid_keys &&
+		     !bversion_cmp(u.k->version, MAX_VERSION))) {
+			printbuf_reset(&buf);
+
+			prt_printf(&buf, "invalid bkey: ");
+			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
+			prt_printf(&buf, "\n  ");
+			bch2_bkey_val_to_text(&buf, c, u.s_c);
+
+			btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
+
+			btree_keys_account_key_drop(&b->nr, 0, k);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_p_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			set_btree_bset_end(b, b->set);
+			continue;
+		}
+
+		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
+			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
+
+			bp.v->mem_ptr = 0;
+		}
+
+		k = bkey_p_next(k);
+	}
+
+	bch2_bset_build_aux_tree(b, b->set, false);
+
+	set_needs_whiteout(btree_bset_first(b), true);
+
+	btree_node_reset_sib_u64s(b);
+
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+		struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ca2->mi.state != BCH_MEMBER_STATE_rw)
+			set_btree_node_need_rewrite(b);
+	}
+
+	if (!ptr_written)
+		set_btree_node_need_rewrite(b);
+out:
+	mempool_free(iter, &c->fill_iter);
+	printbuf_exit(&buf);
+	return retry_read;
+fsck_err:
+	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+	    ret == -BCH_ERR_btree_node_read_err_must_retry)
+		retry_read = 1;
+	else
+		set_btree_node_read_error(b);
+	goto out;
+}
+
+static void btree_node_read_work(struct work_struct *work)
+{
+	struct btree_read_bio *rb =
+		container_of(work, struct btree_read_bio, work);
+	struct bch_fs *c	= rb->c;
+	struct btree *b		= rb->b;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+	struct bio *bio		= &rb->bio;
+	struct bch_io_failures failed = { .nr = 0 };
+	struct printbuf buf = PRINTBUF;
+	bool saw_error = false;
+	bool retry = false;
+	bool can_retry;
+
+	goto start;
+	while (1) {
+		retry = true;
+		bch_info(c, "retrying read");
+		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
+		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
+		bio->bi_iter.bi_size	= btree_bytes(c);
+
+		if (rb->have_ioref) {
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			submit_bio_wait(bio);
+		} else {
+			bio->bi_status = BLK_STS_REMOVED;
+		}
+start:
+		printbuf_reset(&buf);
+		btree_pos_to_text(&buf, c, b);
+		bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
+		if (rb->have_ioref)
+			percpu_ref_put(&ca->io_ref);
+		rb->have_ioref = false;
+
+		bch2_mark_io_failure(&failed, &rb->pick);
+
+		can_retry = bch2_bkey_pick_read_device(c,
+				bkey_i_to_s_c(&b->key),
+				&failed, &rb->pick) > 0;
+
+		if (!bio->bi_status &&
+		    !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
+			if (retry)
+				bch_info(c, "retry success");
+			break;
+		}
+
+		saw_error = true;
+
+		if (!can_retry) {
+			set_btree_node_read_error(b);
+			break;
+		}
+	}
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+			       rb->start_time);
+	bio_put(&rb->bio);
+
+	if (saw_error && !btree_node_read_error(b)) {
+		printbuf_reset(&buf);
+		bch2_bpos_to_text(&buf, b->key.k.p);
+		bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+			 __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
+
+		bch2_btree_node_rewrite_async(c, b);
+	}
+
+	printbuf_exit(&buf);
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+	struct btree_read_bio *rb =
+		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
+
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
+
+	queue_work(c->io_complete_wq, &rb->work);
+}
+
+struct btree_node_read_all {
+	struct closure		cl;
+	struct bch_fs		*c;
+	struct btree		*b;
+	unsigned		nr;
+	void			*buf[BCH_REPLICAS_MAX];
+	struct bio		*bio[BCH_REPLICAS_MAX];
+	blk_status_t		err[BCH_REPLICAS_MAX];
+};
+
+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
+{
+	struct btree_node *bn = data;
+	struct btree_node_entry *bne;
+	unsigned offset = 0;
+
+	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
+		return 0;
+
+	while (offset < btree_sectors(c)) {
+		if (!offset) {
+			offset += vstruct_sectors(bn, c->block_bits);
+		} else {
+			bne = data + (offset << 9);
+			if (bne->keys.seq != bn->keys.seq)
+				break;
+			offset += vstruct_sectors(bne, c->block_bits);
+		}
+	}
+
+	return offset;
+}
+
+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
+{
+	struct btree_node *bn = data;
+	struct btree_node_entry *bne;
+
+	if (!offset)
+		return false;
+
+	while (offset < btree_sectors(c)) {
+		bne = data + (offset << 9);
+		if (bne->keys.seq == bn->keys.seq)
+			return true;
+		offset++;
+	}
+
+	return false;
+	return offset;
+}
+
+static void btree_node_read_all_replicas_done(struct closure *cl)
+{
+	struct btree_node_read_all *ra =
+		container_of(cl, struct btree_node_read_all, cl);
+	struct bch_fs *c = ra->c;
+	struct btree *b = ra->b;
+	struct printbuf buf = PRINTBUF;
+	bool dump_bset_maps = false;
+	bool have_retry = false;
+	int ret = 0, best = -1, write = READ;
+	unsigned i, written = 0, written2 = 0;
+	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
+		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+	bool _saw_error = false, *saw_error = &_saw_error;
+
+	for (i = 0; i < ra->nr; i++) {
+		struct btree_node *bn = ra->buf[i];
+
+		if (ra->err[i])
+			continue;
+
+		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
+		    (seq && seq != bn->keys.seq))
+			continue;
+
+		if (best < 0) {
+			best = i;
+			written = btree_node_sectors_written(c, bn);
+			continue;
+		}
+
+		written2 = btree_node_sectors_written(c, ra->buf[i]);
+		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
+				 "btree node sectors written mismatch: %u != %u",
+				 written, written2) ||
+		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
+				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
+				 "found bset signature after last bset") ||
+		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
+				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
+				 "btree node replicas content mismatch"))
+			dump_bset_maps = true;
+
+		if (written2 > written) {
+			written = written2;
+			best = i;
+		}
+	}
+fsck_err:
+	if (dump_bset_maps) {
+		for (i = 0; i < ra->nr; i++) {
+			struct btree_node *bn = ra->buf[i];
+			struct btree_node_entry *bne = NULL;
+			unsigned offset = 0, sectors;
+			bool gap = false;
+
+			if (ra->err[i])
+				continue;
+
+			printbuf_reset(&buf);
+
+			while (offset < btree_sectors(c)) {
+				if (!offset) {
+					sectors = vstruct_sectors(bn, c->block_bits);
+				} else {
+					bne = ra->buf[i] + (offset << 9);
+					if (bne->keys.seq != bn->keys.seq)
+						break;
+					sectors = vstruct_sectors(bne, c->block_bits);
+				}
+
+				prt_printf(&buf, " %u-%u", offset, offset + sectors);
+				if (bne && bch2_journal_seq_is_blacklisted(c,
+							le64_to_cpu(bne->keys.journal_seq), false))
+					prt_printf(&buf, "*");
+				offset += sectors;
+			}
+
+			while (offset < btree_sectors(c)) {
+				bne = ra->buf[i] + (offset << 9);
+				if (bne->keys.seq == bn->keys.seq) {
+					if (!gap)
+						prt_printf(&buf, " GAP");
+					gap = true;
+
+					sectors = vstruct_sectors(bne, c->block_bits);
+					prt_printf(&buf, " %u-%u", offset, offset + sectors);
+					if (bch2_journal_seq_is_blacklisted(c,
+							le64_to_cpu(bne->keys.journal_seq), false))
+						prt_printf(&buf, "*");
+				}
+				offset++;
+			}
+
+			bch_err(c, "replica %u:%s", i, buf.buf);
+		}
+	}
+
+	if (best >= 0) {
+		memcpy(b->data, ra->buf[best], btree_bytes(c));
+		ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
+	} else {
+		ret = -1;
+	}
+
+	if (ret)
+		set_btree_node_read_error(b);
+	else if (*saw_error)
+		bch2_btree_node_rewrite_async(c, b);
+
+	for (i = 0; i < ra->nr; i++) {
+		mempool_free(ra->buf[i], &c->btree_bounce_pool);
+		bio_put(ra->bio[i]);
+	}
+
+	closure_debug_destroy(&ra->cl);
+	kfree(ra);
+	printbuf_exit(&buf);
+
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_all_replicas_endio(struct bio *bio)
+{
+	struct btree_read_bio *rb =
+		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
+	struct btree_node_read_all *ra = rb->ra;
+
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
+
+	ra->err[rb->idx] = bio->bi_status;
+	closure_put(&ra->cl);
+}
+
+/*
+ * XXX This allocates multiple times from the same mempools, and can deadlock
+ * under sufficient memory pressure (but is only a debug path)
+ */
+static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
+{
+	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded pick;
+	struct btree_node_read_all *ra;
+	unsigned i;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
+
+	closure_init(&ra->cl, NULL);
+	ra->c	= c;
+	ra->b	= b;
+	ra->nr	= bch2_bkey_nr_ptrs(k);
+
+	for (i = 0; i < ra->nr; i++) {
+		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+		ra->bio[i] = bio_alloc_bioset(NULL,
+					      buf_pages(ra->buf[i], btree_bytes(c)),
+					      REQ_OP_READ|REQ_SYNC|REQ_META,
+					      GFP_NOFS,
+					      &c->btree_bio);
+	}
+
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+		struct btree_read_bio *rb =
+			container_of(ra->bio[i], struct btree_read_bio, bio);
+		rb->c			= c;
+		rb->b			= b;
+		rb->ra			= ra;
+		rb->start_time		= local_clock();
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+		rb->idx			= i;
+		rb->pick		= pick;
+		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
+		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
+		bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+
+		if (rb->have_ioref) {
+			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+				     bio_sectors(&rb->bio));
+			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
+
+			closure_get(&ra->cl);
+			submit_bio(&rb->bio);
+		} else {
+			ra->err[i] = BLK_STS_REMOVED;
+		}
+
+		i++;
+	}
+
+	if (sync) {
+		closure_sync(&ra->cl);
+		btree_node_read_all_replicas_done(&ra->cl);
+	} else {
+		continue_at(&ra->cl, btree_node_read_all_replicas_done,
+			    c->io_complete_wq);
+	}
+
+	return 0;
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+			  bool sync)
+{
+	struct extent_ptr_decoded pick;
+	struct btree_read_bio *rb;
+	struct bch_dev *ca;
+	struct bio *bio;
+	int ret;
+
+	trace_and_count(c, btree_node_read, c, b);
+
+	if (bch2_verify_all_btree_replicas &&
+	    !btree_node_read_all_replicas(c, b, sync))
+		return;
+
+	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+					 NULL, &pick);
+
+	if (ret <= 0) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "btree node read error: no device to read from\n at ");
+		btree_pos_to_text(&buf, c, b);
+		bch_err(c, "%s", buf.buf);
+
+		if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
+			bch2_fatal_error(c);
+
+		set_btree_node_read_error(b);
+		clear_btree_node_read_in_flight(b);
+		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+		printbuf_exit(&buf);
+		return;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	bio = bio_alloc_bioset(NULL,
+			       buf_pages(b->data, btree_bytes(c)),
+			       REQ_OP_READ|REQ_SYNC|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
+	rb = container_of(bio, struct btree_read_bio, bio);
+	rb->c			= c;
+	rb->b			= b;
+	rb->ra			= NULL;
+	rb->start_time		= local_clock();
+	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+	rb->pick		= pick;
+	INIT_WORK(&rb->work, btree_node_read_work);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_end_io		= btree_node_read_endio;
+	bch2_bio_map(bio, b->data, btree_bytes(c));
+
+	if (rb->have_ioref) {
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+			     bio_sectors(bio));
+		bio_set_dev(bio, ca->disk_sb.bdev);
+
+		if (sync) {
+			submit_bio_wait(bio);
+
+			btree_node_read_work(&rb->work);
+		} else {
+			submit_bio(bio);
+		}
+	} else {
+		bio->bi_status = BLK_STS_REMOVED;
+
+		if (sync)
+			btree_node_read_work(&rb->work);
+		else
+			queue_work(c->io_complete_wq, &rb->work);
+	}
+}
+
+static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
+				  const struct bkey_i *k, unsigned level)
+{
+	struct bch_fs *c = trans->c;
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = bch2_btree_node_mem_alloc(trans, level != 0);
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	BUG_ON(IS_ERR(b));
+
+	bkey_copy(&b->key, k);
+	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
+
+	set_btree_node_read_in_flight(b);
+
+	bch2_btree_node_read(c, b, true);
+
+	if (btree_node_read_error(b)) {
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&b->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_btree_set_root_for_read(c, b);
+err:
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+
+	return ret;
+}
+
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+			const struct bkey_i *k, unsigned level)
+{
+	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
+}
+
+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+			      struct btree_write *w)
+{
+	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+
+	do {
+		old = new = v;
+		if (!(old & 1))
+			break;
+
+		new &= ~1UL;
+	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+
+	if (old & 1)
+		closure_put(&((struct btree_update *) new)->cl);
+
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+}
+
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+	struct btree_write *w = btree_prev_write(b);
+	unsigned long old, new, v;
+	unsigned type = 0;
+
+	bch2_btree_complete_write(c, b, w);
+
+	v = READ_ONCE(b->flags);
+	do {
+		old = new = v;
+
+		if ((old & (1U << BTREE_NODE_dirty)) &&
+		    (old & (1U << BTREE_NODE_need_write)) &&
+		    !(old & (1U << BTREE_NODE_never_write)) &&
+		    !(old & (1U << BTREE_NODE_write_blocked)) &&
+		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
+			new &= ~(1U << BTREE_NODE_dirty);
+			new &= ~(1U << BTREE_NODE_need_write);
+			new |=  (1U << BTREE_NODE_write_in_flight);
+			new |=  (1U << BTREE_NODE_write_in_flight_inner);
+			new |=  (1U << BTREE_NODE_just_written);
+			new ^=  (1U << BTREE_NODE_write_idx);
+
+			type = new & BTREE_WRITE_TYPE_MASK;
+			new &= ~BTREE_WRITE_TYPE_MASK;
+		} else {
+			new &= ~(1U << BTREE_NODE_write_in_flight);
+			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
+		}
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	if (new & (1U << BTREE_NODE_write_in_flight))
+		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
+	else
+		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+	__btree_node_write_done(c, b);
+	six_unlock_read(&b->c.lock);
+
+	bch2_trans_put(trans);
+}
+
+static void btree_node_write_work(struct work_struct *work)
+{
+	struct btree_write_bio *wbio =
+		container_of(work, struct btree_write_bio, work);
+	struct bch_fs *c	= wbio->wbio.c;
+	struct btree *b		= wbio->wbio.bio.bi_private;
+	struct bch_extent_ptr *ptr;
+	int ret = 0;
+
+	btree_bounce_free(c,
+		wbio->data_bytes,
+		wbio->wbio.used_mempool,
+		wbio->data);
+
+	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
+		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
+
+	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
+		goto err;
+
+	if (wbio->wbio.first_btree_write) {
+		if (wbio->wbio.failed.nr) {
+
+		}
+	} else {
+		ret = bch2_trans_do(c, NULL, NULL, 0,
+			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
+					BCH_WATERMARK_reclaim|
+					BTREE_INSERT_JOURNAL_RECLAIM|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_NOCHECK_RW,
+					!wbio->wbio.failed.nr));
+		if (ret)
+			goto err;
+	}
+out:
+	bio_put(&wbio->wbio.bio);
+	btree_node_write_done(c, b);
+	return;
+err:
+	set_btree_node_noevict(b);
+	if (!bch2_err_matches(ret, EROFS))
+		bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+	goto out;
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_write_bio *orig	= parent ?: wbio;
+	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
+	struct bch_fs *c		= wbio->c;
+	struct btree *b			= wbio->bio.bi_private;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+	unsigned long flags;
+
+	if (wbio->have_ioref)
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
+			       bch2_blk_status_to_str(bio->bi_status)) ||
+	    bch2_meta_write_fault("btree")) {
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+	}
+
+	if (wbio->have_ioref)
+		percpu_ref_put(&ca->io_ref);
+
+	if (parent) {
+		bio_put(bio);
+		bio_endio(&parent->bio);
+		return;
+	}
+
+	clear_btree_node_write_in_flight_inner(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
+	INIT_WORK(&wb->work, btree_node_write_work);
+	queue_work(c->btree_io_complete_wq, &wb->work);
+}
+
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+				   struct bset *i, unsigned sectors)
+{
+	struct printbuf buf = PRINTBUF;
+	bool saw_error;
+	int ret;
+
+	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
+				BKEY_TYPE_btree, WRITE, &buf);
+
+	if (ret)
+		bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
+	printbuf_exit(&buf);
+	if (ret)
+		return ret;
+
+	ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
+		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
+	if (ret) {
+		bch2_inconsistent_error(c);
+		dump_stack();
+	}
+
+	return ret;
+}
+
+static void btree_write_submit(struct work_struct *work)
+{
+	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+	struct bch_extent_ptr *ptr;
+	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+
+	bkey_copy(&tmp.k, &wbio->key);
+
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
+		ptr->offset += wbio->sector_offset;
+
+	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
+				  &tmp.k, false);
+}
+
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
+{
+	struct btree_write_bio *wbio;
+	struct bset_tree *t;
+	struct bset *i;
+	struct btree_node *bn = NULL;
+	struct btree_node_entry *bne = NULL;
+	struct sort_iter_stack sort_iter;
+	struct nonce nonce;
+	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
+	u64 seq = 0;
+	bool used_mempool;
+	unsigned long old, new;
+	bool validate_before_checksum = false;
+	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
+	void *data;
+	int ret;
+
+	if (flags & BTREE_WRITE_ALREADY_STARTED)
+		goto do_write;
+
+	/*
+	 * We may only have a read lock on the btree node - the dirty bit is our
+	 * "lock" against racing with other threads that may be trying to start
+	 * a write, we do a write iff we clear the dirty bit. Since setting the
+	 * dirty bit requires a write lock, we can't race with other threads
+	 * redirtying it:
+	 */
+	do {
+		old = new = READ_ONCE(b->flags);
+
+		if (!(old & (1 << BTREE_NODE_dirty)))
+			return;
+
+		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+		    !(old & (1 << BTREE_NODE_need_write)))
+			return;
+
+		if (old &
+		    ((1 << BTREE_NODE_never_write)|
+		     (1 << BTREE_NODE_write_blocked)))
+			return;
+
+		if (b->written &&
+		    (old & (1 << BTREE_NODE_will_make_reachable)))
+			return;
+
+		if (old & (1 << BTREE_NODE_write_in_flight))
+			return;
+
+		if (flags & BTREE_WRITE_ONLY_IF_NEED)
+			type = new & BTREE_WRITE_TYPE_MASK;
+		new &= ~BTREE_WRITE_TYPE_MASK;
+
+		new &= ~(1 << BTREE_NODE_dirty);
+		new &= ~(1 << BTREE_NODE_need_write);
+		new |=  (1 << BTREE_NODE_write_in_flight);
+		new |=  (1 << BTREE_NODE_write_in_flight_inner);
+		new |=  (1 << BTREE_NODE_just_written);
+		new ^=  (1 << BTREE_NODE_write_idx);
+	} while (cmpxchg_acquire(&b->flags, old, new) != old);
+
+	if (new & (1U << BTREE_NODE_need_write))
+		return;
+do_write:
+	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
+
+	atomic_dec(&c->btree_cache.dirty);
+
+	BUG_ON(btree_node_fake(b));
+	BUG_ON((b->will_make_reachable != 0) != !b->written);
+
+	BUG_ON(b->written >= btree_sectors(c));
+	BUG_ON(b->written & (block_sectors(c) - 1));
+	BUG_ON(bset_written(b, btree_bset_last(b)));
+	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
+	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+
+	bch2_sort_whiteouts(c, b);
+
+	sort_iter_stack_init(&sort_iter, b);
+
+	bytes = !b->written
+		? sizeof(struct btree_node)
+		: sizeof(struct btree_node_entry);
+
+	bytes += b->whiteout_u64s * sizeof(u64);
+
+	for_each_bset(b, t) {
+		i = bset(b, t);
+
+		if (bset_written(b, i))
+			continue;
+
+		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+		sort_iter_add(&sort_iter.iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+		seq = max(seq, le64_to_cpu(i->journal_seq));
+	}
+
+	BUG_ON(b->written && !seq);
+
+	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+	bytes += 8;
+
+	/* buffer must be a multiple of the block size */
+	bytes = round_up(bytes, block_bytes(c));
+
+	data = btree_bounce_alloc(c, bytes, &used_mempool);
+
+	if (!b->written) {
+		bn = data;
+		*bn = *b->data;
+		i = &bn->keys;
+	} else {
+		bne = data;
+		bne->keys = b->data->keys;
+		i = &bne->keys;
+	}
+
+	i->journal_seq	= cpu_to_le64(seq);
+	i->u64s		= 0;
+
+	sort_iter_add(&sort_iter.iter,
+		      unwritten_whiteouts_start(c, b),
+		      unwritten_whiteouts_end(c, b));
+	SET_BSET_SEPARATE_WHITEOUTS(i, false);
+
+	b->whiteout_u64s = 0;
+
+	u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
+	le16_add_cpu(&i->u64s, u64s);
+
+	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
+
+	set_needs_whiteout(i, false);
+
+	/* do we have data to write? */
+	if (b->written && !i->u64s)
+		goto nowrite;
+
+	bytes_to_write = vstruct_end(i) - data;
+	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+	if (!b->written &&
+	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+
+	memset(data + bytes_to_write, 0,
+	       (sectors_to_write << 9) - bytes_to_write);
+
+	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
+	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+	BUG_ON(i->seq != b->data->keys.seq);
+
+	i->version = cpu_to_le16(c->sb.version);
+	SET_BSET_OFFSET(i, b->written);
+	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
+
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
+		validate_before_checksum = true;
+
+	/* validate_bset will be modifying: */
+	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
+		validate_before_checksum = true;
+
+	/* if we're going to be encrypting, check metadata validity first: */
+	if (validate_before_checksum &&
+	    validate_bset_for_write(c, b, i, sectors_to_write))
+		goto err;
+
+	ret = bset_encrypt(c, i, b->written << 9);
+	if (bch2_fs_fatal_err_on(ret, c,
+			"error encrypting btree node: %i\n", ret))
+		goto err;
+
+	nonce = btree_nonce(i, b->written << 9);
+
+	if (bn)
+		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+	else
+		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+	/* if we're not encrypting, check metadata after checksumming: */
+	if (!validate_before_checksum &&
+	    validate_bset_for_write(c, b, i, sectors_to_write))
+		goto err;
+
+	/*
+	 * We handle btree write errors by immediately halting the journal -
+	 * after we've done that, we can't issue any subsequent btree writes
+	 * because they might have pointers to new nodes that failed to write.
+	 *
+	 * Furthermore, there's no point in doing any more btree writes because
+	 * with the journal stopped, we're never going to update the journal to
+	 * reflect that those writes were done and the data flushed from the
+	 * journal:
+	 *
+	 * Also on journal error, the pending write may have updates that were
+	 * never journalled (interior nodes, see btree_update_nodes_written()) -
+	 * it's critical that we don't do the write in that case otherwise we
+	 * will have updates visible that weren't in the journal:
+	 *
+	 * Make sure to update b->written so bch2_btree_init_next() doesn't
+	 * break:
+	 */
+	if (bch2_journal_error(&c->journal) ||
+	    c->opts.nochanges)
+		goto err;
+
+	trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
+
+	wbio = container_of(bio_alloc_bioset(NULL,
+				buf_pages(data, sectors_to_write << 9),
+				REQ_OP_WRITE|REQ_META,
+				GFP_NOFS,
+				&c->btree_bio),
+			    struct btree_write_bio, wbio.bio);
+	wbio_init(&wbio->wbio.bio);
+	wbio->data			= data;
+	wbio->data_bytes		= bytes;
+	wbio->sector_offset		= b->written;
+	wbio->wbio.c			= c;
+	wbio->wbio.used_mempool		= used_mempool;
+	wbio->wbio.first_btree_write	= !b->written;
+	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
+	wbio->wbio.bio.bi_private	= b;
+
+	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
+
+	bkey_copy(&wbio->key, &b->key);
+
+	b->written += sectors_to_write;
+
+	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
+			cpu_to_le16(b->written);
+
+	atomic64_inc(&c->btree_write_stats[type].nr);
+	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
+
+	INIT_WORK(&wbio->work, btree_write_submit);
+	queue_work(c->io_complete_wq, &wbio->work);
+	return;
+err:
+	set_btree_node_noevict(b);
+	b->written += sectors_to_write;
+nowrite:
+	btree_bounce_free(c, bytes, used_mempool, data);
+	__btree_node_write_done(c, b);
+}
+
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
+{
+	bool invalidated_iter = false;
+	struct btree_node_entry *bne;
+	struct bset_tree *t;
+
+	if (!btree_node_just_written(b))
+		return false;
+
+	BUG_ON(b->whiteout_u64s);
+
+	clear_btree_node_just_written(b);
+
+	/*
+	 * Note: immediately after write, bset_written() doesn't work - the
+	 * amount of data we had to write after compaction might have been
+	 * smaller than the offset of the last bset.
+	 *
+	 * However, we know that all bsets have been written here, as long as
+	 * we're still holding the write lock:
+	 */
+
+	/*
+	 * XXX: decide if we really want to unconditionally sort down to a
+	 * single bset:
+	 */
+	if (b->nsets > 1) {
+		btree_node_sort(c, b, 0, b->nsets, true);
+		invalidated_iter = true;
+	} else {
+		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
+	}
+
+	for_each_bset(b, t)
+		set_needs_whiteout(bset(b, t), true);
+
+	bch2_btree_verify(c, b);
+
+	/*
+	 * If later we don't unconditionally sort down to a single bset, we have
+	 * to ensure this is still true:
+	 */
+	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch2_bset_init_next(c, b, bne);
+
+	bch2_btree_build_aux_trees(b);
+
+	return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
+void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+			   enum six_lock_type lock_type_held,
+			   unsigned flags)
+{
+	if (lock_type_held == SIX_LOCK_intent ||
+	    (lock_type_held == SIX_LOCK_read &&
+	     six_lock_tryupgrade(&b->c.lock))) {
+		__bch2_btree_node_write(c, b, flags);
+
+		/* don't cycle lock unnecessarily: */
+		if (btree_node_just_written(b) &&
+		    six_trylock_write(&b->c.lock)) {
+			bch2_btree_post_write_cleanup(c, b);
+			six_unlock_write(&b->c.lock);
+		}
+
+		if (lock_type_held == SIX_LOCK_read)
+			six_lock_downgrade(&b->c.lock);
+	} else {
+		__bch2_btree_node_write(c, b, flags);
+		if (lock_type_held == SIX_LOCK_write &&
+		    btree_node_just_written(b))
+			bch2_btree_post_write_cleanup(c, b);
+	}
+}
+
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	unsigned i;
+	bool ret = false;
+restart:
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos)
+		if (test_bit(flag, &b->flags)) {
+			rcu_read_unlock();
+			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+			ret = true;
+			goto restart;
+		}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
+{
+	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+}
+
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
+{
+	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
+
+static const char * const bch2_btree_write_types[] = {
+#define x(t, n) [n] = #t,
+	BCH_BTREE_WRITE_TYPES()
+	NULL
+};
+
+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	printbuf_tabstop_push(out, 20);
+	printbuf_tabstop_push(out, 10);
+
+	prt_tab(out);
+	prt_str(out, "nr");
+	prt_tab(out);
+	prt_str(out, "size");
+	prt_newline(out);
+
+	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
+		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
+		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
+
+		prt_printf(out, "%s:", bch2_btree_write_types[i]);
+		prt_tab(out);
+		prt_u64(out, nr);
+		prt_tab(out);
+		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
+		prt_newline(out);
+	}
+}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
new file mode 100644
index 000000000000..7e03dd76fb38
--- /dev/null
+++ b/fs/bcachefs/btree_io.h
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_IO_H
+#define _BCACHEFS_BTREE_IO_H
+
+#include "bkey_methods.h"
+#include "bset.h"
+#include "btree_locking.h"
+#include "checksum.h"
+#include "extents.h"
+#include "io_write_types.h"
+
+struct bch_fs;
+struct btree_write;
+struct btree;
+struct btree_iter;
+struct btree_node_read_all;
+
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
+{
+	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+		atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
+{
+	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+		atomic_dec(&c->btree_cache.dirty);
+}
+
+static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_btree_ptr_v2
+		? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+		: 0;
+}
+
+struct btree_read_bio {
+	struct bch_fs		*c;
+	struct btree		*b;
+	struct btree_node_read_all *ra;
+	u64			start_time;
+	unsigned		have_ioref:1;
+	unsigned		idx:7;
+	struct extent_ptr_decoded	pick;
+	struct work_struct	work;
+	struct bio		bio;
+};
+
+struct btree_write_bio {
+	struct work_struct	work;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	void			*data;
+	unsigned		data_bytes;
+	unsigned		sector_offset;
+	struct bch_write_bio	wbio;
+};
+
+void bch2_btree_node_io_unlock(struct btree *);
+void bch2_btree_node_io_lock(struct btree *);
+void __bch2_btree_node_wait_on_read(struct btree *);
+void __bch2_btree_node_wait_on_write(struct btree *);
+void bch2_btree_node_wait_on_read(struct btree *);
+void bch2_btree_node_wait_on_write(struct btree *);
+
+enum compact_mode {
+	COMPACT_LAZY,
+	COMPACT_ALL,
+};
+
+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
+			    enum compact_mode);
+
+static inline bool should_compact_bset_lazy(struct btree *b,
+					    struct bset_tree *t)
+{
+	unsigned total_u64s = bset_u64s(t);
+	unsigned dead_u64s = bset_dead_u64s(b, t);
+
+	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
+}
+
+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (should_compact_bset_lazy(b, t))
+			return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+
+	return false;
+}
+
+static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+	return (struct nonce) {{
+		[0] = cpu_to_le32(offset),
+		[1] = ((__le32 *) &i->seq)[0],
+		[2] = ((__le32 *) &i->seq)[1],
+		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+	}};
+}
+
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+	struct nonce nonce = btree_nonce(i, offset);
+	int ret;
+
+	if (!offset) {
+		struct btree_node *bn = container_of(i, struct btree_node, keys);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+				   &bn->flags, bytes);
+		if (ret)
+			return ret;
+
+		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+	}
+
+	return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+			    vstruct_end(i) - (void *) i->_data);
+}
+
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
+
+void bch2_btree_node_drop_keys_outside_node(struct btree *);
+
+void bch2_btree_build_aux_trees(struct btree *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
+
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
+			      struct btree *, bool, bool *);
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
+			 const struct bkey_i *, unsigned);
+
+void bch2_btree_complete_write(struct bch_fs *, struct btree *,
+			      struct btree_write *);
+
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+
+enum btree_write_flags {
+	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
+	__BTREE_WRITE_ALREADY_STARTED,
+};
+#define BTREE_WRITE_ONLY_IF_NEED	BIT(__BTREE_WRITE_ONLY_IF_NEED)
+#define BTREE_WRITE_ALREADY_STARTED	BIT(__BTREE_WRITE_ALREADY_STARTED)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
+			   enum six_lock_type, unsigned);
+
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+					    enum six_lock_type lock_held)
+{
+	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
+}
+
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
+
+static inline void compat_bformat(unsigned level, enum btree_id btree_id,
+				  unsigned version, unsigned big_endian,
+				  int write, struct bkey_format *f)
+{
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_inodes) {
+		swap(f->bits_per_field[BKEY_FIELD_INODE],
+		     f->bits_per_field[BKEY_FIELD_OFFSET]);
+		swap(f->field_offset[BKEY_FIELD_INODE],
+		     f->field_offset[BKEY_FIELD_OFFSET]);
+	}
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    (level || btree_type_has_snapshots(btree_id))) {
+		u64 max_packed =
+			~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+			? 0
+			: cpu_to_le64(U32_MAX - max_packed);
+	}
+}
+
+static inline void compat_bpos(unsigned level, enum btree_id btree_id,
+			       unsigned version, unsigned big_endian,
+			       int write, struct bpos *p)
+{
+	if (big_endian != CPU_BIG_ENDIAN)
+		bch2_bpos_swab(p);
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_inodes)
+		swap(p->inode, p->offset);
+}
+
+static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
+				     unsigned version, unsigned big_endian,
+				     int write,
+				     struct btree_node *bn)
+{
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bpos_eq(bn->min_key, POS_MIN) &&
+	    write)
+		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    write)
+		bn->max_key.snapshot = 0;
+
+	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
+	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    !write)
+		bn->max_key.snapshot = U32_MAX;
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bpos_eq(bn->min_key, POS_MIN) &&
+	    !write)
+		bn->min_key = bpos_nosnap_successor(bn->min_key);
+}
+
+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
new file mode 100644
index 000000000000..1d79514754d7
--- /dev/null
+++ b/fs/bcachefs/btree_iter.c
@@ -0,0 +1,3215 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "replicas.h"
+#include "snapshot.h"
+#include "trace.h"
+
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
+				       struct btree_path *);
+
+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
+{
+#ifdef TRACK_PATH_ALLOCATED
+	return iter->ip_allocated;
+#else
+	return 0;
+#endif
+}
+
+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+
+static inline int __btree_path_cmp(const struct btree_path *l,
+				   enum btree_id	r_btree_id,
+				   bool			r_cached,
+				   struct bpos		r_pos,
+				   unsigned		r_level)
+{
+	/*
+	 * Must match lock ordering as defined by __bch2_btree_node_lock:
+	 */
+	return   cmp_int(l->btree_id,	r_btree_id) ?:
+		 cmp_int((int) l->cached,	(int) r_cached) ?:
+		 bpos_cmp(l->pos,	r_pos) ?:
+		-cmp_int(l->level,	r_level);
+}
+
+static inline int btree_path_cmp(const struct btree_path *l,
+				 const struct btree_path *r)
+{
+	return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
+}
+
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+		p = bpos_successor(p);
+	} else {
+		p = bpos_nosnap_successor(p);
+		p.snapshot = iter->snapshot;
+	}
+
+	return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+		p = bpos_predecessor(p);
+	} else {
+		p = bpos_nosnap_predecessor(p);
+		p.snapshot = iter->snapshot;
+	}
+
+	return p;
+}
+
+static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
+{
+	struct bpos pos = iter->pos;
+
+	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	    !bkey_eq(pos, POS_MAX))
+		pos = bkey_successor(iter, pos);
+	return pos;
+}
+
+static inline bool btree_path_pos_before_node(struct btree_path *path,
+					      struct btree *b)
+{
+	return bpos_lt(path->pos, b->data->min_key);
+}
+
+static inline bool btree_path_pos_after_node(struct btree_path *path,
+					     struct btree *b)
+{
+	return bpos_gt(path->pos, b->key.k.p);
+}
+
+static inline bool btree_path_pos_in_node(struct btree_path *path,
+					  struct btree *b)
+{
+	return path->btree_id == b->c.btree_id &&
+		!btree_path_pos_before_node(path, b) &&
+		!btree_path_pos_after_node(path, b);
+}
+
+/* Btree iterator: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+					  struct btree_path *path)
+{
+	struct bkey_cached *ck;
+	bool locked = btree_node_locked(path, 0);
+
+	if (!bch2_btree_node_relock(trans, path, 0))
+		return;
+
+	ck = (void *) path->l[0].b;
+	BUG_ON(ck->key.btree_id != path->btree_id ||
+	       !bkey_eq(ck->key.pos, path->pos));
+
+	if (!locked)
+		btree_node_unlock(trans, path, 0);
+}
+
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+				struct btree_path *path, unsigned level)
+{
+	struct btree_path_level *l;
+	struct btree_node_iter tmp;
+	bool locked;
+	struct bkey_packed *p, *k;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	struct printbuf buf3 = PRINTBUF;
+	const char *msg;
+
+	if (!bch2_debug_check_iterators)
+		return;
+
+	l	= &path->l[level];
+	tmp	= l->iter;
+	locked	= btree_node_locked(path, level);
+
+	if (path->cached) {
+		if (!level)
+			bch2_btree_path_verify_cached(trans, path);
+		return;
+	}
+
+	if (!btree_path_node(path, level))
+		return;
+
+	if (!bch2_btree_node_relock_notrace(trans, path, level))
+		return;
+
+	BUG_ON(!btree_path_pos_in_node(path, l->b));
+
+	bch2_btree_node_iter_verify(&l->iter, l->b);
+
+	/*
+	 * For interior nodes, the iterator will have skipped past deleted keys:
+	 */
+	p = level
+		? bch2_btree_node_iter_prev(&tmp, l->b)
+		: bch2_btree_node_iter_prev_all(&tmp, l->b);
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+	if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
+		msg = "before";
+		goto err;
+	}
+
+	if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
+		msg = "after";
+		goto err;
+	}
+
+	if (!locked)
+		btree_node_unlock(trans, path, level);
+	return;
+err:
+	bch2_bpos_to_text(&buf1, path->pos);
+
+	if (p) {
+		struct bkey uk = bkey_unpack_key(l->b, p);
+
+		bch2_bkey_to_text(&buf2, &uk);
+	} else {
+		prt_printf(&buf2, "(none)");
+	}
+
+	if (k) {
+		struct bkey uk = bkey_unpack_key(l->b, k);
+
+		bch2_bkey_to_text(&buf3, &uk);
+	} else {
+		prt_printf(&buf3, "(none)");
+	}
+
+	panic("path should be %s key at level %u:\n"
+	      "path pos %s\n"
+	      "prev key %s\n"
+	      "cur  key %s\n",
+	      msg, level, buf1.buf, buf2.buf, buf3.buf);
+}
+
+static void bch2_btree_path_verify(struct btree_trans *trans,
+				   struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	unsigned i;
+
+	EBUG_ON(path->btree_id >= BTREE_ID_NR);
+
+	for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+		if (!path->l[i].b) {
+			BUG_ON(!path->cached &&
+			       bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
+			break;
+		}
+
+		bch2_btree_path_verify_level(trans, path, i);
+	}
+
+	bch2_btree_path_verify_locks(path);
+}
+
+void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_verify(trans, path);
+}
+
+static void bch2_btree_iter_verify(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+
+	BUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+	BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
+
+	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+	BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       !btree_type_has_snapshots(iter->btree_id));
+
+	if (iter->update_path)
+		bch2_btree_path_verify(trans, iter->update_path);
+	bch2_btree_path_verify(trans, iter->path);
+}
+
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+	BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+	       !iter->pos.snapshot);
+
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       iter->pos.snapshot != iter->snapshot);
+
+	BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+	       bkey_gt(iter->pos, iter->k.p));
+}
+
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree_iter copy;
+	struct bkey_s_c prev;
+	int ret = 0;
+
+	if (!bch2_debug_check_iterators)
+		return 0;
+
+	if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+		return 0;
+
+	if (bkey_err(k) || !k.k)
+		return 0;
+
+	BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+					  iter->snapshot,
+					  k.k->p.snapshot));
+
+	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+			     BTREE_ITER_NOPRESERVE|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	prev = bch2_btree_iter_prev(&copy);
+	if (!prev.k)
+		goto out;
+
+	ret = bkey_err(prev);
+	if (ret)
+		goto out;
+
+	if (bkey_eq(prev.k->p, k.k->p) &&
+	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+				      prev.k->p.snapshot) > 0) {
+		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+		bch2_bkey_to_text(&buf1, k.k);
+		bch2_bkey_to_text(&buf2, prev.k);
+
+		panic("iter snap %u\n"
+		      "k    %s\n"
+		      "prev %s\n",
+		      iter->snapshot,
+		      buf1.buf, buf2.buf);
+	}
+out:
+	bch2_trans_iter_exit(trans, &copy);
+	return ret;
+}
+
+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+			    struct bpos pos, bool key_cache)
+{
+	struct btree_path *path;
+	unsigned idx;
+	struct printbuf buf = PRINTBUF;
+
+	btree_trans_sort_paths(trans);
+
+	trans_for_each_path_inorder(trans, path, idx) {
+		int cmp = cmp_int(path->btree_id, id) ?:
+			cmp_int(path->cached, key_cache);
+
+		if (cmp > 0)
+			break;
+		if (cmp < 0)
+			continue;
+
+		if (!btree_node_locked(path, 0) ||
+		    !path->should_be_locked)
+			continue;
+
+		if (!key_cache) {
+			if (bkey_ge(pos, path->l[0].b->data->min_key) &&
+			    bkey_le(pos, path->l[0].b->key.k.p))
+				return;
+		} else {
+			if (bkey_eq(pos, path->pos))
+				return;
+		}
+	}
+
+	bch2_dump_trans_paths_updates(trans);
+	bch2_bpos_to_text(&buf, pos);
+
+	panic("not locked: %s %s%s\n",
+	      bch2_btree_ids[id], buf.buf,
+	      key_cache ? " cached" : "");
+}
+
+#else
+
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+						struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+					  struct btree_path *path) {}
+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
+
+#endif
+
+/* Btree path: fixups after btree updates */
+
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+					struct btree *b,
+					struct bset_tree *t,
+					struct bkey_packed *k)
+{
+	struct btree_node_iter_set *set;
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset) {
+			set->k = __btree_node_key_to_offset(b, k);
+			bch2_btree_node_iter_sort(iter, b);
+			return;
+		}
+
+	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
+
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
+					       struct btree *b,
+					       struct bkey_packed *where)
+{
+	struct btree_path_level *l = &path->l[b->c.level];
+
+	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
+		return;
+
+	if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+}
+
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+				      struct btree *b,
+				      struct bkey_packed *where)
+{
+	struct btree_path *path;
+
+	trans_for_each_path_with_node(trans, b, path) {
+		__bch2_btree_path_fix_key_modified(path, b, where);
+		bch2_btree_path_verify_level(trans, path, b->c.level);
+	}
+}
+
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+				       struct btree *b,
+				       struct btree_node_iter *node_iter,
+				       struct bset_tree *t,
+				       struct bkey_packed *where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
+{
+	const struct bkey_packed *end = btree_bkey_last(b, t);
+	struct btree_node_iter_set *set;
+	unsigned offset = __btree_node_key_to_offset(b, where);
+	int shift = new_u64s - clobber_u64s;
+	unsigned old_end = t->end_offset - shift;
+	unsigned orig_iter_pos = node_iter->data[0].k;
+	bool iter_current_key_modified =
+		orig_iter_pos >= offset &&
+		orig_iter_pos <= offset + clobber_u64s;
+
+	btree_node_iter_for_each(node_iter, set)
+		if (set->end == old_end)
+			goto found;
+
+	/* didn't find the bset in the iterator - might have to readd it: */
+	if (new_u64s &&
+	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
+		bch2_btree_node_iter_push(node_iter, b, where, end);
+		goto fixup_done;
+	} else {
+		/* Iterator is after key that changed */
+		return;
+	}
+found:
+	set->end = t->end_offset;
+
+	/* Iterator hasn't gotten to the key that changed yet: */
+	if (set->k < offset)
+		return;
+
+	if (new_u64s &&
+	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
+		set->k = offset;
+	} else if (set->k < offset + clobber_u64s) {
+		set->k = offset + new_u64s;
+		if (set->k == set->end)
+			bch2_btree_node_iter_set_drop(node_iter, set);
+	} else {
+		/* Iterator is after key that changed */
+		set->k = (int) set->k + shift;
+		return;
+	}
+
+	bch2_btree_node_iter_sort(node_iter, b);
+fixup_done:
+	if (node_iter->data[0].k != orig_iter_pos)
+		iter_current_key_modified = true;
+
+	/*
+	 * When a new key is added, and the node iterator now points to that
+	 * key, the iterator might have skipped past deleted keys that should
+	 * come after the key the iterator now points to. We have to rewind to
+	 * before those deleted keys - otherwise
+	 * bch2_btree_node_iter_prev_all() breaks:
+	 */
+	if (!bch2_btree_node_iter_end(node_iter) &&
+	    iter_current_key_modified &&
+	    b->c.level) {
+		struct bkey_packed *k, *k2, *p;
+
+		k = bch2_btree_node_iter_peek_all(node_iter, b);
+
+		for_each_bset(b, t) {
+			bool set_pos = false;
+
+			if (node_iter->data[0].end == t->end_offset)
+				continue;
+
+			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+			       bkey_iter_cmp(b, k, p) < 0) {
+				k2 = p;
+				set_pos = true;
+			}
+
+			if (set_pos)
+				btree_node_iter_set_set_pos(node_iter,
+							    b, t, k2);
+		}
+	}
+}
+
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+			      struct btree_path *path,
+			      struct btree *b,
+			      struct btree_node_iter *node_iter,
+			      struct bkey_packed *where,
+			      unsigned clobber_u64s,
+			      unsigned new_u64s)
+{
+	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
+	struct btree_path *linked;
+
+	if (node_iter != &path->l[b->c.level].iter) {
+		__bch2_btree_node_iter_fix(path, b, node_iter, t,
+					   where, clobber_u64s, new_u64s);
+
+		if (bch2_debug_check_iterators)
+			bch2_btree_node_iter_verify(node_iter, b);
+	}
+
+	trans_for_each_path_with_node(trans, b, linked) {
+		__bch2_btree_node_iter_fix(linked, b,
+					   &linked->l[b->c.level].iter, t,
+					   where, clobber_u64s, new_u64s);
+		bch2_btree_path_verify_level(trans, linked, b->c.level);
+	}
+}
+
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+						  struct btree_path_level *l,
+						  struct bkey *u,
+						  struct bkey_packed *k)
+{
+	if (unlikely(!k)) {
+		/*
+		 * signal to bch2_btree_iter_peek_slot() that we're currently at
+		 * a hole
+		 */
+		u->type = KEY_TYPE_deleted;
+		return bkey_s_c_null;
+	}
+
+	return bkey_disassemble(l->b, k, u);
+}
+
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+							struct btree_path_level *l,
+							struct bkey *u)
+{
+	return __btree_iter_unpack(c, l, u,
+			bch2_btree_node_iter_peek_all(&l->iter, l->b));
+}
+
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
+						    struct btree_path *path,
+						    struct btree_path_level *l,
+						    struct bkey *u)
+{
+	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+			bch2_btree_node_iter_peek(&l->iter, l->b));
+
+	path->pos = k.k ? k.k->p : l->b->key.k.p;
+	trans->paths_sorted = false;
+	bch2_btree_path_verify_level(trans, path, l - path->l);
+	return k;
+}
+
+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
+						    struct btree_path *path,
+						    struct btree_path_level *l,
+						    struct bkey *u)
+{
+	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+			bch2_btree_node_iter_prev(&l->iter, l->b));
+
+	path->pos = k.k ? k.k->p : l->b->data->min_key;
+	trans->paths_sorted = false;
+	bch2_btree_path_verify_level(trans, path, l - path->l);
+	return k;
+}
+
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+					     struct btree_path_level *l,
+					     int max_advance)
+{
+	struct bkey_packed *k;
+	int nr_advanced = 0;
+
+	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+	       bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
+		if (max_advance > 0 && nr_advanced >= max_advance)
+			return false;
+
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+		nr_advanced++;
+	}
+
+	return true;
+}
+
+static inline void __btree_path_level_init(struct btree_path *path,
+					   unsigned level)
+{
+	struct btree_path_level *l = &path->l[level];
+
+	bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+	/*
+	 * Iterators to interior nodes should always be pointed at the first non
+	 * whiteout:
+	 */
+	if (level)
+		bch2_btree_node_iter_peek(&l->iter, l->b);
+}
+
+void bch2_btree_path_level_init(struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b)
+{
+	BUG_ON(path->cached);
+
+	EBUG_ON(!btree_path_pos_in_node(path, b));
+
+	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
+	path->l[b->c.level].b = b;
+	__btree_path_level_init(path, b->c.level);
+}
+
+/* Btree path: fixups after btree node updates: */
+
+static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i)
+		if (!i->cached &&
+		    i->level	== b->c.level &&
+		    i->btree_id	== b->c.btree_id &&
+		    bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
+		    bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
+			i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+			if (unlikely(trans->journal_replay_not_finished)) {
+				struct bkey_i *j_k =
+					bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+								    i->k->k.p);
+
+				if (j_k) {
+					i->old_k = j_k->k;
+					i->old_v = &j_k->v;
+				}
+			}
+		}
+}
+
+/*
+ * A btree node is being replaced - update the iterator to point to the new
+ * node:
+ */
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->uptodate == BTREE_ITER_UPTODATE &&
+		    !path->cached &&
+		    btree_path_pos_in_node(path, b)) {
+			enum btree_node_locked_type t =
+				btree_lock_want(path, b->c.level);
+
+			if (t != BTREE_NODE_UNLOCKED) {
+				btree_node_unlock(trans, path, b->c.level);
+				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
+				mark_btree_node_locked(trans, path, b->c.level, t);
+			}
+
+			bch2_btree_path_level_init(trans, path, b);
+		}
+
+	bch2_trans_revalidate_updates_in_node(trans, b);
+}
+
+/*
+ * A btree node has been modified in such a way as to invalidate iterators - fix
+ * them:
+ */
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
+{
+	struct btree_path *path;
+
+	trans_for_each_path_with_node(trans, b, path)
+		__btree_path_level_init(path, b->c.level);
+
+	bch2_trans_revalidate_updates_in_node(trans, b);
+}
+
+/* Btree path: traverse, set_pos: */
+
+static inline int btree_path_lock_root(struct btree_trans *trans,
+				       struct btree_path *path,
+				       unsigned depth_want,
+				       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
+	enum six_lock_type lock_type;
+	unsigned i;
+	int ret;
+
+	EBUG_ON(path->nodes_locked);
+
+	while (1) {
+		b = READ_ONCE(*rootp);
+		path->level = READ_ONCE(b->c.level);
+
+		if (unlikely(path->level < depth_want)) {
+			/*
+			 * the root is at a lower depth than the depth we want:
+			 * got to the end of the btree, or we're walking nodes
+			 * greater than some depth and there are no nodes >=
+			 * that depth
+			 */
+			path->level = depth_want;
+			for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+				path->l[i].b = NULL;
+			return 1;
+		}
+
+		lock_type = __btree_lock_want(path, path->level);
+		ret = btree_node_lock(trans, path, &b->c,
+				      path->level, lock_type, trace_ip);
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
+				continue;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				return ret;
+			BUG();
+		}
+
+		if (likely(b == READ_ONCE(*rootp) &&
+			   b->c.level == path->level &&
+			   !race_fault())) {
+			for (i = 0; i < path->level; i++)
+				path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
+			path->l[path->level].b = b;
+			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+				path->l[i].b = NULL;
+
+			mark_btree_node_locked(trans, path, path->level,
+					       (enum btree_node_locked_type) lock_type);
+			bch2_btree_path_level_init(trans, path, b);
+			return 0;
+		}
+
+		six_unlock_type(&b->c.lock, lock_type);
+	}
+}
+
+noinline
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree_node_iter node_iter = l->iter;
+	struct bkey_packed *k;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+		? (path->level > 1 ? 0 :  2)
+		: (path->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(path, path->level);
+	int ret = 0;
+
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr-- && !ret) {
+		if (!bch2_btree_node_relock(trans, path, path->level))
+			break;
+
+		bch2_btree_node_iter_advance(&node_iter, l->b);
+		k = bch2_btree_node_iter_peek(&node_iter, l->b);
+		if (!k)
+			break;
+
+		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
+					       path->level - 1);
+	}
+
+	if (!was_locked)
+		btree_node_unlock(trans, path, path->level);
+
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+				 struct btree_and_journal_iter *jiter)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+		? (path->level > 1 ? 0 :  2)
+		: (path->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(path, path->level);
+	int ret = 0;
+
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr-- && !ret) {
+		if (!bch2_btree_node_relock(trans, path, path->level))
+			break;
+
+		bch2_btree_and_journal_iter_advance(jiter);
+		k = bch2_btree_and_journal_iter_peek(jiter);
+		if (!k.k)
+			break;
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
+					       path->level - 1);
+	}
+
+	if (!was_locked)
+		btree_node_unlock(trans, path, path->level);
+
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+					    struct btree_path *path,
+					    unsigned plevel, struct btree *b)
+{
+	struct btree_path_level *l = &path->l[plevel];
+	bool locked = btree_node_locked(path, plevel);
+	struct bkey_packed *k;
+	struct bch_btree_ptr_v2 *bp;
+
+	if (!bch2_btree_node_relock(trans, path, plevel))
+		return;
+
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
+
+	bp = (void *) bkeyp_val(&l->b->format, k);
+	bp->mem_ptr = (unsigned long)b;
+
+	if (!locked)
+		btree_node_unlock(trans, path, plevel);
+}
+
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+						     struct btree_path *path,
+						     unsigned flags,
+						     struct bkey_buf *out)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree_and_journal_iter jiter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+	k = bch2_btree_and_journal_iter_peek(&jiter);
+
+	bch2_bkey_buf_reassemble(out, c, k);
+
+	if (flags & BTREE_ITER_PREFETCH)
+		ret = btree_path_prefetch_j(trans, path, &jiter);
+
+	bch2_btree_and_journal_iter_exit(&jiter);
+	return ret;
+}
+
+static __always_inline int btree_path_down(struct btree_trans *trans,
+					   struct btree_path *path,
+					   unsigned flags,
+					   unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree *b;
+	unsigned level = path->level - 1;
+	enum six_lock_type lock_type = __btree_lock_want(path, level);
+	struct bkey_buf tmp;
+	int ret;
+
+	EBUG_ON(!btree_node_locked(path, path->level));
+
+	bch2_bkey_buf_init(&tmp);
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+		if (ret)
+			goto err;
+	} else {
+		bch2_bkey_buf_unpack(&tmp, c, l->b,
+				 bch2_btree_node_iter_peek(&l->iter, l->b));
+
+		if (flags & BTREE_ITER_PREFETCH) {
+			ret = btree_path_prefetch(trans, path);
+			if (ret)
+				goto err;
+		}
+	}
+
+	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (unlikely(ret))
+		goto err;
+
+	if (likely(!trans->journal_replay_not_finished &&
+		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
+	    unlikely(b != btree_node_mem_ptr(tmp.k)))
+		btree_node_mem_ptr_set(trans, path, level + 1, b);
+
+	if (btree_node_read_locked(path, level + 1))
+		btree_node_unlock(trans, path, level + 1);
+
+	mark_btree_node_locked(trans, path, level,
+			       (enum btree_node_locked_type) lock_type);
+	path->level = level;
+	bch2_btree_path_level_init(trans, path, b);
+
+	bch2_btree_path_verify_locks(path);
+err:
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
+
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+	unsigned long trace_ip = _RET_IP_;
+	int i, ret = 0;
+
+	if (trans->in_traverse_all)
+		return -BCH_ERR_transaction_restart_in_traverse_all;
+
+	trans->in_traverse_all = true;
+retry_all:
+	trans->restarted = 0;
+	trans->last_restarted_ip = 0;
+
+	trans_for_each_path(trans, path)
+		path->should_be_locked = false;
+
+	btree_trans_sort_paths(trans);
+
+	bch2_trans_unlock(trans);
+	cond_resched();
+
+	if (unlikely(trans->memory_allocation_failure)) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		do {
+			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+			closure_sync(&cl);
+		} while (ret);
+	}
+
+	/* Now, redo traversals in correct order: */
+	i = 0;
+	while (i < trans->nr_sorted) {
+		path = trans->paths + trans->sorted[i];
+
+		/*
+		 * Traversing a path can cause another path to be added at about
+		 * the same position:
+		 */
+		if (path->uptodate) {
+			__btree_path_get(path, false);
+			ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+			__btree_path_put(path, false);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+			    bch2_err_matches(ret, ENOMEM))
+				goto retry_all;
+			if (ret)
+				goto err;
+		} else {
+			i++;
+		}
+	}
+
+	/*
+	 * We used to assert that all paths had been traversed here
+	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
+	 * path->should_be_locked is not set yet, we might have unlocked and
+	 * then failed to relock a path - that's fine.
+	 */
+err:
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	trans->in_traverse_all = false;
+
+	trace_and_count(c, trans_traverse_all, trans, trace_ip);
+	return ret;
+}
+
+static inline bool btree_path_check_pos_in_node(struct btree_path *path,
+						unsigned l, int check_pos)
+{
+	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
+		return false;
+	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
+		return false;
+	return true;
+}
+
+static inline bool btree_path_good_node(struct btree_trans *trans,
+					struct btree_path *path,
+					unsigned l, int check_pos)
+{
+	return is_btree_node(path, l) &&
+		bch2_btree_node_relock(trans, path, l) &&
+		btree_path_check_pos_in_node(path, l, check_pos);
+}
+
+static void btree_path_set_level_down(struct btree_trans *trans,
+				      struct btree_path *path,
+				      unsigned new_level)
+{
+	unsigned l;
+
+	path->level = new_level;
+
+	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(trans, path, l);
+
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+	bch2_btree_path_verify(trans, path);
+}
+
+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
+							 struct btree_path *path,
+							 int check_pos)
+{
+	unsigned i, l = path->level;
+again:
+	while (btree_path_node(path, l) &&
+	       !btree_path_good_node(trans, path, l, check_pos))
+		__btree_path_set_level_up(trans, path, l++);
+
+	/* If we need intent locks, take them too: */
+	for (i = l + 1;
+	     i < path->locks_want && btree_path_node(path, i);
+	     i++)
+		if (!bch2_btree_node_relock(trans, path, i)) {
+			while (l <= i)
+				__btree_path_set_level_up(trans, path, l++);
+			goto again;
+		}
+
+	return l;
+}
+
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+						     struct btree_path *path,
+						     int check_pos)
+{
+	return likely(btree_node_locked(path, path->level) &&
+		      btree_path_check_pos_in_node(path, path->level, check_pos))
+		? path->level
+		: __btree_path_up_until_good_node(trans, path, check_pos);
+}
+
+/*
+ * This is the main state machine for walking down the btree - walks down to a
+ * specified depth
+ *
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
+ *
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
+ * stashed in the iterator and returned from bch2_trans_exit().
+ */
+int bch2_btree_path_traverse_one(struct btree_trans *trans,
+				 struct btree_path *path,
+				 unsigned flags,
+				 unsigned long trace_ip)
+{
+	unsigned depth_want = path->level;
+	int ret = -((int) trans->restarted);
+
+	if (unlikely(ret))
+		goto out;
+
+	/*
+	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+	 * and re-traverse the path without a transaction restart:
+	 */
+	if (path->should_be_locked) {
+		ret = bch2_btree_path_relock(trans, path, trace_ip);
+		goto out;
+	}
+
+	if (path->cached) {
+		ret = bch2_btree_path_traverse_cached(trans, path, flags);
+		goto out;
+	}
+
+	if (unlikely(path->level >= BTREE_MAX_DEPTH))
+		goto out;
+
+	path->level = btree_path_up_until_good_node(trans, path, 0);
+
+	EBUG_ON(btree_path_node(path, path->level) &&
+		!btree_node_locked(path, path->level));
+
+	/*
+	 * Note: path->nodes[path->level] may be temporarily NULL here - that
+	 * would indicate to other code that we got to the end of the btree,
+	 * here it indicates that relocking the root failed - it's critical that
+	 * btree_path_lock_root() comes next and that it can't fail
+	 */
+	while (path->level > depth_want) {
+		ret = btree_path_node(path, path->level)
+			? btree_path_down(trans, path, flags, trace_ip)
+			: btree_path_lock_root(trans, path, depth_want, trace_ip);
+		if (unlikely(ret)) {
+			if (ret == 1) {
+				/*
+				 * No nodes at this level - got to the end of
+				 * the btree:
+				 */
+				ret = 0;
+				goto out;
+			}
+
+			__bch2_btree_path_unlock(trans, path);
+			path->level = depth_want;
+			path->l[path->level].b = ERR_PTR(ret);
+			goto out;
+		}
+	}
+
+	path->uptodate = BTREE_ITER_UPTODATE;
+out:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
+		panic("ret %s (%i) trans->restarted %s (%i)\n",
+		      bch2_err_str(ret), ret,
+		      bch2_err_str(trans->restarted), trans->restarted);
+	bch2_btree_path_verify(trans, path);
+	return ret;
+}
+
+static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+			    struct btree_path *src)
+{
+	unsigned i, offset = offsetof(struct btree_path, pos);
+
+	memcpy((void *) dst + offset,
+	       (void *) src + offset,
+	       sizeof(struct btree_path) - offset);
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+		unsigned t = btree_node_locked_type(dst, i);
+
+		if (t != BTREE_NODE_UNLOCKED)
+			six_lock_increment(&dst->l[i].b->c.lock, t);
+	}
+}
+
+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
+					   bool intent)
+{
+	struct btree_path *new = btree_path_alloc(trans, src);
+
+	btree_path_copy(trans, new, src);
+	__btree_path_get(new, intent);
+	return new;
+}
+
+__flatten
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
+			 struct btree_path *path, bool intent,
+			 unsigned long ip)
+{
+	__btree_path_put(path, intent);
+	path = btree_path_clone(trans, path, intent);
+	path->preserve = false;
+	return path;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *trans,
+		   struct btree_path *path, struct bpos new_pos,
+		   bool intent, unsigned long ip, int cmp)
+{
+	unsigned level = path->level;
+
+	bch2_trans_verify_not_in_restart(trans);
+	EBUG_ON(!path->ref);
+
+	path = bch2_btree_path_make_mut(trans, path, intent, ip);
+
+	path->pos		= new_pos;
+	trans->paths_sorted	= false;
+
+	if (unlikely(path->cached)) {
+		btree_node_unlock(trans, path, 0);
+		path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		goto out;
+	}
+
+	level = btree_path_up_until_good_node(trans, path, cmp);
+
+	if (btree_path_node(path, level)) {
+		struct btree_path_level *l = &path->l[level];
+
+		BUG_ON(!btree_node_locked(path, level));
+		/*
+		 * We might have to skip over many keys, or just a few: try
+		 * advancing the node iterator, and if we have to skip over too
+		 * many keys just reinit it (or if we're rewinding, since that
+		 * is expensive).
+		 */
+		if (cmp < 0 ||
+		    !btree_path_advance_to_pos(path, l, 8))
+			bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+		/*
+		 * Iterators to interior nodes should always be pointed at the first non
+		 * whiteout:
+		 */
+		if (unlikely(level))
+			bch2_btree_node_iter_peek(&l->iter, l->b);
+	}
+
+	if (unlikely(level != path->level)) {
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		__bch2_btree_path_unlock(trans, path);
+	}
+out:
+	bch2_btree_path_verify(trans, path);
+	return path;
+}
+
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree_path *sib;
+
+	sib = prev_btree_path(trans, path);
+	if (sib && !btree_path_cmp(sib, path))
+		return sib;
+
+	sib = next_btree_path(trans, path);
+	if (sib && !btree_path_cmp(sib, path))
+		return sib;
+
+	return NULL;
+}
+
+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree_path *sib;
+
+	sib = prev_btree_path(trans, path);
+	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+		return sib;
+
+	sib = next_btree_path(trans, path);
+	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+		return sib;
+
+	return NULL;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+{
+	__bch2_btree_path_unlock(trans, path);
+	btree_path_list_remove(trans, path);
+	trans->paths_allocated &= ~(1ULL << path->idx);
+}
+
+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+{
+	struct btree_path *dup;
+
+	EBUG_ON(trans->paths + path->idx != path);
+	EBUG_ON(!path->ref);
+
+	if (!__btree_path_put(path, intent))
+		return;
+
+	dup = path->preserve
+		? have_path_at_pos(trans, path)
+		: have_node_at_pos(trans, path);
+
+	if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+		return;
+
+	if (path->should_be_locked &&
+	    !trans->restarted &&
+	    (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+		return;
+
+	if (dup) {
+		dup->preserve		|= path->preserve;
+		dup->should_be_locked	|= path->should_be_locked;
+	}
+
+	__bch2_path_free(trans, path);
+}
+
+static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+				 bool intent)
+{
+	EBUG_ON(trans->paths + path->idx != path);
+	EBUG_ON(!path->ref);
+
+	if (!__btree_path_put(path, intent))
+		return;
+
+	__bch2_path_free(trans, path);
+}
+
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+{
+	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+	      trans->restart_count, restart_count,
+	      (void *) trans->last_begin_ip);
+}
+
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+{
+	panic("in transaction restart: %s, last restarted by %pS\n",
+	      bch2_err_str(trans->restarted),
+	      (void *) trans->last_restarted_ip);
+}
+
+noinline __cold
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
+
+	prt_printf(buf, "transaction updates for %s journal seq %llu",
+	       trans->fn, trans->journal_res.seq);
+	prt_newline(buf);
+	printbuf_indent_add(buf, 2);
+
+	trans_for_each_update(trans, i) {
+		struct bkey_s_c old = { &i->old_k, i->old_v };
+
+		prt_printf(buf, "update: btree=%s cached=%u %pS",
+		       bch2_btree_ids[i->btree_id],
+		       i->cached,
+		       (void *) i->ip_allocated);
+		prt_newline(buf);
+
+		prt_printf(buf, "  old ");
+		bch2_bkey_val_to_text(buf, trans->c, old);
+		prt_newline(buf);
+
+		prt_printf(buf, "  new ");
+		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+		prt_newline(buf);
+	}
+
+	trans_for_each_wb_update(trans, wb) {
+		prt_printf(buf, "update: btree=%s wb=1 %pS",
+		       bch2_btree_ids[wb->btree],
+		       (void *) i->ip_allocated);
+		prt_newline(buf);
+
+		prt_printf(buf, "  new ");
+		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
+		prt_newline(buf);
+	}
+
+	printbuf_indent_sub(buf, 2);
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+	struct printbuf buf = PRINTBUF;
+
+	bch2_trans_updates_to_text(&buf, trans);
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
+	printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+{
+	prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+		   path->idx, path->ref, path->intent_ref,
+		   path->preserve ? 'P' : ' ',
+		   path->should_be_locked ? 'S' : ' ',
+		   bch2_btree_ids[path->btree_id],
+		   path->level);
+	bch2_bpos_to_text(out, path->pos);
+
+	prt_printf(out, " locks %u", path->nodes_locked);
+#ifdef TRACK_PATH_ALLOCATED
+	prt_printf(out, " %pS", (void *) path->ip_allocated);
+#endif
+	prt_newline(out);
+}
+
+static noinline __cold
+void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
+				bool nosort)
+{
+	struct btree_path *path;
+	unsigned idx;
+
+	if (!nosort)
+		btree_trans_sort_paths(trans);
+
+	trans_for_each_path_inorder(trans, path, idx)
+		bch2_btree_path_to_text(out, path);
+}
+
+noinline __cold
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+	__bch2_trans_paths_to_text(out, trans, false);
+}
+
+static noinline __cold
+void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
+{
+	struct printbuf buf = PRINTBUF;
+
+	__bch2_trans_paths_to_text(&buf, trans, nosort);
+	bch2_trans_updates_to_text(&buf, trans);
+
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
+	printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+	__bch2_dump_trans_paths_updates(trans, false);
+}
+
+noinline __cold
+static void bch2_trans_update_max_paths(struct btree_trans *trans)
+{
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+	struct printbuf buf = PRINTBUF;
+
+	if (!s)
+		return;
+
+	bch2_trans_paths_to_text(&buf, trans);
+
+	if (!buf.allocation_failure) {
+		mutex_lock(&s->lock);
+		if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
+			s->nr_max_paths = trans->nr_max_paths =
+				hweight64(trans->paths_allocated);
+			swap(s->max_paths_text, buf.buf);
+		}
+		mutex_unlock(&s->lock);
+	}
+
+	printbuf_exit(&buf);
+
+	trans->nr_max_paths = hweight64(trans->paths_allocated);
+}
+
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+	bch2_dump_trans_paths_updates(trans);
+	panic("trans path overflow\n");
+}
+
+static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
+						  struct btree_path *pos)
+{
+	struct btree_path *path;
+	unsigned idx;
+
+	if (unlikely(trans->paths_allocated ==
+		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+		btree_path_overflow(trans);
+
+	idx = __ffs64(~trans->paths_allocated);
+
+	/*
+	 * Do this before marking the new path as allocated, since it won't be
+	 * initialized yet:
+	 */
+	if (unlikely(idx > trans->nr_max_paths))
+		bch2_trans_update_max_paths(trans);
+
+	trans->paths_allocated |= 1ULL << idx;
+
+	path = &trans->paths[idx];
+	path->idx		= idx;
+	path->ref		= 0;
+	path->intent_ref	= 0;
+	path->nodes_locked	= 0;
+
+	btree_path_list_add(trans, pos, path);
+	trans->paths_sorted = false;
+	return path;
+}
+
+struct btree_path *bch2_path_get(struct btree_trans *trans,
+				 enum btree_id btree_id, struct bpos pos,
+				 unsigned locks_want, unsigned level,
+				 unsigned flags, unsigned long ip)
+{
+	struct btree_path *path, *path_pos = NULL;
+	bool cached = flags & BTREE_ITER_CACHED;
+	bool intent = flags & BTREE_ITER_INTENT;
+	int i;
+
+	bch2_trans_verify_not_in_restart(trans);
+	bch2_trans_verify_locks(trans);
+
+	btree_trans_sort_paths(trans);
+
+	trans_for_each_path_inorder(trans, path, i) {
+		if (__btree_path_cmp(path,
+				     btree_id,
+				     cached,
+				     pos,
+				     level) > 0)
+			break;
+
+		path_pos = path;
+	}
+
+	if (path_pos &&
+	    path_pos->cached	== cached &&
+	    path_pos->btree_id	== btree_id &&
+	    path_pos->level	== level) {
+		__btree_path_get(path_pos, intent);
+		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+	} else {
+		path = btree_path_alloc(trans, path_pos);
+		path_pos = NULL;
+
+		__btree_path_get(path, intent);
+		path->pos			= pos;
+		path->btree_id			= btree_id;
+		path->cached			= cached;
+		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
+		path->should_be_locked		= false;
+		path->level			= level;
+		path->locks_want		= locks_want;
+		path->nodes_locked		= 0;
+		for (i = 0; i < ARRAY_SIZE(path->l); i++)
+			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
+#ifdef TRACK_PATH_ALLOCATED
+		path->ip_allocated		= ip;
+#endif
+		trans->paths_sorted		= false;
+	}
+
+	if (!(flags & BTREE_ITER_NOPRESERVE))
+		path->preserve = true;
+
+	if (path->intent_ref)
+		locks_want = max(locks_want, level + 1);
+
+	/*
+	 * If the path has locks_want greater than requested, we don't downgrade
+	 * it here - on transaction restart because btree node split needs to
+	 * upgrade locks, we might be putting/getting the iterator again.
+	 * Downgrading iterators only happens via bch2_trans_downgrade(), after
+	 * a successful transaction commit.
+	 */
+
+	locks_want = min(locks_want, BTREE_MAX_DEPTH);
+	if (locks_want > path->locks_want)
+		bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
+
+	return path;
+}
+
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+	struct btree_path_level *l = path_l(path);
+	struct bkey_packed *_k;
+	struct bkey_s_c k;
+
+	if (unlikely(!l->b))
+		return bkey_s_c_null;
+
+	EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+	EBUG_ON(!btree_node_locked(path, path->level));
+
+	if (!path->cached) {
+		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
+
+		if (!k.k || !bpos_eq(path->pos, k.k->p))
+			goto hole;
+	} else {
+		struct bkey_cached *ck = (void *) path->l[0].b;
+
+		EBUG_ON(ck &&
+			(path->btree_id != ck->key.btree_id ||
+			 !bkey_eq(path->pos, ck->key.pos)));
+		if (!ck || !ck->valid)
+			return bkey_s_c_null;
+
+		*u = ck->k->k;
+		k = bkey_i_to_s_c(ck->k);
+	}
+
+	return k;
+hole:
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
+/* Btree iterators: */
+
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+}
+
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	int ret;
+
+	iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
+					btree_iter_search_key(iter),
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+	if (ret)
+		return ret;
+
+	btree_path_set_should_be_locked(iter->path);
+	return 0;
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree *b = NULL;
+	int ret;
+
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+	if (ret)
+		goto err;
+
+	b = btree_path_node(iter->path, iter->path->level);
+	if (!b)
+		goto out;
+
+	BUG_ON(bpos_lt(b->key.k.p, iter->pos));
+
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = b->key.k.p;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+	btree_path_set_should_be_locked(iter->path);
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+
+	return b;
+err:
+	b = ERR_PTR(ret);
+	goto out;
+}
+
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+{
+	struct btree *b;
+
+	while (b = bch2_btree_iter_peek_node(iter),
+	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
+		bch2_trans_begin(iter->trans);
+
+	return b;
+}
+
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree_path *path = iter->path;
+	struct btree *b = NULL;
+	int ret;
+
+	bch2_trans_verify_not_in_restart(trans);
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+
+	/* already at end? */
+	if (!btree_path_node(path, path->level))
+		return NULL;
+
+	/* got to end? */
+	if (!btree_path_node(path, path->level + 1)) {
+		btree_path_set_level_up(trans, path);
+		return NULL;
+	}
+
+	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+		__bch2_btree_path_unlock(trans, path);
+		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
+		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+		goto err;
+	}
+
+	b = btree_path_node(path, path->level + 1);
+
+	if (bpos_eq(iter->pos, b->key.k.p)) {
+		__btree_path_set_level_up(trans, path, path->level++);
+	} else {
+		/*
+		 * Haven't gotten to the end of the parent node: go back down to
+		 * the next child node
+		 */
+		path = iter->path =
+			bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+					   iter->flags & BTREE_ITER_INTENT,
+					   btree_iter_ip_allocated(iter));
+
+		btree_path_set_level_down(trans, path, iter->min_depth);
+
+		ret = bch2_btree_path_traverse(trans, path, iter->flags);
+		if (ret)
+			goto err;
+
+		b = path->l[path->level].b;
+	}
+
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = b->key.k.p;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+	btree_path_set_should_be_locked(iter->path);
+	BUG_ON(iter->path->uptodate);
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+
+	return b;
+err:
+	b = ERR_PTR(ret);
+	goto out;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+{
+	if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
+		struct bpos pos = iter->k.p;
+		bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+			     ? bpos_eq(pos, SPOS_MAX)
+			     : bkey_eq(pos, SPOS_MAX));
+
+		if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+			pos = bkey_successor(iter, pos);
+		bch2_btree_iter_set_pos(iter, pos);
+		return ret;
+	} else {
+		if (!btree_path_node(iter->path, iter->path->level))
+			return true;
+
+		iter->advanced = true;
+		return false;
+	}
+}
+
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+{
+	struct bpos pos = bkey_start_pos(&iter->k);
+	bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+		     ? bpos_eq(pos, POS_MIN)
+		     : bkey_eq(pos, POS_MIN));
+
+	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+		pos = bkey_predecessor(iter, pos);
+	bch2_btree_iter_set_pos(iter, pos);
+	return ret;
+}
+
+static noinline
+struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
+{
+	struct btree_insert_entry *i;
+	struct bkey_i *ret = NULL;
+
+	trans_for_each_update(iter->trans, i) {
+		if (i->btree_id < iter->btree_id)
+			continue;
+		if (i->btree_id > iter->btree_id)
+			break;
+		if (bpos_lt(i->k->k.p, iter->path->pos))
+			continue;
+		if (i->key_cache_already_flushed)
+			continue;
+		if (!ret || bpos_lt(i->k->k.p, ret->k.p))
+			ret = i->k;
+	}
+
+	return ret;
+}
+
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+{
+	return iter->flags & BTREE_ITER_WITH_UPDATES
+		? __bch2_btree_trans_peek_updates(iter)
+		: NULL;
+}
+
+static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      struct bpos end_pos)
+{
+	struct bkey_i *k;
+
+	if (bpos_lt(iter->path->pos, iter->journal_pos))
+		iter->journal_idx = 0;
+
+	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+					iter->path->level,
+					iter->path->pos,
+					end_pos,
+					&iter->journal_idx);
+
+	iter->journal_pos = k ? k->k.p : end_pos;
+	return k;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+					      struct btree_iter *iter)
+{
+	struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
+
+	if (k) {
+		iter->k = k->k;
+		return bkey_i_to_s_c(k);
+	} else {
+		return bkey_s_c_null;
+	}
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bkey_s_c k)
+{
+	struct bkey_i *next_journal =
+		bch2_btree_journal_peek(trans, iter,
+				k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
+
+	if (next_journal) {
+		iter->k = next_journal->k;
+		k = bkey_i_to_s_c(next_journal);
+	}
+
+	return k;
+}
+
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bch_fs *c = trans->c;
+	struct bkey u;
+	struct bkey_s_c k;
+	int ret;
+
+	if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+	    bpos_eq(iter->pos, pos))
+		return bkey_s_c_null;
+
+	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+		return bkey_s_c_null;
+
+	if (!iter->key_cache_path)
+		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+						     iter->flags & BTREE_ITER_INTENT, 0,
+						     iter->flags|BTREE_ITER_CACHED|
+						     BTREE_ITER_CACHED_NOFILL,
+						     _THIS_IP_);
+
+	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+	ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
+					 iter->flags|BTREE_ITER_CACHED) ?:
+		bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
+	btree_path_set_should_be_locked(iter->key_cache_path);
+
+	k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+	if (k.k && !bkey_err(k)) {
+		iter->k = u;
+		k.k = &iter->k;
+	}
+	return k;
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bkey_i *next_update;
+	struct bkey_s_c k, k2;
+	int ret;
+
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+
+	while (1) {
+		struct btree_path_level *l;
+
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out;
+		}
+
+		l = path_l(iter->path);
+
+		if (unlikely(!l->b)) {
+			/* No btree nodes at requested level: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			goto out;
+		}
+
+		btree_path_set_should_be_locked(iter->path);
+
+		k = btree_path_level_peek_all(trans->c, l, &iter->k);
+
+		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+		    k.k &&
+		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+			k = k2;
+			ret = bkey_err(k);
+			if (ret) {
+				bch2_btree_iter_set_pos(iter, iter->pos);
+				goto out;
+			}
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+			k = btree_trans_peek_journal(trans, iter, k);
+
+		next_update = btree_trans_peek_updates(iter);
+
+		if (next_update &&
+		    bpos_le(next_update->k.p,
+			    k.k ? k.k->p : l->b->key.k.p)) {
+			iter->k = next_update->k;
+			k = bkey_i_to_s_c(next_update);
+		}
+
+		if (k.k && bkey_deleted(k.k)) {
+			/*
+			 * If we've got a whiteout, and it's after the search
+			 * key, advance the search key to the whiteout instead
+			 * of just after the whiteout - it might be a btree
+			 * whiteout, with a real key at the same position, since
+			 * in the btree deleted keys sort before non deleted.
+			 */
+			search_key = !bpos_eq(search_key, k.k->p)
+				? k.k->p
+				: bpos_successor(k.k->p);
+			continue;
+		}
+
+		if (likely(k.k)) {
+			break;
+		} else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
+			/* Advance to next leaf node: */
+			search_key = bpos_successor(l->b->key.k.p);
+		} else {
+			/* End of btree: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			goto out;
+		}
+	}
+out:
+	bch2_btree_iter_verify(iter);
+
+	return k;
+}
+
+/**
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter:	iterator to peek from
+ * @end:	search limit: returns keys less than or equal to @end
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key = btree_iter_search_key(iter);
+	struct bkey_s_c k;
+	struct bpos iter_pos;
+	int ret;
+
+	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+	EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
+
+	if (iter->update_path) {
+		bch2_path_put_nokeep(trans, iter->update_path,
+				     iter->flags & BTREE_ITER_INTENT);
+		iter->update_path = NULL;
+	}
+
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	while (1) {
+		k = __bch2_btree_iter_peek(iter, search_key);
+		if (unlikely(!k.k))
+			goto end;
+		if (unlikely(bkey_err(k)))
+			goto out_no_locked;
+
+		/*
+		 * iter->pos should be mononotically increasing, and always be
+		 * equal to the key we just returned - except extents can
+		 * straddle iter->pos:
+		 */
+		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+			iter_pos = k.k->p;
+		else
+			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
+
+		if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+			     ? bkey_gt(iter_pos, end)
+			     : bkey_ge(iter_pos, end)))
+			goto end;
+
+		if (iter->update_path &&
+		    !bkey_eq(iter->update_path->pos, k.k->p)) {
+			bch2_path_put_nokeep(trans, iter->update_path,
+					     iter->flags & BTREE_ITER_INTENT);
+			iter->update_path = NULL;
+		}
+
+		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+		    (iter->flags & BTREE_ITER_INTENT) &&
+		    !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+		    !iter->update_path) {
+			struct bpos pos = k.k->p;
+
+			if (pos.snapshot < iter->snapshot) {
+				search_key = bpos_successor(k.k->p);
+				continue;
+			}
+
+			pos.snapshot = iter->snapshot;
+
+			/*
+			 * advance, same as on exit for iter->path, but only up
+			 * to snapshot
+			 */
+			__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+			iter->update_path = iter->path;
+
+			iter->update_path = bch2_btree_path_set_pos(trans,
+						iter->update_path, pos,
+						iter->flags & BTREE_ITER_INTENT,
+						_THIS_IP_);
+			ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+			if (unlikely(ret)) {
+				k = bkey_s_c_err(ret);
+				goto out_no_locked;
+			}
+		}
+
+		/*
+		 * We can never have a key in a leaf node at POS_MAX, so
+		 * we don't have to check these successor() calls:
+		 */
+		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+		    !bch2_snapshot_is_ancestor(trans->c,
+					       iter->snapshot,
+					       k.k->p.snapshot)) {
+			search_key = bpos_successor(k.k->p);
+			continue;
+		}
+
+		if (bkey_whiteout(k.k) &&
+		    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+			search_key = bkey_successor(iter, k.k->p);
+			continue;
+		}
+
+		break;
+	}
+
+	iter->pos = iter_pos;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
+				iter->flags & BTREE_ITER_INTENT,
+				btree_iter_ip_allocated(iter));
+
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+	if (iter->update_path) {
+		ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+		if (unlikely(ret))
+			k = bkey_s_c_err(ret);
+		else
+			btree_path_set_should_be_locked(iter->update_path);
+	}
+
+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+		iter->pos.snapshot = iter->snapshot;
+
+	ret = bch2_btree_iter_verify_ret(iter, k);
+	if (unlikely(ret)) {
+		bch2_btree_iter_set_pos(iter, iter->pos);
+		k = bkey_s_c_err(ret);
+	}
+
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	return k;
+end:
+	bch2_btree_iter_set_pos(iter, end);
+	k = bkey_s_c_null;
+	goto out_no_locked;
+}
+
+/**
+ * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
+ * equal to iterator's current position, returning keys from every level of the
+ * btree. For keys at different levels of the btree that compare equal, the key
+ * from the lower level (leaf) is returned first.
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bkey_s_c k;
+	int ret;
+
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+	BUG_ON(iter->path->level < iter->min_depth);
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+	EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
+
+	while (1) {
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out_no_locked;
+		}
+
+		/* Already at end? */
+		if (!btree_path_node(iter->path, iter->path->level)) {
+			k = bkey_s_c_null;
+			goto out_no_locked;
+		}
+
+		k = btree_path_level_peek_all(trans->c,
+				&iter->path->l[iter->path->level], &iter->k);
+
+		/* Check if we should go up to the parent node: */
+		if (!k.k ||
+		    (iter->advanced &&
+		     bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
+			iter->pos = path_l(iter->path)->b->key.k.p;
+			btree_path_set_level_up(trans, iter->path);
+			iter->advanced = false;
+			continue;
+		}
+
+		/*
+		 * Check if we should go back down to a leaf:
+		 * If we're not in a leaf node, we only return the current key
+		 * if it exactly matches iter->pos - otherwise we first have to
+		 * go back to the leaf:
+		 */
+		if (iter->path->level != iter->min_depth &&
+		    (iter->advanced ||
+		     !k.k ||
+		     !bpos_eq(iter->pos, k.k->p))) {
+			btree_path_set_level_down(trans, iter->path, iter->min_depth);
+			iter->pos = bpos_successor(iter->pos);
+			iter->advanced = false;
+			continue;
+		}
+
+		/* Check if we should go to the next key: */
+		if (iter->path->level == iter->min_depth &&
+		    iter->advanced &&
+		    k.k &&
+		    bpos_eq(iter->pos, k.k->p)) {
+			iter->pos = bpos_successor(iter->pos);
+			iter->advanced = false;
+			continue;
+		}
+
+		if (iter->advanced &&
+		    iter->path->level == iter->min_depth &&
+		    !bpos_eq(k.k->p, iter->pos))
+			iter->advanced = false;
+
+		BUG_ON(iter->advanced);
+		BUG_ON(!k.k);
+		break;
+	}
+
+	iter->pos = k.k->p;
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+	bch2_btree_iter_verify(iter);
+
+	return k;
+}
+
+/**
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
+ * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_advance(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek(iter);
+}
+
+/**
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
+ * iterator's current position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key = iter->pos;
+	struct btree_path *saved_path = NULL;
+	struct bkey_s_c k;
+	struct bkey saved_k;
+	const struct bch_val *saved_v;
+	int ret;
+
+	EBUG_ON(iter->path->cached || iter->path->level);
+	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+		return bkey_s_c_err(-EIO);
+
+	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+		search_key.snapshot = U32_MAX;
+
+	while (1) {
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+						iter->flags & BTREE_ITER_INTENT,
+						btree_iter_ip_allocated(iter));
+
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out_no_locked;
+		}
+
+		k = btree_path_level_peek(trans, iter->path,
+					  &iter->path->l[0], &iter->k);
+		if (!k.k ||
+		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
+		     ? bpos_ge(bkey_start_pos(k.k), search_key)
+		     : bpos_gt(k.k->p, search_key)))
+			k = btree_path_level_prev(trans, iter->path,
+						  &iter->path->l[0], &iter->k);
+
+		if (likely(k.k)) {
+			if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+				if (k.k->p.snapshot == iter->snapshot)
+					goto got_key;
+
+				/*
+				 * If we have a saved candidate, and we're no
+				 * longer at the same _key_ (not pos), return
+				 * that candidate
+				 */
+				if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
+					bch2_path_put_nokeep(trans, iter->path,
+						      iter->flags & BTREE_ITER_INTENT);
+					iter->path = saved_path;
+					saved_path = NULL;
+					iter->k	= saved_k;
+					k.v	= saved_v;
+					goto got_key;
+				}
+
+				if (bch2_snapshot_is_ancestor(iter->trans->c,
+							      iter->snapshot,
+							      k.k->p.snapshot)) {
+					if (saved_path)
+						bch2_path_put_nokeep(trans, saved_path,
+						      iter->flags & BTREE_ITER_INTENT);
+					saved_path = btree_path_clone(trans, iter->path,
+								iter->flags & BTREE_ITER_INTENT);
+					saved_k = *k.k;
+					saved_v = k.v;
+				}
+
+				search_key = bpos_predecessor(k.k->p);
+				continue;
+			}
+got_key:
+			if (bkey_whiteout(k.k) &&
+			    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+				search_key = bkey_predecessor(iter, k.k->p);
+				if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+					search_key.snapshot = U32_MAX;
+				continue;
+			}
+
+			break;
+		} else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
+			/* Advance to previous leaf node: */
+			search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+		} else {
+			/* Start of btree: */
+			bch2_btree_iter_set_pos(iter, POS_MIN);
+			k = bkey_s_c_null;
+			goto out_no_locked;
+		}
+	}
+
+	EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
+
+	/* Extents can straddle iter->pos: */
+	if (bkey_lt(k.k->p, iter->pos))
+		iter->pos = k.k->p;
+
+	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+		iter->pos.snapshot = iter->snapshot;
+
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+	if (saved_path)
+		bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+
+	return k;
+}
+
+/**
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
+ * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_rewind(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_prev(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+	EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+
+	/* extents can't span inode numbers: */
+	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	    unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
+		if (iter->pos.inode == KEY_INODE_MAX)
+			return bkey_s_c_null;
+
+		bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+	}
+
+	search_key = btree_iter_search_key(iter);
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+	if (unlikely(ret)) {
+		k = bkey_s_c_err(ret);
+		goto out_no_locked;
+	}
+
+	if ((iter->flags & BTREE_ITER_CACHED) ||
+	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
+		struct bkey_i *next_update;
+
+		if ((next_update = btree_trans_peek_updates(iter)) &&
+		    bpos_eq(next_update->k.p, iter->pos)) {
+			iter->k = next_update->k;
+			k = bkey_i_to_s_c(next_update);
+			goto out;
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
+			goto out;
+
+		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+			if (!bkey_err(k))
+				iter->k = *k.k;
+			/* We're not returning a key from iter->path: */
+			goto out_no_locked;
+		}
+
+		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+		if (unlikely(!k.k))
+			goto out_no_locked;
+	} else {
+		struct bpos next;
+		struct bpos end = iter->pos;
+
+		if (iter->flags & BTREE_ITER_IS_EXTENTS)
+			end.offset = U64_MAX;
+
+		EBUG_ON(iter->path->level);
+
+		if (iter->flags & BTREE_ITER_INTENT) {
+			struct btree_iter iter2;
+
+			bch2_trans_copy_iter(&iter2, iter);
+			k = bch2_btree_iter_peek_upto(&iter2, end);
+
+			if (k.k && !bkey_err(k)) {
+				iter->k = iter2.k;
+				k.k = &iter->k;
+			}
+			bch2_trans_iter_exit(trans, &iter2);
+		} else {
+			struct bpos pos = iter->pos;
+
+			k = bch2_btree_iter_peek_upto(iter, end);
+			if (unlikely(bkey_err(k)))
+				bch2_btree_iter_set_pos(iter, pos);
+			else
+				iter->pos = pos;
+		}
+
+		if (unlikely(bkey_err(k)))
+			goto out_no_locked;
+
+		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
+
+		if (bkey_lt(iter->pos, next)) {
+			bkey_init(&iter->k);
+			iter->k.p = iter->pos;
+
+			if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+				bch2_key_resize(&iter->k,
+						min_t(u64, KEY_SIZE_MAX,
+						      (next.inode == iter->pos.inode
+						       ? next.offset
+						       : KEY_OFFSET_MAX) -
+						      iter->pos.offset));
+				EBUG_ON(!iter->k.size);
+			}
+
+			k = (struct bkey_s_c) { &iter->k, NULL };
+		}
+	}
+out:
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+	ret = bch2_btree_iter_verify_ret(iter, k);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
+	return k;
+}
+
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_advance(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_rewind(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+
+	while (btree_trans_too_many_iters(iter->trans) ||
+	       (k = bch2_btree_iter_peek_type(iter, iter->flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(iter->trans);
+
+	return k;
+}
+
+/* new transactional stuff: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	unsigned i;
+
+	BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+
+	trans_for_each_path(trans, path) {
+		BUG_ON(path->sorted_idx >= trans->nr_sorted);
+		BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+	}
+
+	for (i = 0; i < trans->nr_sorted; i++) {
+		unsigned idx = trans->sorted[i];
+
+		EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+		BUG_ON(trans->paths[idx].sorted_idx != i);
+	}
+}
+
+static void btree_trans_verify_sorted(struct btree_trans *trans)
+{
+	struct btree_path *path, *prev = NULL;
+	unsigned i;
+
+	if (!bch2_debug_check_iterators)
+		return;
+
+	trans_for_each_path_inorder(trans, path, i) {
+		if (prev && btree_path_cmp(prev, path) > 0) {
+			__bch2_dump_trans_paths_updates(trans, true);
+			panic("trans paths out of order!\n");
+		}
+		prev = path;
+	}
+}
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
+#endif
+
+void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
+{
+	int i, l = 0, r = trans->nr_sorted, inc = 1;
+	bool swapped;
+
+	btree_trans_verify_sorted_refs(trans);
+
+	if (trans->paths_sorted)
+		goto out;
+
+	/*
+	 * Cocktail shaker sort: this is efficient because iterators will be
+	 * mostly sorted.
+	 */
+	do {
+		swapped = false;
+
+		for (i = inc > 0 ? l : r - 2;
+		     i + 1 < r && i >= l;
+		     i += inc) {
+			if (btree_path_cmp(trans->paths + trans->sorted[i],
+					   trans->paths + trans->sorted[i + 1]) > 0) {
+				swap(trans->sorted[i], trans->sorted[i + 1]);
+				trans->paths[trans->sorted[i]].sorted_idx = i;
+				trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
+				swapped = true;
+			}
+		}
+
+		if (inc > 0)
+			--r;
+		else
+			l++;
+		inc = -inc;
+	} while (swapped);
+
+	trans->paths_sorted = true;
+out:
+	btree_trans_verify_sorted(trans);
+}
+
+static inline void btree_path_list_remove(struct btree_trans *trans,
+					  struct btree_path *path)
+{
+	unsigned i;
+
+	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	trans->nr_sorted--;
+	memmove_u64s_down_small(trans->sorted + path->sorted_idx,
+				trans->sorted + path->sorted_idx + 1,
+				DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+#else
+	array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
+#endif
+	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+		trans->paths[trans->sorted[i]].sorted_idx = i;
+
+	path->sorted_idx = U8_MAX;
+}
+
+static inline void btree_path_list_add(struct btree_trans *trans,
+				       struct btree_path *pos,
+				       struct btree_path *path)
+{
+	unsigned i;
+
+	path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
+			      trans->sorted + path->sorted_idx,
+			      DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+	trans->nr_sorted++;
+	trans->sorted[path->sorted_idx] = path->idx;
+#else
+	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
+#endif
+
+	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+		trans->paths[trans->sorted[i]].sorted_idx = i;
+
+	btree_trans_verify_sorted_refs(trans);
+}
+
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
+{
+	if (iter->update_path)
+		bch2_path_put_nokeep(trans, iter->update_path,
+			      iter->flags & BTREE_ITER_INTENT);
+	if (iter->path)
+		bch2_path_put(trans, iter->path,
+			      iter->flags & BTREE_ITER_INTENT);
+	if (iter->key_cache_path)
+		bch2_path_put(trans, iter->key_cache_path,
+			      iter->flags & BTREE_ITER_INTENT);
+	iter->path = NULL;
+	iter->update_path = NULL;
+	iter->key_cache_path = NULL;
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  enum btree_id btree_id, struct bpos pos,
+			  unsigned flags)
+{
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+			       bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
+}
+
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       enum btree_id btree_id,
+			       struct bpos pos,
+			       unsigned locks_want,
+			       unsigned depth,
+			       unsigned flags)
+{
+	flags |= BTREE_ITER_NOT_EXTENTS;
+	flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+	flags |= BTREE_ITER_ALL_SNAPSHOTS;
+
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
+			       __bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
+
+	iter->min_depth	= depth;
+
+	BUG_ON(iter->path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
+	BUG_ON(iter->path->level	!= depth);
+	BUG_ON(iter->min_depth		!= depth);
+}
+
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+	*dst = *src;
+	if (src->path)
+		__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+	if (src->update_path)
+		__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+	dst->key_cache_path = NULL;
+}
+
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+	unsigned new_top = trans->mem_top + size;
+	size_t old_bytes = trans->mem_bytes;
+	size_t new_bytes = roundup_pow_of_two(new_top);
+	int ret;
+	void *new_mem;
+	void *p;
+
+	trans->mem_max = max(trans->mem_max, new_top);
+
+	WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+
+	new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+	if (unlikely(!new_mem)) {
+		bch2_trans_unlock(trans);
+
+		new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+			new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+			new_bytes = BTREE_TRANS_MEM_MAX;
+			kfree(trans->mem);
+		}
+
+		if (!new_mem)
+			return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+		trans->mem = new_mem;
+		trans->mem_bytes = new_bytes;
+
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	trans->mem = new_mem;
+	trans->mem_bytes = new_bytes;
+
+	if (old_bytes) {
+		trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
+	}
+
+	p = trans->mem + trans->mem_top;
+	trans->mem_top += size;
+	memset(p, 0, size);
+	return p;
+}
+
+static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->cached && !btree_node_locked(path, 0))
+			path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	trans->srcu_lock_time	= jiffies;
+}
+
+/**
+ * bch2_trans_begin() - reset a transaction after a interrupted attempt
+ * @trans: transaction to reset
+ *
+ * Returns:	current restart counter, to be used with trans_was_restarted()
+ *
+ * While iterating over nodes or updating nodes a attempt to lock a btree node
+ * may return BCH_ERR_transaction_restart when the trylock fails. When this
+ * occurs bch2_trans_begin() should be called and the transaction retried.
+ */
+u32 bch2_trans_begin(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	u64 now;
+
+	bch2_trans_reset_updates(trans);
+
+	trans->restart_count++;
+	trans->mem_top			= 0;
+
+	trans_for_each_path(trans, path) {
+		path->should_be_locked = false;
+
+		/*
+		 * If the transaction wasn't restarted, we're presuming to be
+		 * doing something new: dont keep iterators excpt the ones that
+		 * are in use - except for the subvolumes btree:
+		 */
+		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+			path->preserve = false;
+
+		/*
+		 * XXX: we probably shouldn't be doing this if the transaction
+		 * was restarted, but currently we still overflow transaction
+		 * iterators if we do that
+		 */
+		if (!path->ref && !path->preserve)
+			__bch2_path_free(trans, path);
+		else
+			path->preserve = false;
+	}
+
+	now = local_clock();
+	if (!trans->restarted &&
+	    (need_resched() ||
+	     now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+		drop_locks_do(trans, (cond_resched(), 0));
+		now = local_clock();
+	}
+	trans->last_begin_time = now;
+
+	if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+		bch2_trans_reset_srcu_lock(trans);
+
+	trans->last_begin_ip = _RET_IP_;
+	if (trans->restarted) {
+		bch2_btree_path_traverse_all(trans);
+		trans->notrace_relock_fail = false;
+	}
+
+	return trans->restart_count;
+}
+
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+
+	if (IS_ENABLED(__KERNEL__)) {
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+		if (trans)
+			return trans;
+	}
+
+	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+	/*
+	 * paths need to be zeroed, bch2_check_for_deadlock looks at
+	 * paths in other threads
+	 */
+	memset(&trans->paths, 0, sizeof(trans->paths));
+	return trans;
+}
+
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+
+unsigned bch2_trans_get_fn_idx(const char *fn)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+		if (!bch2_btree_transaction_fns[i] ||
+		    bch2_btree_transaction_fns[i] == fn) {
+			bch2_btree_transaction_fns[i] = fn;
+			return i;
+		}
+
+	pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
+	return i;
+}
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
+	__acquires(&c->btree_trans_barrier)
+{
+	struct btree_trans *trans;
+	struct btree_transaction_stats *s;
+
+	trans = bch2_trans_alloc(c);
+
+	memset(trans, 0, sizeof(*trans));
+	trans->c		= c;
+	trans->fn		= fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
+		? bch2_btree_transaction_fns[fn_idx] : NULL;
+	trans->last_begin_time	= local_clock();
+	trans->fn_idx		= fn_idx;
+	trans->locking_wait.task = current;
+	trans->journal_replay_not_finished =
+		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+	closure_init_stack(&trans->ref);
+
+	s = btree_trans_stats(trans);
+	if (s && s->max_mem) {
+		unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+		trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+
+		if (!unlikely(trans->mem)) {
+			trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
+			trans->mem_bytes = BTREE_TRANS_MEM_MAX;
+		} else {
+			trans->mem_bytes = expected_mem_bytes;
+		}
+	}
+
+	if (s) {
+		trans->nr_max_paths = s->nr_max_paths;
+		trans->wb_updates_size = s->wb_updates_size;
+	}
+
+	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	trans->srcu_lock_time	= jiffies;
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+		struct btree_trans *pos;
+
+		seqmutex_lock(&c->btree_trans_lock);
+		list_for_each_entry(pos, &c->btree_trans_list, list) {
+			/*
+			 * We'd much prefer to be stricter here and completely
+			 * disallow multiple btree_trans in the same thread -
+			 * but the data move path calls bch2_write when we
+			 * already have a btree_trans initialized.
+			 */
+			BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
+			       bch2_trans_locked(pos));
+
+			if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+				list_add_tail(&trans->list, &pos->list);
+				goto list_add_done;
+			}
+		}
+		list_add_tail(&trans->list, &c->btree_trans_list);
+list_add_done:
+		seqmutex_unlock(&c->btree_trans_lock);
+	}
+
+	return trans;
+}
+
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->ref)
+			goto leaked;
+	return;
+leaked:
+	bch_err(c, "btree paths leaked from %s!", trans->fn);
+	trans_for_each_path(trans, path)
+		if (path->ref)
+			printk(KERN_ERR "  btree %s %pS\n",
+			       bch2_btree_ids[path->btree_id],
+			       (void *) path->ip_allocated);
+	/* Be noisy about this: */
+	bch2_fatal_error(c);
+#endif
+}
+
+void bch2_trans_put(struct btree_trans *trans)
+	__releases(&c->btree_trans_barrier)
+{
+	struct btree_insert_entry *i;
+	struct bch_fs *c = trans->c;
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+	bch2_trans_unlock(trans);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+		seqmutex_lock(&c->btree_trans_lock);
+		list_del(&trans->list);
+		seqmutex_unlock(&c->btree_trans_lock);
+	}
+
+	closure_sync(&trans->ref);
+
+	if (s)
+		s->max_mem = max(s->max_mem, trans->mem_max);
+
+	trans_for_each_update(trans, i)
+		__btree_path_put(i->path, true);
+	trans->nr_updates		= 0;
+
+	check_btree_paths_leaked(trans);
+
+	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+
+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+
+	kfree(trans->extra_journal_entries.data);
+
+	if (trans->fs_usage_deltas) {
+		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
+		    REPLICAS_DELTA_LIST_MAX)
+			mempool_free(trans->fs_usage_deltas,
+				     &c->replicas_delta_pool);
+		else
+			kfree(trans->fs_usage_deltas);
+	}
+
+	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+		mempool_free(trans->mem, &c->btree_trans_mem_pool);
+	else
+		kfree(trans->mem);
+
+	/* Userspace doesn't have a real percpu implementation: */
+	if (IS_ENABLED(__KERNEL__))
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+	if (trans)
+		mempool_free(trans, &c->btree_trans_pool);
+}
+
+static void __maybe_unused
+bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
+				      struct btree_bkey_cached_common *b)
+{
+	struct six_lock_count c = six_lock_counts(&b->lock);
+	struct task_struct *owner;
+	pid_t pid;
+
+	rcu_read_lock();
+	owner = READ_ONCE(b->lock.owner);
+	pid = owner ? owner->pid : 0;
+	rcu_read_unlock();
+
+	prt_tab(out);
+	prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+		   b->level, bch2_btree_ids[b->btree_id]);
+	bch2_bpos_to_text(out, btree_node_pos(b));
+
+	prt_tab(out);
+	prt_printf(out, " locks %u:%u:%u held by pid %u",
+		   c.n[0], c.n[1], c.n[2], pid);
+}
+
+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+	struct btree_path *path;
+	struct btree_bkey_cached_common *b;
+	static char lock_types[] = { 'r', 'i', 'w' };
+	unsigned l, idx;
+
+	if (!out->nr_tabstops) {
+		printbuf_tabstop_push(out, 16);
+		printbuf_tabstop_push(out, 32);
+	}
+
+	prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
+
+	trans_for_each_path_safe(trans, path, idx) {
+		if (!path->nodes_locked)
+			continue;
+
+		prt_printf(out, "  path %u %c l=%u %s:",
+		       path->idx,
+		       path->cached ? 'c' : 'b',
+		       path->level,
+		       bch2_btree_ids[path->btree_id]);
+		bch2_bpos_to_text(out, path->pos);
+		prt_newline(out);
+
+		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+			if (btree_node_locked(path, l) &&
+			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
+				prt_printf(out, "    %c l=%u ",
+					   lock_types[btree_node_locked_type(path, l)], l);
+				bch2_btree_bkey_cached_common_to_text(out, b);
+				prt_newline(out);
+			}
+		}
+	}
+
+	b = READ_ONCE(trans->locking);
+	if (b) {
+		prt_printf(out, "  blocked for %lluus on",
+			   div_u64(local_clock() - trans->locking_wait.start_time,
+				   1000));
+		prt_newline(out);
+		prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
+		bch2_btree_bkey_cached_common_to_text(out, b);
+		prt_newline(out);
+	}
+}
+
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
+{
+	struct btree_transaction_stats *s;
+	struct btree_trans *trans;
+	int cpu;
+
+	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+	if (trans)
+		panic("%s leaked btree_trans\n", trans->fn);
+
+	if (c->btree_trans_bufs)
+		for_each_possible_cpu(cpu)
+			kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+	free_percpu(c->btree_trans_bufs);
+
+	for (s = c->btree_transaction_stats;
+	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+	     s++) {
+		kfree(s->max_paths_text);
+		bch2_time_stats_exit(&s->lock_hold_times);
+	}
+
+	if (c->btree_trans_barrier_initialized)
+		cleanup_srcu_struct(&c->btree_trans_barrier);
+	mempool_exit(&c->btree_trans_mem_pool);
+	mempool_exit(&c->btree_trans_pool);
+}
+
+int bch2_fs_btree_iter_init(struct bch_fs *c)
+{
+	struct btree_transaction_stats *s;
+	int ret;
+
+	for (s = c->btree_transaction_stats;
+	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+	     s++) {
+		bch2_time_stats_init(&s->lock_hold_times);
+		mutex_init(&s->lock);
+	}
+
+	INIT_LIST_HEAD(&c->btree_trans_list);
+	seqmutex_init(&c->btree_trans_lock);
+
+	c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+	if (!c->btree_trans_bufs)
+		return -ENOMEM;
+
+	ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+					  sizeof(struct btree_trans)) ?:
+		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
+					  BTREE_TRANS_MEM_MAX) ?:
+		init_srcu_struct(&c->btree_trans_barrier);
+	if (!ret)
+		c->btree_trans_barrier_initialized = true;
+	return ret;
+}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
new file mode 100644
index 000000000000..fbe273453db3
--- /dev/null
+++ b/fs/bcachefs/btree_iter.h
@@ -0,0 +1,939 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_ITER_H
+#define _BCACHEFS_BTREE_ITER_H
+
+#include "bset.h"
+#include "btree_types.h"
+#include "trace.h"
+
+static inline int __bkey_err(const struct bkey *k)
+{
+	return PTR_ERR_OR_ZERO(k);
+}
+
+#define bkey_err(_k)	__bkey_err((_k).k)
+
+static inline void __btree_path_get(struct btree_path *path, bool intent)
+{
+	path->ref++;
+	path->intent_ref += intent;
+}
+
+static inline bool __btree_path_put(struct btree_path *path, bool intent)
+{
+	EBUG_ON(!path->ref);
+	EBUG_ON(!path->intent_ref && intent);
+	path->intent_ref -= intent;
+	return --path->ref == 0;
+}
+
+static inline void btree_path_set_dirty(struct btree_path *path,
+					enum btree_path_uptodate u)
+{
+	path->uptodate = max_t(unsigned, path->uptodate, u);
+}
+
+static inline struct btree *btree_path_node(struct btree_path *path,
+					    unsigned level)
+{
+	return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
+}
+
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
+					const struct btree *b, unsigned level)
+{
+	return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
+}
+
+static inline struct btree *btree_node_parent(struct btree_path *path,
+					      struct btree *b)
+{
+	return btree_path_node(path, b->c.level + 1);
+}
+
+/* Iterate over paths within a transaction: */
+
+void __bch2_btree_trans_sort_paths(struct btree_trans *);
+
+static inline void btree_trans_sort_paths(struct btree_trans *trans)
+{
+	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    trans->paths_sorted)
+		return;
+	__bch2_btree_trans_sort_paths(trans);
+}
+
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned idx)
+{
+	u64 l;
+
+	if (idx == BTREE_ITER_MAX)
+		return NULL;
+
+	l = trans->paths_allocated >> idx;
+	if (!l)
+		return NULL;
+
+	idx += __ffs64(l);
+	EBUG_ON(idx >= BTREE_ITER_MAX);
+	EBUG_ON(trans->paths[idx].idx != idx);
+	return &trans->paths[idx];
+}
+
+#define trans_for_each_path_from(_trans, _path, _start)			\
+	for (_path = __trans_next_path((_trans), _start);		\
+	     (_path);							\
+	     _path = __trans_next_path((_trans), (_path)->idx + 1))
+
+#define trans_for_each_path(_trans, _path)				\
+	trans_for_each_path_from(_trans, _path, 0)
+
+static inline struct btree_path *
+__trans_next_path_safe(struct btree_trans *trans, unsigned *idx)
+{
+	u64 l;
+
+	if (*idx == BTREE_ITER_MAX)
+		return NULL;
+
+	l = trans->paths_allocated >> *idx;
+	if (!l)
+		return NULL;
+
+	*idx += __ffs64(l);
+	EBUG_ON(*idx >= BTREE_ITER_MAX);
+	return &trans->paths[*idx];
+}
+
+/*
+ * This version is intended to be safe for use on a btree_trans that is owned by
+ * another thread, for bch2_btree_trans_to_text();
+ */
+#define trans_for_each_path_safe_from(_trans, _path, _idx, _start)	\
+	for (_idx = _start;						\
+	     (_path = __trans_next_path_safe((_trans), &_idx));		\
+	     _idx++)
+
+#define trans_for_each_path_safe(_trans, _path, _idx)			\
+	trans_for_each_path_safe_from(_trans, _path, _idx, 0)
+
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
+{
+	unsigned idx = path ? path->sorted_idx + 1 : 0;
+
+	EBUG_ON(idx > trans->nr_sorted);
+
+	return idx < trans->nr_sorted
+		? trans->paths + trans->sorted[idx]
+		: NULL;
+}
+
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
+{
+	unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
+
+	return idx
+		? trans->paths + trans->sorted[idx - 1]
+		: NULL;
+}
+
+#define trans_for_each_path_inorder(_trans, _path, _i)			\
+	for (_i = 0;							\
+	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+	     _i++)
+
+#define trans_for_each_path_inorder_reverse(_trans, _path, _i)		\
+	for (_i = trans->nr_sorted - 1;					\
+	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
+	     --_i)
+
+static inline bool __path_has_node(const struct btree_path *path,
+				   const struct btree *b)
+{
+	return path->l[b->c.level].b == b &&
+		btree_node_lock_seq_matches(path, b, b->c.level);
+}
+
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
+			    unsigned idx)
+{
+	struct btree_path *path = __trans_next_path(trans, idx);
+
+	while (path && !__path_has_node(path, b))
+		path = __trans_next_path(trans, path->idx + 1);
+
+	return path;
+}
+
+#define trans_for_each_path_with_node(_trans, _b, _path)		\
+	for (_path = __trans_next_path_with_node((_trans), (_b), 0);	\
+	     (_path);							\
+	     _path = __trans_next_path_with_node((_trans), (_b),	\
+						 (_path)->idx + 1))
+
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+			 bool, unsigned long);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+			 struct btree_path *path, bool intent,
+			 unsigned long ip)
+{
+	if (path->ref > 1 || path->preserve)
+		path = __bch2_btree_path_make_mut(trans, path, intent, ip);
+	path->should_be_locked = false;
+	return path;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+			struct bpos, bool, unsigned long, int);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
+		   struct btree_path *path, struct bpos new_pos,
+		   bool intent, unsigned long ip)
+{
+	int cmp = bpos_cmp(new_pos, path->pos);
+
+	return cmp
+		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
+		: path;
+}
+
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+					      unsigned, unsigned long);
+
+static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+					  struct btree_path *path, unsigned flags)
+{
+	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+		return 0;
+
+	return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
+}
+
+int __must_check bch2_btree_path_traverse(struct btree_trans *,
+					  struct btree_path *, unsigned);
+struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+				 unsigned, unsigned, unsigned, unsigned long);
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+	if (k.k && bpos_eq(path->pos, k.k->p))
+		return k;
+
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
+					struct btree_iter *, struct bpos);
+
+void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
+
+int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
+
+static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
+{
+	return mutex_trylock(lock)
+		? 0
+		: __bch2_trans_mutex_lock(trans, lock);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_trans_verify_paths(struct btree_trans *);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
+			    struct bpos, bool);
+#else
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+					  struct bpos pos, bool key_cache) {}
+#endif
+
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+				      struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
+			      struct btree *, struct btree_node_iter *,
+			      struct bkey_packed *, unsigned, unsigned);
+
+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+
+void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
+
+int bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock_notrace(struct btree_trans *);
+void bch2_trans_unlock(struct btree_trans *);
+bool bch2_trans_locked(struct btree_trans *);
+
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+{
+	return restart_count != trans->restart_count
+		? -BCH_ERR_transaction_restart_nested
+		: 0;
+}
+
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
+
+static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
+						   u32 restart_count)
+{
+	if (trans_was_restarted(trans, restart_count))
+		bch2_trans_restart_error(trans, restart_count);
+}
+
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
+
+static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
+{
+	if (trans->restarted)
+		bch2_trans_in_restart_error(trans);
+}
+
+__always_inline
+static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
+{
+	BUG_ON(err <= 0);
+	BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
+
+	trans->restarted = err;
+	trans->last_restarted_ip = _THIS_IP_;
+	return -err;
+}
+
+__always_inline
+static int btree_trans_restart(struct btree_trans *trans, int err)
+{
+	btree_trans_restart_nounlock(trans, err);
+	return -err;
+}
+
+bool bch2_btree_node_upgrade(struct btree_trans *,
+			     struct btree_path *, unsigned);
+
+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
+
+static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
+					     struct btree_path *path)
+{
+	unsigned new_locks_want = path->level + !!path->intent_ref;
+
+	if (path->locks_want > new_locks_want)
+		__bch2_btree_path_downgrade(trans, path, new_locks_want);
+}
+
+void bch2_trans_downgrade(struct btree_trans *);
+
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
+
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
+
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
+
+bool bch2_btree_iter_advance(struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_iter *);
+
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	iter->k.type = KEY_TYPE_deleted;
+	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
+	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
+	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
+	iter->k.size = 0;
+}
+
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	if (unlikely(iter->update_path))
+		bch2_path_put(iter->trans, iter->update_path,
+			      iter->flags & BTREE_ITER_INTENT);
+	iter->update_path = NULL;
+
+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+		new_pos.snapshot = iter->snapshot;
+
+	__bch2_btree_iter_set_pos(iter, new_pos);
+}
+
+static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
+{
+	BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
+	iter->pos = bkey_start_pos(&iter->k);
+}
+
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+{
+	struct bpos pos = iter->pos;
+
+	iter->snapshot = snapshot;
+	pos.snapshot = snapshot;
+	bch2_btree_iter_set_pos(iter, pos);
+}
+
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+
+static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
+					       unsigned btree_id,
+					       unsigned flags)
+{
+	if (flags & BTREE_ITER_ALL_LEVELS)
+		flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
+
+	if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+	    btree_node_type_is_extents(btree_id))
+		flags |= BTREE_ITER_IS_EXTENTS;
+
+	if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+	    !btree_type_has_snapshots(btree_id))
+		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	    btree_type_has_snapshots(btree_id))
+		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+
+	if (trans->journal_replay_not_finished)
+		flags |= BTREE_ITER_WITH_JOURNAL;
+
+	return flags;
+}
+
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+					     unsigned btree_id,
+					     unsigned flags)
+{
+	if (!btree_id_cached(trans->c, btree_id)) {
+		flags &= ~BTREE_ITER_CACHED;
+		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+	} else if (!(flags & BTREE_ITER_CACHED))
+		flags |= BTREE_ITER_WITH_KEY_CACHE;
+
+	return __bch2_btree_iter_flags(trans, btree_id, flags);
+}
+
+static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
+					  struct btree_iter *iter,
+					  unsigned btree_id, struct bpos pos,
+					  unsigned locks_want,
+					  unsigned depth,
+					  unsigned flags,
+					  unsigned long ip)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->trans	= trans;
+	iter->btree_id	= btree_id;
+	iter->flags	= flags;
+	iter->snapshot	= pos.snapshot;
+	iter->pos	= pos;
+	iter->k.p	= pos;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	iter->ip_allocated = ip;
+#endif
+	iter->path = bch2_path_get(trans, btree_id, iter->pos,
+				   locks_want, depth, flags, ip);
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
+			  enum btree_id, struct bpos, unsigned);
+
+static inline void bch2_trans_iter_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  unsigned btree_id, struct bpos pos,
+			  unsigned flags)
+{
+	if (__builtin_constant_p(btree_id) &&
+	    __builtin_constant_p(flags))
+		bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+				bch2_btree_iter_flags(trans, btree_id, flags),
+				_THIS_IP_);
+	else
+		bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
+}
+
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+			       enum btree_id, struct bpos,
+			       unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
+{
+	if (!iter->trans->restarted)
+		iter->path->preserve = false;
+}
+
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+
+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+	size = roundup(size, 8);
+
+	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+		void *p = trans->mem + trans->mem_top;
+
+		trans->mem_top += size;
+		memset(p, 0, size);
+		return p;
+	} else {
+		return __bch2_trans_kmalloc(trans, size);
+	}
+}
+
+static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+{
+	size = roundup(size, 8);
+
+	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+		void *p = trans->mem + trans->mem_top;
+
+		trans->mem_top += size;
+		return p;
+	} else {
+		return __bch2_trans_kmalloc(trans, size);
+	}
+}
+
+static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags, unsigned type)
+{
+	struct bkey_s_c k;
+
+	bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
+	k = bch2_btree_iter_peek_slot(iter);
+
+	if (!bkey_err(k) && type && k.k->type != type)
+		k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
+	if (unlikely(bkey_err(k)))
+		bch2_trans_iter_exit(trans, iter);
+	return k;
+}
+
+static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags)
+{
+	return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
+}
+
+#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
+				       _btree_id, _pos, _flags, KEY_TYPE_##_type))
+
+static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags, unsigned type,
+				unsigned val_size, void *val)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
+	ret = bkey_err(k);
+	if (!ret) {
+		unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
+
+		memcpy(val, k.v, b);
+		if (unlikely(b < sizeof(*val)))
+			memset((void *) val + b, 0, sizeof(*val) - b);
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	return ret;
+}
+
+#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
+	__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,	\
+				  KEY_TYPE_##_type, sizeof(*_val), _val)
+
+u32 bch2_trans_begin(struct btree_trans *);
+
+/*
+ * XXX
+ * this does not handle transaction restarts from bch2_btree_iter_next_node()
+ * correctly
+ */
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			      _locks_want, _depth, _flags, _b, _ret)	\
+	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
+				_start, _locks_want, _depth, _flags);	\
+	     (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)),	\
+	     !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);			\
+	     (_b) = bch2_btree_iter_next_node(&(_iter)))
+
+#define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			    _flags, _b, _ret)				\
+	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			      0, 0, _flags, _b, _ret)
+
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+							     unsigned flags)
+{
+	BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
+
+	return  flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+						bch2_btree_iter_peek_prev(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
+							unsigned flags)
+{
+	return  flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
+		flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+						bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+							     struct bpos end,
+							     unsigned flags)
+{
+	if (!(flags & BTREE_ITER_SLOTS))
+		return bch2_btree_iter_peek_upto(iter, end);
+
+	if (bkey_gt(iter->pos, end))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
+{
+	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
+		trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
+	}
+
+	return 0;
+}
+
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+				   struct btree_iter *iter, unsigned flags)
+{
+	struct bkey_s_c k;
+
+	while (btree_trans_too_many_iters(trans) ||
+	       (k = bch2_btree_iter_peek_type(iter, flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(trans);
+
+	return k;
+}
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct bpos end,
+					unsigned flags)
+{
+	struct bkey_s_c k;
+
+	while (btree_trans_too_many_iters(trans) ||
+	       (k = bch2_btree_iter_peek_upto_type(iter, end, flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(trans);
+
+	return k;
+}
+
+#define lockrestart_do(_trans, _do)					\
+({									\
+	u32 _restart_count;						\
+	int _ret2;							\
+									\
+	do {								\
+		_restart_count = bch2_trans_begin(_trans);		\
+		_ret2 = (_do);						\
+	} while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart));	\
+									\
+	if (!_ret2)							\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+									\
+	_ret2;								\
+})
+
+/*
+ * nested_lockrestart_do(), nested_commit_do():
+ *
+ * These are like lockrestart_do() and commit_do(), with two differences:
+ *
+ *  - We don't call bch2_trans_begin() unless we had a transaction restart
+ *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
+ *  transaction restart
+ */
+#define nested_lockrestart_do(_trans, _do)				\
+({									\
+	u32 _restart_count, _orig_restart_count;			\
+	int _ret2;							\
+									\
+	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
+									\
+	while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
+		_restart_count = bch2_trans_begin(_trans);		\
+									\
+	if (!_ret2)							\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+									\
+	_ret2 ?: trans_was_restarted(_trans, _restart_count);		\
+})
+
+#define for_each_btree_key2(_trans, _iter, _btree_id,			\
+			    _start, _flags, _k, _do)			\
+({									\
+	int _ret3 = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	while (1) {							\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
+									\
+		_ret3 = 0;						\
+		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
+		if (!(_k).k)						\
+			break;						\
+									\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret3)						\
+			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+		if (!bch2_btree_iter_advance(&(_iter)))			\
+			break;						\
+	}								\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret3;								\
+})
+
+#define for_each_btree_key2_upto(_trans, _iter, _btree_id,		\
+			    _start, _end, _flags, _k, _do)		\
+({									\
+	int _ret3 = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	while (1) {							\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
+									\
+		_ret3 = 0;						\
+		(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
+		if (!(_k).k)						\
+			break;						\
+									\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret3)						\
+			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+		if (!bch2_btree_iter_advance(&(_iter)))			\
+			break;						\
+	}								\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret3;								\
+})
+
+#define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
+				   _start, _flags, _k, _do)		\
+({									\
+	int _ret3 = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	while (1) {							\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
+		(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
+		if (!(_k).k) {						\
+			_ret3 = 0;					\
+			break;						\
+		}							\
+									\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret3)						\
+			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+		if (!bch2_btree_iter_rewind(&(_iter)))			\
+			break;						\
+	}								\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret3;								\
+})
+
+#define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
+				  _start, _iter_flags, _k,		\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id,	\
+				  _start, _iter_flags, _k,		\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id,	\
+				  _start, _end, _iter_flags, _k,	\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key(_trans, _iter, _btree_id,			\
+			   _start, _flags, _k, _ret)			\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto(_trans, _iter, _btree_id,		\
+				_start, _end, _flags, _k, _ret)		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans),	\
+						&(_iter), _end, _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
+			   _start, _flags, _k, _ret)			\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
+			   _start, _end, _flags, _k, _ret)		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret)	\
+	for (;								\
+	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
+	for (;								\
+	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
+	for (;									\
+	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;				\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define drop_locks_do(_trans, _do)					\
+({									\
+	bch2_trans_unlock(_trans);					\
+	_do ?: bch2_trans_relock(_trans);				\
+})
+
+#define allocate_dropping_locks_errcode(_trans, _do)			\
+({									\
+	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
+	int _ret = _do;							\
+									\
+	if (bch2_err_matches(_ret, ENOMEM)) {				\
+		_gfp = GFP_KERNEL;					\
+		_ret = drop_locks_do(trans, _do);			\
+	}								\
+	_ret;								\
+})
+
+#define allocate_dropping_locks(_trans, _ret, _do)			\
+({									\
+	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
+	typeof(_do) _p = _do;						\
+									\
+	_ret = 0;							\
+	if (unlikely(!_p)) {						\
+		_gfp = GFP_KERNEL;					\
+		_ret = drop_locks_do(trans, ((_p = _do), 0));		\
+	}								\
+	_p;								\
+})
+
+/* new multiple iterator interface: */
+
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
+
+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+unsigned bch2_trans_get_fn_idx(const char *);
+
+#define bch2_trans_get(_c)						\
+({									\
+	static unsigned trans_fn_idx;					\
+									\
+	if (unlikely(!trans_fn_idx))					\
+		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
+	__bch2_trans_get(_c, trans_fn_idx);				\
+})
+
+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
+
+void bch2_fs_btree_iter_exit(struct bch_fs *);
+int bch2_fs_btree_iter_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
new file mode 100644
index 000000000000..58a981bcf3aa
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bset.h"
+#include "btree_journal_iter.h"
+#include "journal_io.h"
+
+#include <linux/sort.h>
+
+/*
+ * For managing keys we read from the journal: until journal replay works normal
+ * btree lookups need to be able to find and return keys from the journal where
+ * they overwrite what's in the btree, so we have a special iterator and
+ * operations for the regular btree iter code to use:
+ */
+
+static int __journal_key_cmp(enum btree_id	l_btree_id,
+			     unsigned		l_level,
+			     struct bpos	l_pos,
+			     const struct journal_key *r)
+{
+	return (cmp_int(l_btree_id,	r->btree_id) ?:
+		cmp_int(l_level,	r->level) ?:
+		bpos_cmp(l_pos,	r->k->k.p));
+}
+
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+	size_t gap_size = keys->size - keys->nr;
+
+	if (idx >= keys->gap)
+		idx += gap_size;
+	return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+	return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+					enum btree_id id, unsigned level,
+					struct bpos pos)
+{
+	size_t l = 0, r = keys->nr, m;
+
+	while (l < r) {
+		m = l + ((r - l) >> 1);
+		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	BUG_ON(l < keys->nr &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+
+	BUG_ON(l &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+
+	return l;
+}
+
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+				      enum btree_id id, unsigned level,
+				      struct bpos pos)
+{
+	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos,
+					   struct bpos end_pos, size_t *idx)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	unsigned iters = 0;
+	struct journal_key *k;
+search:
+	if (!*idx)
+		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+			return NULL;
+
+		if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
+		    !k->overwritten)
+			return k->k;
+
+		(*idx)++;
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
+	return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos)
+{
+	size_t idx = 0;
+
+	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	/* The key we just inserted is immediately before the gap: */
+	size_t gap_end = keys->gap + (keys->size - keys->nr);
+	struct btree_and_journal_iter *iter;
+
+	/*
+	 * If an iterator points one after the key we just inserted, decrement
+	 * the iterator so it points at the key we just inserted - if the
+	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+	 * handle that:
+	 */
+	list_for_each_entry(iter, &c->journal_iters, journal.list)
+		if (iter->journal.idx == gap_end)
+			iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_iter *iter;
+	size_t gap_size = keys->size - keys->nr;
+
+	list_for_each_entry(iter, &c->journal_iters, list) {
+		if (iter->idx > old_gap)
+			iter->idx -= gap_size;
+		if (iter->idx >= new_gap)
+			iter->idx += gap_size;
+	}
+}
+
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+				 unsigned level, struct bkey_i *k)
+{
+	struct journal_key n = {
+		.btree_id	= id,
+		.level		= level,
+		.k		= k,
+		.allocated	= true,
+		/*
+		 * Ensure these keys are done last by journal replay, to unblock
+		 * journal reclaim:
+		 */
+		.journal_seq	= U32_MAX,
+	};
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+
+	if (idx < keys->size &&
+	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
+		if (keys->d[idx].allocated)
+			kfree(keys->d[idx].k);
+		keys->d[idx] = n;
+		return 0;
+	}
+
+	if (idx > keys->gap)
+		idx -= keys->size - keys->nr;
+
+	if (keys->nr == keys->size) {
+		struct journal_keys new_keys = {
+			.nr			= keys->nr,
+			.size			= max_t(size_t, keys->size, 8) * 2,
+		};
+
+		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
+		if (!new_keys.d) {
+			bch_err(c, "%s: error allocating new key array (size %zu)",
+				__func__, new_keys.size);
+			return -BCH_ERR_ENOMEM_journal_key_insert;
+		}
+
+		/* Since @keys was full, there was no gap: */
+		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+		kvfree(keys->d);
+		*keys = new_keys;
+
+		/* And now the gap is at the end: */
+		keys->gap = keys->nr;
+	}
+
+	journal_iters_move_gap(c, keys->gap, idx);
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+	keys->gap = idx;
+
+	keys->nr++;
+	keys->d[keys->gap++] = n;
+
+	journal_iters_fix(c);
+
+	return 0;
+}
+
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bkey_i *k)
+{
+	struct bkey_i *n;
+	int ret;
+
+	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+	if (!n)
+		return -BCH_ERR_ENOMEM_journal_key_insert;
+
+	bkey_copy(n, k);
+	ret = bch2_journal_key_insert_take(c, id, level, n);
+	if (ret)
+		kfree(n);
+	return ret;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bpos pos)
+{
+	struct bkey_i whiteout;
+
+	bkey_init(&whiteout.k);
+	whiteout.k.p = pos;
+
+	return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+				  unsigned level, struct bpos pos)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+	if (idx < keys->size &&
+	    keys->d[idx].btree_id	== btree &&
+	    keys->d[idx].level		== level &&
+	    bpos_eq(keys->d[idx].k->k.p, pos))
+		keys->d[idx].overwritten = true;
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+	if (iter->idx < iter->keys->size) {
+		iter->idx++;
+		if (iter->idx == iter->keys->gap)
+			iter->idx += iter->keys->size - iter->keys->nr;
+	}
+}
+
+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+	struct journal_key *k = iter->keys->d + iter->idx;
+
+	while (k < iter->keys->d + iter->keys->size &&
+	       k->btree_id	== iter->btree_id &&
+	       k->level		== iter->level) {
+		if (!k->overwritten)
+			return bkey_i_to_s_c(k->k);
+
+		bch2_journal_iter_advance(iter);
+		k = iter->keys->d + iter->idx;
+	}
+
+	return bkey_s_c_null;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+	list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+				   struct journal_iter *iter,
+				   enum btree_id id, unsigned level,
+				   struct bpos pos)
+{
+	iter->btree_id	= id;
+	iter->level	= level;
+	iter->keys	= &c->journal_keys;
+	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+						iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+}
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+	if (bpos_eq(iter->pos, SPOS_MAX))
+		iter->at_end = true;
+	else
+		iter->pos = bpos_successor(iter->pos);
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+	struct bkey_s_c btree_k, journal_k, ret;
+again:
+	if (iter->at_end)
+		return bkey_s_c_null;
+
+	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+	       bpos_lt(btree_k.k->p, iter->pos))
+		bch2_journal_iter_advance_btree(iter);
+
+	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+	       bpos_lt(journal_k.k->p, iter->pos))
+		bch2_journal_iter_advance(&iter->journal);
+
+	ret = journal_k.k &&
+		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
+		? journal_k
+		: btree_k;
+
+	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
+		ret = bkey_s_c_null;
+
+	if (ret.k) {
+		iter->pos = ret.k->p;
+		if (bkey_deleted(ret.k)) {
+			bch2_btree_and_journal_iter_advance(iter);
+			goto again;
+		}
+	} else {
+		iter->pos = SPOS_MAX;
+		iter->at_end = true;
+	}
+
+	return ret;
+}
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
+{
+	bch2_journal_iter_exit(&iter->journal);
+}
+
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						  struct bch_fs *c,
+						  struct btree *b,
+						  struct btree_node_iter node_iter,
+						  struct bpos pos)
+{
+	memset(iter, 0, sizeof(*iter));
+
+	iter->b = b;
+	iter->node_iter = node_iter;
+	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+	INIT_LIST_HEAD(&iter->journal.list);
+	iter->pos = b->data->min_key;
+	iter->at_end = false;
+}
+
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						struct bch_fs *c,
+						struct btree *b)
+{
+	struct btree_node_iter node_iter;
+
+	bch2_btree_node_iter_init_from_start(&node_iter, b);
+	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+	list_add(&iter->journal.list, &c->journal_iters);
+}
+
+/* sort and dedup all keys in the journal: */
+
+void bch2_journal_entries_free(struct bch_fs *c)
+{
+	struct journal_replay **i;
+	struct genradix_iter iter;
+
+	genradix_for_each(&c->journal_entries, iter, i)
+		if (*i)
+			kvpfree(*i, offsetof(struct journal_replay, j) +
+				vstruct_bytes(&(*i)->j));
+	genradix_free(&c->journal_entries);
+}
+
+/*
+ * When keys compare equal, oldest compares first:
+ */
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return  journal_key_cmp(l, r) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+void bch2_journal_keys_free(struct journal_keys *keys)
+{
+	struct journal_key *i;
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+	keys->gap = keys->nr;
+
+	for (i = keys->d; i < keys->d + keys->nr; i++)
+		if (i->allocated)
+			kfree(i->k);
+
+	kvfree(keys->d);
+	keys->d = NULL;
+	keys->nr = keys->gap = keys->size = 0;
+}
+
+static void __journal_keys_sort(struct journal_keys *keys)
+{
+	struct journal_key *src, *dst;
+
+	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+
+	src = dst = keys->d;
+	while (src < keys->d + keys->nr) {
+		while (src + 1 < keys->d + keys->nr &&
+		       src[0].btree_id	== src[1].btree_id &&
+		       src[0].level	== src[1].level &&
+		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
+			src++;
+
+		*dst++ = *src++;
+	}
+
+	keys->nr = dst - keys->d;
+}
+
+int bch2_journal_keys_sort(struct bch_fs *c)
+{
+	struct genradix_iter iter;
+	struct journal_replay *i, **_i;
+	struct jset_entry *entry;
+	struct bkey_i *k;
+	struct journal_keys *keys = &c->journal_keys;
+	size_t nr_keys = 0, nr_read = 0;
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		for_each_jset_key(k, entry, &i->j)
+			nr_keys++;
+	}
+
+	if (!nr_keys)
+		return 0;
+
+	keys->size = roundup_pow_of_two(nr_keys);
+
+	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+	if (!keys->d) {
+		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
+			nr_keys);
+
+		do {
+			keys->size >>= 1;
+			keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+		} while (!keys->d && keys->size > nr_keys / 8);
+
+		if (!keys->d) {
+			bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
+				keys->size);
+			return -BCH_ERR_ENOMEM_journal_keys_sort;
+		}
+	}
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		cond_resched();
+
+		for_each_jset_key(k, entry, &i->j) {
+			if (keys->nr == keys->size) {
+				__journal_keys_sort(keys);
+
+				if (keys->nr > keys->size * 7 / 8) {
+					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
+						keys->nr, keys->size, nr_read, nr_keys);
+					return -BCH_ERR_ENOMEM_journal_keys_sort;
+				}
+			}
+
+			keys->d[keys->nr++] = (struct journal_key) {
+				.btree_id	= entry->btree_id,
+				.level		= entry->level,
+				.k		= k,
+				.journal_seq	= le64_to_cpu(i->j.seq),
+				.journal_offset	= k->_data - i->j._data,
+			};
+
+			nr_read++;
+		}
+	}
+
+	__journal_keys_sort(keys);
+	keys->gap = keys->nr;
+
+	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+	return 0;
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
new file mode 100644
index 000000000000..5d64e7e22f26
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+
+struct journal_iter {
+	struct list_head	list;
+	enum btree_id		btree_id;
+	unsigned		level;
+	size_t			idx;
+	struct journal_keys	*keys;
+};
+
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+
+struct btree_and_journal_iter {
+	struct btree		*b;
+	struct btree_node_iter	node_iter;
+	struct bkey		unpacked;
+
+	struct journal_iter	journal;
+	struct bpos		pos;
+	bool			at_end;
+};
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+				unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+					   unsigned, struct bpos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+				 unsigned, struct bkey_i *);
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+			    unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+			    unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+				  unsigned, struct bpos);
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+				struct bch_fs *, struct btree *,
+				struct btree_node_iter, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+						struct bch_fs *,
+						struct btree *);
+
+void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_entries_free(struct bch_fs *);
+
+int bch2_journal_keys_sort(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
new file mode 100644
index 000000000000..29a0b566a4fe
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+	return id == BTREE_ID_subvolumes;
+}
+
+static struct kmem_cache *bch2_key_cache;
+
+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				       const void *obj)
+{
+	const struct bkey_cached *ck = obj;
+	const struct bkey_cached_key *key = arg->key;
+
+	return ck->key.btree_id != key->btree_id ||
+		!bpos_eq(ck->key.pos, key->pos);
+}
+
+static const struct rhashtable_params bch2_btree_key_cache_params = {
+	.head_offset	= offsetof(struct bkey_cached, hash),
+	.key_offset	= offsetof(struct bkey_cached, key),
+	.key_len	= sizeof(struct bkey_cached_key),
+	.obj_cmpfn	= bch2_btree_key_cache_cmp_fn,
+};
+
+__flatten
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+{
+	struct bkey_cached_key key = {
+		.btree_id	= btree_id,
+		.pos		= pos,
+	};
+
+	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
+				      bch2_btree_key_cache_params);
+}
+
+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
+{
+	if (!six_trylock_intent(&ck->c.lock))
+		return false;
+
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		six_unlock_intent(&ck->c.lock);
+		return false;
+	}
+
+	if (!six_trylock_write(&ck->c.lock)) {
+		six_unlock_intent(&ck->c.lock);
+		return false;
+	}
+
+	return true;
+}
+
+static void bkey_cached_evict(struct btree_key_cache *c,
+			      struct bkey_cached *ck)
+{
+	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
+				      bch2_btree_key_cache_params));
+	memset(&ck->key, ~0, sizeof(ck->key));
+
+	atomic_long_dec(&c->nr_keys);
+}
+
+static void bkey_cached_free(struct btree_key_cache *bc,
+			     struct bkey_cached *ck)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+	ck->btree_trans_barrier_seq =
+		start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+	if (ck->c.lock.readers)
+		list_move_tail(&ck->list, &bc->freed_pcpu);
+	else
+		list_move_tail(&ck->list, &bc->freed_nonpcpu);
+	atomic_long_inc(&bc->nr_freed);
+
+	kfree(ck->k);
+	ck->k		= NULL;
+	ck->u64s	= 0;
+
+	six_unlock_write(&ck->c.lock);
+	six_unlock_intent(&ck->c.lock);
+}
+
+#ifdef __KERNEL__
+static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
+						   struct bkey_cached *ck)
+{
+	struct bkey_cached *pos;
+
+	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
+		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
+				 pos->btree_trans_barrier_seq)) {
+			list_move(&ck->list, &pos->list);
+			return;
+		}
+	}
+
+	list_move(&ck->list, &bc->freed_nonpcpu);
+}
+#endif
+
+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+					 struct bkey_cached *ck)
+{
+	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+	if (!ck->c.lock.readers) {
+#ifdef __KERNEL__
+		struct btree_key_cache_freelist *f;
+		bool freed = false;
+
+		preempt_disable();
+		f = this_cpu_ptr(bc->pcpu_freed);
+
+		if (f->nr < ARRAY_SIZE(f->objs)) {
+			f->objs[f->nr++] = ck;
+			freed = true;
+		}
+		preempt_enable();
+
+		if (!freed) {
+			mutex_lock(&bc->lock);
+			preempt_disable();
+			f = this_cpu_ptr(bc->pcpu_freed);
+
+			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+				struct bkey_cached *ck2 = f->objs[--f->nr];
+
+				__bkey_cached_move_to_freelist_ordered(bc, ck2);
+			}
+			preempt_enable();
+
+			__bkey_cached_move_to_freelist_ordered(bc, ck);
+			mutex_unlock(&bc->lock);
+		}
+#else
+		mutex_lock(&bc->lock);
+		list_move_tail(&ck->list, &bc->freed_nonpcpu);
+		mutex_unlock(&bc->lock);
+#endif
+	} else {
+		mutex_lock(&bc->lock);
+		list_move_tail(&ck->list, &bc->freed_pcpu);
+		mutex_unlock(&bc->lock);
+	}
+}
+
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+				  struct bkey_cached *ck)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+	ck->btree_trans_barrier_seq =
+		start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+	list_del_init(&ck->list);
+	atomic_long_inc(&bc->nr_freed);
+
+	kfree(ck->k);
+	ck->k		= NULL;
+	ck->u64s	= 0;
+
+	bkey_cached_move_to_freelist(bc, ck);
+
+	six_unlock_write(&ck->c.lock);
+	six_unlock_intent(&ck->c.lock);
+}
+
+static struct bkey_cached *
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
+		  bool *was_new)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bkey_cached *ck = NULL;
+	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+	int ret;
+
+	if (!pcpu_readers) {
+#ifdef __KERNEL__
+		struct btree_key_cache_freelist *f;
+
+		preempt_disable();
+		f = this_cpu_ptr(bc->pcpu_freed);
+		if (f->nr)
+			ck = f->objs[--f->nr];
+		preempt_enable();
+
+		if (!ck) {
+			mutex_lock(&bc->lock);
+			preempt_disable();
+			f = this_cpu_ptr(bc->pcpu_freed);
+
+			while (!list_empty(&bc->freed_nonpcpu) &&
+			       f->nr < ARRAY_SIZE(f->objs) / 2) {
+				ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+				list_del_init(&ck->list);
+				f->objs[f->nr++] = ck;
+			}
+
+			ck = f->nr ? f->objs[--f->nr] : NULL;
+			preempt_enable();
+			mutex_unlock(&bc->lock);
+		}
+#else
+		mutex_lock(&bc->lock);
+		if (!list_empty(&bc->freed_nonpcpu)) {
+			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+			list_del_init(&ck->list);
+		}
+		mutex_unlock(&bc->lock);
+#endif
+	} else {
+		mutex_lock(&bc->lock);
+		if (!list_empty(&bc->freed_pcpu)) {
+			ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
+			list_del_init(&ck->list);
+		}
+		mutex_unlock(&bc->lock);
+	}
+
+	if (ck) {
+		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
+		if (unlikely(ret)) {
+			bkey_cached_move_to_freelist(bc, ck);
+			return ERR_PTR(ret);
+		}
+
+		path->l[0].b = (void *) ck;
+		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+
+		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
+		if (unlikely(ret)) {
+			btree_node_unlock(trans, path, 0);
+			bkey_cached_move_to_freelist(bc, ck);
+			return ERR_PTR(ret);
+		}
+
+		return ck;
+	}
+
+	ck = allocate_dropping_locks(trans, ret,
+			kmem_cache_zalloc(bch2_key_cache, _gfp));
+	if (ret) {
+		kmem_cache_free(bch2_key_cache, ck);
+		return ERR_PTR(ret);
+	}
+
+	if (!ck)
+		return NULL;
+
+	INIT_LIST_HEAD(&ck->list);
+	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+
+	ck->c.cached = true;
+	BUG_ON(!six_trylock_intent(&ck->c.lock));
+	BUG_ON(!six_trylock_write(&ck->c.lock));
+	*was_new = true;
+	return ck;
+}
+
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct bkey_cached *ck;
+	unsigned i;
+
+	mutex_lock(&c->lock);
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+			    bkey_cached_lock_for_evict(ck)) {
+				bkey_cached_evict(c, ck);
+				goto out;
+			}
+		}
+	ck = NULL;
+out:
+	rcu_read_unlock();
+	mutex_unlock(&c->lock);
+	return ck;
+}
+
+static struct bkey_cached *
+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bkey_cached *ck;
+	bool was_new = false;
+
+	ck = bkey_cached_alloc(trans, path, &was_new);
+	if (IS_ERR(ck))
+		return ck;
+
+	if (unlikely(!ck)) {
+		ck = bkey_cached_reuse(bc);
+		if (unlikely(!ck)) {
+			bch_err(c, "error allocating memory for key cache item, btree %s",
+				bch2_btree_ids[path->btree_id]);
+			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
+		}
+
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+	}
+
+	ck->c.level		= 0;
+	ck->c.btree_id		= path->btree_id;
+	ck->key.btree_id	= path->btree_id;
+	ck->key.pos		= path->pos;
+	ck->valid		= false;
+	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
+
+	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
+					  &ck->hash,
+					  bch2_btree_key_cache_params))) {
+		/* We raced with another fill: */
+
+		if (likely(was_new)) {
+			six_unlock_write(&ck->c.lock);
+			six_unlock_intent(&ck->c.lock);
+			kfree(ck);
+		} else {
+			bkey_cached_free_fast(bc, ck);
+		}
+
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+		return NULL;
+	}
+
+	atomic_long_inc(&bc->nr_keys);
+
+	six_unlock_write(&ck->c.lock);
+
+	return ck;
+}
+
+static int btree_key_cache_fill(struct btree_trans *trans,
+				struct btree_path *ck_path,
+				struct bkey_cached *ck)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned new_u64s = 0;
+	struct bkey_i *new_k = NULL;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
+			       BTREE_ITER_KEY_CACHE_FILL|
+			       BTREE_ITER_CACHED_NOFILL);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+		goto err;
+	}
+
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at
+	 * most 7 bytes (it won't be used):
+	 */
+	new_u64s = k.k->u64s + 1;
+
+	/*
+	 * Allocate some extra space so that the transaction commit path is less
+	 * likely to have to reallocate, since that requires a transaction
+	 * restart:
+	 */
+	new_u64s = min(256U, (new_u64s * 3) / 2);
+
+	if (new_u64s > ck->u64s) {
+		new_u64s = roundup_pow_of_two(new_u64s);
+		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
+		if (!new_k) {
+			bch2_trans_unlock(trans);
+
+			new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+			if (!new_k) {
+				bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+					bch2_btree_ids[ck->key.btree_id], new_u64s);
+				ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+				goto err;
+			}
+
+			if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+				kfree(new_k);
+				trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+				goto err;
+			}
+
+			ret = bch2_trans_relock(trans);
+			if (ret) {
+				kfree(new_k);
+				goto err;
+			}
+		}
+	}
+
+	ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
+	if (ret) {
+		kfree(new_k);
+		goto err;
+	}
+
+	if (new_k) {
+		kfree(ck->k);
+		ck->u64s = new_u64s;
+		ck->k = new_k;
+	}
+
+	bkey_reassemble(ck->k, k);
+	ck->valid = true;
+	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
+
+	/* We're not likely to need this iterator again: */
+	set_btree_iter_dontneed(&iter);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline int
+bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
+					 unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck;
+	int ret = 0;
+
+	BUG_ON(path->level);
+
+	path->l[1].b = NULL;
+
+	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+		ck = (void *) path->l[0].b;
+		goto fill;
+	}
+retry:
+	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+	if (!ck) {
+		ck = btree_key_cache_create(trans, path);
+		ret = PTR_ERR_OR_ZERO(ck);
+		if (ret)
+			goto err;
+		if (!ck)
+			goto retry;
+
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+		path->locks_want = 1;
+	} else {
+		enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+		ret = btree_node_lock(trans, path, (void *) ck, 0,
+				      lock_want, _THIS_IP_);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto err;
+
+		BUG_ON(ret);
+
+		if (ck->key.btree_id != path->btree_id ||
+		    !bpos_eq(ck->key.pos, path->pos)) {
+			six_unlock_type(&ck->c.lock, lock_want);
+			goto retry;
+		}
+
+		mark_btree_node_locked(trans, path, 0,
+				       (enum btree_node_locked_type) lock_want);
+	}
+
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
+	path->l[0].b		= (void *) ck;
+fill:
+	path->uptodate = BTREE_ITER_UPTODATE;
+
+	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+		/*
+		 * Using the underscore version because we haven't set
+		 * path->uptodate yet:
+		 */
+		if (!path->locks_want &&
+		    !__bch2_btree_path_upgrade(trans, path, 1)) {
+			trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
+			goto err;
+		}
+
+		ret = btree_key_cache_fill(trans, path, ck);
+		if (ret)
+			goto err;
+
+		ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+		if (ret)
+			goto err;
+
+		path->uptodate = BTREE_ITER_UPTODATE;
+	}
+
+	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+	BUG_ON(path->uptodate);
+
+	return ret;
+err:
+	path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		btree_node_unlock(trans, path, 0);
+		path->l[0].b = ERR_PTR(ret);
+	}
+	return ret;
+}
+
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+				    unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck;
+	int ret = 0;
+
+	EBUG_ON(path->level);
+
+	path->l[1].b = NULL;
+
+	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+		ck = (void *) path->l[0].b;
+		goto fill;
+	}
+retry:
+	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+	if (!ck) {
+		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+	} else {
+		enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+		ret = btree_node_lock(trans, path, (void *) ck, 0,
+				      lock_want, _THIS_IP_);
+		EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+		if (ret)
+			return ret;
+
+		if (ck->key.btree_id != path->btree_id ||
+		    !bpos_eq(ck->key.pos, path->pos)) {
+			six_unlock_type(&ck->c.lock, lock_want);
+			goto retry;
+		}
+
+		mark_btree_node_locked(trans, path, 0,
+				       (enum btree_node_locked_type) lock_want);
+	}
+
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
+	path->l[0].b		= (void *) ck;
+fill:
+	if (!ck->valid)
+		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+
+	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+	path->uptodate = BTREE_ITER_UPTODATE;
+	EBUG_ON(!ck->valid);
+	EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+
+	return ret;
+}
+
+static int btree_key_cache_flush_pos(struct btree_trans *trans,
+				     struct bkey_cached_key key,
+				     u64 journal_seq,
+				     unsigned commit_flags,
+				     bool evict)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree_iter c_iter, b_iter;
+	struct bkey_cached *ck = NULL;
+	int ret;
+
+	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+			     BTREE_ITER_SLOTS|
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
+	ret = bch2_btree_iter_traverse(&c_iter);
+	if (ret)
+		goto out;
+
+	ck = (void *) c_iter.path->l[0].b;
+	if (!ck)
+		goto out;
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		if (evict)
+			goto evict;
+		goto out;
+	}
+
+	BUG_ON(!ck->valid);
+
+	if (journal_seq && ck->journal.seq != journal_seq)
+		goto out;
+
+	/*
+	 * Since journal reclaim depends on us making progress here, and the
+	 * allocator/copygc depend on journal reclaim making progress, we need
+	 * to be using alloc reserves:
+	 */
+	ret   = bch2_btree_iter_traverse(&b_iter) ?:
+		bch2_trans_update(trans, &b_iter, ck->k,
+				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+				  BTREE_TRIGGER_NORUN) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOCHECK_RW|
+				  BTREE_INSERT_NOFAIL|
+				  (ck->journal.seq == journal_last_seq(j)
+				   ? BCH_WATERMARK_reclaim
+				   : 0)|
+				  commit_flags);
+
+	bch2_fs_fatal_err_on(ret &&
+			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+			     !bch2_journal_error(j), c,
+			     "error flushing key cache: %s", bch2_err_str(ret));
+	if (ret)
+		goto out;
+
+	bch2_journal_pin_drop(j, &ck->journal);
+	bch2_journal_preres_put(j, &ck->res);
+
+	BUG_ON(!btree_node_locked(c_iter.path, 0));
+
+	if (!evict) {
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		}
+	} else {
+		struct btree_path *path2;
+evict:
+		trans_for_each_path(trans, path2)
+			if (path2 != c_iter.path)
+				__bch2_btree_path_unlock(trans, path2);
+
+		bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
+
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		}
+
+		mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
+		bkey_cached_evict(&c->btree_key_cache, ck);
+		bkey_cached_free_fast(&c->btree_key_cache, ck);
+	}
+out:
+	bch2_trans_iter_exit(trans, &b_iter);
+	bch2_trans_iter_exit(trans, &c_iter);
+	return ret;
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *j,
+				struct journal_entry_pin *pin, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bkey_cached *ck =
+		container_of(pin, struct bkey_cached, journal);
+	struct bkey_cached_key key;
+	struct btree_trans *trans = bch2_trans_get(c);
+	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	int ret = 0;
+
+	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
+	key = ck->key;
+
+	if (ck->journal.seq != seq ||
+	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		six_unlock_read(&ck->c.lock);
+		goto unlock;
+	}
+
+	if (ck->seq != seq) {
+		bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
+					bch2_btree_key_cache_journal_flush);
+		six_unlock_read(&ck->c.lock);
+		goto unlock;
+	}
+	six_unlock_read(&ck->c.lock);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+		btree_key_cache_flush_pos(trans, key, seq,
+				BTREE_INSERT_JOURNAL_RECLAIM, false));
+unlock:
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+/*
+ * Flush and evict a key from the key cache:
+ */
+int bch2_btree_key_cache_flush(struct btree_trans *trans,
+			       enum btree_id id, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached_key key = { id, pos };
+
+	/* Fastpath - assume it won't be found: */
+	if (!bch2_btree_key_cache_find(c, id, pos))
+		return 0;
+
+	return btree_key_cache_flush_pos(trans, key, 0, 0, true);
+}
+
+bool bch2_btree_insert_key_cached(struct btree_trans *trans,
+				  unsigned flags,
+				  struct btree_insert_entry *insert_entry)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+	struct bkey_i *insert = insert_entry->k;
+	bool kick_reclaim = false;
+
+	BUG_ON(insert->k.u64s > ck->u64s);
+
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		int difference;
+
+		BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
+
+		difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
+		if (difference > 0) {
+			trans->journal_preres.u64s	-= difference;
+			ck->res.u64s			+= difference;
+		}
+	}
+
+	bkey_copy(ck->k, insert);
+	ck->valid = true;
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		atomic_long_inc(&c->btree_key_cache.nr_dirty);
+
+		if (bch2_nr_btree_keys_need_flush(c))
+			kick_reclaim = true;
+	}
+
+	/*
+	 * To minimize lock contention, we only add the journal pin here and
+	 * defer pin updates to the flush callback via ->seq. Be careful not to
+	 * update ->seq on nojournal commits because we don't want to update the
+	 * pin to a seq that doesn't include journal updates on disk. Otherwise
+	 * we risk losing the update after a crash.
+	 *
+	 * The only exception is if the pin is not active in the first place. We
+	 * have to add the pin because journal reclaim drives key cache
+	 * flushing. The flush callback will not proceed unless ->seq matches
+	 * the latest pin, so make sure it starts with a consistent value.
+	 */
+	if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+	    !journal_pin_active(&ck->journal)) {
+		ck->seq = trans->journal_res.seq;
+	}
+	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+			     &ck->journal, bch2_btree_key_cache_journal_flush);
+
+	if (kick_reclaim)
+		journal_reclaim_kick(&c->journal);
+	return true;
+}
+
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+			       struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+
+	BUG_ON(!ck->valid);
+
+	/*
+	 * We just did an update to the btree, bypassing the key cache: the key
+	 * cache key is now stale and must be dropped, even if dirty:
+	 */
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		bch2_journal_pin_drop(&c->journal, &ck->journal);
+	}
+
+	ck->valid = false;
+}
+
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_key_cache.shrink);
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bucket_table *tbl;
+	struct bkey_cached *ck, *t;
+	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+	unsigned start, flags;
+	int srcu_idx;
+
+	mutex_lock(&bc->lock);
+	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	flags = memalloc_nofs_save();
+
+	/*
+	 * Newest freed entries are at the end of the list - once we hit one
+	 * that's too new to be freed, we can bail out:
+	 */
+	list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
+		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+						 ck->btree_trans_barrier_seq))
+			break;
+
+		list_del(&ck->list);
+		six_lock_exit(&ck->c.lock);
+		kmem_cache_free(bch2_key_cache, ck);
+		atomic_long_dec(&bc->nr_freed);
+		scanned++;
+		freed++;
+	}
+
+	if (scanned >= nr)
+		goto out;
+
+	list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
+		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+						 ck->btree_trans_barrier_seq))
+			break;
+
+		list_del(&ck->list);
+		six_lock_exit(&ck->c.lock);
+		kmem_cache_free(bch2_key_cache, ck);
+		atomic_long_dec(&bc->nr_freed);
+		scanned++;
+		freed++;
+	}
+
+	if (scanned >= nr)
+		goto out;
+
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+	if (bc->shrink_iter >= tbl->size)
+		bc->shrink_iter = 0;
+	start = bc->shrink_iter;
+
+	do {
+		struct rhash_head *pos, *next;
+
+		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+
+		while (!rht_is_a_nulls(pos)) {
+			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+			ck = container_of(pos, struct bkey_cached, hash);
+
+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+				goto next;
+
+			if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+			else if (bkey_cached_lock_for_evict(ck)) {
+				bkey_cached_evict(bc, ck);
+				bkey_cached_free(bc, ck);
+			}
+
+			scanned++;
+			if (scanned >= nr)
+				break;
+next:
+			pos = next;
+		}
+
+		bc->shrink_iter++;
+		if (bc->shrink_iter >= tbl->size)
+			bc->shrink_iter = 0;
+	} while (scanned < nr && bc->shrink_iter != start);
+
+	rcu_read_unlock();
+out:
+	memalloc_nofs_restore(flags);
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+	mutex_unlock(&bc->lock);
+
+	return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_key_cache.shrink);
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	long nr = atomic_long_read(&bc->nr_keys) -
+		atomic_long_read(&bc->nr_dirty);
+
+	return max(0L, nr);
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct bucket_table *tbl;
+	struct bkey_cached *ck, *n;
+	struct rhash_head *pos;
+	LIST_HEAD(items);
+	unsigned i;
+#ifdef __KERNEL__
+	int cpu;
+#endif
+
+	unregister_shrinker(&bc->shrink);
+
+	mutex_lock(&bc->lock);
+
+	/*
+	 * The loop is needed to guard against racing with rehash:
+	 */
+	while (atomic_long_read(&bc->nr_keys)) {
+		rcu_read_lock();
+		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+		if (tbl)
+			for (i = 0; i < tbl->size; i++)
+				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+					bkey_cached_evict(bc, ck);
+					list_add(&ck->list, &items);
+				}
+		rcu_read_unlock();
+	}
+
+#ifdef __KERNEL__
+	for_each_possible_cpu(cpu) {
+		struct btree_key_cache_freelist *f =
+			per_cpu_ptr(bc->pcpu_freed, cpu);
+
+		for (i = 0; i < f->nr; i++) {
+			ck = f->objs[i];
+			list_add(&ck->list, &items);
+		}
+	}
+#endif
+
+	list_splice(&bc->freed_pcpu,	&items);
+	list_splice(&bc->freed_nonpcpu,	&items);
+
+	mutex_unlock(&bc->lock);
+
+	list_for_each_entry_safe(ck, n, &items, list) {
+		cond_resched();
+
+		bch2_journal_pin_drop(&c->journal, &ck->journal);
+		bch2_journal_preres_put(&c->journal, &ck->res);
+
+		list_del(&ck->list);
+		kfree(ck->k);
+		six_lock_exit(&ck->c.lock);
+		kmem_cache_free(bch2_key_cache, ck);
+	}
+
+	if (atomic_long_read(&bc->nr_dirty) &&
+	    !bch2_journal_error(&c->journal) &&
+	    test_bit(BCH_FS_WAS_RW, &c->flags))
+		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+		      atomic_long_read(&bc->nr_dirty));
+
+	if (atomic_long_read(&bc->nr_keys))
+		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+		      atomic_long_read(&bc->nr_keys));
+
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
+
+	free_percpu(bc->pcpu_freed);
+}
+
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
+{
+	mutex_init(&c->lock);
+	INIT_LIST_HEAD(&c->freed_pcpu);
+	INIT_LIST_HEAD(&c->freed_nonpcpu);
+}
+
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+#ifdef __KERNEL__
+	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
+	if (!bc->pcpu_freed)
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+#endif
+
+	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+	bc->table_init_done = true;
+
+	bc->shrink.seeks		= 0;
+	bc->shrink.count_objects	= bch2_btree_key_cache_count;
+	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
+	if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name))
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+	return 0;
+}
+
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
+{
+	prt_printf(out, "nr_freed:\t%lu",	atomic_long_read(&c->nr_freed));
+	prt_newline(out);
+	prt_printf(out, "nr_keys:\t%lu",	atomic_long_read(&c->nr_keys));
+	prt_newline(out);
+	prt_printf(out, "nr_dirty:\t%lu",	atomic_long_read(&c->nr_dirty));
+	prt_newline(out);
+}
+
+void bch2_btree_key_cache_exit(void)
+{
+	kmem_cache_destroy(bch2_key_cache);
+}
+
+int __init bch2_btree_key_cache_init(void)
+{
+	bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
+	if (!bch2_key_cache)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
new file mode 100644
index 000000000000..be3acde2caa0
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
+#define _BCACHEFS_BTREE_KEY_CACHE_H
+
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = 1024 + nr_keys  / 2;
+
+	return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+	return nr_dirty > max_dirty;
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *,
+				struct journal_entry_pin *, u64);
+
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+				    unsigned);
+
+bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
+			struct btree_insert_entry *);
+int bch2_btree_key_cache_flush(struct btree_trans *,
+			       enum btree_id, struct bpos);
+void bch2_btree_key_cache_drop(struct btree_trans *,
+			       struct btree_path *);
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
+
+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
+
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
new file mode 100644
index 000000000000..40c8ed8f7bf1
--- /dev/null
+++ b/fs/bcachefs/btree_locking.c
@@ -0,0 +1,791 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+static struct lock_class_key bch2_btree_node_lock_key;
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
+			  enum six_lock_init_flags flags)
+{
+	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
+	lockdep_set_novalidate_class(&b->lock);
+}
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void)
+{
+#if 0
+	//Re-enable when lock_class_is_held() is merged:
+	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+#endif
+}
+#endif
+
+/* Btree node locking: */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+						  struct btree_path *skip,
+						  struct btree_bkey_cached_common *b,
+						  unsigned level)
+{
+	struct btree_path *path;
+	struct six_lock_count ret;
+
+	memset(&ret, 0, sizeof(ret));
+
+	if (IS_ERR_OR_NULL(b))
+		return ret;
+
+	trans_for_each_path(trans, path)
+		if (path != skip && &path->l[level].b->c == b) {
+			int t = btree_node_locked_type(path, level);
+
+			if (t != BTREE_NODE_UNLOCKED)
+				ret.n[t]++;
+		}
+
+	return ret;
+}
+
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+			struct btree_path *path, struct btree *b)
+{
+	bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+/* lock */
+
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+	struct btree_trans		*trans;
+	struct btree_bkey_cached_common	*node_want;
+	enum six_lock_type		lock_want;
+
+	/* for iterating over held locks :*/
+	u8				path_idx;
+	u8				level;
+	u64				lock_start_time;
+};
+
+struct lock_graph {
+	struct trans_waiting_for_lock	g[8];
+	unsigned			nr;
+};
+
+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	prt_printf(out, "Found lock cycle (%u entries):", g->nr);
+	prt_newline(out);
+
+	for (i = g->g; i < g->g + g->nr; i++)
+		bch2_btree_trans_to_text(out, i->trans);
+}
+
+static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g; i != g->g + g->nr; i++) {
+		if (i != g->g)
+			prt_str(out, "<- ");
+		prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+	}
+	prt_newline(out);
+}
+
+static void lock_graph_up(struct lock_graph *g)
+{
+	closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static noinline void lock_graph_pop_all(struct lock_graph *g)
+{
+	while (g->nr)
+		lock_graph_up(g);
+}
+
+static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+	g->g[g->nr++] = (struct trans_waiting_for_lock) {
+		.trans		= trans,
+		.node_want	= trans->locking,
+		.lock_want	= trans->locking_wait.lock_want,
+	};
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+	closure_get(&trans->ref);
+	__lock_graph_down(g, trans);
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g + 1; i < g->g + g->nr; i++)
+		if (i->trans->locking != i->node_want ||
+		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+			while (g->g + g->nr > i)
+				lock_graph_up(g);
+			return true;
+		}
+
+	return false;
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+	if (i == g->g) {
+		trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+		return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+	} else {
+		i->trans->lock_must_abort = true;
+		wake_up_process(i->trans->locking_wait.task);
+		return 0;
+	}
+}
+
+static int btree_trans_abort_preference(struct btree_trans *trans)
+{
+	if (trans->lock_may_not_fail)
+		return 0;
+	if (trans->locking_wait.lock_want == SIX_LOCK_write)
+		return 1;
+	if (!trans->in_traverse_all)
+		return 2;
+	return 3;
+}
+
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+	struct trans_waiting_for_lock *i, *abort = NULL;
+	unsigned best = 0, pref;
+	int ret;
+
+	if (lock_graph_remove_non_waiters(g))
+		return 0;
+
+	/* Only checking, for debugfs: */
+	if (cycle) {
+		print_cycle(cycle, g);
+		ret = -1;
+		goto out;
+	}
+
+	for (i = g->g; i < g->g + g->nr; i++) {
+		pref = btree_trans_abort_preference(i->trans);
+		if (pref > best) {
+			abort = i;
+			best = pref;
+		}
+	}
+
+	if (unlikely(!best)) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+		for (i = g->g; i < g->g + g->nr; i++) {
+			struct btree_trans *trans = i->trans;
+
+			bch2_btree_trans_to_text(&buf, trans);
+
+			prt_printf(&buf, "backtrace:");
+			prt_newline(&buf);
+			printbuf_indent_add(&buf, 2);
+			bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
+			printbuf_indent_sub(&buf, 2);
+			prt_newline(&buf);
+		}
+
+		bch2_print_string_as_lines(KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+		BUG();
+	}
+
+	ret = abort_lock(g, abort);
+out:
+	if (ret)
+		while (g->nr)
+			lock_graph_up(g);
+	return ret;
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
+			      struct printbuf *cycle)
+{
+	struct btree_trans *orig_trans = g->g->trans;
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g; i < g->g + g->nr; i++)
+		if (i->trans == trans) {
+			closure_put(&trans->ref);
+			return break_cycle(g, cycle);
+		}
+
+	if (g->nr == ARRAY_SIZE(g->g)) {
+		closure_put(&trans->ref);
+
+		if (orig_trans->lock_may_not_fail)
+			return 0;
+
+		while (g->nr)
+			lock_graph_up(g);
+
+		if (cycle)
+			return 0;
+
+		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+	}
+
+	__lock_graph_down(g, trans);
+	return 0;
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+	return t1 + t2 > 1;
+}
+
+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
+{
+	struct lock_graph g;
+	struct trans_waiting_for_lock *top;
+	struct btree_bkey_cached_common *b;
+	struct btree_path *path;
+	unsigned path_idx;
+	int ret;
+
+	if (trans->lock_must_abort) {
+		if (cycle)
+			return -1;
+
+		trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+	}
+
+	g.nr = 0;
+	lock_graph_down(&g, trans);
+next:
+	if (!g.nr)
+		return 0;
+
+	top = &g.g[g.nr - 1];
+
+	trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) {
+		if (!path->nodes_locked)
+			continue;
+
+		if (path_idx != top->path_idx) {
+			top->path_idx		= path_idx;
+			top->level		= 0;
+			top->lock_start_time	= 0;
+		}
+
+		for (;
+		     top->level < BTREE_MAX_DEPTH;
+		     top->level++, top->lock_start_time = 0) {
+			int lock_held = btree_node_locked_type(path, top->level);
+
+			if (lock_held == BTREE_NODE_UNLOCKED)
+				continue;
+
+			b = &READ_ONCE(path->l[top->level].b)->c;
+
+			if (IS_ERR_OR_NULL(b)) {
+				/*
+				 * If we get here, it means we raced with the
+				 * other thread updating its btree_path
+				 * structures - which means it can't be blocked
+				 * waiting on a lock:
+				 */
+				if (!lock_graph_remove_non_waiters(&g)) {
+					/*
+					 * If lock_graph_remove_non_waiters()
+					 * didn't do anything, it must be
+					 * because we're being called by debugfs
+					 * checking for lock cycles, which
+					 * invokes us on btree_transactions that
+					 * aren't actually waiting on anything.
+					 * Just bail out:
+					 */
+					lock_graph_pop_all(&g);
+				}
+
+				goto next;
+			}
+
+			if (list_empty_careful(&b->lock.wait_list))
+				continue;
+
+			raw_spin_lock(&b->lock.wait_lock);
+			list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+				BUG_ON(b != trans->locking);
+
+				if (top->lock_start_time &&
+				    time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+					continue;
+
+				top->lock_start_time = trans->locking_wait.start_time;
+
+				/* Don't check for self deadlock: */
+				if (trans == top->trans ||
+				    !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+					continue;
+
+				closure_get(&trans->ref);
+				raw_spin_unlock(&b->lock.wait_lock);
+
+				ret = lock_graph_descend(&g, trans, cycle);
+				if (ret)
+					return ret;
+				goto next;
+
+			}
+			raw_spin_unlock(&b->lock.wait_lock);
+		}
+	}
+
+	if (g.nr > 1 && cycle)
+		print_chain(cycle, &g);
+	lock_graph_up(&g);
+	goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+	struct btree_trans *trans = p;
+
+	return bch2_check_for_deadlock(trans, NULL);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
+				 struct btree_bkey_cached_common *b,
+				 bool lock_may_not_fail)
+{
+	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+	int ret;
+
+	/*
+	 * Must drop our read locks before calling six_lock_write() -
+	 * six_unlock() won't do wakeups until the reader count
+	 * goes to 0, and it's safe because we have the node intent
+	 * locked:
+	 */
+	six_lock_readers_add(&b->lock, -readers);
+	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
+				       lock_may_not_fail, _RET_IP_);
+	six_lock_readers_add(&b->lock, readers);
+
+	if (ret)
+		mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
+
+	return ret;
+}
+
+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree_bkey_cached_common *b)
+{
+	struct btree_path *linked;
+	unsigned i;
+	int ret;
+
+	/*
+	 * XXX BIG FAT NOTICE
+	 *
+	 * Drop all read locks before taking a write lock:
+	 *
+	 * This is a hack, because bch2_btree_node_lock_write_nofail() is a
+	 * hack - but by dropping read locks first, this should never fail, and
+	 * we only use this in code paths where whatever read locks we've
+	 * already taken are no longer needed:
+	 */
+
+	trans_for_each_path(trans, linked) {
+		if (!linked->nodes_locked)
+			continue;
+
+		for (i = 0; i < BTREE_MAX_DEPTH; i++)
+			if (btree_node_read_locked(linked, i)) {
+				btree_node_unlock(trans, linked, i);
+				btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
+			}
+	}
+
+	ret = __btree_node_lock_write(trans, path, b, true);
+	BUG_ON(ret);
+}
+
+/* relock */
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+					struct btree_path *path,
+					bool upgrade)
+{
+	unsigned l = path->level;
+	int fail_idx = -1;
+
+	do {
+		if (!btree_path_node(path, l))
+			break;
+
+		if (!(upgrade
+		      ? bch2_btree_node_upgrade(trans, path, l)
+		      : bch2_btree_node_relock(trans, path, l)))
+			fail_idx = l;
+
+		l++;
+	} while (l < path->locks_want);
+
+	/*
+	 * When we fail to get a lock, we have to ensure that any child nodes
+	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
+	 * the node that we failed to relock:
+	 */
+	if (fail_idx >= 0) {
+		__bch2_btree_path_unlock(trans, path);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+		do {
+			path->l[fail_idx].b = upgrade
+				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
+			--fail_idx;
+		} while (fail_idx >= 0);
+	}
+
+	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+		path->uptodate = BTREE_ITER_UPTODATE;
+
+	bch2_trans_verify_locks(trans);
+
+	return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+			      struct btree_path *path, unsigned level,
+			      bool trace)
+{
+	struct btree *b = btree_path_node(path, level);
+	int want = __btree_lock_want(path, level);
+
+	if (race_fault())
+		goto fail;
+
+	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+	    (btree_node_lock_seq_matches(path, b, level) &&
+	     btree_node_lock_increment(trans, &b->c, level, want))) {
+		mark_btree_node_locked(trans, path, level, want);
+		return true;
+	}
+fail:
+	if (trace && !trans->notrace_relock_fail)
+		trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+	return false;
+}
+
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+			     struct btree_path *path, unsigned level)
+{
+	struct btree *b = path->l[level].b;
+	struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
+
+	if (!is_btree_node(path, level))
+		return false;
+
+	switch (btree_lock_want(path, level)) {
+	case BTREE_NODE_UNLOCKED:
+		BUG_ON(btree_node_locked(path, level));
+		return true;
+	case BTREE_NODE_READ_LOCKED:
+		BUG_ON(btree_node_intent_locked(path, level));
+		return bch2_btree_node_relock(trans, path, level);
+	case BTREE_NODE_INTENT_LOCKED:
+		break;
+	case BTREE_NODE_WRITE_LOCKED:
+		BUG();
+	}
+
+	if (btree_node_intent_locked(path, level))
+		return true;
+
+	if (race_fault())
+		return false;
+
+	if (btree_node_locked(path, level)) {
+		bool ret;
+
+		six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
+		ret = six_lock_tryupgrade(&b->c.lock);
+		six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
+
+		if (ret)
+			goto success;
+	} else {
+		if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+			goto success;
+	}
+
+	/*
+	 * Do we already have an intent lock via another path? If so, just bump
+	 * lock count:
+	 */
+	if (btree_node_lock_seq_matches(path, b, level) &&
+	    btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
+		btree_node_unlock(trans, path, level);
+		goto success;
+	}
+
+	trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
+	return false;
+success:
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+	return true;
+}
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+				  struct btree_path *path)
+{
+	unsigned l;
+
+	for (l = path->level;
+	     l < path->locks_want && btree_path_node(path, l);
+	     l++) {
+		if (!bch2_btree_node_relock(trans, path, l)) {
+			__bch2_btree_path_unlock(trans, path);
+			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+			trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+		}
+	}
+
+	return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_path_get_locks(trans, path, false);
+}
+
+int __bch2_btree_path_relock(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+		trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+	}
+
+	return 0;
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+			       struct btree_path *path,
+			       unsigned new_locks_want)
+{
+	EBUG_ON(path->locks_want >= new_locks_want);
+
+	path->locks_want = new_locks_want;
+
+	return btree_path_get_locks(trans, path, true);
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+			       struct btree_path *path,
+			       unsigned new_locks_want)
+{
+	struct btree_path *linked;
+
+	if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+		return true;
+
+	/*
+	 * XXX: this is ugly - we'd prefer to not be mucking with other
+	 * iterators in the btree_trans here.
+	 *
+	 * On failure to upgrade the iterator, setting iter->locks_want and
+	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+	 * get the locks we want on transaction restart.
+	 *
+	 * But if this iterator was a clone, on transaction restart what we did
+	 * to this iterator isn't going to be preserved.
+	 *
+	 * Possibly we could add an iterator field for the parent iterator when
+	 * an iterator is a copy - for now, we'll just upgrade any other
+	 * iterators with the same btree id.
+	 *
+	 * The code below used to be needed to ensure ancestor nodes get locked
+	 * before interior nodes - now that's handled by
+	 * bch2_btree_path_traverse_all().
+	 */
+	if (!path->cached && !trans->in_traverse_all)
+		trans_for_each_path(trans, linked)
+			if (linked != path &&
+			    linked->cached == path->cached &&
+			    linked->btree_id == path->btree_id &&
+			    linked->locks_want < new_locks_want) {
+				linked->locks_want = new_locks_want;
+				btree_path_get_locks(trans, linked, true);
+			}
+
+	return false;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+				 struct btree_path *path,
+				 unsigned new_locks_want)
+{
+	unsigned l;
+
+	EBUG_ON(path->locks_want < new_locks_want);
+
+	path->locks_want = new_locks_want;
+
+	while (path->nodes_locked &&
+	       (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
+		if (l > path->level) {
+			btree_node_unlock(trans, path, l);
+		} else {
+			if (btree_node_intent_locked(path, l)) {
+				six_lock_downgrade(&path->l[l].b->c.lock);
+				mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
+			}
+			break;
+		}
+	}
+
+	bch2_btree_path_verify_locks(path);
+}
+
+/* Btree transaction locking: */
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_downgrade(trans, path);
+}
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	if (unlikely(trans->restarted))
+		return -((int) trans->restarted);
+
+	trans_for_each_path(trans, path)
+		if (path->should_be_locked &&
+		    !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+			trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+		}
+	return 0;
+}
+
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	if (unlikely(trans->restarted))
+		return -((int) trans->restarted);
+
+	trans_for_each_path(trans, path)
+		if (path->should_be_locked &&
+		    !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+		}
+	return 0;
+}
+
+void bch2_trans_unlock_noassert(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		__bch2_btree_path_unlock(trans, path);
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		__bch2_btree_path_unlock(trans, path);
+}
+
+bool bch2_trans_locked(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->nodes_locked)
+			return true;
+	return false;
+}
+
+int __bch2_trans_mutex_lock(struct btree_trans *trans,
+			    struct mutex *lock)
+{
+	int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
+
+	if (ret)
+		mutex_unlock(lock);
+	return ret;
+}
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+	unsigned l;
+
+	if (!path->nodes_locked) {
+		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+		       btree_path_node(path, path->level));
+		return;
+	}
+
+	for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+		int want = btree_lock_want(path, l);
+		int have = btree_node_locked_type(path, l);
+
+		BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+		BUG_ON(is_btree_node(path, l) &&
+		       (want == BTREE_NODE_UNLOCKED ||
+			have != BTREE_NODE_WRITE_LOCKED) &&
+		       want != have);
+	}
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_verify_locks(path);
+}
+
+#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
new file mode 100644
index 000000000000..6231e9ffc5d7
--- /dev/null
+++ b/fs/bcachefs/btree_locking.h
@@ -0,0 +1,423 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_LOCKING_H
+#define _BCACHEFS_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include "btree_iter.h"
+#include "six.h"
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void);
+#else
+static inline void bch2_assert_btree_nodes_not_locked(void) {}
+#endif
+
+void bch2_trans_unlock_noassert(struct btree_trans *);
+
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
+{
+	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
+}
+
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+	return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+		? &trans->c->btree_transaction_stats[trans->fn_idx]
+		: NULL;
+}
+
+/* matches six lock types */
+enum btree_node_locked_type {
+	BTREE_NODE_UNLOCKED		= -1,
+	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
+	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+	BTREE_NODE_WRITE_LOCKED		= SIX_LOCK_write,
+};
+
+static inline int btree_node_locked_type(struct btree_path *path,
+					 unsigned level)
+{
+	return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
+}
+
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
+{
+	return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
+}
+
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
+{
+	return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
+{
+	return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
+{
+	return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
+}
+
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
+						  unsigned level,
+						  enum btree_node_locked_type type)
+{
+	/* relying on this to avoid a branch */
+	BUILD_BUG_ON(SIX_LOCK_read   != 0);
+	BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+	path->nodes_locked &= ~(3U << (level << 1));
+	path->nodes_locked |= (type + 1) << (level << 1);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+					    unsigned level)
+{
+	EBUG_ON(btree_node_write_locked(path, level));
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
+}
+
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+					  struct btree_path *path,
+					  unsigned level,
+					  enum btree_node_locked_type type)
+{
+	mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+	path->l[level].lock_taken_time = local_clock();
+#endif
+}
+
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
+{
+	return level < path->locks_want
+		? SIX_LOCK_intent
+		: SIX_LOCK_read;
+}
+
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_path *path, int level)
+{
+	if (level < path->level)
+		return BTREE_NODE_UNLOCKED;
+	if (level < path->locks_want)
+		return BTREE_NODE_INTENT_LOCKED;
+	if (level == path->level)
+		return BTREE_NODE_READ_LOCKED;
+	return BTREE_NODE_UNLOCKED;
+}
+
+static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
+					      struct btree_path *path, unsigned level)
+{
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+	if (s)
+		__bch2_time_stats_update(&s->lock_hold_times,
+					 path->l[level].lock_taken_time,
+					 local_clock());
+#endif
+}
+
+/* unlock: */
+
+static inline void btree_node_unlock(struct btree_trans *trans,
+				     struct btree_path *path, unsigned level)
+{
+	int lock_type = btree_node_locked_type(path, level);
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	if (lock_type != BTREE_NODE_UNLOCKED) {
+		six_unlock_type(&path->l[level].b->c.lock, lock_type);
+		btree_trans_lock_hold_time_update(trans, path, level);
+	}
+	mark_btree_node_unlocked(path, level);
+}
+
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
+{
+	return __ffs(path->nodes_locked) >> 1;
+}
+
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+	return __fls(path->nodes_locked) >> 1;
+}
+
+static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
+					    struct btree_path *path)
+{
+	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
+
+	while (path->nodes_locked)
+		btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
+}
+
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+				     struct btree *b)
+{
+	struct btree_path *linked;
+
+	EBUG_ON(path->l[b->c.level].b != b);
+	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
+	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+
+	trans_for_each_path_with_node(trans, b, linked)
+		linked->l[b->c.level].lock_seq++;
+
+	six_unlock_write(&b->c.lock);
+}
+
+void bch2_btree_node_unlock_write(struct btree_trans *,
+			struct btree_path *, struct btree *);
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
+/* lock: */
+
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+					 struct btree_bkey_cached_common *b,
+					 enum six_lock_type type,
+					 bool lock_may_not_fail,
+					 unsigned long ip)
+{
+	int ret;
+
+	trans->lock_may_not_fail = lock_may_not_fail;
+	trans->lock_must_abort	= false;
+	trans->locking		= b;
+
+	ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+				 bch2_six_check_for_deadlock, trans, ip);
+	WRITE_ONCE(trans->locking, NULL);
+	WRITE_ONCE(trans->locking_wait.start_time, 0);
+	return ret;
+}
+
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+		       struct btree_bkey_cached_common *b,
+		       enum six_lock_type type,
+		       unsigned long ip)
+{
+	return __btree_node_lock_nopath(trans, b, type, false, ip);
+}
+
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+					 struct btree_bkey_cached_common *b,
+					 enum six_lock_type type)
+{
+	int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
+
+	BUG_ON(ret);
+}
+
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_trans *trans,
+					     struct btree_bkey_cached_common *b,
+					     unsigned level,
+					     enum btree_node_locked_type want)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (&path->l[level].b->c == b &&
+		    btree_node_locked_type(path, level) >= want) {
+			six_lock_increment(&b->lock, (enum six_lock_type) want);
+			return true;
+		}
+
+	return false;
+}
+
+static inline int btree_node_lock(struct btree_trans *trans,
+			struct btree_path *path,
+			struct btree_bkey_cached_common *b,
+			unsigned level,
+			enum six_lock_type type,
+			unsigned long ip)
+{
+	int ret = 0;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+
+	if (likely(six_trylock_type(&b->lock, type)) ||
+	    btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
+	    !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		path->l[b->level].lock_taken_time = local_clock();
+#endif
+	}
+
+	return ret;
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
+				 struct btree_bkey_cached_common *b, bool);
+
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+					  struct btree_path *path,
+					  struct btree_bkey_cached_common *b,
+					  bool lock_may_not_fail)
+{
+	EBUG_ON(&path->l[b->level].b->c != b);
+	EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
+	EBUG_ON(!btree_node_intent_locked(path, b->level));
+
+	/*
+	 * six locks are unfair, and read locks block while a thread wants a
+	 * write lock: thus, we need to tell the cycle detector we have a write
+	 * lock _before_ taking the lock:
+	 */
+	mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
+
+	return likely(six_trylock_write(&b->lock))
+		? 0
+		: __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
+}
+
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+			   struct btree_path *path,
+			   struct btree_bkey_cached_common *b)
+{
+	return __btree_node_lock_write(trans, path, b, false);
+}
+
+void bch2_btree_node_lock_write_nofail(struct btree_trans *,
+				       struct btree_path *,
+				       struct btree_bkey_cached_common *);
+
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+				      struct btree_path *, unsigned long);
+int __bch2_btree_path_relock(struct btree_trans *,
+			     struct btree_path *, unsigned long);
+
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+				struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_node_locked(path, path->level)
+		? 0
+		: __bch2_btree_path_relock(trans, path, trace_ip);
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
+
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+					  struct btree_path *path, unsigned level)
+{
+	EBUG_ON(btree_node_locked(path, level) &&
+		!btree_node_write_locked(path, level) &&
+		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+
+	return likely(btree_node_locked(path, level)) ||
+		(!IS_ERR_OR_NULL(path->l[level].b) &&
+		 __bch2_btree_node_relock(trans, path, level, true));
+}
+
+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
+						  struct btree_path *path, unsigned level)
+{
+	EBUG_ON(btree_node_locked(path, level) &&
+		!btree_node_write_locked(path, level) &&
+		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+
+	return likely(btree_node_locked(path, level)) ||
+		(!IS_ERR_OR_NULL(path->l[level].b) &&
+		 __bch2_btree_node_relock(trans, path, level, false));
+}
+
+/* upgrade */
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+			       struct btree_path *, unsigned);
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+			       struct btree_path *, unsigned);
+
+static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
+					  struct btree_path *path,
+					  unsigned new_locks_want)
+{
+	unsigned old_locks_want = path->locks_want;
+
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	if (path->locks_want < new_locks_want
+	    ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+	    : path->uptodate == BTREE_ITER_UPTODATE)
+		return 0;
+
+	trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
+			old_locks_want, new_locks_want);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+}
+
+/* misc: */
+
+static inline void btree_path_set_should_be_locked(struct btree_path *path)
+{
+	EBUG_ON(!btree_node_locked(path, path->level));
+	EBUG_ON(path->uptodate);
+
+	path->should_be_locked = true;
+}
+
+static inline void __btree_path_set_level_up(struct btree_trans *trans,
+				      struct btree_path *path,
+				      unsigned l)
+{
+	btree_node_unlock(trans, path, l);
+	path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+}
+
+static inline void btree_path_set_level_up(struct btree_trans *trans,
+				    struct btree_path *path)
+{
+	__btree_path_set_level_up(trans, path, path->level++);
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
+/* debug */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
+				struct btree_path *,
+				struct btree_bkey_cached_common *b,
+				unsigned);
+
+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
new file mode 100644
index 000000000000..04c1f4610972
--- /dev/null
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -0,0 +1,1150 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "snapshot.h"
+
+#include <linux/prefetch.h>
+
+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct bkey u;
+	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		struct bkey_i *j_k =
+			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
+
+		if (j_k)
+			k = bkey_i_to_s_c(j_k);
+	}
+
+	u = *k.k;
+	u.needs_whiteout = i->old_k.needs_whiteout;
+
+	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
+	BUG_ON(i->old_v != k.v);
+#endif
+}
+
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+	return i->path->l + i->level;
+}
+
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i != trans->updates &&
+		insert_l(&i[0])->b == insert_l(&i[-1])->b;
+}
+
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i + 1 < trans->updates + trans->nr_updates &&
+		insert_l(&i[0])->b == insert_l(&i[1])->b;
+}
+
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+					   struct btree_path *path,
+					   struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+
+	if (unlikely(btree_node_just_written(b)) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(trans, b);
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b,
+				struct btree_node_iter *node_iter,
+				struct bkey_i *insert)
+{
+	struct bkey_packed *k;
+	unsigned clobber_u64s = 0, new_u64s = 0;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
+	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
+	EBUG_ON(insert->k.u64s >
+		bch_btree_keys_u64s_remaining(trans->c, b));
+	EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
+
+	k = bch2_btree_node_iter_peek_all(node_iter, b);
+	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
+		k = NULL;
+
+	/* @k is the key being overwritten/deleted, if any: */
+	EBUG_ON(k && bkey_deleted(k));
+
+	/* Deleting, but not found? nothing to do: */
+	if (bkey_deleted(&insert->k) && !k)
+		return false;
+
+	if (bkey_deleted(&insert->k)) {
+		/* Deleting: */
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
+		if (k->needs_whiteout)
+			push_whiteout(trans->c, b, insert->k.p);
+		k->needs_whiteout = false;
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+			bch2_bset_delete(b, k, clobber_u64s);
+			goto fix_iter;
+		} else {
+			bch2_btree_path_fix_key_modified(trans, b, k);
+		}
+
+		return true;
+	}
+
+	if (k) {
+		/* Overwriting: */
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+		k->needs_whiteout = false;
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+			goto overwrite;
+		} else {
+			bch2_btree_path_fix_key_modified(trans, b, k);
+		}
+	}
+
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
+overwrite:
+	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	new_u64s = k->u64s;
+fix_iter:
+	if (clobber_u64s != new_u64s)
+		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
+					 clobber_u64s, new_u64s);
+	return true;
+}
+
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+	struct btree_trans *trans = bch2_trans_get(c);
+	unsigned long old, new, v;
+	unsigned idx = w - b->writes;
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+	v = READ_ONCE(b->flags);
+
+	do {
+		old = new = v;
+
+		if (!(old & (1 << BTREE_NODE_dirty)) ||
+		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+		    w->journal.seq != seq)
+			break;
+
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_journal_reclaim;
+		new |= 1 << BTREE_NODE_need_write;
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	btree_node_write_if_need(c, b, SIX_LOCK_read);
+	six_unlock_read(&b->c.lock);
+
+	bch2_trans_put(trans);
+	return 0;
+}
+
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 0, seq);
+}
+
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 1, seq);
+}
+
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+				       struct btree *b, u64 seq)
+{
+	struct btree_write *w = btree_current_write(b);
+
+	bch2_journal_pin_add(&c->journal, seq, &w->journal,
+			     btree_node_write_idx(b) == 0
+			     ? bch2_btree_node_flush0
+			     : bch2_btree_node_flush1);
+}
+
+/**
+ * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
+ * @trans:		btree transaction object
+ * @path:		path pointing to @insert's pos
+ * @insert:		key to insert
+ * @journal_seq:	sequence number of journal reservation
+ */
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct bkey_i *insert,
+				       u64 journal_seq)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = path_l(path)->b;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bset *i = bset(b, t);
+	int old_u64s = bset_u64s(t);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+					&path_l(path)->iter, insert)))
+		return;
+
+	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
+
+	bch2_btree_add_journal_pin(c, b, journal_seq);
+
+	if (unlikely(!btree_node_dirty(b))) {
+		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+		set_btree_node_dirty_acct(c, b);
+	}
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) bset_u64s(t) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+}
+
+/* Cached btree updates: */
+
+/* Normal update interface: */
+
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
+					     struct btree_insert_entry *i)
+{
+	BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
+	BUG_ON(i->cached	!= i->path->cached);
+	BUG_ON(i->level		!= i->path->level);
+	BUG_ON(i->btree_id	!= i->path->btree_id);
+	EBUG_ON(!i->level &&
+		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+		i->k->k.p.snapshot &&
+		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+}
+
+static noinline int
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
+				   unsigned long trace_ip)
+{
+	return drop_locks_do(trans,
+		bch2_journal_preres_get(&trans->c->journal,
+			&trans->journal_preres,
+			trans->journal_preres_u64s,
+			(flags & BCH_WATERMARK_MASK)));
+}
+
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+						      unsigned flags)
+{
+	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
+				    trans->journal_u64s, flags);
+}
+
+#define JSET_ENTRY_LOG_U64s		4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct jset_entry *entry =
+		bch2_journal_add_entry(j, &trans->journal_res,
+				       BCH_JSET_ENTRY_log, 0, 0,
+				       JSET_ENTRY_LOG_U64s);
+	struct jset_entry_log *l =
+		container_of(entry, struct jset_entry_log, entry);
+
+	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
+}
+
+static inline int btree_key_can_insert(struct btree_trans *trans,
+				       struct btree *b, unsigned u64s)
+{
+	struct bch_fs *c = trans->c;
+
+	if (!bch2_btree_node_insert_fits(c, b, u64s))
+		return -BCH_ERR_btree_insert_btree_node_full;
+
+	return 0;
+}
+
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+				       struct btree_path *path, unsigned u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+	struct btree_insert_entry *i;
+	unsigned new_u64s;
+	struct bkey_i *new_k;
+
+	EBUG_ON(path->level);
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+	    bch2_btree_key_cache_must_wait(c) &&
+	    !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+		return -BCH_ERR_btree_insert_need_journal_reclaim;
+
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at most 7
+	 * bytes (it won't be used):
+	 */
+	u64s += 1;
+
+	if (u64s <= ck->u64s)
+		return 0;
+
+	new_u64s	= roundup_pow_of_two(u64s);
+	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
+	if (!new_k) {
+		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+			bch2_btree_ids[path->btree_id], new_u64s);
+		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+	}
+
+	trans_for_each_update(trans, i)
+		if (i->old_v == &ck->k->v)
+			i->old_v = &new_k->v;
+
+	ck->u64s	= new_u64s;
+	ck->k		= new_k;
+	return 0;
+}
+
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+			       struct btree_insert_entry *i,
+			       unsigned flags)
+{
+	struct bkey_s_c old = { &i->old_k, i->old_v };
+	struct bkey_i *new = i->k;
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+	int ret;
+
+	verify_update_old_key(trans, i);
+
+	if (unlikely(flags & BTREE_TRIGGER_NORUN))
+		return 0;
+
+	if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
+		return 0;
+
+	if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
+				old, bkey_i_to_s_c(new),
+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+	} else {
+		struct bkey		_deleted = KEY(0, 0, 0);
+		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
+
+		_deleted.p = i->path->pos;
+
+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
+				deleted, bkey_i_to_s_c(new),
+				BTREE_TRIGGER_INSERT|flags) ?:
+			bch2_mark_key(trans, i->btree_id, i->level,
+				old, deleted,
+				BTREE_TRIGGER_OVERWRITE|flags);
+	}
+
+	return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+				 bool overwrite)
+{
+	/*
+	 * Transactional triggers create new btree_insert_entries, so we can't
+	 * pass them a pointer to a btree_insert_entry, that memory is going to
+	 * move:
+	 */
+	struct bkey old_k = i->old_k;
+	struct bkey_s_c old = { &old_k, i->old_v };
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+
+	verify_update_old_key(trans, i);
+
+	if ((i->flags & BTREE_TRIGGER_NORUN) ||
+	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+		return 0;
+
+	if (!i->insert_trigger_run &&
+	    !i->overwrite_trigger_run &&
+	    old_ops->trans_trigger == new_ops->trans_trigger &&
+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+		i->overwrite_trigger_run = true;
+		i->insert_trigger_run = true;
+		return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
+					   BTREE_TRIGGER_INSERT|
+					   BTREE_TRIGGER_OVERWRITE|
+					   i->flags) ?: 1;
+	} else if (overwrite && !i->overwrite_trigger_run) {
+		i->overwrite_trigger_run = true;
+		return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+	} else if (!overwrite && !i->insert_trigger_run) {
+		i->insert_trigger_run = true;
+		return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+	} else {
+		return 0;
+	}
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+			      struct btree_insert_entry *btree_id_start)
+{
+	struct btree_insert_entry *i;
+	bool trans_trigger_run;
+	int ret, overwrite;
+
+	for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+		/*
+		 * Running triggers will append more updates to the list of updates as
+		 * we're walking it:
+		 */
+		do {
+			trans_trigger_run = false;
+
+			for (i = btree_id_start;
+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+			     i++) {
+				if (i->btree_id != btree_id)
+					continue;
+
+				ret = run_one_trans_trigger(trans, i, overwrite);
+				if (ret < 0)
+					return ret;
+				if (ret)
+					trans_trigger_run = true;
+			}
+		} while (trans_trigger_run);
+	}
+
+	return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+	unsigned btree_id = 0;
+	int ret = 0;
+
+	/*
+	 *
+	 * For a given btree, this algorithm runs insert triggers before
+	 * overwrite triggers: this is so that when extents are being moved
+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+	 * they are re-added.
+	 */
+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		if (btree_id == BTREE_ID_alloc)
+			continue;
+
+		while (btree_id_start < trans->updates + trans->nr_updates &&
+		       btree_id_start->btree_id < btree_id)
+			btree_id_start++;
+
+		ret = run_btree_triggers(trans, btree_id, btree_id_start);
+		if (ret)
+			return ret;
+	}
+
+	trans_for_each_update(trans, i) {
+		if (i->btree_id > BTREE_ID_alloc)
+			break;
+		if (i->btree_id == BTREE_ID_alloc) {
+			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+			if (ret)
+				return ret;
+			break;
+		}
+	}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
+#endif
+	return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0;
+
+	trans_for_each_update(trans, i) {
+		/*
+		 * XXX: synchronization of cached update triggers with gc
+		 * XXX: synchronization of interior node updates with gc
+		 */
+		BUG_ON(i->cached || i->level);
+
+		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
+			       struct btree_insert_entry **stopped_at,
+			       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
+	struct btree_trans_commit_hook *h;
+	unsigned u64s = 0;
+	int ret;
+
+	if (race_fault()) {
+		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
+	}
+
+	/*
+	 * Check if the insert will fit in the leaf node with the write lock
+	 * held, otherwise another thread could write the node changing the
+	 * amount of space available:
+	 */
+
+	prefetch(&trans->c->journal.flags);
+
+	trans_for_each_update(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		u64s += i->k->k.u64s;
+		ret = !i->cached
+			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+			: btree_key_can_insert_cached(trans, flags, i->path, u64s);
+		if (ret) {
+			*stopped_at = i;
+			return ret;
+		}
+	}
+
+	if (trans->nr_wb_updates &&
+	    trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
+		return -BCH_ERR_btree_insert_need_flush_buffer;
+
+	/*
+	 * Don't get journal reservation until after we know insert will
+	 * succeed:
+	 */
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		ret = bch2_trans_journal_res_get(trans,
+				(flags & BCH_WATERMARK_MASK)|
+				JOURNAL_RES_GET_NONBLOCK);
+		if (ret)
+			return ret;
+
+		if (unlikely(trans->journal_transaction_names))
+			journal_transaction_name(trans);
+	} else {
+		trans->journal_res.seq = c->journal.replay_journal_seq;
+	}
+
+	/*
+	 * Not allowed to fail after we've gotten our journal reservation - we
+	 * have to use it:
+	 */
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+		if (bch2_journal_seq_verify)
+			trans_for_each_update(trans, i)
+				i->k->k.version.lo = trans->journal_res.seq;
+		else if (bch2_inject_invalid_keys)
+			trans_for_each_update(trans, i)
+				i->k->k.version = MAX_VERSION;
+	}
+
+	if (trans->fs_usage_deltas &&
+	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
+		return -BCH_ERR_btree_insert_need_mark_replicas;
+
+	if (trans->nr_wb_updates) {
+		EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+
+		ret = bch2_btree_insert_keys_write_buffer(trans);
+		if (ret)
+			goto revert_fs_usage;
+	}
+
+	h = trans->hooks;
+	while (h) {
+		ret = h->fn(trans, h);
+		if (ret)
+			goto revert_fs_usage;
+		h = h->next;
+	}
+
+	trans_for_each_update(trans, i)
+		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+			ret = run_one_mem_trigger(trans, i, i->flags);
+			if (ret)
+				goto fatal_err;
+		}
+
+	if (unlikely(c->gc_pos.phase)) {
+		ret = bch2_trans_commit_run_gc_triggers(trans);
+		if  (ret)
+			goto fatal_err;
+	}
+
+	if (unlikely(trans->extra_journal_entries.nr)) {
+		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+				  trans->extra_journal_entries.data,
+				  trans->extra_journal_entries.nr);
+
+		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
+		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
+	}
+
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		struct journal *j = &c->journal;
+		struct jset_entry *entry;
+
+		trans_for_each_update(trans, i) {
+			if (i->key_cache_already_flushed)
+				continue;
+
+			if (i->flags & BTREE_UPDATE_NOJOURNAL)
+				continue;
+
+			verify_update_old_key(trans, i);
+
+			if (trans->journal_transaction_names) {
+				entry = bch2_journal_add_entry(j, &trans->journal_res,
+						       BCH_JSET_ENTRY_overwrite,
+						       i->btree_id, i->level,
+						       i->old_k.u64s);
+				bkey_reassemble(&entry->start[0],
+						(struct bkey_s_c) { &i->old_k, i->old_v });
+			}
+
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       i->btree_id, i->level,
+					       i->k->k.u64s);
+			bkey_copy(&entry->start[0], i->k);
+		}
+
+		trans_for_each_wb_update(trans, wb) {
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       wb->btree, 0,
+					       wb->k.k.u64s);
+			bkey_copy(&entry->start[0], &wb->k);
+		}
+
+		if (trans->journal_seq)
+			*trans->journal_seq = trans->journal_res.seq;
+	}
+
+	trans_for_each_update(trans, i) {
+		i->k->k.needs_whiteout = false;
+
+		if (!i->cached) {
+			u64 seq = trans->journal_res.seq;
+
+			if (i->flags & BTREE_UPDATE_PREJOURNAL)
+				seq = i->seq;
+
+			bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+		} else if (!i->key_cache_already_flushed)
+			bch2_btree_insert_key_cached(trans, flags, i);
+		else {
+			bch2_btree_key_cache_drop(trans, i->path);
+			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+		}
+	}
+
+	return 0;
+fatal_err:
+	bch2_fatal_error(c);
+revert_fs_usage:
+	if (trans->fs_usage_deltas)
+		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+	return ret;
+}
+
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+	while (--i >= trans->updates) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+	}
+
+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int trans_lock_write(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+			return trans_lock_write_fail(trans, i);
+
+		if (!i->cached)
+			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+	}
+
+	return 0;
+}
+
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
+
+	trans_for_each_update(trans, i)
+		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+
+	trans_for_each_wb_update(trans, wb)
+		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
+}
+
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
+						   struct btree_insert_entry *i,
+						   struct printbuf *err)
+{
+	struct bch_fs *c = trans->c;
+	int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+
+	printbuf_reset(err);
+	prt_printf(err, "invalid bkey on insert from %s -> %ps",
+		   trans->fn, (void *) i->ip_allocated);
+	prt_newline(err);
+	printbuf_indent_add(err, 2);
+
+	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
+	prt_newline(err);
+
+	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+			  i->bkey_type, rw, err);
+	bch2_print_string_as_lines(KERN_ERR, err->buf);
+
+	bch2_inconsistent_error(c);
+	bch2_dump_trans_updates(trans);
+
+	return -EINVAL;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
+				       struct btree_insert_entry **stopped_at,
+				       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0, u64s_delta = 0;
+
+	trans_for_each_update(trans, i) {
+		if (i->cached)
+			continue;
+
+		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+		u64s_delta -= i->old_btree_u64s;
+
+		if (!same_leaf_as_next(trans, i)) {
+			if (u64s_delta <= 0) {
+				ret = bch2_foreground_maybe_merge(trans, i->path,
+							i->level, flags);
+				if (unlikely(ret))
+					return ret;
+			}
+
+			u64s_delta = 0;
+		}
+	}
+
+	ret = bch2_journal_preres_get(&c->journal,
+			&trans->journal_preres, trans->journal_preres_u64s,
+			(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
+	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
+		ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
+	if (unlikely(ret))
+		return ret;
+
+	ret = trans_lock_write(trans);
+	if (unlikely(ret))
+		return ret;
+
+	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
+
+	if (!ret && unlikely(trans->journal_replay_not_finished))
+		bch2_drop_overwrites_from_journal(trans);
+
+	trans_for_each_update(trans, i)
+		if (!same_leaf_as_prev(trans, i))
+			bch2_btree_node_unlock_write_inlined(trans, i->path,
+							insert_l(i)->b);
+
+	if (!ret && trans->journal_pin)
+		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+				     trans->journal_pin, NULL);
+
+	/*
+	 * Drop journal reservation after dropping write locks, since dropping
+	 * the journal reservation may kick off a journal write:
+	 */
+	bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+	if (unlikely(ret))
+		return ret;
+
+	bch2_trans_downgrade(trans);
+
+	return 0;
+}
+
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+	int ret = bch2_journal_error(&c->journal) ?:
+		!bch2_btree_key_cache_must_wait(c);
+
+	if (!ret)
+		journal_reclaim_kick(&c->journal);
+	return ret;
+}
+
+static noinline
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
+			    struct btree_insert_entry *i,
+			    int ret, unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+
+	switch (ret) {
+	case -BCH_ERR_btree_insert_btree_node_full:
+		ret = bch2_btree_split_leaf(trans, i->path, flags);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
+		break;
+	case -BCH_ERR_btree_insert_need_mark_replicas:
+		ret = drop_locks_do(trans,
+			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+		break;
+	case -BCH_ERR_journal_res_get_blocked:
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
+		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			break;
+		}
+
+		ret = drop_locks_do(trans,
+			bch2_trans_journal_res_get(trans,
+					(flags & BCH_WATERMARK_MASK)|
+					JOURNAL_RES_GET_CHECK));
+		break;
+	case -BCH_ERR_btree_insert_need_journal_reclaim:
+		bch2_trans_unlock(trans);
+
+		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+
+		wait_event_freezable(c->journal.reclaim_wait,
+				     (ret = journal_reclaim_wait_done(c)));
+		if (ret < 0)
+			break;
+
+		ret = bch2_trans_relock(trans);
+		break;
+	case -BCH_ERR_btree_insert_need_flush_buffer: {
+		struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+		ret = 0;
+
+		if (wb->state.nr > wb->size * 3 / 4) {
+			bch2_trans_unlock(trans);
+			mutex_lock(&wb->flush_lock);
+
+			if (wb->state.nr > wb->size * 3 / 4) {
+				bch2_trans_begin(trans);
+				ret = __bch2_btree_write_buffer_flush(trans,
+						flags|BTREE_INSERT_NOCHECK_RW, true);
+				if (!ret) {
+					trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+					ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+				}
+			} else {
+				mutex_unlock(&wb->flush_lock);
+				ret = bch2_trans_relock(trans);
+			}
+		}
+		break;
+	}
+	default:
+		BUG_ON(ret >= 0);
+		break;
+	}
+
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+				!(flags & BTREE_INSERT_NOWAIT) &&
+				(flags & BTREE_INSERT_NOFAIL), c,
+		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
+
+	return ret;
+}
+
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
+	    test_bit(BCH_FS_STARTED, &c->flags))
+		return -BCH_ERR_erofs_trans_commit;
+
+	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
+	if (ret)
+		return ret;
+
+	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
+	return 0;
+}
+
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0;
+
+	trans_for_each_update(trans, i) {
+		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i = NULL;
+	struct btree_write_buffered_key *wb;
+	unsigned u64s;
+	int ret = 0;
+
+	if (!trans->nr_updates &&
+	    !trans->nr_wb_updates &&
+	    !trans->extra_journal_entries.nr)
+		goto out_reset;
+
+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&c->gc_lock);
+
+	ret = bch2_trans_commit_run_triggers(trans);
+	if (ret)
+		goto out_reset;
+
+	trans_for_each_update(trans, i) {
+		struct printbuf buf = PRINTBUF;
+		enum bkey_invalid_flags invalid_flags = 0;
+
+		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+					       i->bkey_type, invalid_flags, &buf)))
+			ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
+		btree_insert_entry_checks(trans, i);
+		printbuf_exit(&buf);
+
+		if (ret)
+			return ret;
+	}
+
+	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+		ret = do_bch2_trans_commit_to_journal_replay(trans);
+		goto out_reset;
+	}
+
+	if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+		ret = bch2_trans_commit_get_rw_cold(trans, flags);
+		if (ret)
+			goto out_reset;
+	}
+
+	if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
+	    mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+		bch2_trans_begin(trans);
+		bch2_trans_unlock(trans);
+
+		ret = __bch2_btree_write_buffer_flush(trans,
+					flags|BTREE_INSERT_NOCHECK_RW, true);
+		if (!ret) {
+			trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+		}
+		goto out;
+	}
+
+	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
+	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+
+	trans->journal_u64s		= trans->extra_journal_entries.nr;
+	trans->journal_preres_u64s	= 0;
+
+	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+
+	if (trans->journal_transaction_names)
+		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+
+	trans_for_each_update(trans, i) {
+		EBUG_ON(!i->path->should_be_locked);
+
+		ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+		if (unlikely(ret))
+			goto out;
+
+		EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+
+		if (i->key_cache_already_flushed)
+			continue;
+
+		/* we're going to journal the key being updated: */
+		u64s = jset_u64s(i->k->k.u64s);
+		if (i->cached &&
+		    likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
+			trans->journal_preres_u64s += u64s;
+
+		if (i->flags & BTREE_UPDATE_NOJOURNAL)
+			continue;
+
+		trans->journal_u64s += u64s;
+
+		/* and we're also going to log the overwrite: */
+		if (trans->journal_transaction_names)
+			trans->journal_u64s += jset_u64s(i->old_k.u64s);
+	}
+
+	trans_for_each_wb_update(trans, wb)
+		trans->journal_u64s += jset_u64s(wb->k.k.u64s);
+
+	if (trans->extra_journal_res) {
+		ret = bch2_disk_reservation_add(c, trans->disk_res,
+				trans->extra_journal_res,
+				(flags & BTREE_INSERT_NOFAIL)
+				? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			goto err;
+	}
+retry:
+	bch2_trans_verify_not_in_restart(trans);
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+	ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
+
+	/* make sure we didn't drop or screw up locks: */
+	bch2_trans_verify_locks(trans);
+
+	if (ret)
+		goto err;
+
+	trace_and_count(c, transaction_commit, trans, _RET_IP_);
+out:
+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+
+	if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
+out_reset:
+	bch2_trans_reset_updates(trans);
+
+	return ret;
+err:
+	ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
+	if (ret)
+		goto out;
+
+	goto retry;
+}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
new file mode 100644
index 000000000000..c9a38e254949
--- /dev/null
+++ b/fs/bcachefs/btree_types.h
@@ -0,0 +1,739 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H
+
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+
+//#include "bkey_methods.h"
+#include "buckets_types.h"
+#include "darray.h"
+#include "errcode.h"
+#include "journal_types.h"
+#include "replicas_types.h"
+#include "six.h"
+
+struct open_bucket;
+struct btree_update;
+struct btree_trans;
+
+#define MAX_BSETS		3U
+
+struct btree_nr_keys {
+
+	/*
+	 * Amount of live metadata (i.e. size of node after a compaction) in
+	 * units of u64s
+	 */
+	u16			live_u64s;
+	u16			bset_u64s[MAX_BSETS];
+
+	/* live keys only: */
+	u16			packed_keys;
+	u16			unpacked_keys;
+};
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	u16			size;
+
+	/* function of size - precalculated for to_inorder() */
+	u16			extra;
+
+	u16			data_offset;
+	u16			aux_data_offset;
+	u16			end_offset;
+};
+
+struct btree_write {
+	struct journal_entry_pin	journal;
+};
+
+struct btree_alloc {
+	struct open_buckets	ob;
+	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+struct btree_bkey_cached_common {
+	struct six_lock		lock;
+	u8			level;
+	u8			btree_id;
+	bool			cached;
+};
+
+struct btree {
+	struct btree_bkey_cached_common c;
+
+	struct rhash_head	hash;
+	u64			hash_val;
+
+	unsigned long		flags;
+	u16			written;
+	u8			nsets;
+	u8			nr_key_bits;
+	u16			version_ondisk;
+
+	struct bkey_format	format;
+
+	struct btree_node	*data;
+	void			*aux_data;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+
+	struct btree_nr_keys	nr;
+	u16			sib_u64s[2];
+	u16			whiteout_u64s;
+	u8			byte_order;
+	u8			unpack_fn_len;
+
+	struct btree_write	writes[2];
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+	/*
+	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
+	 * fails because the lock sequence number has changed - i.e. the
+	 * contents were modified - we can still relock the node if it's still
+	 * the one we want, without redoing the traversal
+	 */
+
+	/*
+	 * For asynchronous splits/interior node updates:
+	 * When we do a split, we allocate new child nodes and update the parent
+	 * node to point to them: we update the parent in memory immediately,
+	 * but then we must wait until the children have been written out before
+	 * the update to the parent can be written - this is a list of the
+	 * btree_updates that are blocking this node from being
+	 * written:
+	 */
+	struct list_head	write_blocked;
+
+	/*
+	 * Also for asynchronous splits/interior node updates:
+	 * If a btree node isn't reachable yet, we don't want to kick off
+	 * another write - because that write also won't yet be reachable and
+	 * marking it as completed before it's reachable would be incorrect:
+	 */
+	unsigned long		will_make_reachable;
+
+	struct open_buckets	ob;
+
+	/* lru list */
+	struct list_head	list;
+};
+
+struct btree_cache {
+	struct rhashtable	table;
+	bool			table_init_done;
+	/*
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct mutex		lock;
+	struct list_head	live;
+	struct list_head	freeable;
+	struct list_head	freed_pcpu;
+	struct list_head	freed_nonpcpu;
+
+	/* Number of elements in live + freeable lists */
+	unsigned		used;
+	unsigned		reserve;
+	atomic_t		dirty;
+	struct shrinker		shrink;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
+	 */
+	struct task_struct	*alloc_lock;
+	struct closure_waitlist	alloc_wait;
+};
+
+struct btree_node_iter {
+	struct btree_node_iter_set {
+		u16	k, end;
+	} data[MAX_BSETS];
+};
+
+/*
+ * Iterate over all possible positions, synthesizing deleted keys for holes:
+ */
+static const __maybe_unused u16 BTREE_ITER_SLOTS		= 1 << 0;
+static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS		= 1 << 1;
+/*
+ * Indicates that intent locks should be taken on leaf nodes, because we expect
+ * to be doing updates:
+ */
+static const __maybe_unused u16 BTREE_ITER_INTENT		= 1 << 2;
+/*
+ * Causes the btree iterator code to prefetch additional btree nodes from disk:
+ */
+static const __maybe_unused u16 BTREE_ITER_PREFETCH		= 1 << 3;
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS		= 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS		= 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_CACHED		= 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE	= 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES		= 1 << 8;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL		= 1 << 9;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS	= 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS	= 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS	= 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE		= 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL	= 1 << 14;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL	= 1 << 15;
+#define __BTREE_ITER_FLAGS_END					       16
+
+enum btree_path_uptodate {
+	BTREE_ITER_UPTODATE		= 0,
+	BTREE_ITER_NEED_RELOCK		= 1,
+	BTREE_ITER_NEED_TRAVERSE	= 2,
+};
+
+#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
+#define TRACK_PATH_ALLOCATED
+#endif
+
+struct btree_path {
+	u8			idx;
+	u8			sorted_idx;
+	u8			ref;
+	u8			intent_ref;
+
+	/* btree_iter_copy starts here: */
+	struct bpos		pos;
+
+	enum btree_id		btree_id:5;
+	bool			cached:1;
+	bool			preserve:1;
+	enum btree_path_uptodate uptodate:2;
+	/*
+	 * When true, failing to relock this path will cause the transaction to
+	 * restart:
+	 */
+	bool			should_be_locked:1;
+	unsigned		level:3,
+				locks_want:3;
+	u8			nodes_locked;
+
+	struct btree_path_level {
+		struct btree	*b;
+		struct btree_node_iter iter;
+		u32		lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		u64             lock_taken_time;
+#endif
+	}			l[BTREE_MAX_DEPTH];
+#ifdef TRACK_PATH_ALLOCATED
+	unsigned long		ip_allocated;
+#endif
+};
+
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+	return path->l + path->level;
+}
+
+static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
+{
+#ifdef TRACK_PATH_ALLOCATED
+	return path->ip_allocated;
+#else
+	return _THIS_IP_;
+#endif
+}
+
+/*
+ * @pos			- iterator's current position
+ * @level		- current btree depth
+ * @locks_want		- btree level below which we start taking intent locks
+ * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked	- bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+	struct btree_trans	*trans;
+	struct btree_path	*path;
+	struct btree_path	*update_path;
+	struct btree_path	*key_cache_path;
+
+	enum btree_id		btree_id:8;
+	unsigned		min_depth:3;
+	unsigned		advanced:1;
+
+	/* btree_iter_copy starts here: */
+	u16			flags;
+
+	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
+	unsigned		snapshot;
+
+	struct bpos		pos;
+	/*
+	 * Current unpacked key - so that bch2_btree_iter_next()/
+	 * bch2_btree_iter_next_slot() can correctly advance pos.
+	 */
+	struct bkey		k;
+
+	/* BTREE_ITER_WITH_JOURNAL: */
+	size_t			journal_idx;
+	struct bpos		journal_pos;
+#ifdef TRACK_PATH_ALLOCATED
+	unsigned long		ip_allocated;
+#endif
+};
+
+struct btree_key_cache_freelist {
+	struct bkey_cached	*objs[16];
+	unsigned		nr;
+};
+
+struct btree_key_cache {
+	struct mutex		lock;
+	struct rhashtable	table;
+	bool			table_init_done;
+	struct list_head	freed_pcpu;
+	struct list_head	freed_nonpcpu;
+	struct shrinker		shrink;
+	unsigned		shrink_iter;
+	struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+	atomic_long_t		nr_freed;
+	atomic_long_t		nr_keys;
+	atomic_long_t		nr_dirty;
+};
+
+struct bkey_cached_key {
+	u32			btree_id;
+	struct bpos		pos;
+} __packed __aligned(4);
+
+#define BKEY_CACHED_ACCESSED		0
+#define BKEY_CACHED_DIRTY		1
+
+struct bkey_cached {
+	struct btree_bkey_cached_common c;
+
+	unsigned long		flags;
+	u16			u64s;
+	bool			valid;
+	u32			btree_trans_barrier_seq;
+	struct bkey_cached_key	key;
+
+	struct rhash_head	hash;
+	struct list_head	list;
+
+	struct journal_preres	res;
+	struct journal_entry_pin journal;
+	u64			seq;
+
+	struct bkey_i		*k;
+};
+
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+	return !b->cached
+		? container_of(b, struct btree, c)->key.k.p
+		: container_of(b, struct bkey_cached, c)->key.pos;
+}
+
+struct btree_insert_entry {
+	unsigned		flags;
+	u8			bkey_type;
+	enum btree_id		btree_id:8;
+	u8			level:4;
+	bool			cached:1;
+	bool			insert_trigger_run:1;
+	bool			overwrite_trigger_run:1;
+	bool			key_cache_already_flushed:1;
+	/*
+	 * @old_k may be a key from the journal; @old_btree_u64s always refers
+	 * to the size of the key being overwritten in the btree:
+	 */
+	u8			old_btree_u64s;
+	struct bkey_i		*k;
+	struct btree_path	*path;
+	u64			seq;
+	/* key being overwritten: */
+	struct bkey		old_k;
+	const struct bch_val	*old_v;
+	unsigned long		ip_allocated;
+};
+
+#ifndef CONFIG_LOCKDEP
+#define BTREE_ITER_MAX		64
+#else
+#define BTREE_ITER_MAX		32
+#endif
+
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+	btree_trans_commit_hook_fn	*fn;
+	struct btree_trans_commit_hook	*next;
+};
+
+#define BTREE_TRANS_MEM_MAX	(1U << 16)
+
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS	10000
+
+struct btree_trans {
+	struct bch_fs		*c;
+	const char		*fn;
+	struct closure		ref;
+	struct list_head	list;
+	u64			last_begin_time;
+
+	u8			lock_may_not_fail;
+	u8			lock_must_abort;
+	struct btree_bkey_cached_common *locking;
+	struct six_lock_waiter	locking_wait;
+
+	int			srcu_idx;
+
+	u8			fn_idx;
+	u8			nr_sorted;
+	u8			nr_updates;
+	u8			nr_wb_updates;
+	u8			wb_updates_size;
+	bool			used_mempool:1;
+	bool			in_traverse_all:1;
+	bool			paths_sorted:1;
+	bool			memory_allocation_failure:1;
+	bool			journal_transaction_names:1;
+	bool			journal_replay_not_finished:1;
+	bool			notrace_relock_fail:1;
+	enum bch_errcode	restarted:16;
+	u32			restart_count;
+	unsigned long		last_begin_ip;
+	unsigned long		last_restarted_ip;
+	unsigned long		srcu_lock_time;
+
+	/*
+	 * For when bch2_trans_update notices we'll be splitting a compressed
+	 * extent:
+	 */
+	unsigned		extra_journal_res;
+	unsigned		nr_max_paths;
+
+	u64			paths_allocated;
+
+	unsigned		mem_top;
+	unsigned		mem_max;
+	unsigned		mem_bytes;
+	void			*mem;
+
+	u8			sorted[BTREE_ITER_MAX + 8];
+	struct btree_path	paths[BTREE_ITER_MAX];
+	struct btree_insert_entry updates[BTREE_ITER_MAX];
+	struct btree_write_buffered_key *wb_updates;
+
+	/* update path: */
+	struct btree_trans_commit_hook *hooks;
+	darray_u64		extra_journal_entries;
+	struct journal_entry_pin *journal_pin;
+
+	struct journal_res	journal_res;
+	struct journal_preres	journal_preres;
+	u64			*journal_seq;
+	struct disk_reservation *disk_res;
+	unsigned		journal_u64s;
+	unsigned		journal_preres_u64s;
+	struct replicas_delta_list *fs_usage_deltas;
+};
+
+#define BCH_BTREE_WRITE_TYPES()						\
+	x(initial,		0)					\
+	x(init_next_bset,	1)					\
+	x(cache_reclaim,	2)					\
+	x(journal_reclaim,	3)					\
+	x(interior,		4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+	BCH_BTREE_WRITE_TYPES()
+#undef x
+	BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS	ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
+
+#define BTREE_FLAGS()							\
+	x(read_in_flight)						\
+	x(read_error)							\
+	x(dirty)							\
+	x(need_write)							\
+	x(write_blocked)						\
+	x(will_make_reachable)						\
+	x(noevict)							\
+	x(write_idx)							\
+	x(accessed)							\
+	x(write_in_flight)						\
+	x(write_in_flight_inner)					\
+	x(just_written)							\
+	x(dying)							\
+	x(fake)								\
+	x(need_rewrite)							\
+	x(never_write)
+
+enum btree_flags {
+	/* First bits for btree node write type */
+	BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
+#define x(flag)	BTREE_NODE_##flag,
+	BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag)								\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+									\
+static inline void clear_btree_node_ ## flag(struct btree *b)		\
+{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+BTREE_FLAGS()
+#undef x
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+	EBUG_ON(!b->nsets);
+	return b->set + b->nsets - 1;
+}
+
+static inline void *
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+{
+	return (void *) ((u64 *) b->data + 1 + offset);
+}
+
+static inline u16
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+{
+	u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+
+	EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+	return ret;
+}
+
+static inline struct bset *bset(const struct btree *b,
+				const struct bset_tree *t)
+{
+	return __btree_node_offset_to_ptr(b, t->data_offset);
+}
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = __btree_node_ptr_to_offset(b, i);
+	set_btree_bset_end(b, t);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+	return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+	return __btree_node_ptr_to_offset(b, k);
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+	return __btree_node_offset_to_ptr(b, k);
+}
+
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+{
+	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+}
+
+#define btree_bkey_first(_b, _t)					\
+({									\
+	EBUG_ON(bset(_b, _t)->start !=					\
+		__btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+									\
+	bset(_b, _t)->start;						\
+})
+
+#define btree_bkey_last(_b, _t)						\
+({									\
+	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
+		vstruct_last(bset(_b, _t)));				\
+									\
+	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
+})
+
+static inline unsigned bset_u64s(struct bset_tree *t)
+{
+	return t->end_offset - t->data_offset -
+		sizeof(struct bset) / sizeof(u64);
+}
+
+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
+{
+	return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+	return i - (void *) b->data;
+}
+
+enum btree_node_type {
+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
+	BCH_BTREE_IDS()
+#undef x
+	BKEY_TYPE_btree,
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
+}
+
+/* Type of keys @b contains: */
+static inline enum btree_node_type btree_node_type(struct btree *b)
+{
+	return __btree_node_type(b->c.level, b->c.btree_id);
+}
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
+	(BIT(BKEY_TYPE_extents)|			\
+	 BIT(BKEY_TYPE_alloc)|				\
+	 BIT(BKEY_TYPE_inodes)|				\
+	 BIT(BKEY_TYPE_stripes)|			\
+	 BIT(BKEY_TYPE_reflink)|			\
+	 BIT(BKEY_TYPE_btree))
+
+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS		\
+	(BIT(BKEY_TYPE_alloc)|				\
+	 BIT(BKEY_TYPE_inodes)|				\
+	 BIT(BKEY_TYPE_stripes)|			\
+	 BIT(BKEY_TYPE_snapshots))
+
+#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
+	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
+	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+{
+	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
+}
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_EXTENTS)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << type) & mask;
+}
+
+static inline bool btree_id_is_extents(enum btree_id btree)
+{
+	return btree_node_type_is_extents((enum btree_node_type) btree);
+}
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << id) & mask;
+}
+
+static inline bool btree_type_has_ptrs(enum btree_id id)
+{
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_DATA)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << id) & mask;
+}
+
+struct btree_root {
+	struct btree		*b;
+
+	/* On disk root - see async splits: */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	u8			level;
+	u8			alive;
+	s8			error;
+};
+
+enum btree_gc_coalesce_fail_reason {
+	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+enum btree_node_sibling {
+	btree_prev_sib,
+	btree_next_sib,
+};
+
+#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
new file mode 100644
index 000000000000..324767c0ddcc
--- /dev/null
+++ b/fs/bcachefs/btree_update.c
@@ -0,0 +1,933 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "extents.h"
+#include "keylist.h"
+#include "snapshot.h"
+#include "trace.h"
+
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+					 const struct btree_insert_entry *r)
+{
+	return   cmp_int(l->btree_id,	r->btree_id) ?:
+		 cmp_int(l->cached,	r->cached) ?:
+		 -cmp_int(l->level,	r->level) ?:
+		 bpos_cmp(l->k->k.p,	r->k->k.p);
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+			  struct bkey_i *, enum btree_update_flags,
+			  unsigned long ip);
+
+static noinline int extent_front_merge(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       struct bkey_s_c k,
+				       struct bkey_i **insert,
+				       enum btree_update_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *update;
+	int ret;
+
+	update = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(update);
+	if (ret)
+		return ret;
+
+	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+		return 0;
+
+	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
+		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	ret = bch2_btree_delete_at(trans, iter, flags);
+	if (ret)
+		return ret;
+
+	*insert = update;
+	return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+				      struct btree_iter *iter,
+				      struct bkey_i *insert,
+				      struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
+		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+	return 0;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+				      enum btree_id btree_id, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot = pos.snapshot;
+	int ret;
+
+	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+		return 0;
+
+	pos.snapshot++;
+
+	for_each_btree_key_norestart(trans, iter, btree_id, pos,
+			   BTREE_ITER_ALL_SNAPSHOTS|
+			   BTREE_ITER_NOPRESERVE, k, ret) {
+		if (!bkey_eq(k.k->p, pos))
+			break;
+
+		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+					      k.k->p.snapshot)) {
+			ret = !bkey_whiteout(k.k);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+				   enum btree_id id,
+				   struct bpos old_pos,
+				   struct bpos new_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter old_iter, new_iter = { NULL };
+	struct bkey_s_c old_k, new_k;
+	snapshot_id_list s;
+	struct bkey_i *update;
+	int ret = 0;
+
+	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+		return 0;
+
+	darray_init(&s);
+
+	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+	       !(ret = bkey_err(old_k)) &&
+	       bkey_eq(old_pos, old_k.k->p)) {
+		struct bpos whiteout_pos =
+			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+			continue;
+
+		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+					   BTREE_ITER_NOT_EXTENTS|
+					   BTREE_ITER_INTENT);
+		ret = bkey_err(new_k);
+		if (ret)
+			break;
+
+		if (new_k.k->type == KEY_TYPE_deleted) {
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+			ret = PTR_ERR_OR_ZERO(update);
+			if (ret)
+				break;
+
+			bkey_init(&update->k);
+			update->k.p		= whiteout_pos;
+			update->k.type		= KEY_TYPE_whiteout;
+
+			ret = bch2_trans_update(trans, &new_iter, update,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		}
+		bch2_trans_iter_exit(trans, &new_iter);
+
+		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &new_iter);
+	bch2_trans_iter_exit(trans, &old_iter);
+	darray_exit(&s);
+
+	return ret;
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       enum btree_update_flags flags,
+				       struct bkey_s_c old,
+				       struct bkey_s_c new)
+{
+	enum btree_id btree_id = iter->btree_id;
+	struct bkey_i *update;
+	struct bpos new_start = bkey_start_pos(new.k);
+	bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
+	bool back_split  = bkey_gt(old.k->p, new.k->p);
+	int ret = 0, compressed_sectors;
+
+	/*
+	 * If we're going to be splitting a compressed extent, note it
+	 * so that __bch2_trans_commit() can increase our disk
+	 * reservation:
+	 */
+	if (((front_split && back_split) ||
+	     ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
+	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
+		trans->extra_journal_res += compressed_sectors;
+
+	if (front_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_back(new_start, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	/* If we're overwriting in a different snapshot - middle split: */
+	if (old.k->p.snapshot != new.k->p.snapshot &&
+	    (front_split || back_split)) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new_start, update);
+		bch2_cut_back(new.k->p, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (bkey_le(old.k->p, new.k->p)) {
+		update = bch2_trans_kmalloc(trans, sizeof(*update));
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bkey_init(&update->k);
+		update->k.p = old.k->p;
+		update->k.p.snapshot = new.k->p.snapshot;
+
+		if (new.k->p.snapshot != old.k->p.snapshot) {
+			update->k.type = KEY_TYPE_whiteout;
+		} else if (btree_type_has_snapshots(btree_id)) {
+			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+			if (ret < 0)
+				return ret;
+			if (ret)
+				update->k.type = KEY_TYPE_whiteout;
+		}
+
+		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (back_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new.k->p, update);
+
+		ret = bch2_trans_update_by_path(trans, iter->path, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+					  flags, _RET_IP_);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+				    struct btree_iter *orig_iter,
+				    struct bkey_i *insert,
+				    enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	enum btree_id btree_id = orig_iter->btree_id;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_WITH_UPDATES|
+			     BTREE_ITER_NOT_EXTENTS);
+	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+	if ((ret = bkey_err(k)))
+		goto err;
+	if (!k.k)
+		goto out;
+
+	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+			ret = extent_front_merge(trans, &iter, k, &insert, flags);
+			if (ret)
+				goto err;
+		}
+
+		goto next;
+	}
+
+	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+		bool done = bkey_lt(insert->k.p, k.k->p);
+
+		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+		if (ret)
+			goto err;
+
+		if (done)
+			goto out;
+next:
+		bch2_btree_iter_advance(&iter);
+		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+		if ((ret = bkey_err(k)))
+			goto err;
+		if (!k.k)
+			goto out;
+	}
+
+	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+		ret = extent_back_merge(trans, &iter, insert, k);
+		if (ret)
+			goto err;
+	}
+out:
+	if (!bkey_deleted(&insert->k))
+		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+					    struct btree_path *path,
+					    struct btree_insert_entry *i,
+					    enum btree_update_flags flags,
+					    unsigned long ip)
+{
+	struct btree_path *btree_path;
+	struct bkey k;
+	int ret;
+
+	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+				   BTREE_ITER_INTENT, _THIS_IP_);
+	ret = bch2_btree_path_traverse(trans, btree_path, 0);
+	if (ret)
+		goto out;
+
+	/*
+	 * The old key in the insert entry might actually refer to an existing
+	 * key in the btree that has been deleted from cache and not yet
+	 * flushed. Check for this and skip the flush so we don't run triggers
+	 * against a stale key.
+	 */
+	bch2_btree_path_peek_slot_exact(btree_path, &k);
+	if (!bkey_deleted(&k))
+		goto out;
+
+	i->key_cache_already_flushed = true;
+	i->flags |= BTREE_TRIGGER_NORUN;
+
+	btree_path_set_should_be_locked(btree_path);
+	ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+out:
+	bch2_path_put(trans, btree_path, true);
+	return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+			  struct bkey_i *k, enum btree_update_flags flags,
+			  unsigned long ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i, n;
+	u64 seq = 0;
+	int cmp;
+
+	EBUG_ON(!path->should_be_locked);
+	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+	EBUG_ON(!bpos_eq(k->k.p, path->pos));
+
+	/*
+	 * The transaction journal res hasn't been allocated at this point.
+	 * That occurs at commit time. Reuse the seq field to pass in the seq
+	 * of a prejournaled key.
+	 */
+	if (flags & BTREE_UPDATE_PREJOURNAL)
+		seq = trans->journal_res.seq;
+
+	n = (struct btree_insert_entry) {
+		.flags		= flags,
+		.bkey_type	= __btree_node_type(path->level, path->btree_id),
+		.btree_id	= path->btree_id,
+		.level		= path->level,
+		.cached		= path->cached,
+		.path		= path,
+		.k		= k,
+		.seq		= seq,
+		.ip_allocated	= ip,
+	};
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		BUG_ON(i != trans->updates &&
+		       btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
+
+	/*
+	 * Pending updates are kept sorted: first, find position of new update,
+	 * then delete/trim any updates the new update overwrites:
+	 */
+	trans_for_each_update(trans, i) {
+		cmp = btree_insert_entry_cmp(&n, i);
+		if (cmp <= 0)
+			break;
+	}
+
+	if (!cmp && i < trans->updates + trans->nr_updates) {
+		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+
+		bch2_path_put(trans, i->path, true);
+		i->flags	= n.flags;
+		i->cached	= n.cached;
+		i->k		= n.k;
+		i->path		= n.path;
+		i->seq		= n.seq;
+		i->ip_allocated	= n.ip_allocated;
+	} else {
+		array_insert_item(trans->updates, trans->nr_updates,
+				  i - trans->updates, n);
+
+		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
+		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+		if (unlikely(trans->journal_replay_not_finished)) {
+			struct bkey_i *j_k =
+				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+			if (j_k) {
+				i->old_k = j_k->k;
+				i->old_v = &j_k->v;
+			}
+		}
+	}
+
+	__btree_path_get(i->path, true);
+
+	/*
+	 * If a key is present in the key cache, it must also exist in the
+	 * btree - this is necessary for cache coherency. When iterating over
+	 * a btree that's cached in the key cache, the btree iter code checks
+	 * the key cache - but the key has to exist in the btree for that to
+	 * work:
+	 */
+	if (path->cached && bkey_deleted(&i->old_k))
+		return flush_new_cached_update(trans, path, i, flags, ip);
+
+	return 0;
+}
+
+static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
+						    struct btree_iter *iter,
+						    struct btree_path *path)
+{
+	if (!iter->key_cache_path ||
+	    !iter->key_cache_path->should_be_locked ||
+	    !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+		struct bkey_cached *ck;
+		int ret;
+
+		if (!iter->key_cache_path)
+			iter->key_cache_path =
+				bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+					      BTREE_ITER_INTENT|
+					      BTREE_ITER_CACHED, _THIS_IP_);
+
+		iter->key_cache_path =
+			bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+						iter->flags & BTREE_ITER_INTENT,
+						_THIS_IP_);
+
+		ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+					       BTREE_ITER_CACHED);
+		if (unlikely(ret))
+			return ret;
+
+		ck = (void *) iter->key_cache_path->l[0].b;
+
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+		}
+
+		btree_path_set_should_be_locked(iter->key_cache_path);
+	}
+
+	return 0;
+}
+
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+				   struct bkey_i *k, enum btree_update_flags flags)
+{
+	struct btree_path *path = iter->update_path ?: iter->path;
+	int ret;
+
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return bch2_trans_update_extent(trans, iter, k, flags);
+
+	if (bkey_deleted(&k->k) &&
+	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+		if (unlikely(ret < 0))
+			return ret;
+
+		if (ret)
+			k->k.type = KEY_TYPE_whiteout;
+	}
+
+	/*
+	 * Ensure that updates to cached btrees go to the key cache:
+	 */
+	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+	    !path->cached &&
+	    !path->level &&
+	    btree_id_cached(trans->c, path->btree_id)) {
+		ret = bch2_trans_update_get_key_cache(trans, iter, path);
+		if (ret)
+			return ret;
+
+		path = iter->key_cache_path;
+	}
+
+	return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+}
+
+/*
+ * Add a transaction update for a key that has already been journaled.
+ */
+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
+				       struct btree_iter *iter, struct bkey_i *k,
+				       enum btree_update_flags flags)
+{
+	trans->journal_res.seq = seq;
+	return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
+						 BTREE_UPDATE_PREJOURNAL);
+}
+
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+					    enum btree_id btree,
+					    struct bkey_i *k)
+{
+	struct btree_write_buffered_key *i;
+	int ret;
+
+	EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+	trans_for_each_wb_update(trans, i) {
+		if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+			bkey_copy(&i->k, k);
+			return 0;
+		}
+	}
+
+	if (!trans->wb_updates ||
+	    trans->nr_wb_updates == trans->wb_updates_size) {
+		struct btree_write_buffered_key *u;
+
+		if (trans->nr_wb_updates == trans->wb_updates_size) {
+			struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+			BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+			trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+			if (s)
+				s->wb_updates_size = trans->wb_updates_size;
+		}
+
+		u = bch2_trans_kmalloc_nomemzero(trans,
+					trans->wb_updates_size *
+					sizeof(struct btree_write_buffered_key));
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			return ret;
+
+		if (trans->nr_wb_updates)
+			memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+			       sizeof(struct btree_write_buffered_key));
+		trans->wb_updates = u;
+	}
+
+	trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+		.btree	= btree,
+	};
+
+	bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+	trans->nr_wb_updates++;
+
+	return 0;
+}
+
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+			     enum btree_id btree, struct bpos end)
+{
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_prev(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_advance(iter);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+	if (bkey_gt(k.k->p, end)) {
+		ret = -BCH_ERR_ENOSPC_btree_slot;
+		goto err;
+	}
+
+	return 0;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+void bch2_trans_commit_hook(struct btree_trans *trans,
+			    struct btree_trans_commit_hook *h)
+{
+	h->next = trans->hooks;
+	trans->hooks = h;
+}
+
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+				enum btree_id btree, struct bkey_i *k,
+				enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
+			    struct bkey_i *k, enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/**
+ * bch2_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct bch_fs
+ * @id:			btree to insert into
+ * @k:			key to insert
+ * @disk_res:		must be non-NULL whenever inserting or potentially
+ *			splitting data extents
+ * @flags:		transaction commit flags
+ *
+ * Returns:		0 on success, error code on failure
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
+		      struct disk_reservation *disk_res, int flags)
+{
+	return bch2_trans_do(c, disk_res, NULL, flags,
+			     bch2_btree_insert_trans(trans, id, k, 0));
+}
+
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+				unsigned len, unsigned update_flags)
+{
+	struct bkey_i *k;
+
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	bch2_key_resize(&k->k, len);
+	return bch2_trans_update(trans, iter, k, update_flags);
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned update_flags)
+{
+	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+				  enum btree_id btree, struct bpos pos)
+{
+	struct bkey_i *k;
+
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.p = pos;
+	return bch2_trans_update_buffered(trans, btree, k);
+}
+
+int bch2_btree_delete(struct btree_trans *trans,
+		      enum btree_id btree, struct bpos pos,
+		      unsigned update_flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, pos,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, update_flags);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+				  struct bpos start, struct bpos end,
+				  unsigned update_flags,
+				  u64 *journal_seq)
+{
+	u32 restart_count = trans->restart_count;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(trans->c, 0);
+		struct bkey_i delete;
+
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * This could probably be more efficient for extents:
+		 */
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+
+		if (iter.flags & BTREE_ITER_IS_EXTENTS)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
+
+		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
+			bch2_trans_commit(trans, &disk_res, journal_seq,
+					  BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+		/*
+		 * the bch2_trans_begin() call is in a weird place because we
+		 * need to call it after every transaction commit, to avoid path
+		 * overflow, but don't want to call it if the delete operation
+		 * is a no-op and we have no work to do:
+		 */
+		bch2_trans_begin(trans);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			    struct bpos start, struct bpos end,
+			    unsigned update_flags,
+			    u64 *journal_seq)
+{
+	int ret = bch2_trans_run(c,
+			bch2_btree_delete_range_trans(trans, id, start, end,
+						      update_flags, journal_seq));
+	if (ret == -BCH_ERR_transaction_restart_nested)
+		ret = 0;
+	return ret;
+}
+
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+		       struct bpos pos, bool set)
+{
+	struct bkey_i *k;
+	int ret = 0;
+
+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		return ret;
+
+	bkey_init(&k->k);
+	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	k->k.p = pos;
+
+	return bch2_trans_update_buffered(trans, btree, k);
+}
+
+__printf(2, 0)
+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+{
+	struct printbuf buf = PRINTBUF;
+	struct jset_entry_log *l;
+	unsigned u64s;
+	int ret;
+
+	prt_vprintf(&buf, fmt, args);
+	ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+	if (ret)
+		goto err;
+
+	u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+
+	ret = darray_make_room(entries, jset_u64s(u64s));
+	if (ret)
+		goto err;
+
+	l = (void *) &darray_top(*entries);
+	l->entry.u64s		= cpu_to_le16(u64s);
+	l->entry.btree_id	= 0;
+	l->entry.level		= 1;
+	l->entry.type		= BCH_JSET_ENTRY_log;
+	l->entry.pad[0]		= 0;
+	l->entry.pad[1]		= 0;
+	l->entry.pad[2]		= 0;
+	memcpy(l->d, buf.buf, buf.pos);
+	while (buf.pos & 7)
+		l->d[buf.pos++] = '\0';
+
+	entries->nr += jset_u64s(u64s);
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+__printf(3, 0)
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+		  va_list args)
+{
+	int ret;
+
+	if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+		ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+	} else {
+		ret = bch2_trans_do(c, NULL, NULL,
+			BTREE_INSERT_LAZY_RW|commit_flags,
+			__bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
+	}
+
+	return ret;
+}
+
+__printf(2, 3)
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, 0, fmt, args);
+	va_end(args);
+	return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+__printf(2, 3)
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
+	va_end(args);
+	return ret;
+}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
new file mode 100644
index 000000000000..9816d2286540
--- /dev/null
+++ b/fs/bcachefs/btree_update.h
@@ -0,0 +1,340 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_H
+#define _BCACHEFS_BTREE_UPDATE_H
+
+#include "btree_iter.h"
+#include "journal.h"
+
+struct bch_fs;
+struct btree;
+
+void bch2_btree_node_prep_for_write(struct btree_trans *,
+				    struct btree_path *, struct btree *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
+				struct btree *, struct btree_node_iter *,
+				struct bkey_i *);
+
+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
+
+void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
+				struct bkey_i *, u64);
+
+enum btree_insert_flags {
+	/* First bits for bch_watermark: */
+	__BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
+	__BTREE_INSERT_NOCHECK_RW,
+	__BTREE_INSERT_LAZY_RW,
+	__BTREE_INSERT_JOURNAL_REPLAY,
+	__BTREE_INSERT_JOURNAL_RECLAIM,
+	__BTREE_INSERT_NOWAIT,
+	__BTREE_INSERT_GC_LOCK_HELD,
+	__BCH_HASH_SET_MUST_CREATE,
+	__BCH_HASH_SET_MUST_REPLACE,
+};
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL		BIT(__BTREE_INSERT_NOFAIL)
+
+#define BTREE_INSERT_NOCHECK_RW		BIT(__BTREE_INSERT_NOCHECK_RW)
+#define BTREE_INSERT_LAZY_RW		BIT(__BTREE_INSERT_LAZY_RW)
+
+/* Insert is for journal replay - don't get journal reservations: */
+#define BTREE_INSERT_JOURNAL_REPLAY	BIT(__BTREE_INSERT_JOURNAL_REPLAY)
+
+/* Insert is being called from journal reclaim path: */
+#define BTREE_INSERT_JOURNAL_RECLAIM	BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
+
+/* Don't block on allocation failure (for new btree nodes: */
+#define BTREE_INSERT_NOWAIT		BIT(__BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD	BIT(__BTREE_INSERT_GC_LOCK_HELD)
+
+#define BCH_HASH_SET_MUST_CREATE	BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE	BIT(__BCH_HASH_SET_MUST_REPLACE)
+
+int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
+				unsigned, unsigned);
+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
+
+int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
+				struct bkey_i *, enum btree_update_flags);
+
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
+			enum btree_update_flags);
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
+		     struct disk_reservation *, int flags);
+
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+				  struct bpos, struct bpos, unsigned, u64 *);
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
+			    struct bpos, struct bpos, unsigned, u64 *);
+
+int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
+				     struct bpos, struct bpos);
+
+/*
+ * For use when splitting extents in existing snapshots:
+ *
+ * If @old_pos is an interior snapshot node, iterate over descendent snapshot
+ * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
+ * not visible, emit a whiteout at @new_pos.
+ */
+static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+						 enum btree_id btree,
+						 struct bpos old_pos,
+						 struct bpos new_pos)
+{
+	if (!btree_type_has_snapshots(btree) ||
+	    bkey_eq(old_pos, new_pos))
+		return 0;
+
+	return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
+				       enum btree_update_flags,
+				       struct bkey_s_c, struct bkey_s_c);
+
+int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
+			     enum btree_id, struct bpos);
+
+int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
+				   struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *,
+				       struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_buffered(struct btree_trans *,
+					    enum btree_id, struct bkey_i *);
+
+void bch2_trans_commit_hook(struct btree_trans *,
+			    struct btree_trans_commit_hook *);
+int __bch2_trans_commit(struct btree_trans *, unsigned);
+
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
+
+/**
+ * bch2_trans_commit - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+static inline int bch2_trans_commit(struct btree_trans *trans,
+				    struct disk_reservation *disk_res,
+				    u64 *journal_seq,
+				    unsigned flags)
+{
+	trans->disk_res		= disk_res;
+	trans->journal_seq	= journal_seq;
+
+	return __bch2_trans_commit(trans, flags);
+}
+
+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_flags)))
+
+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_flags)))
+
+#define bch2_trans_run(_c, _do)						\
+({									\
+	struct btree_trans *trans = bch2_trans_get(_c);			\
+	int _ret = (_do);						\
+	bch2_trans_put(trans);						\
+	_ret;								\
+})
+
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
+	bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
+
+#define trans_for_each_update(_trans, _i)				\
+	for ((_i) = (_trans)->updates;					\
+	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
+	     (_i)++)
+
+#define trans_for_each_wb_update(_trans, _i)				\
+	for ((_i) = (_trans)->wb_updates;				\
+	     (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates;	\
+	     (_i)++)
+
+static inline void bch2_trans_reset_updates(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i)
+		bch2_path_put(trans, i->path, true);
+
+	trans->extra_journal_res	= 0;
+	trans->nr_updates		= 0;
+	trans->nr_wb_updates		= 0;
+	trans->wb_updates		= NULL;
+	trans->hooks			= NULL;
+	trans->extra_journal_entries.nr	= 0;
+
+	if (trans->fs_usage_deltas) {
+		trans->fs_usage_deltas->used = 0;
+		memset((void *) trans->fs_usage_deltas +
+		       offsetof(struct replicas_delta_list, memset_start), 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
+	}
+}
+
+static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
+						  unsigned type, unsigned min_bytes)
+{
+	unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
+	struct bkey_i *mut;
+
+	if (type && k.k->type != type)
+		return ERR_PTR(-ENOENT);
+
+	mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
+	if (!IS_ERR(mut)) {
+		bkey_reassemble(mut, k);
+
+		if (unlikely(bytes > bkey_bytes(k.k))) {
+			memset((void *) mut + bkey_bytes(k.k), 0,
+			       bytes - bkey_bytes(k.k));
+			mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
+		}
+	}
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
+{
+	return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
+}
+
+#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type)		\
+	bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k,	\
+				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+					struct bkey_s_c *k, unsigned flags,
+					unsigned type, unsigned min_bytes)
+{
+	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
+	int ret;
+
+	if (IS_ERR(mut))
+		return mut;
+
+	ret = bch2_trans_update(trans, iter, mut, flags);
+	if (ret)
+		return ERR_PTR(ret);
+
+	*k = bkey_i_to_s_c(mut);
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+						struct bkey_s_c *k, unsigned flags)
+{
+	return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
+}
+
+#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type)	\
+	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
+				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 unsigned btree_id, struct bpos pos,
+					 unsigned flags, unsigned type, unsigned min_bytes)
+{
+	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
+				btree_id, pos, flags|BTREE_ITER_INTENT, type);
+	struct bkey_i *ret = IS_ERR(k.k)
+		? ERR_CAST(k.k)
+		: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
+	if (IS_ERR(ret))
+		bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       unsigned btree_id, struct bpos pos,
+					       unsigned flags)
+{
+	return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 unsigned btree_id, struct bpos pos,
+					 unsigned flags, unsigned type, unsigned min_bytes)
+{
+	struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
+				btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes);
+	int ret;
+
+	if (IS_ERR(mut))
+		return mut;
+
+	ret = bch2_trans_update(trans, iter, mut, flags);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ERR_PTR(ret);
+	}
+
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
+						       struct btree_iter *iter,
+						       unsigned btree_id, struct bpos pos,
+						       unsigned flags, unsigned min_bytes)
+{
+	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       unsigned btree_id, struct bpos pos,
+					       unsigned flags)
+{
+	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+	bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter,		\
+			_btree_id, _pos, _flags,			\
+			KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
+					       unsigned flags, unsigned type, unsigned val_size)
+{
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
+	int ret;
+
+	if (IS_ERR(k))
+		return k;
+
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	k->k.type = type;
+	set_bkey_val_bytes(&k->k, val_size);
+
+	ret = bch2_trans_update(trans, iter, k, flags);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+	return k;
+}
+
+#define bch2_bkey_alloc(_trans, _iter, _flags, _type)			\
+	bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags,	\
+				KEY_TYPE_##_type, sizeof(struct bch_##_type)))
+
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
new file mode 100644
index 000000000000..7dbf6b6c7f34
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.c
@@ -0,0 +1,2480 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/random.h>
+
+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+				  struct btree_path *, struct btree *,
+				  struct keylist *, unsigned);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
+
+static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
+						enum btree_id btree_id,
+						unsigned level,
+						struct bpos pos)
+{
+	struct btree_path *path;
+
+	path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+			     BTREE_ITER_NOPRESERVE|
+			     BTREE_ITER_INTENT, _RET_IP_);
+	path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+	bch2_btree_path_downgrade(trans, path);
+	__bch2_btree_path_unlock(trans, path);
+	return path;
+}
+
+/* Debug code: */
+
+/*
+ * Verify that child nodes correctly span parent node's range:
+ */
+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bpos next_node = b->data->min_key;
+	struct btree_node_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_btree_ptr_v2 bp;
+	struct bkey unpacked;
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+	BUG_ON(!b->c.level);
+
+	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+		return;
+
+	bch2_btree_node_iter_init_from_start(&iter, b);
+
+	while (1) {
+		k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+		if (k.k->type != KEY_TYPE_btree_ptr_v2)
+			break;
+		bp = bkey_s_c_to_btree_ptr_v2(k);
+
+		if (!bpos_eq(next_node, bp.v->min_key)) {
+			bch2_dump_btree_node(c, b);
+			bch2_bpos_to_text(&buf1, next_node);
+			bch2_bpos_to_text(&buf2, bp.v->min_key);
+			panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
+		}
+
+		bch2_btree_node_iter_advance(&iter, b);
+
+		if (bch2_btree_node_iter_end(&iter)) {
+			if (!bpos_eq(k.k->p, b->key.k.p)) {
+				bch2_dump_btree_node(c, b);
+				bch2_bpos_to_text(&buf1, b->key.k.p);
+				bch2_bpos_to_text(&buf2, k.k->p);
+				panic("expected end %s got %s\n", buf1.buf, buf2.buf);
+			}
+			break;
+		}
+
+		next_node = bpos_successor(k.k->p);
+	}
+#endif
+}
+
+/* Calculate ideal packed bkey format for new btree nodes: */
+
+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+{
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	struct bkey uk;
+
+	for_each_bset(b, t)
+		bset_tree_for_each_key(b, t, k)
+			if (!bkey_deleted(k)) {
+				uk = bkey_unpack_key(b, k);
+				bch2_bkey_format_add_key(s, &uk);
+			}
+}
+
+static struct bkey_format bch2_btree_calc_format(struct btree *b)
+{
+	struct bkey_format_state s;
+
+	bch2_bkey_format_init(&s);
+	bch2_bkey_format_add_pos(&s, b->data->min_key);
+	bch2_bkey_format_add_pos(&s, b->data->max_key);
+	__bch2_btree_calc_format(&s, b);
+
+	return bch2_bkey_format_done(&s);
+}
+
+static size_t btree_node_u64s_with_format(struct btree *b,
+					  struct bkey_format *new_f)
+{
+	struct bkey_format *old_f = &b->format;
+
+	/* stupid integer promotion rules */
+	ssize_t delta =
+	    (((int) new_f->key_u64s - old_f->key_u64s) *
+	     (int) b->nr.packed_keys) +
+	    (((int) new_f->key_u64s - BKEY_U64s) *
+	     (int) b->nr.unpacked_keys);
+
+	BUG_ON(delta + b->nr.live_u64s < 0);
+
+	return b->nr.live_u64s + delta;
+}
+
+/**
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * @c:		filesystem handle
+ * @b:		btree node to rewrite
+ * @new_f:	bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
+ */
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
+				 struct bkey_format *new_f)
+{
+	size_t u64s = btree_node_u64s_with_format(b, new_f);
+
+	return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+}
+
+/* Btree node freeing/allocation: */
+
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
+{
+	trace_and_count(c, btree_node_free, c, b);
+
+	BUG_ON(btree_node_write_blocked(b));
+	BUG_ON(btree_node_dirty(b));
+	BUG_ON(btree_node_need_write(b));
+	BUG_ON(b == btree_node_root(c, b));
+	BUG_ON(b->ob.nr);
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON(b->will_make_reachable);
+
+	clear_btree_node_noevict(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_move(&b->list, &c->btree_cache.freeable);
+	mutex_unlock(&c->btree_cache.lock);
+}
+
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	unsigned level = b->c.level;
+
+	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+	__btree_node_free(c, b);
+	six_unlock_write(&b->c.lock);
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+
+	trans_for_each_path(trans, path)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
+}
+
+static void bch2_btree_node_free_never_used(struct btree_update *as,
+					    struct btree_trans *trans,
+					    struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
+	struct btree_path *path;
+	unsigned level = b->c.level;
+
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
+
+	b->will_make_reachable = 0;
+	closure_put(&as->cl);
+
+	clear_btree_node_will_make_reachable(b);
+	clear_btree_node_accessed(b);
+	clear_btree_node_dirty_acct(c, b);
+	clear_btree_node_need_write(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_del_init(&b->list);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+	mutex_unlock(&c->btree_cache.lock);
+
+	BUG_ON(p->nr >= ARRAY_SIZE(p->b));
+	p->b[p->nr++] = b;
+
+	six_unlock_intent(&b->c.lock);
+
+	trans_for_each_path(trans, path)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
+}
+
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
+					     struct disk_reservation *res,
+					     struct closure *cl,
+					     bool interior_node,
+					     unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp;
+	struct btree *b;
+	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+	struct open_buckets obs = { .nr = 0 };
+	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+		? BTREE_NODE_RESERVE
+		: 0;
+	int ret;
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	if (c->btree_reserve_cache_nr > nr_reserve) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		obs = a->ob;
+		bkey_copy(&tmp.k, &a->k);
+		mutex_unlock(&c->btree_reserve_cache_lock);
+		goto mem_alloc;
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+retry:
+	ret = bch2_alloc_sectors_start_trans(trans,
+				      c->opts.metadata_target ?:
+				      c->opts.foreground_target,
+				      0,
+				      writepoint_ptr(&c->btree_write_point),
+				      &devs_have,
+				      res->nr_replicas,
+				      c->opts.metadata_replicas_required,
+				      watermark, 0, cl, &wp);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+
+	if (wp->sectors_free < btree_sectors(c)) {
+		struct open_bucket *ob;
+		unsigned i;
+
+		open_bucket_for_each(c, &wp->ptrs, ob, i)
+			if (ob->sectors_free < btree_sectors(c))
+				ob->sectors_free = 0;
+
+		bch2_alloc_sectors_done(c, wp);
+		goto retry;
+	}
+
+	bkey_btree_ptr_v2_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
+
+	bch2_open_bucket_get(c, wp, &obs);
+	bch2_alloc_sectors_done(c, wp);
+mem_alloc:
+	b = bch2_btree_node_mem_alloc(trans, interior_node);
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+
+	/* we hold cannibalize_lock: */
+	BUG_ON(IS_ERR(b));
+	BUG_ON(b->ob.nr);
+
+	bkey_copy(&b->key, &tmp.k);
+	b->ob = obs;
+
+	return b;
+}
+
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+					   struct btree_trans *trans,
+					   unsigned level)
+{
+	struct bch_fs *c = as->c;
+	struct btree *b;
+	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
+	int ret;
+
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+	BUG_ON(!p->nr);
+
+	b = p->b[--p->nr];
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+
+	set_btree_node_accessed(b);
+	set_btree_node_dirty_acct(c, b);
+	set_btree_node_need_write(b);
+
+	bch2_bset_init_first(b, &b->data->keys);
+	b->c.level	= level;
+	b->c.btree_id	= as->btree_id;
+	b->version_ondisk = c->sb.version;
+
+	memset(&b->nr, 0, sizeof(b->nr));
+	b->data->magic = cpu_to_le64(bset_magic(c));
+	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
+	b->data->flags = 0;
+	SET_BTREE_NODE_ID(b->data, as->btree_id);
+	SET_BTREE_NODE_LEVEL(b->data, level);
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
+
+		bp->v.mem_ptr		= 0;
+		bp->v.seq		= b->data->keys.seq;
+		bp->v.sectors_written	= 0;
+	}
+
+	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
+
+	bch2_btree_build_aux_trees(b);
+
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
+	BUG_ON(ret);
+
+	trace_and_count(c, btree_node_alloc, c, b);
+	bch2_increment_clock(c, btree_sectors(c), WRITE);
+	return b;
+}
+
+static void btree_set_min(struct btree *b, struct bpos pos)
+{
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
+	b->data->min_key = pos;
+}
+
+static void btree_set_max(struct btree *b, struct bpos pos)
+{
+	b->key.k.p = pos;
+	b->data->max_key = pos;
+}
+
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+						       struct btree_trans *trans,
+						       struct btree *b)
+{
+	struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
+	struct bkey_format format = bch2_btree_calc_format(b);
+
+	/*
+	 * The keys might expand with the new format - if they wouldn't fit in
+	 * the btree node anymore, use the old format for now:
+	 */
+	if (!bch2_btree_node_format_fits(as->c, b, &format))
+		format = b->format;
+
+	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+
+	btree_set_min(n, b->data->min_key);
+	btree_set_max(n, b->data->max_key);
+
+	n->data->format		= format;
+	btree_node_set_format(n, format);
+
+	bch2_btree_sort_into(as->c, n, b);
+
+	btree_node_reset_sib_u64s(n);
+	return n;
+}
+
+static struct btree *__btree_root_alloc(struct btree_update *as,
+				struct btree_trans *trans, unsigned level)
+{
+	struct btree *b = bch2_btree_node_alloc(as, trans, level);
+
+	btree_set_min(b, POS_MIN);
+	btree_set_max(b, SPOS_MAX);
+	b->data->format = bch2_btree_calc_format(b);
+
+	btree_node_set_format(b, b->data->format);
+	bch2_btree_build_aux_trees(b);
+
+	return b;
+}
+
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
+{
+	struct bch_fs *c = as->c;
+	struct prealloc_nodes *p;
+
+	for (p = as->prealloc_nodes;
+	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+	     p++) {
+		while (p->nr) {
+			struct btree *b = p->b[--p->nr];
+
+			mutex_lock(&c->btree_reserve_cache_lock);
+
+			if (c->btree_reserve_cache_nr <
+			    ARRAY_SIZE(c->btree_reserve_cache)) {
+				struct btree_alloc *a =
+					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+				a->ob = b->ob;
+				b->ob.nr = 0;
+				bkey_copy(&a->k, &b->key);
+			} else {
+				bch2_open_buckets_put(c, &b->ob);
+			}
+
+			mutex_unlock(&c->btree_reserve_cache_lock);
+
+			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+			__btree_node_free(c, b);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+		}
+	}
+}
+
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+				  struct btree_update *as,
+				  unsigned nr_nodes[2],
+				  unsigned flags,
+				  struct closure *cl)
+{
+	struct bch_fs *c = as->c;
+	struct btree *b;
+	unsigned interior;
+	int ret = 0;
+
+	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
+
+	/*
+	 * Protects reaping from the btree node cache and using the btree node
+	 * open bucket reserve:
+	 *
+	 * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+	 * blocking on this lock:
+	 */
+	ret = bch2_btree_cache_cannibalize_lock(c, cl);
+	if (ret)
+		return ret;
+
+	for (interior = 0; interior < 2; interior++) {
+		struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+		while (p->nr < nr_nodes[interior]) {
+			b = __bch2_btree_node_alloc(trans, &as->disk_res,
+					flags & BTREE_INSERT_NOWAIT ? NULL : cl,
+					interior, flags);
+			if (IS_ERR(b)) {
+				ret = PTR_ERR(b);
+				goto err;
+			}
+
+			p->b[p->nr++] = b;
+		}
+	}
+err:
+	bch2_btree_cache_cannibalize_unlock(c);
+	return ret;
+}
+
+/* Asynchronous interior node update machinery */
+
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
+{
+	struct bch_fs *c = as->c;
+
+	if (as->took_gc_lock)
+		up_read(&c->gc_lock);
+	as->took_gc_lock = false;
+
+	bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
+	bch2_journal_pin_flush(&c->journal, &as->journal);
+	bch2_disk_reservation_put(c, &as->disk_res);
+	bch2_btree_reserve_put(as, trans);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+			       as->start_time);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_del(&as->unwritten_list);
+	list_del(&as->list);
+
+	closure_debug_destroy(&as->cl);
+	mempool_free(as, &c->btree_interior_update_pool);
+
+	/*
+	 * Have to do the wakeup with btree_interior_update_lock still held,
+	 * since being on btree_interior_update_list is our ref on @c:
+	 */
+	closure_wake_up(&c->btree_interior_update_wait);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_add_key(struct btree_update *as,
+				 struct keylist *keys, struct btree *b)
+{
+	struct bkey_i *k = &b->key;
+
+	BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
+	       ARRAY_SIZE(as->_old_keys));
+
+	bkey_copy(keys->top, k);
+	bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+	bch2_keylist_push(keys);
+}
+
+/*
+ * The transactional part of an interior btree node update, where we journal the
+ * update we did to the interior node and update alloc info:
+ */
+static int btree_update_nodes_written_trans(struct btree_trans *trans,
+					    struct btree_update *as)
+{
+	struct bkey_i *k;
+	int ret;
+
+	ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+	if (ret)
+		return ret;
+
+	memcpy(&darray_top(trans->extra_journal_entries),
+	       as->journal_entries,
+	       as->journal_u64s * sizeof(u64));
+	trans->extra_journal_entries.nr += as->journal_u64s;
+
+	trans->journal_pin = &as->journal;
+
+	for_each_keylist_key(&as->old_keys, k) {
+		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+		ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
+		if (ret)
+			return ret;
+	}
+
+	for_each_keylist_key(&as->new_keys, k) {
+		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+		ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void btree_update_nodes_written(struct btree_update *as)
+{
+	struct bch_fs *c = as->c;
+	struct btree *b;
+	struct btree_trans *trans = bch2_trans_get(c);
+	u64 journal_seq = 0;
+	unsigned i;
+	int ret;
+
+	/*
+	 * If we're already in an error state, it might be because a btree node
+	 * was never written, and we might be trying to free that same btree
+	 * node here, but it won't have been marked as allocated and we'll see
+	 * spurious disk usage inconsistencies in the transactional part below
+	 * if we don't skip it:
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
+
+	/*
+	 * Wait for any in flight writes to finish before we free the old nodes
+	 * on disk:
+	 */
+	for (i = 0; i < as->nr_old_nodes; i++) {
+		__le64 seq;
+
+		b = as->old_nodes[i];
+
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		seq = b->data ? b->data->keys.seq : 0;
+		six_unlock_read(&b->c.lock);
+
+		if (seq == as->old_nodes_seq[i])
+			wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
+				       TASK_UNINTERRUPTIBLE);
+	}
+
+	/*
+	 * We did an update to a parent node where the pointers we added pointed
+	 * to child nodes that weren't written yet: now, the child nodes have
+	 * been written so we can write out the update to the interior node.
+	 */
+
+	/*
+	 * We can't call into journal reclaim here: we'd block on the journal
+	 * reclaim lock, but we may need to release the open buckets we have
+	 * pinned in order for other btree updates to make forward progress, and
+	 * journal reclaim does btree updates when flushing bkey_cached entries,
+	 * which may require allocations as well.
+	 */
+	ret = commit_do(trans, &as->disk_res, &journal_seq,
+			BCH_WATERMARK_reclaim|
+			BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_NOCHECK_RW|
+			BTREE_INSERT_JOURNAL_RECLAIM,
+			btree_update_nodes_written_trans(trans, as));
+	bch2_trans_unlock(trans);
+
+	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
+			     "%s(): error %s", __func__, bch2_err_str(ret));
+err:
+	if (as->b) {
+		struct btree_path *path;
+
+		b = as->b;
+		path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
+		/*
+		 * @b is the node we did the final insert into:
+		 *
+		 * On failure to get a journal reservation, we still have to
+		 * unblock the write and allow most of the write path to happen
+		 * so that shutdown works, but the i->journal_seq mechanism
+		 * won't work to prevent the btree write from being visible (we
+		 * didn't get a journal sequence number) - instead
+		 * __bch2_btree_node_write() doesn't do the actual write if
+		 * we're in journal error state:
+		 */
+
+		/*
+		 * Ensure transaction is unlocked before using
+		 * btree_node_lock_nopath() (the use of which is always suspect,
+		 * we need to work on removing this in the future)
+		 *
+		 * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+		 * calls bch2_path_upgrade(), before we call path_make_mut(), so
+		 * we may rarely end up with a locked path besides the one we
+		 * have here:
+		 */
+		bch2_trans_unlock(trans);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
+		path->l[b->c.level].b = b;
+
+		bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+
+		mutex_lock(&c->btree_interior_update_lock);
+
+		list_del(&as->write_blocked_list);
+		if (list_empty(&b->write_blocked))
+			clear_btree_node_write_blocked(b);
+
+		/*
+		 * Node might have been freed, recheck under
+		 * btree_interior_update_lock:
+		 */
+		if (as->b == b) {
+			BUG_ON(!b->c.level);
+			BUG_ON(!btree_node_dirty(b));
+
+			if (!ret) {
+				struct bset *last = btree_bset_last(b);
+
+				last->journal_seq = cpu_to_le64(
+							     max(journal_seq,
+								 le64_to_cpu(last->journal_seq)));
+
+				bch2_btree_add_journal_pin(c, b, journal_seq);
+			} else {
+				/*
+				 * If we didn't get a journal sequence number we
+				 * can't write this btree node, because recovery
+				 * won't know to ignore this write:
+				 */
+				set_btree_node_never_write(b);
+			}
+		}
+
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+		six_unlock_write(&b->c.lock);
+
+		btree_node_write_if_need(c, b, SIX_LOCK_intent);
+		btree_node_unlock(trans, path, b->c.level);
+		bch2_path_put(trans, path, true);
+	}
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
+
+	bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	for (i = 0; i < as->nr_new_nodes; i++) {
+		b = as->new_nodes[i];
+
+		BUG_ON(b->will_make_reachable != (unsigned long) as);
+		b->will_make_reachable = 0;
+		clear_btree_node_will_make_reachable(b);
+	}
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	for (i = 0; i < as->nr_new_nodes; i++) {
+		b = as->new_nodes[i];
+
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		btree_node_write_if_need(c, b, SIX_LOCK_read);
+		six_unlock_read(&b->c.lock);
+	}
+
+	for (i = 0; i < as->nr_open_buckets; i++)
+		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
+
+	bch2_btree_update_free(as, trans);
+	bch2_trans_put(trans);
+}
+
+static void btree_interior_update_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, btree_interior_update_work);
+	struct btree_update *as;
+
+	while (1) {
+		mutex_lock(&c->btree_interior_update_lock);
+		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+					      struct btree_update, unwritten_list);
+		if (as && !as->nodes_written)
+			as = NULL;
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		if (!as)
+			break;
+
+		btree_update_nodes_written(as);
+	}
+}
+
+static void btree_update_set_nodes_written(struct closure *cl)
+{
+	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	as->nodes_written = true;
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void btree_update_updated_node(struct btree_update *as, struct btree *b)
+{
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(!btree_node_dirty(b));
+	BUG_ON(!b->c.level);
+
+	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
+	as->b		= b;
+
+	set_btree_node_write_blocked(b);
+	list_add(&as->write_blocked_list, &b->write_blocked);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_reparent(struct btree_update *as,
+				  struct btree_update *child)
+{
+	struct bch_fs *c = as->c;
+
+	lockdep_assert_held(&c->btree_interior_update_lock);
+
+	child->b = NULL;
+	child->mode = BTREE_INTERIOR_UPDATING_AS;
+
+	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+}
+
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
+{
+	struct bkey_i *insert = &b->key;
+	struct bch_fs *c = as->c;
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+
+	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+	       ARRAY_SIZE(as->journal_entries));
+
+	as->journal_u64s +=
+		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+				  BCH_JSET_ENTRY_btree_root,
+				  b->c.btree_id, b->c.level,
+				  insert, insert->k.u64s);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+/*
+ * bch2_btree_update_add_new_node:
+ *
+ * This causes @as to wait on @b to be written, before it gets to
+ * bch2_btree_update_nodes_written
+ *
+ * Additionally, it sets b->will_make_reachable to prevent any additional writes
+ * to @b from happening besides the first until @b is reachable on disk
+ *
+ * And it adds @b to the list of @as's new nodes, so that we can update sector
+ * counts in bch2_btree_update_nodes_written:
+ */
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
+{
+	struct bch_fs *c = as->c;
+
+	closure_get(&as->cl);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
+	BUG_ON(b->will_make_reachable);
+
+	as->new_nodes[as->nr_new_nodes++] = b;
+	b->will_make_reachable = 1UL|(unsigned long) as;
+	set_btree_node_will_make_reachable(b);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	btree_update_add_key(as, &as->new_keys, b);
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
+		unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
+
+		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+			cpu_to_le16(sectors);
+	}
+}
+
+/*
+ * returns true if @b was a new node
+ */
+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
+{
+	struct btree_update *as;
+	unsigned long v;
+	unsigned i;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	/*
+	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
+	 * dropped when it gets written by bch2_btree_complete_write - the
+	 * xchg() is for synchronization with bch2_btree_complete_write:
+	 */
+	v = xchg(&b->will_make_reachable, 0);
+	clear_btree_node_will_make_reachable(b);
+	as = (struct btree_update *) (v & ~1UL);
+
+	if (!as) {
+		mutex_unlock(&c->btree_interior_update_lock);
+		return;
+	}
+
+	for (i = 0; i < as->nr_new_nodes; i++)
+		if (as->new_nodes[i] == b)
+			goto found;
+
+	BUG();
+found:
+	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	if (v & 1)
+		closure_put(&as->cl);
+}
+
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
+{
+	while (b->ob.nr)
+		as->open_buckets[as->nr_open_buckets++] =
+			b->ob.v[--b->ob.nr];
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding btree_updates - redirect @b's
+ * btree_updates to point to this btree_update:
+ */
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+						      struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct btree_update *p, *n;
+	struct btree_write *w;
+
+	set_btree_node_dying(b);
+
+	if (btree_node_fake(b))
+		return;
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	/*
+	 * Does this node have any btree_update operations preventing
+	 * it from being written?
+	 *
+	 * If so, redirect them to point to this btree_update: we can
+	 * write out our new nodes, but we won't make them visible until those
+	 * operations complete
+	 */
+	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
+		list_del_init(&p->write_blocked_list);
+		btree_update_reparent(as, p);
+
+		/*
+		 * for flush_held_btree_writes() waiting on updates to flush or
+		 * nodes to be writeable:
+		 */
+		closure_wake_up(&c->btree_interior_update_wait);
+	}
+
+	clear_btree_node_dirty_acct(c, b);
+	clear_btree_node_need_write(b);
+	clear_btree_node_write_blocked(b);
+
+	/*
+	 * Does this node have unwritten data that has a pin on the journal?
+	 *
+	 * If so, transfer that pin to the btree_update operation -
+	 * note that if we're freeing multiple nodes, we only need to keep the
+	 * oldest pin of any of the nodes we're freeing. We'll release the pin
+	 * when the new nodes are persistent and reachable on disk:
+	 */
+	w = btree_current_write(b);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+
+	w = btree_prev_write(b);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * Is this a node that isn't reachable on disk yet?
+	 *
+	 * Nodes that aren't reachable yet have writes blocked until they're
+	 * reachable - now that we've cancelled any pending writes and moved
+	 * things waiting on that write to wait on this update, we can drop this
+	 * node from the list of nodes that the other update is making
+	 * reachable, prior to freeing it:
+	 */
+	btree_update_drop_new_node(c, b);
+
+	btree_update_add_key(as, &as->old_keys, b);
+
+	as->old_nodes[as->nr_old_nodes] = b;
+	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+	as->nr_old_nodes++;
+}
+
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
+{
+	struct bch_fs *c = as->c;
+	u64 start_time = as->start_time;
+
+	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+
+	if (as->took_gc_lock)
+		up_read(&as->c->gc_lock);
+	as->took_gc_lock = false;
+
+	bch2_btree_reserve_put(as, trans);
+
+	continue_at(&as->cl, btree_update_set_nodes_written,
+		    as->c->btree_interior_update_worker);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+			       start_time);
+}
+
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
+			unsigned level, bool split, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_update *as;
+	u64 start_time = local_clock();
+	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+		? BCH_DISK_RESERVATION_NOFAIL : 0;
+	unsigned nr_nodes[2] = { 0, 0 };
+	unsigned update_level = level;
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	unsigned journal_flags = 0;
+	int ret = 0;
+	u32 restart_count = trans->restart_count;
+
+	BUG_ON(!path->should_be_locked);
+
+	if (watermark == BCH_WATERMARK_copygc)
+		watermark = BCH_WATERMARK_btree_copygc;
+	if (watermark < BCH_WATERMARK_btree)
+		watermark = BCH_WATERMARK_btree;
+
+	flags &= ~BCH_WATERMARK_MASK;
+	flags |= watermark;
+
+	if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+		journal_flags |= JOURNAL_RES_GET_NONBLOCK;
+	journal_flags |= watermark;
+
+	while (1) {
+		nr_nodes[!!update_level] += 1 + split;
+		update_level++;
+
+		ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+		if (ret)
+			return ERR_PTR(ret);
+
+		if (!btree_path_node(path, update_level)) {
+			/* Allocating new root? */
+			nr_nodes[1] += split;
+			update_level = BTREE_MAX_DEPTH;
+			break;
+		}
+
+		if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+					BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
+			break;
+
+		split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+	}
+
+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&c->gc_lock);
+	else if (!down_read_trylock(&c->gc_lock)) {
+		ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
+		if (ret) {
+			up_read(&c->gc_lock);
+			return ERR_PTR(ret);
+		}
+	}
+
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
+	memset(as, 0, sizeof(*as));
+	closure_init(&as->cl, NULL);
+	as->c		= c;
+	as->start_time	= start_time;
+	as->mode	= BTREE_INTERIOR_NO_UPDATE;
+	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+	as->btree_id	= path->btree_id;
+	as->update_level = update_level;
+	INIT_LIST_HEAD(&as->list);
+	INIT_LIST_HEAD(&as->unwritten_list);
+	INIT_LIST_HEAD(&as->write_blocked_list);
+	bch2_keylist_init(&as->old_keys, as->_old_keys);
+	bch2_keylist_init(&as->new_keys, as->_new_keys);
+	bch2_keylist_init(&as->parent_keys, as->inline_keys);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->list, &c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * We don't want to allocate if we're in an error state, that can cause
+	 * deadlock on emergency shutdown due to open buckets getting stuck in
+	 * the btree_reserve_cache after allocator shutdown has cleared it out.
+	 * This check needs to come after adding us to the btree_interior_update
+	 * list but before calling bch2_btree_reserve_get, to synchronize with
+	 * __bch2_fs_read_only().
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
+
+	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+				      BTREE_UPDATE_JOURNAL_RES,
+				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
+	if (ret) {
+		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			goto err;
+		}
+
+		ret = drop_locks_do(trans,
+			bch2_journal_preres_get(&c->journal, &as->journal_preres,
+					      BTREE_UPDATE_JOURNAL_RES,
+					      journal_flags));
+		if (ret == -BCH_ERR_journal_preres_get_blocked) {
+			trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
+		}
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_disk_reservation_get(c, &as->disk_res,
+			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
+			c->opts.metadata_replicas,
+			disk_res_flags);
+	if (ret)
+		goto err;
+
+	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+	if (bch2_err_matches(ret, ENOSPC) ||
+	    bch2_err_matches(ret, ENOMEM)) {
+		struct closure cl;
+
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
+		if (bch2_err_matches(ret, ENOSPC) &&
+		    (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+		    watermark != BCH_WATERMARK_reclaim) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			goto err;
+		}
+
+		closure_init_stack(&cl);
+
+		do {
+			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
+
+			bch2_trans_unlock(trans);
+			closure_sync(&cl);
+		} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
+	}
+
+	if (ret) {
+		trace_and_count(c, btree_reserve_get_fail, trans->fn,
+				_RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
+		goto err;
+	}
+
+	ret = bch2_trans_relock(trans);
+	if (ret)
+		goto err;
+
+	bch2_trans_verify_not_restarted(trans, restart_count);
+	return as;
+err:
+	bch2_btree_update_free(as, trans);
+	return ERR_PTR(ret);
+}
+
+/* Btree root updates: */
+
+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+{
+	/* Root nodes cannot be reaped */
+	mutex_lock(&c->btree_cache.lock);
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache.lock);
+
+	mutex_lock(&c->btree_root_lock);
+	BUG_ON(btree_node_root(c, b) &&
+	       (b->c.level < btree_node_root(c, b)->c.level ||
+		!btree_node_dying(btree_node_root(c, b))));
+
+	bch2_btree_id_root(c, b->c.btree_id)->b = b;
+	mutex_unlock(&c->btree_root_lock);
+
+	bch2_recalc_btree_reserve(c);
+}
+
+static void bch2_btree_set_root(struct btree_update *as,
+				struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct btree *old;
+
+	trace_and_count(c, btree_node_set_root, c, b);
+
+	old = btree_node_root(c, b);
+
+	/*
+	 * Ensure no one is using the old root while we switch to the
+	 * new root:
+	 */
+	bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+
+	bch2_btree_set_root_inmem(c, b);
+
+	btree_update_updated_root(as, b);
+
+	/*
+	 * Unlock old root after new root is visible:
+	 *
+	 * The new root isn't persistent, but that's ok: we still have
+	 * an intent lock on the new root, and any updates that would
+	 * depend on the new root would have to update the new root.
+	 */
+	bch2_btree_node_unlock_write(trans, path, old);
+}
+
+/* Interior node updates: */
+
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+					struct btree_trans *trans,
+					struct btree_path *path,
+					struct btree *b,
+					struct btree_node_iter *node_iter,
+					struct bkey_i *insert)
+{
+	struct bch_fs *c = as->c;
+	struct bkey_packed *k;
+	struct printbuf buf = PRINTBUF;
+	unsigned long old, new, v;
+
+	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
+	       !btree_ptr_sectors_written(insert));
+
+	if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+			      btree_node_type(b), WRITE, &buf) ?:
+	    bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "inserting invalid bkey\n  ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+		prt_printf(&buf, "\n  ");
+		bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+				  btree_node_type(b), WRITE, &buf);
+		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
+
+		bch2_fs_inconsistent(c, "%s", buf.buf);
+		dump_stack();
+	}
+
+	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+	       ARRAY_SIZE(as->journal_entries));
+
+	as->journal_u64s +=
+		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+				  BCH_JSET_ENTRY_btree_keys,
+				  b->c.btree_id, b->c.level,
+				  insert, insert->k.u64s);
+
+	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
+	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
+		bch2_btree_node_iter_advance(node_iter, b);
+
+	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
+	set_btree_node_dirty_acct(c, b);
+
+	v = READ_ONCE(b->flags);
+	do {
+		old = new = v;
+
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_interior;
+		new |= 1 << BTREE_NODE_need_write;
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	printbuf_exit(&buf);
+}
+
+static void
+__bch2_btree_insert_keys_interior(struct btree_update *as,
+				  struct btree_trans *trans,
+				  struct btree_path *path,
+				  struct btree *b,
+				  struct btree_node_iter node_iter,
+				  struct keylist *keys)
+{
+	struct bkey_i *insert = bch2_keylist_front(keys);
+	struct bkey_packed *k;
+
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
+
+	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
+		;
+
+	while (!bch2_keylist_empty(keys)) {
+		insert = bch2_keylist_front(keys);
+
+		if (bpos_gt(insert->k.p, b->key.k.p))
+			break;
+
+		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
+		bch2_keylist_pop_front(keys);
+	}
+}
+
+/*
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
+ * node)
+ */
+static void __btree_split_node(struct btree_update *as,
+			       struct btree_trans *trans,
+			       struct btree *b,
+			       struct btree *n[2])
+{
+	struct bkey_packed *k;
+	struct bpos n1_pos = POS_MIN;
+	struct btree_node_iter iter;
+	struct bset *bsets[2];
+	struct bkey_format_state format[2];
+	struct bkey_packed *out[2];
+	struct bkey uk;
+	unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		BUG_ON(n[i]->nsets != 1);
+
+		bsets[i] = btree_bset_first(n[i]);
+		out[i] = bsets[i]->start;
+
+		SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
+		bch2_bkey_format_init(&format[i]);
+	}
+
+	u64s = 0;
+	for_each_btree_node_key(b, k, &iter) {
+		if (bkey_deleted(k))
+			continue;
+
+		i = u64s >= n1_u64s;
+		u64s += k->u64s;
+		uk = bkey_unpack_key(b, k);
+		if (!i)
+			n1_pos = uk.p;
+		bch2_bkey_format_add_key(&format[i], &uk);
+	}
+
+	btree_set_min(n[0], b->data->min_key);
+	btree_set_max(n[0], n1_pos);
+	btree_set_min(n[1], bpos_successor(n1_pos));
+	btree_set_max(n[1], b->data->max_key);
+
+	for (i = 0; i < 2; i++) {
+		bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
+		bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
+
+		n[i]->data->format = bch2_bkey_format_done(&format[i]);
+		btree_node_set_format(n[i], n[i]->data->format);
+	}
+
+	u64s = 0;
+	for_each_btree_node_key(b, k, &iter) {
+		if (bkey_deleted(k))
+			continue;
+
+		i = u64s >= n1_u64s;
+		u64s += k->u64s;
+
+		if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
+					? &b->format: &bch2_bkey_format_current, k))
+			out[i]->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(b, (void *) out[i], k);
+
+		out[i]->needs_whiteout = false;
+
+		btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
+		out[i] = bkey_p_next(out[i]);
+	}
+
+	for (i = 0; i < 2; i++) {
+		bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
+
+		BUG_ON(!bsets[i]->u64s);
+
+		set_btree_bset_end(n[i], n[i]->set);
+
+		btree_node_reset_sib_u64s(n[i]);
+
+		bch2_verify_btree_nr_keys(n[i]);
+
+		if (b->c.level)
+			btree_node_interior_verify(as->c, n[i]);
+	}
+}
+
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
+static void btree_split_insert_keys(struct btree_update *as,
+				    struct btree_trans *trans,
+				    struct btree_path *path,
+				    struct btree *b,
+				    struct keylist *keys)
+{
+	if (!bch2_keylist_empty(keys) &&
+	    bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
+		struct btree_node_iter node_iter;
+
+		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
+
+		__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+
+		btree_node_interior_verify(as->c, b);
+	}
+}
+
+static int btree_split(struct btree_update *as, struct btree_trans *trans,
+		       struct btree_path *path, struct btree *b,
+		       struct keylist *keys, unsigned flags)
+{
+	struct bch_fs *c = as->c;
+	struct btree *parent = btree_node_parent(path, b);
+	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	struct btree_path *path1 = NULL, *path2 = NULL;
+	u64 start_time = local_clock();
+	int ret = 0;
+
+	BUG_ON(!parent && (b != btree_node_root(c, b)));
+	BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
+
+	bch2_btree_interior_update_will_free_node(as, b);
+
+	if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
+		struct btree *n[2];
+
+		trace_and_count(c, btree_node_split, c, b);
+
+		n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
+		n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
+
+		__btree_split_node(as, trans, b, n);
+
+		if (keys) {
+			btree_split_insert_keys(as, trans, path, n1, keys);
+			btree_split_insert_keys(as, trans, path, n2, keys);
+			BUG_ON(!bch2_keylist_empty(keys));
+		}
+
+		bch2_btree_build_aux_trees(n2);
+		bch2_btree_build_aux_trees(n1);
+
+		bch2_btree_update_add_new_node(as, n1);
+		bch2_btree_update_add_new_node(as, n2);
+		six_unlock_write(&n2->c.lock);
+		six_unlock_write(&n1->c.lock);
+
+		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, path1, n1);
+
+		path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+		six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, path2, n2);
+
+		/*
+		 * Note that on recursive parent_keys == keys, so we
+		 * can't start adding new keys to parent_keys before emptying it
+		 * out (which we did with btree_split_insert_keys() above)
+		 */
+		bch2_keylist_add(&as->parent_keys, &n1->key);
+		bch2_keylist_add(&as->parent_keys, &n2->key);
+
+		if (!parent) {
+			/* Depth increases, make a new root */
+			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
+
+			bch2_btree_update_add_new_node(as, n3);
+			six_unlock_write(&n3->c.lock);
+
+			path2->locks_want++;
+			BUG_ON(btree_node_locked(path2, n3->c.level));
+			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
+			mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
+			bch2_btree_path_level_init(trans, path2, n3);
+
+			n3->sib_u64s[0] = U16_MAX;
+			n3->sib_u64s[1] = U16_MAX;
+
+			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
+		}
+	} else {
+		trace_and_count(c, btree_node_compact, c, b);
+
+		n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+
+		if (keys) {
+			btree_split_insert_keys(as, trans, path, n1, keys);
+			BUG_ON(!bch2_keylist_empty(keys));
+		}
+
+		bch2_btree_build_aux_trees(n1);
+		bch2_btree_update_add_new_node(as, n1);
+		six_unlock_write(&n1->c.lock);
+
+		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, path1, n1);
+
+		if (parent)
+			bch2_keylist_add(&as->parent_keys, &n1->key);
+	}
+
+	/* New nodes all written, now make them visible: */
+
+	if (parent) {
+		/* Split a non root node */
+		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+		if (ret)
+			goto err;
+	} else if (n3) {
+		bch2_btree_set_root(as, trans, path, n3);
+	} else {
+		/* Root filled up but didn't need to be split */
+		bch2_btree_set_root(as, trans, path, n1);
+	}
+
+	if (n3) {
+		bch2_btree_update_get_open_buckets(as, n3);
+		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+	}
+	if (n2) {
+		bch2_btree_update_get_open_buckets(as, n2);
+		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+	}
+	bch2_btree_update_get_open_buckets(as, n1);
+	bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+
+	/*
+	 * The old node must be freed (in memory) _before_ unlocking the new
+	 * nodes - else another thread could re-acquire a read lock on the old
+	 * node after another thread has locked and updated the new node, thus
+	 * seeing stale data:
+	 */
+	bch2_btree_node_free_inmem(trans, path, b);
+
+	if (n3)
+		bch2_trans_node_add(trans, n3);
+	if (n2)
+		bch2_trans_node_add(trans, n2);
+	bch2_trans_node_add(trans, n1);
+
+	if (n3)
+		six_unlock_intent(&n3->c.lock);
+	if (n2)
+		six_unlock_intent(&n2->c.lock);
+	six_unlock_intent(&n1->c.lock);
+out:
+	if (path2) {
+		__bch2_btree_path_unlock(trans, path2);
+		bch2_path_put(trans, path2, true);
+	}
+	if (path1) {
+		__bch2_btree_path_unlock(trans, path1);
+		bch2_path_put(trans, path1, true);
+	}
+
+	bch2_trans_verify_locks(trans);
+
+	bch2_time_stats_update(&c->times[n2
+			       ? BCH_TIME_btree_node_split
+			       : BCH_TIME_btree_node_compact],
+			       start_time);
+	return ret;
+err:
+	if (n3)
+		bch2_btree_node_free_never_used(as, trans, n3);
+	if (n2)
+		bch2_btree_node_free_never_used(as, trans, n2);
+	bch2_btree_node_free_never_used(as, trans, n1);
+	goto out;
+}
+
+static void
+bch2_btree_insert_keys_interior(struct btree_update *as,
+				struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b,
+				struct keylist *keys)
+{
+	struct btree_path *linked;
+
+	__bch2_btree_insert_keys_interior(as, trans, path, b,
+					  path->l[b->c.level].iter, keys);
+
+	btree_update_updated_node(as, b);
+
+	trans_for_each_path_with_node(trans, b, linked)
+		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
+
+	bch2_trans_verify_paths(trans);
+}
+
+/**
+ * bch2_btree_insert_node - insert bkeys into a given btree node
+ *
+ * @as:			btree_update object
+ * @trans:		btree_trans object
+ * @path:		path that points to current node
+ * @b:			node to insert keys into
+ * @keys:		list of keys to insert
+ * @flags:		transaction commit flags
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
+ *
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
+ * If a split occurred, this function will return early. This can only happen
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
+ */
+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+				  struct btree_path *path, struct btree *b,
+				  struct keylist *keys, unsigned flags)
+{
+	struct bch_fs *c = as->c;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+	int ret;
+
+	lockdep_assert_held(&c->gc_lock);
+	BUG_ON(!btree_node_intent_locked(path, b->c.level));
+	BUG_ON(!b->c.level);
+	BUG_ON(!as || as->b);
+	bch2_verify_keylist_sorted(keys);
+
+	ret = bch2_btree_node_lock_write(trans, path, &b->c);
+	if (ret)
+		return ret;
+
+	bch2_btree_node_prep_for_write(trans, path, b);
+
+	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+		bch2_btree_node_unlock_write(trans, path, b);
+		goto split;
+	}
+
+	btree_node_interior_verify(c, b);
+
+	bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+
+	bch2_btree_node_unlock_write(trans, path, b);
+
+	btree_node_interior_verify(c, b);
+	return 0;
+split:
+	/*
+	 * We could attempt to avoid the transaction restart, by calling
+	 * bch2_btree_path_upgrade() and allocating more nodes:
+	 */
+	if (b->c.level >= as->update_level) {
+		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+	}
+
+	return btree_split(as, trans, path, b, keys, flags);
+}
+
+int bch2_btree_split_leaf(struct btree_trans *trans,
+			  struct btree_path *path,
+			  unsigned flags)
+{
+	struct btree *b = path_l(path)->b;
+	struct btree_update *as;
+	unsigned l;
+	int ret = 0;
+
+	as = bch2_btree_update_start(trans, path, path->level,
+				     true, flags);
+	if (IS_ERR(as))
+		return PTR_ERR(as);
+
+	ret = btree_split(as, trans, path, b, NULL, flags);
+	if (ret) {
+		bch2_btree_update_free(as, trans);
+		return ret;
+	}
+
+	bch2_btree_update_done(as, trans);
+
+	for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
+		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
+
+	return ret;
+}
+
+int __bch2_foreground_maybe_merge(struct btree_trans *trans,
+				  struct btree_path *path,
+				  unsigned level,
+				  unsigned flags,
+				  enum btree_node_sibling sib)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *sib_path = NULL, *new_path = NULL;
+	struct btree_update *as;
+	struct bkey_format_state new_s;
+	struct bkey_format new_f;
+	struct bkey_i delete;
+	struct btree *b, *m, *n, *prev, *next, *parent;
+	struct bpos sib_pos;
+	size_t sib_u64s;
+	u64 start_time = local_clock();
+	int ret = 0;
+
+	BUG_ON(!path->should_be_locked);
+	BUG_ON(!btree_node_locked(path, level));
+
+	b = path->l[level].b;
+
+	if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
+	    (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
+		b->sib_u64s[sib] = U16_MAX;
+		return 0;
+	}
+
+	sib_pos = sib == btree_prev_sib
+		? bpos_predecessor(b->data->min_key)
+		: bpos_successor(b->data->max_key);
+
+	sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+				 U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
+	ret = bch2_btree_path_traverse(trans, sib_path, false);
+	if (ret)
+		goto err;
+
+	btree_path_set_should_be_locked(sib_path);
+
+	m = sib_path->l[level].b;
+
+	if (btree_node_parent(path, b) !=
+	    btree_node_parent(sib_path, m)) {
+		b->sib_u64s[sib] = U16_MAX;
+		goto out;
+	}
+
+	if (sib == btree_prev_sib) {
+		prev = m;
+		next = b;
+	} else {
+		prev = b;
+		next = m;
+	}
+
+	if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
+		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+		bch2_bpos_to_text(&buf1, prev->data->max_key);
+		bch2_bpos_to_text(&buf2, next->data->min_key);
+		bch_err(c,
+			"%s(): btree topology error:\n"
+			"  prev ends at   %s\n"
+			"  next starts at %s",
+			__func__, buf1.buf, buf2.buf);
+		printbuf_exit(&buf1);
+		printbuf_exit(&buf2);
+		bch2_topology_error(c);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_bkey_format_init(&new_s);
+	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+	__bch2_btree_calc_format(&new_s, prev);
+	__bch2_btree_calc_format(&new_s, next);
+	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
+	new_f = bch2_bkey_format_done(&new_s);
+
+	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
+		btree_node_u64s_with_format(m, &new_f);
+
+	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+		sib_u64s /= 2;
+		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+	}
+
+	sib_u64s = min(sib_u64s, btree_max_u64s(c));
+	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
+	b->sib_u64s[sib] = sib_u64s;
+
+	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+		goto out;
+
+	parent = btree_node_parent(path, b);
+	as = bch2_btree_update_start(trans, path, level, false,
+				     BTREE_INSERT_NOFAIL|flags);
+	ret = PTR_ERR_OR_ZERO(as);
+	if (ret)
+		goto err;
+
+	trace_and_count(c, btree_node_merge, c, b);
+
+	bch2_btree_interior_update_will_free_node(as, b);
+	bch2_btree_interior_update_will_free_node(as, m);
+
+	n = bch2_btree_node_alloc(as, trans, b->c.level);
+
+	SET_BTREE_NODE_SEQ(n->data,
+			   max(BTREE_NODE_SEQ(b->data),
+			       BTREE_NODE_SEQ(m->data)) + 1);
+
+	btree_set_min(n, prev->data->min_key);
+	btree_set_max(n, next->data->max_key);
+
+	n->data->format	 = new_f;
+	btree_node_set_format(n, new_f);
+
+	bch2_btree_sort_into(c, n, prev);
+	bch2_btree_sort_into(c, n, next);
+
+	bch2_btree_build_aux_trees(n);
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
+
+	new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, new_path, n);
+
+	bkey_init(&delete.k);
+	delete.k.p = prev->key.k.p;
+	bch2_keylist_add(&as->parent_keys, &delete);
+	bch2_keylist_add(&as->parent_keys, &n->key);
+
+	bch2_trans_verify_paths(trans);
+
+	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+	if (ret)
+		goto err_free_update;
+
+	bch2_trans_verify_paths(trans);
+
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+	bch2_btree_node_free_inmem(trans, path, b);
+	bch2_btree_node_free_inmem(trans, sib_path, m);
+
+	bch2_trans_node_add(trans, n);
+
+	bch2_trans_verify_paths(trans);
+
+	six_unlock_intent(&n->c.lock);
+
+	bch2_btree_update_done(as, trans);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+out:
+err:
+	if (new_path)
+		bch2_path_put(trans, new_path, true);
+	bch2_path_put(trans, sib_path, true);
+	bch2_trans_verify_locks(trans);
+	return ret;
+err_free_update:
+	bch2_btree_node_free_never_used(as, trans, n);
+	bch2_btree_update_free(as, trans);
+	goto out;
+}
+
+int bch2_btree_node_rewrite(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct btree *b,
+			    unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *new_path = NULL;
+	struct btree *n, *parent;
+	struct btree_update *as;
+	int ret;
+
+	flags |= BTREE_INSERT_NOFAIL;
+
+	parent = btree_node_parent(iter->path, b);
+	as = bch2_btree_update_start(trans, iter->path, b->c.level,
+				     false, flags);
+	ret = PTR_ERR_OR_ZERO(as);
+	if (ret)
+		goto out;
+
+	bch2_btree_interior_update_will_free_node(as, b);
+
+	n = bch2_btree_node_alloc_replacement(as, trans, b);
+
+	bch2_btree_build_aux_trees(n);
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
+
+	new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, new_path, n);
+
+	trace_and_count(c, btree_node_rewrite, c, b);
+
+	if (parent) {
+		bch2_keylist_add(&as->parent_keys, &n->key);
+		ret = bch2_btree_insert_node(as, trans, iter->path, parent,
+					     &as->parent_keys, flags);
+		if (ret)
+			goto err;
+	} else {
+		bch2_btree_set_root(as, trans, iter->path, n);
+	}
+
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+	bch2_btree_node_free_inmem(trans, iter->path, b);
+
+	bch2_trans_node_add(trans, n);
+	six_unlock_intent(&n->c.lock);
+
+	bch2_btree_update_done(as, trans);
+out:
+	if (new_path)
+		bch2_path_put(trans, new_path, true);
+	bch2_btree_path_downgrade(trans, iter->path);
+	return ret;
+err:
+	bch2_btree_node_free_never_used(as, trans, n);
+	bch2_btree_update_free(as, trans);
+	goto out;
+}
+
+struct async_btree_rewrite {
+	struct bch_fs		*c;
+	struct work_struct	work;
+	struct list_head	list;
+	enum btree_id		btree_id;
+	unsigned		level;
+	struct bpos		pos;
+	__le64			seq;
+};
+
+static int async_btree_node_rewrite_trans(struct btree_trans *trans,
+					  struct async_btree_rewrite *a)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct btree *b;
+	int ret;
+
+	bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+				  BTREE_MAX_DEPTH, a->level, 0);
+	b = bch2_btree_iter_peek_node(&iter);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto out;
+
+	if (!b || b->data->keys.seq != a->seq) {
+		struct printbuf buf = PRINTBUF;
+
+		if (b)
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		else
+			prt_str(&buf, "(null");
+		bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
+			 __func__, a->seq, buf.buf);
+		printbuf_exit(&buf);
+		goto out;
+	}
+
+	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static void async_btree_node_rewrite_work(struct work_struct *work)
+{
+	struct async_btree_rewrite *a =
+		container_of(work, struct async_btree_rewrite, work);
+	struct bch_fs *c = a->c;
+	int ret;
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		      async_btree_node_rewrite_trans(trans, a));
+	if (ret)
+		bch_err_fn(c, ret);
+	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
+	kfree(a);
+}
+
+void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
+{
+	struct async_btree_rewrite *a;
+	int ret;
+
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (!a) {
+		bch_err(c, "%s: error allocating memory", __func__);
+		return;
+	}
+
+	a->c		= c;
+	a->btree_id	= b->c.btree_id;
+	a->level	= b->c.level;
+	a->pos		= b->key.k.p;
+	a->seq		= b->data->keys.seq;
+	INIT_WORK(&a->work, async_btree_node_rewrite_work);
+
+	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+		mutex_lock(&c->pending_node_rewrites_lock);
+		list_add(&a->list, &c->pending_node_rewrites);
+		mutex_unlock(&c->pending_node_rewrites_lock);
+		return;
+	}
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+		if (test_bit(BCH_FS_STARTED, &c->flags)) {
+			bch_err(c, "%s: error getting c->writes ref", __func__);
+			kfree(a);
+			return;
+		}
+
+		ret = bch2_fs_read_write_early(c);
+		if (ret) {
+			bch_err_msg(c, ret, "going read-write");
+			kfree(a);
+			return;
+		}
+
+		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+	}
+
+	queue_work(c->btree_interior_update_worker, &a->work);
+}
+
+void bch2_do_pending_node_rewrites(struct bch_fs *c)
+{
+	struct async_btree_rewrite *a, *n;
+
+	mutex_lock(&c->pending_node_rewrites_lock);
+	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+		list_del(&a->list);
+
+		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+		queue_work(c->btree_interior_update_worker, &a->work);
+	}
+	mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
+void bch2_free_pending_node_rewrites(struct bch_fs *c)
+{
+	struct async_btree_rewrite *a, *n;
+
+	mutex_lock(&c->pending_node_rewrites_lock);
+	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+		list_del(&a->list);
+
+		kfree(a);
+	}
+	mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
+static int __bch2_btree_node_update_key(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct btree *b, struct btree *new_hash,
+					struct bkey_i *new_key,
+					unsigned commit_flags,
+					bool skip_triggers)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter2 = { NULL };
+	struct btree *parent;
+	int ret;
+
+	if (!skip_triggers) {
+		ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+					  bkey_i_to_s_c(&b->key), 0);
+		if (ret)
+			return ret;
+
+		ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+					  new_key, 0);
+		if (ret)
+			return ret;
+	}
+
+	if (new_hash) {
+		bkey_copy(&new_hash->key, new_key);
+		ret = bch2_btree_node_hash_insert(&c->btree_cache,
+				new_hash, b->c.level, b->c.btree_id);
+		BUG_ON(ret);
+	}
+
+	parent = btree_node_parent(iter->path, b);
+	if (parent) {
+		bch2_trans_copy_iter(&iter2, iter);
+
+		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+				iter2.flags & BTREE_ITER_INTENT,
+				_THIS_IP_);
+
+		BUG_ON(iter2.path->level != b->c.level);
+		BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
+
+		btree_path_set_level_up(trans, iter2.path);
+
+		trans->paths_sorted = false;
+
+		ret   = bch2_btree_iter_traverse(&iter2) ?:
+			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
+		if (ret)
+			goto err;
+	} else {
+		BUG_ON(btree_node_root(c, b) != b);
+
+		ret = darray_make_room(&trans->extra_journal_entries,
+				       jset_u64s(new_key->k.u64s));
+		if (ret)
+			return ret;
+
+		journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+				  BCH_JSET_ENTRY_btree_root,
+				  b->c.btree_id, b->c.level,
+				  new_key, new_key->k.u64s);
+		trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
+	}
+
+	ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
+	if (ret)
+		goto err;
+
+	bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
+
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		bkey_copy(&b->key, new_key);
+		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+		BUG_ON(ret);
+		mutex_unlock(&c->btree_cache.lock);
+	} else {
+		bkey_copy(&b->key, new_key);
+	}
+
+	bch2_btree_node_unlock_write(trans, iter->path, b);
+out:
+	bch2_trans_iter_exit(trans, &iter2);
+	return ret;
+err:
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+		mutex_unlock(&c->btree_cache.lock);
+	}
+	goto out;
+}
+
+int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
+			       struct btree *b, struct bkey_i *new_key,
+			       unsigned commit_flags, bool skip_triggers)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *new_hash = NULL;
+	struct btree_path *path = iter->path;
+	struct closure cl;
+	int ret = 0;
+
+	ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
+	if (ret)
+		return ret;
+
+	closure_init_stack(&cl);
+
+	/*
+	 * check btree_ptr_hash_val() after @b is locked by
+	 * btree_iter_traverse():
+	 */
+	if (btree_ptr_hash_val(new_key) != b->hash_val) {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		if (ret) {
+			ret = drop_locks_do(trans, (closure_sync(&cl), 0));
+			if (ret)
+				return ret;
+		}
+
+		new_hash = bch2_btree_node_mem_alloc(trans, false);
+	}
+
+	path->intent_ref++;
+	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
+					   commit_flags, skip_triggers);
+	--path->intent_ref;
+
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&new_hash->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		six_unlock_write(&new_hash->c.lock);
+		six_unlock_intent(&new_hash->c.lock);
+	}
+	closure_sync(&cl);
+	bch2_btree_cache_cannibalize_unlock(c);
+	return ret;
+}
+
+int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
+					struct btree *b, struct bkey_i *new_key,
+					unsigned commit_flags, bool skip_triggers)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+				  BTREE_MAX_DEPTH, b->c.level,
+				  BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
+
+	/* has node been freed? */
+	if (iter.path->l[b->c.level].b != b) {
+		/* node has been freed: */
+		BUG_ON(!btree_node_dying(b));
+		goto out;
+	}
+
+	BUG_ON(!btree_node_hashed(b));
+
+	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
+					 commit_flags, skip_triggers);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* Init code: */
+
+/*
+ * Only for filesystem bringup, when first reading the btree roots or allocating
+ * btree roots when initializing a new filesystem:
+ */
+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
+{
+	BUG_ON(btree_node_root(c, b));
+
+	bch2_btree_set_root_inmem(c, b);
+}
+
+static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+{
+	struct bch_fs *c = trans->c;
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = bch2_btree_node_mem_alloc(trans, false);
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	set_btree_node_fake(b);
+	set_btree_node_need_rewrite(b);
+	b->c.level	= 0;
+	b->c.btree_id	= id;
+
+	bkey_btree_ptr_init(&b->key);
+	b->key.k.p = SPOS_MAX;
+	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
+
+	bch2_bset_init_first(b, &b->data->keys);
+	bch2_btree_build_aux_trees(b);
+
+	b->data->flags = 0;
+	btree_set_min(b, POS_MIN);
+	btree_set_max(b, SPOS_MAX);
+	b->data->format = bch2_btree_calc_format(b);
+	btree_node_set_format(b, b->data->format);
+
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
+					  b->c.level, b->c.btree_id);
+	BUG_ON(ret);
+
+	bch2_btree_set_root_inmem(c, b);
+
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+	return 0;
+}
+
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+{
+	bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+}
+
+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct btree_update *as;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_for_each_entry(as, &c->btree_interior_update_list, list)
+		prt_printf(out, "%p m %u w %u r %u j %llu\n",
+		       as,
+		       as->mode,
+		       as->nodes_written,
+		       closure_nr_remaining(&as->cl),
+		       as->journal.seq);
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
+{
+	bool ret;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	ret = !list_empty(&c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return ret;
+}
+
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+	bool ret = bch2_btree_interior_updates_pending(c);
+
+	if (ret)
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_pending(c));
+	return ret;
+}
+
+void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
+{
+	struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
+
+	mutex_lock(&c->btree_root_lock);
+
+	r->level = entry->level;
+	r->alive = true;
+	bkey_copy(&r->key, &entry->start[0]);
+
+	mutex_unlock(&c->btree_root_lock);
+}
+
+struct jset_entry *
+bch2_btree_roots_to_journal_entries(struct bch_fs *c,
+				    struct jset_entry *start,
+				    struct jset_entry *end)
+{
+	struct jset_entry *entry;
+	unsigned long have = 0;
+	unsigned i;
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root)
+			__set_bit(entry->btree_id, &have);
+
+	mutex_lock(&c->btree_root_lock);
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->alive && !test_bit(i, &have)) {
+			journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
+					  i, r->level, &r->key, r->key.k.u64s);
+			end = vstruct_next(end);
+		}
+	}
+
+	mutex_unlock(&c->btree_root_lock);
+
+	return end;
+}
+
+void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
+{
+	if (c->btree_interior_update_worker)
+		destroy_workqueue(c->btree_interior_update_worker);
+	mempool_exit(&c->btree_interior_update_pool);
+}
+
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
+{
+	mutex_init(&c->btree_reserve_cache_lock);
+	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
+	mutex_init(&c->btree_interior_update_lock);
+	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
+
+	INIT_LIST_HEAD(&c->pending_node_rewrites);
+	mutex_init(&c->pending_node_rewrites_lock);
+}
+
+int bch2_fs_btree_interior_update_init(struct bch_fs *c)
+{
+	c->btree_interior_update_worker =
+		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+	if (!c->btree_interior_update_worker)
+		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
+	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+				      sizeof(struct btree_update)))
+		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
new file mode 100644
index 000000000000..5e0a467fe905
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.h
@@ -0,0 +1,337 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+
+#include "btree_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+
+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
+				struct bkey_format *);
+
+#define BTREE_UPDATE_NODES_MAX		((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
+
+#define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_update {
+	struct closure			cl;
+	struct bch_fs			*c;
+	u64				start_time;
+
+	struct list_head		list;
+	struct list_head		unwritten_list;
+
+	/* What kind of update are we doing? */
+	enum {
+		BTREE_INTERIOR_NO_UPDATE,
+		BTREE_INTERIOR_UPDATING_NODE,
+		BTREE_INTERIOR_UPDATING_ROOT,
+		BTREE_INTERIOR_UPDATING_AS,
+	} mode;
+
+	unsigned			nodes_written:1;
+	unsigned			took_gc_lock:1;
+
+	enum btree_id			btree_id;
+	unsigned			update_level;
+
+	struct disk_reservation		disk_res;
+	struct journal_preres		journal_preres;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * The update that made the new nodes visible was a regular update to an
+	 * existing interior node - @b. We can't write out the update to @b
+	 * until the new nodes we created are finished writing, so we block @b
+	 * from writing by putting this btree_interior update on the
+	 * @b->write_blocked list with @write_blocked_list:
+	 */
+	struct btree			*b;
+	struct list_head		write_blocked_list;
+
+	/*
+	 * We may be freeing nodes that were dirty, and thus had journal entries
+	 * pinned: we need to transfer the oldest of those pins to the
+	 * btree_update operation, and release it when the new node(s)
+	 * are all persistent and reachable:
+	 */
+	struct journal_entry_pin	journal;
+
+	/* Preallocated nodes we reserve when we start the update: */
+	struct prealloc_nodes {
+		struct btree		*b[BTREE_UPDATE_NODES_MAX];
+		unsigned		nr;
+	}				prealloc_nodes[2];
+
+	/* Nodes being freed: */
+	struct keylist			old_keys;
+	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
+						  BKEY_BTREE_PTR_U64s_MAX];
+
+	/* Nodes being added: */
+	struct keylist			new_keys;
+	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
+						  BKEY_BTREE_PTR_U64s_MAX];
+
+	/* New nodes, that will be made reachable by this update: */
+	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
+	unsigned			nr_new_nodes;
+
+	struct btree			*old_nodes[BTREE_UPDATE_NODES_MAX];
+	__le64				old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+	unsigned			nr_old_nodes;
+
+	open_bucket_idx_t		open_buckets[BTREE_UPDATE_NODES_MAX *
+						     BCH_REPLICAS_MAX];
+	open_bucket_idx_t		nr_open_buckets;
+
+	unsigned			journal_u64s;
+	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
+
+	/* Only here to reduce stack usage on recursive splits: */
+	struct keylist			parent_keys;
+	/*
+	 * Enough room for btree_split's keys without realloc - btree node
+	 * pointers never have crc/compression info, so we only need to acount
+	 * for the pointers for three keys
+	 */
+	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+						  struct btree_trans *,
+						  struct btree *,
+						  struct bkey_format);
+
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
+
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
+				  unsigned, unsigned, enum btree_node_sibling);
+
+static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
+					struct btree_path *path,
+					unsigned level, unsigned flags,
+					enum btree_node_sibling sib)
+{
+	struct btree *b;
+
+	EBUG_ON(!btree_node_locked(path, level));
+
+	b = path->l[level].b;
+	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
+		return 0;
+
+	return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
+}
+
+static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
+					      struct btree_path *path,
+					      unsigned level,
+					      unsigned flags)
+{
+	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
+						    btree_prev_sib) ?:
+		bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
+						    btree_next_sib);
+}
+
+int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
+			    struct btree *, unsigned);
+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
+int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
+			       struct btree *, struct bkey_i *,
+			       unsigned, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
+					struct bkey_i *, unsigned, bool);
+
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
+						     struct btree *b)
+{
+	unsigned depth = btree_node_root(c, b)->c.level + 1;
+
+	/*
+	 * Number of nodes we might have to allocate in a worst case btree
+	 * split operation - we split all the way up to the root, then allocate
+	 * a new root, unless we're already at max depth:
+	 */
+	if (depth < BTREE_MAX_DEPTH)
+		return (depth - b->c.level) * 2 + 1;
+	else
+		return (depth - b->c.level) * 2 - 1;
+}
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+	b->sib_u64s[0] = b->nr.live_u64s;
+	b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+{
+	return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
+							    struct btree *b)
+{
+	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
+							  struct btree *b)
+{
+	return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+	return (void *) b->data + (b->written << 9);
+}
+
+static inline bool __btree_addr_written(struct btree *b, void *p)
+{
+	return p < write_block(b);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+	return __btree_addr_written(b, i);
+}
+
+static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
+{
+	return __btree_addr_written(b, k);
+}
+
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+						 struct btree *b,
+						 void *end)
+{
+	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+		b->whiteout_u64s;
+	ssize_t total = c->opts.btree_node_size >> 3;
+
+	/* Always leave one extra u64 for bch2_varint_decode: */
+	used++;
+
+	return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+						   struct btree *b)
+{
+	ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+				btree_bkey_last(b, bset_tree_last(b)));
+
+	BUG_ON(remaining < 0);
+
+	if (bset_written(b, btree_bset_last(b)))
+		return 0;
+
+	return remaining;
+}
+
+#define BTREE_WRITE_SET_U64s_BITS	9
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+	/*
+	 * Could buffer up larger amounts of keys for btrees with larger keys,
+	 * pending benchmarking:
+	 */
+	return 8 << BTREE_WRITE_SET_U64s_BITS;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
+						     struct btree *b)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	struct btree_node_entry *bne = max(write_block(b),
+			(void *) btree_bkey_last(b, bset_tree_last(b)));
+	ssize_t remaining_space =
+		__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+
+	if (unlikely(bset_written(b, bset(b, t)))) {
+		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+			return bne;
+	} else {
+		if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
+		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+			return bne;
+	}
+
+	return NULL;
+}
+
+static inline void push_whiteout(struct bch_fs *c, struct btree *b,
+				 struct bpos pos)
+{
+	struct bkey_packed k;
+
+	BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+	EBUG_ON(btree_node_just_written(b));
+
+	if (!bkey_pack_pos(&k, pos, b)) {
+		struct bkey *u = (void *) &k;
+
+		bkey_init(u);
+		u->p = pos;
+	}
+
+	k.needs_whiteout = true;
+
+	b->whiteout_u64s += k.u64s;
+	bkey_copy(unwritten_whiteouts_start(c, b), &k);
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
+					       struct btree *b, unsigned u64s)
+{
+	if (unlikely(btree_node_need_rewrite(b)))
+		return false;
+
+	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
+
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
+
+void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
+					struct jset_entry *, struct jset_entry *);
+
+void bch2_do_pending_node_rewrites(struct bch_fs *);
+void bch2_free_pending_node_rewrites(struct bch_fs *);
+
+void bch2_fs_btree_interior_update_exit(struct bch_fs *);
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
+int bch2_fs_btree_interior_update_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
new file mode 100644
index 000000000000..4e6241db518b
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+
+#include <linux/sort.h>
+
+static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return  cmp_int(l->btree, r->btree) ?:
+		bpos_cmp(l->k.k.p, r->k.k.p) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return  cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
+					     struct btree_iter *iter,
+					     struct btree_write_buffered_key *wb,
+					     unsigned commit_flags,
+					     bool *write_locked,
+					     size_t *fast)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+	int ret;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	path = iter->path;
+
+	if (!*write_locked) {
+		ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
+		if (ret)
+			return ret;
+
+		bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
+		*write_locked = true;
+	}
+
+	if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
+		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+		*write_locked = false;
+		goto trans_commit;
+	}
+
+	bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
+	(*fast)++;
+
+	if (path->ref > 1) {
+		/*
+		 * We can't clone a path that has write locks: if the path is
+		 * shared, unlock before set_pos(), traverse():
+		 */
+		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+		*write_locked = false;
+	}
+	return 0;
+trans_commit:
+	return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
+				      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  commit_flags|
+				  BTREE_INSERT_NOCHECK_RW|
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_JOURNAL_RECLAIM);
+}
+
+static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
+{
+	union btree_write_buffer_state old, new;
+	u64 v = READ_ONCE(wb->state.v);
+
+	do {
+		old.v = new.v = v;
+
+		new.nr = 0;
+		new.idx++;
+	} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+	while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
+		cpu_relax();
+
+	smp_mb();
+
+	return old;
+}
+
+/*
+ * Update a btree with a write buffered key using the journal seq of the
+ * original write buffer insert.
+ *
+ * It is not safe to rejournal the key once it has been inserted into the write
+ * buffer because that may break recovery ordering. For example, the key may
+ * have already been modified in the active write buffer in a seq that comes
+ * before the current transaction. If we were to journal this key again and
+ * crash, recovery would process updates in the wrong order.
+ */
+static int
+btree_write_buffered_insert(struct btree_trans *trans,
+			  struct btree_write_buffered_key *wb)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
+			     BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
+				      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
+				    bool locked)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct journal_entry_pin pin;
+	struct btree_write_buffered_key *i, *keys;
+	struct btree_iter iter = { NULL };
+	size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+	bool write_locked = false;
+	union btree_write_buffer_state s;
+	int ret = 0;
+
+	memset(&pin, 0, sizeof(pin));
+
+	if (!locked && !mutex_trylock(&wb->flush_lock))
+		return 0;
+
+	bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+	bch2_journal_pin_drop(j, &wb->journal_pin);
+
+	s = btree_write_buffer_switch(wb);
+	keys = wb->keys[s.idx];
+	nr = s.nr;
+
+	if (race_fault())
+		goto slowpath;
+
+	/*
+	 * We first sort so that we can detect and skip redundant updates, and
+	 * then we attempt to flush in sorted btree order, as this is most
+	 * efficient.
+	 *
+	 * However, since we're not flushing in the order they appear in the
+	 * journal we won't be able to drop our journal pin until everything is
+	 * flushed - which means this could deadlock the journal if we weren't
+	 * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+	 * if it would block taking a journal reservation.
+	 *
+	 * If that happens, simply skip the key so we can optimistically insert
+	 * as many keys as possible in the fast path.
+	 */
+	sort(keys, nr, sizeof(keys[0]),
+	     btree_write_buffered_key_cmp, NULL);
+
+	for (i = keys; i < keys + nr; i++) {
+		if (i + 1 < keys + nr &&
+		    i[0].btree == i[1].btree &&
+		    bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
+			skipped++;
+			i->journal_seq = 0;
+			continue;
+		}
+
+		if (write_locked &&
+		    (iter.path->btree_id != i->btree ||
+		     bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+			bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+			write_locked = false;
+		}
+
+		if (!iter.path || iter.path->btree_id != i->btree) {
+			bch2_trans_iter_exit(trans, &iter);
+			bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+					     BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
+		}
+
+		bch2_btree_iter_set_pos(&iter, i->k.k.p);
+		iter.path->preserve = false;
+
+		do {
+			ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
+						commit_flags, &write_locked, &fast);
+			if (!write_locked)
+				bch2_trans_begin(trans);
+		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+		if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+			slowpath++;
+			continue;
+		}
+		if (ret)
+			break;
+
+		i->journal_seq = 0;
+	}
+
+	if (write_locked)
+		bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+	bch2_trans_iter_exit(trans, &iter);
+
+	trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+
+	if (slowpath)
+		goto slowpath;
+
+	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+out:
+	bch2_journal_pin_drop(j, &pin);
+	mutex_unlock(&wb->flush_lock);
+	return ret;
+slowpath:
+	trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+
+	/*
+	 * Now sort the rest by journal seq and bump the journal pin as we go.
+	 * The slowpath zapped the seq of keys that were successfully flushed so
+	 * we can skip those here.
+	 */
+	sort(keys, nr, sizeof(keys[0]),
+	     btree_write_buffered_journal_cmp,
+	     NULL);
+
+	commit_flags &= ~BCH_WATERMARK_MASK;
+	commit_flags |= BCH_WATERMARK_reclaim;
+
+	for (i = keys; i < keys + nr; i++) {
+		if (!i->journal_seq)
+			continue;
+
+		if (i->journal_seq > pin.seq) {
+			struct journal_entry_pin pin2;
+
+			memset(&pin2, 0, sizeof(pin2));
+
+			bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
+			bch2_journal_pin_drop(j, &pin);
+			bch2_journal_pin_copy(j, &pin, &pin2, NULL);
+			bch2_journal_pin_drop(j, &pin2);
+		}
+
+		ret = commit_do(trans, NULL, NULL,
+				commit_flags|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_JOURNAL_RECLAIM,
+				btree_write_buffered_insert(trans, i));
+		if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
+			break;
+	}
+
+	goto out;
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+	bch2_trans_unlock(trans);
+	mutex_lock(&trans->c->btree_write_buffer.flush_lock);
+	return __bch2_btree_write_buffer_flush(trans, 0, true);
+}
+
+int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+{
+	return __bch2_btree_write_buffer_flush(trans, 0, false);
+}
+
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+				struct journal_entry_pin *_pin, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	mutex_lock(&wb->flush_lock);
+
+	return bch2_trans_run(c,
+			__bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
+}
+
+static inline u64 btree_write_buffer_ref(int idx)
+{
+	return ((union btree_write_buffer_state) {
+		.ref0 = idx == 0,
+		.ref1 = idx == 1,
+	}).v;
+}
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_write_buffered_key *i;
+	union btree_write_buffer_state old, new;
+	int ret = 0;
+	u64 v;
+
+	trans_for_each_wb_update(trans, i) {
+		EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+		i->journal_seq		= trans->journal_res.seq;
+		i->journal_offset	= trans->journal_res.offset;
+	}
+
+	preempt_disable();
+	v = READ_ONCE(wb->state.v);
+	do {
+		old.v = new.v = v;
+
+		new.v += btree_write_buffer_ref(new.idx);
+		new.nr += trans->nr_wb_updates;
+		if (new.nr > wb->size) {
+			ret = -BCH_ERR_btree_insert_need_flush_buffer;
+			goto out;
+		}
+	} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+	memcpy(wb->keys[new.idx] + old.nr,
+	       trans->wb_updates,
+	       sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+
+	bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+			     bch2_btree_write_buffer_journal_flush);
+
+	atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+out:
+	preempt_enable();
+	return ret;
+}
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+
+	kvfree(wb->keys[1]);
+	kvfree(wb->keys[0]);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	mutex_init(&wb->flush_lock);
+	wb->size = c->opts.btree_write_buffer_size;
+
+	wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
+	wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
+	if (!wb->keys[0] || !wb->keys[1])
+		return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
new file mode 100644
index 000000000000..322df1c8304e
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_H
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+int bch2_btree_write_buffer_flush(struct btree_trans *);
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+int bch2_fs_btree_write_buffer_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
new file mode 100644
index 000000000000..99993ba77aea
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+
+#include "journal_types.h"
+
+#define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
+#define BTREE_WRITE_BUFERED_U64s_MAX	(BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
+
+struct btree_write_buffered_key {
+	u64			journal_seq;
+	unsigned		journal_offset;
+	enum btree_id		btree;
+	__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
+
+union btree_write_buffer_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64			nr:23;
+		u64			idx:1;
+		u64			ref0:20;
+		u64			ref1:20;
+	};
+};
+
+struct btree_write_buffer {
+	struct mutex			flush_lock;
+	struct journal_entry_pin	journal_pin;
+
+	union btree_write_buffer_state	state;
+	size_t				size;
+
+	struct btree_write_buffered_key	*keys[2];
+};
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
new file mode 100644
index 000000000000..a1a4b5feadaa
--- /dev/null
+++ b/fs/bcachefs/buckets.c
@@ -0,0 +1,2106 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "ec.h"
+#include "error.h"
+#include "inode.h"
+#include "movinggc.h"
+#include "recovery.h"
+#include "reflink.h"
+#include "replicas.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/preempt.h>
+
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+					      enum bch_data_type data_type,
+					      s64 sectors)
+{
+	switch (data_type) {
+	case BCH_DATA_btree:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_cached:
+		fs_usage->cached	+= sectors;
+		break;
+	default:
+		break;
+	}
+}
+
+void bch2_fs_usage_initialize(struct bch_fs *c)
+{
+	struct bch_fs_usage *usage;
+	struct bch_dev *ca;
+	unsigned i;
+
+	percpu_down_write(&c->mark_lock);
+	usage = c->usage_base;
+
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++)
+		usage->reserved += usage->persistent_reserved[i];
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+	}
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+		usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+				  dev.d[BCH_DATA_journal].buckets) *
+			ca->mi.bucket_size;
+	}
+
+	percpu_up_write(&c->mark_lock);
+}
+
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+						  unsigned journal_seq,
+						  bool gc)
+{
+	BUG_ON(!gc && !journal_seq);
+
+	return this_cpu_ptr(gc
+			    ? ca->usage_gc
+			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
+{
+	struct bch_fs *c = ca->fs;
+	unsigned seq, i, u64s = dev_usage_u64s();
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		memcpy(usage, ca->usage_base, u64s * sizeof(u64));
+		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+			acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
+{
+	ssize_t offset = v - (u64 *) c->usage_base;
+	unsigned i, seq;
+	u64 ret;
+
+	BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		ret = *v;
+
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+
+	return ret;
+}
+
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
+{
+	struct bch_fs_usage_online *ret;
+	unsigned nr_replicas = READ_ONCE(c->replicas.nr);
+	unsigned seq, i;
+retry:
+	ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
+	if (unlikely(!ret))
+		return NULL;
+
+	percpu_down_read(&c->mark_lock);
+
+	if (nr_replicas != c->replicas.nr) {
+		nr_replicas = c->replicas.nr;
+		percpu_up_read(&c->mark_lock);
+		kfree(ret);
+		goto retry;
+	}
+
+	ret->online_reserved = percpu_u64_get(c->online_reserved);
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		unsafe_memcpy(&ret->u, c->usage_base,
+			      __fs_usage_u64s(nr_replicas) * sizeof(u64),
+			      "embedded variable length struct");
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
+					__fs_usage_u64s(nr_replicas));
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+
+	return ret;
+}
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
+{
+	struct bch_dev *ca;
+	unsigned i, u64s = fs_usage_u64s(c);
+
+	BUG_ON(idx >= ARRAY_SIZE(c->usage));
+
+	preempt_disable();
+	write_seqcount_begin(&c->usage_lock);
+
+	acc_u64s_percpu((u64 *) c->usage_base,
+			(u64 __percpu *) c->usage[idx], u64s);
+	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL) {
+		u64s = dev_usage_u64s();
+
+		acc_u64s_percpu((u64 *) ca->usage_base,
+				(u64 __percpu *) ca->usage[idx], u64s);
+		percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+	}
+	rcu_read_unlock();
+
+	write_seqcount_end(&c->usage_lock);
+	preempt_enable();
+}
+
+void bch2_fs_usage_to_text(struct printbuf *out,
+			   struct bch_fs *c,
+			   struct bch_fs_usage_online *fs_usage)
+{
+	unsigned i;
+
+	prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
+
+	prt_printf(out, "hidden:\t\t\t\t%llu\n",
+	       fs_usage->u.hidden);
+	prt_printf(out, "data:\t\t\t\t%llu\n",
+	       fs_usage->u.data);
+	prt_printf(out, "cached:\t\t\t\t%llu\n",
+	       fs_usage->u.cached);
+	prt_printf(out, "reserved:\t\t\t%llu\n",
+	       fs_usage->u.reserved);
+	prt_printf(out, "nr_inodes:\t\t\t%llu\n",
+	       fs_usage->u.nr_inodes);
+	prt_printf(out, "online reserved:\t\t%llu\n",
+	       fs_usage->online_reserved);
+
+	for (i = 0;
+	     i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
+	     i++) {
+		prt_printf(out, "%u replicas:\n", i + 1);
+		prt_printf(out, "\treserved:\t\t%llu\n",
+		       fs_usage->u.persistent_reserved[i]);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		prt_printf(out, "\t");
+		bch2_replicas_entry_to_text(out, e);
+		prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
+	}
+}
+
+static u64 reserve_factor(u64 r)
+{
+	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
+{
+	return min(fs_usage->u.hidden +
+		   fs_usage->u.btree +
+		   fs_usage->u.data +
+		   reserve_factor(fs_usage->u.reserved +
+				  fs_usage->online_reserved),
+		   c->capacity);
+}
+
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
+{
+	struct bch_fs_usage_short ret;
+	u64 data, reserved;
+
+	ret.capacity = c->capacity -
+		bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+
+	data		= bch2_fs_usage_read_one(c, &c->usage_base->data) +
+		bch2_fs_usage_read_one(c, &c->usage_base->btree);
+	reserved	= bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+		percpu_u64_get(c->online_reserved);
+
+	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
+	ret.free	= ret.capacity - ret.used;
+
+	ret.nr_inodes	= bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+
+	return ret;
+}
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *c)
+{
+	struct bch_fs_usage_short ret;
+
+	percpu_down_read(&c->mark_lock);
+	ret = __bch2_fs_usage_read_short(c);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+void bch2_dev_usage_init(struct bch_dev *ca)
+{
+	ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
+}
+
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+					    struct bch_alloc_v4 a)
+{
+	return a.dirty_sectors
+		? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
+		: 0;
+}
+
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bch_alloc_v4 old,
+				  struct bch_alloc_v4 new,
+				  u64 journal_seq, bool gc)
+{
+	struct bch_fs_usage *fs_usage;
+	struct bch_dev_usage *u;
+
+	preempt_disable();
+	fs_usage = fs_usage_ptr(c, journal_seq, gc);
+
+	if (data_type_is_hidden(old.data_type))
+		fs_usage->hidden -= ca->mi.bucket_size;
+	if (data_type_is_hidden(new.data_type))
+		fs_usage->hidden += ca->mi.bucket_size;
+
+	u = dev_usage_ptr(ca, journal_seq, gc);
+
+	u->d[old.data_type].buckets--;
+	u->d[new.data_type].buckets++;
+
+	u->buckets_ec -= (int) !!old.stripe;
+	u->buckets_ec += (int) !!new.stripe;
+
+	u->d[old.data_type].sectors -= old.dirty_sectors;
+	u->d[new.data_type].sectors += new.dirty_sectors;
+
+	u->d[BCH_DATA_cached].sectors += new.cached_sectors;
+	u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
+
+	u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
+	preempt_enable();
+}
+
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+				    struct bucket old, struct bucket new,
+				    u64 journal_seq, bool gc)
+{
+	struct bch_alloc_v4 old_a = {
+		.gen		= old.gen,
+		.data_type	= old.data_type,
+		.dirty_sectors	= old.dirty_sectors,
+		.cached_sectors	= old.cached_sectors,
+		.stripe		= old.stripe,
+	};
+	struct bch_alloc_v4 new_a = {
+		.gen		= new.gen,
+		.data_type	= new.data_type,
+		.dirty_sectors	= new.dirty_sectors,
+		.cached_sectors	= new.cached_sectors,
+		.stripe		= new.stripe,
+	};
+
+	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+}
+
+static inline int __update_replicas(struct bch_fs *c,
+				    struct bch_fs_usage *fs_usage,
+				    struct bch_replicas_entry *r,
+				    s64 sectors)
+{
+	int idx = bch2_replicas_entry_idx(c, r);
+
+	if (idx < 0)
+		return -1;
+
+	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+	fs_usage->replicas[idx]		+= sectors;
+	return 0;
+}
+
+static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
+			struct bch_replicas_entry *r, s64 sectors,
+			unsigned journal_seq, bool gc)
+{
+	struct bch_fs_usage *fs_usage;
+	int idx, ret = 0;
+	struct printbuf buf = PRINTBUF;
+
+	percpu_down_read(&c->mark_lock);
+
+	idx = bch2_replicas_entry_idx(c, r);
+	if (idx < 0 &&
+	    fsck_err(c, "no replicas entry\n"
+		     "  while marking %s",
+		     (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		percpu_up_read(&c->mark_lock);
+		ret = bch2_mark_replicas(c, r);
+		percpu_down_read(&c->mark_lock);
+
+		if (ret)
+			goto err;
+		idx = bch2_replicas_entry_idx(c, r);
+	}
+	if (idx < 0) {
+		ret = -1;
+		goto err;
+	}
+
+	preempt_disable();
+	fs_usage = fs_usage_ptr(c, journal_seq, gc);
+	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+	fs_usage->replicas[idx]		+= sectors;
+	preempt_enable();
+err:
+fsck_err:
+	percpu_up_read(&c->mark_lock);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static inline int update_cached_sectors(struct bch_fs *c,
+			struct bkey_s_c k,
+			unsigned dev, s64 sectors,
+			unsigned journal_seq, bool gc)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
+}
+
+static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
+				     gfp_t gfp)
+{
+	struct replicas_delta_list *d = trans->fs_usage_deltas;
+	unsigned new_size = d ? (d->size + more) * 2 : 128;
+	unsigned alloc_size = sizeof(*d) + new_size;
+
+	WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
+
+	if (!d || d->used + more > d->size) {
+		d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
+
+		if (unlikely(!d)) {
+			if (alloc_size > REPLICAS_DELTA_LIST_MAX)
+				return -ENOMEM;
+
+			d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
+			if (!d)
+				return -ENOMEM;
+
+			memset(d, 0, REPLICAS_DELTA_LIST_MAX);
+
+			if (trans->fs_usage_deltas)
+				memcpy(d, trans->fs_usage_deltas,
+				       trans->fs_usage_deltas->size + sizeof(*d));
+
+			new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
+			kfree(trans->fs_usage_deltas);
+		}
+
+		d->size = new_size;
+		trans->fs_usage_deltas = d;
+	}
+
+	return 0;
+}
+
+int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
+{
+	return allocate_dropping_locks_errcode(trans,
+				__replicas_deltas_realloc(trans, more, _gfp));
+}
+
+static inline int update_replicas_list(struct btree_trans *trans,
+					struct bch_replicas_entry *r,
+					s64 sectors)
+{
+	struct replicas_delta_list *d;
+	struct replicas_delta *n;
+	unsigned b;
+	int ret;
+
+	if (!sectors)
+		return 0;
+
+	b = replicas_entry_bytes(r) + 8;
+	ret = bch2_replicas_deltas_realloc(trans, b);
+	if (ret)
+		return ret;
+
+	d = trans->fs_usage_deltas;
+	n = (void *) d->d + d->used;
+	n->delta = sectors;
+	unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
+		      r, replicas_entry_bytes(r),
+		      "flexible array member embedded in strcuct with padding");
+	bch2_replicas_entry_sort(&n->r);
+	d->used += b;
+	return 0;
+}
+
+static inline int update_cached_sectors_list(struct btree_trans *trans,
+					      unsigned dev, s64 sectors)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	return update_replicas_list(trans, &r.e, sectors);
+}
+
+int bch2_mark_alloc(struct btree_trans *trans,
+		    enum btree_id btree, unsigned level,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
+{
+	bool gc = flags & BTREE_TRIGGER_GC;
+	u64 journal_seq = trans->journal_res.seq;
+	u64 bucket_journal_seq;
+	struct bch_fs *c = trans->c;
+	struct bch_alloc_v4 old_a_convert, new_a_convert;
+	const struct bch_alloc_v4 *old_a, *new_a;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	/*
+	 * alloc btree is read in by bch2_alloc_read, not gc:
+	 */
+	if ((flags & BTREE_TRIGGER_GC) &&
+	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
+		return 0;
+
+	if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+				       "alloc key for invalid device or bucket"))
+		return -EIO;
+
+	ca = bch_dev_bkey_exists(c, new.k->p.inode);
+
+	old_a = bch2_alloc_to_v4(old, &old_a_convert);
+	new_a = bch2_alloc_to_v4(new, &new_a_convert);
+
+	bucket_journal_seq = new_a->journal_seq;
+
+	if ((flags & BTREE_TRIGGER_INSERT) &&
+	    data_type_is_empty(old_a->data_type) !=
+	    data_type_is_empty(new_a->data_type) &&
+	    new.k->type == KEY_TYPE_alloc_v4) {
+		struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
+
+		EBUG_ON(!journal_seq);
+
+		/*
+		 * If the btree updates referring to a bucket weren't flushed
+		 * before the bucket became empty again, then the we don't have
+		 * to wait on a journal flush before we can reuse the bucket:
+		 */
+		v->journal_seq = bucket_journal_seq =
+			data_type_is_empty(new_a->data_type) &&
+			(journal_seq == v->journal_seq ||
+			 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
+			? 0 : journal_seq;
+	}
+
+	if (!data_type_is_empty(old_a->data_type) &&
+	    data_type_is_empty(new_a->data_type) &&
+	    bucket_journal_seq) {
+		ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+				c->journal.flushed_seq_ondisk,
+				new.k->p.inode, new.k->p.offset,
+				bucket_journal_seq);
+		if (ret) {
+			bch2_fs_fatal_error(c,
+				"error setting bucket_needs_journal_commit: %i", ret);
+			return ret;
+		}
+	}
+
+	percpu_down_read(&c->mark_lock);
+	if (!gc && new_a->gen != old_a->gen)
+		*bucket_gen(ca, new.k->p.offset) = new_a->gen;
+
+	bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
+
+	if (gc) {
+		struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+		bucket_lock(g);
+
+		g->gen_valid		= 1;
+		g->gen			= new_a->gen;
+		g->data_type		= new_a->data_type;
+		g->stripe		= new_a->stripe;
+		g->stripe_redundancy	= new_a->stripe_redundancy;
+		g->dirty_sectors	= new_a->dirty_sectors;
+		g->cached_sectors	= new_a->cached_sectors;
+
+		bucket_unlock(g);
+	}
+	percpu_up_read(&c->mark_lock);
+
+	/*
+	 * need to know if we're getting called from the invalidate path or
+	 * not:
+	 */
+
+	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+	    old_a->cached_sectors) {
+		ret = update_cached_sectors(c, new, ca->dev_idx,
+					    -((s64) old_a->cached_sectors),
+					    journal_seq, gc);
+		if (ret) {
+			bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+					    __func__);
+			return ret;
+		}
+	}
+
+	if (new_a->data_type == BCH_DATA_free &&
+	    (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+		closure_wake_up(&c->freelist_wait);
+
+	if (new_a->data_type == BCH_DATA_need_discard &&
+	    (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
+		bch2_do_discards(c);
+
+	if (old_a->data_type != BCH_DATA_cached &&
+	    new_a->data_type == BCH_DATA_cached &&
+	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+		bch2_do_invalidates(c);
+
+	if (new_a->data_type == BCH_DATA_need_gc_gens)
+		bch2_do_gc_gens(c);
+
+	return 0;
+}
+
+int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			      size_t b, enum bch_data_type data_type,
+			      unsigned sectors, struct gc_pos pos,
+			      unsigned flags)
+{
+	struct bucket old, new, *g;
+	int ret = 0;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+	BUG_ON(data_type != BCH_DATA_sb &&
+	       data_type != BCH_DATA_journal);
+
+	/*
+	 * Backup superblock might be past the end of our normal usable space:
+	 */
+	if (b >= ca->mi.nbuckets)
+		return 0;
+
+	percpu_down_read(&c->mark_lock);
+	g = gc_bucket(ca, b);
+
+	bucket_lock(g);
+	old = *g;
+
+	if (bch2_fs_inconsistent_on(g->data_type &&
+			g->data_type != data_type, c,
+			"different types of data in same bucket: %s, %s",
+			bch2_data_types[g->data_type],
+			bch2_data_types[data_type])) {
+		ret = -EIO;
+		goto err;
+	}
+
+	if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
+			ca->dev_idx, b, g->gen,
+			bch2_data_types[g->data_type ?: data_type],
+			g->dirty_sectors, sectors)) {
+		ret = -EIO;
+		goto err;
+	}
+
+
+	g->data_type = data_type;
+	g->dirty_sectors += sectors;
+	new = *g;
+err:
+	bucket_unlock(g);
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+	percpu_up_read(&c->mark_lock);
+	return ret;
+}
+
+static int check_bucket_ref(struct btree_trans *trans,
+			    struct bkey_s_c k,
+			    const struct bch_extent_ptr *ptr,
+			    s64 sectors, enum bch_data_type ptr_data_type,
+			    u8 b_gen, u8 bucket_data_type,
+			    u32 dirty_sectors, u32 cached_sectors)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
+	u32 bucket_sectors = !ptr->cached
+		? dirty_sectors
+		: cached_sectors;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (bucket_data_type == BCH_DATA_cached)
+		bucket_data_type = BCH_DATA_user;
+
+	if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
+	    (bucket_data_type == BCH_DATA_user   && ptr_data_type == BCH_DATA_stripe))
+		bucket_data_type = ptr_data_type = BCH_DATA_stripe;
+
+	if (gen_after(ptr->gen, b_gen)) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+
+	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+
+	if (b_gen != ptr->gen && !ptr->cached) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			*bucket_gen(ca, bucket_nr),
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+
+	if (b_gen != ptr->gen) {
+		ret = 1;
+		goto out;
+	}
+
+	if (!data_type_is_empty(bucket_data_type) &&
+	    ptr_data_type &&
+	    bucket_data_type != ptr_data_type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_types[bucket_data_type],
+			bch2_data_types[ptr_data_type],
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+
+	if ((u64) bucket_sectors + sectors > U32_MAX) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			bucket_sectors, sectors,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+out:
+	printbuf_exit(&buf);
+	return ret;
+err:
+	bch2_dump_trans_updates(trans);
+	goto out;
+}
+
+static int mark_stripe_bucket(struct btree_trans *trans,
+			      struct bkey_s_c k,
+			      unsigned ptr_idx,
+			      unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	u64 journal_seq = trans->journal_res.seq;
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+	bool parity = ptr_idx >= nr_data;
+	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+	s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
+	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	struct bucket old, new, *g;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	/* * XXX doesn't handle deletion */
+
+	percpu_down_read(&c->mark_lock);
+	g = PTR_GC_BUCKET(ca, ptr);
+
+	if (g->dirty_sectors ||
+	    (g->stripe && g->stripe != k.k->p.offset)) {
+		bch2_fs_inconsistent(c,
+			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+			      (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EINVAL;
+		goto err;
+	}
+
+	bucket_lock(g);
+	old = *g;
+
+	ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
+			       g->gen, g->data_type,
+			       g->dirty_sectors, g->cached_sectors);
+	if (ret)
+		goto err;
+
+	g->data_type = data_type;
+	g->dirty_sectors += sectors;
+
+	g->stripe		= k.k->p.offset;
+	g->stripe_redundancy	= s->nr_redundant;
+	new = *g;
+err:
+	bucket_unlock(g);
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
+	percpu_up_read(&c->mark_lock);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int __mark_pointer(struct btree_trans *trans,
+			  struct bkey_s_c k,
+			  const struct bch_extent_ptr *ptr,
+			  s64 sectors, enum bch_data_type ptr_data_type,
+			  u8 bucket_gen, u8 *bucket_data_type,
+			  u32 *dirty_sectors, u32 *cached_sectors)
+{
+	u32 *dst_sectors = !ptr->cached
+		? dirty_sectors
+		: cached_sectors;
+	int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
+				   bucket_gen, *bucket_data_type,
+				   *dirty_sectors, *cached_sectors);
+
+	if (ret)
+		return ret;
+
+	*dst_sectors += sectors;
+	*bucket_data_type = *dirty_sectors || *cached_sectors
+		? ptr_data_type : 0;
+	return 0;
+}
+
+static int bch2_mark_pointer(struct btree_trans *trans,
+			     enum btree_id btree_id, unsigned level,
+			     struct bkey_s_c k,
+			     struct extent_ptr_decoded p,
+			     s64 sectors,
+			     unsigned flags)
+{
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	struct bucket old, new, *g;
+	enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+	u8 bucket_data_type;
+	int ret = 0;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	percpu_down_read(&c->mark_lock);
+	g = PTR_GC_BUCKET(ca, &p.ptr);
+	bucket_lock(g);
+	old = *g;
+
+	bucket_data_type = g->data_type;
+	ret = __mark_pointer(trans, k, &p.ptr, sectors,
+			     data_type, g->gen,
+			     &bucket_data_type,
+			     &g->dirty_sectors,
+			     &g->cached_sectors);
+	if (!ret)
+		g->data_type = bucket_data_type;
+
+	new = *g;
+	bucket_unlock(g);
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+static int bch2_mark_stripe_ptr(struct btree_trans *trans,
+				struct bkey_s_c k,
+				struct bch_extent_stripe_ptr p,
+				enum bch_data_type data_type,
+				s64 sectors,
+				unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_replicas_padded r;
+	struct gc_stripe *m;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
+	if (!m) {
+		bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+			(u64) p.idx);
+		return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+	}
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+
+	if (!m || !m->alive) {
+		mutex_unlock(&c->ec_stripes_heap_lock);
+		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+				    (u64) p.idx);
+		bch2_inconsistent_error(c);
+		return -EIO;
+	}
+
+	m->block_sectors[p.block] += sectors;
+
+	r = m->r;
+	mutex_unlock(&c->ec_stripes_heap_lock);
+
+	r.e.data_type = data_type;
+	update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+
+	return 0;
+}
+
+int bch2_mark_extent(struct btree_trans *trans,
+		     enum btree_id btree_id, unsigned level,
+		     struct bkey_s_c old, struct bkey_s_c new,
+		     unsigned flags)
+{
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_replicas_padded r;
+	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+		? BCH_DATA_btree
+		: BCH_DATA_user;
+	s64 sectors = bkey_is_btree_ptr(k.k)
+		? btree_sectors(c)
+		: k.k->size;
+	s64 dirty_sectors = 0;
+	bool stale;
+	int ret;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = ptr_disk_sectors(sectors, p);
+
+		if (flags & BTREE_TRIGGER_OVERWRITE)
+			disk_sectors = -disk_sectors;
+
+		ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
+		if (ret < 0)
+			return ret;
+
+		stale = ret > 0;
+
+		if (p.ptr.cached) {
+			if (!stale) {
+				ret = update_cached_sectors(c, k, p.ptr.dev,
+						disk_sectors, journal_seq, true);
+				if (ret) {
+					bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+							    __func__);
+					return ret;
+				}
+			}
+		} else if (!p.has_ec) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
+			ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
+					disk_sectors, flags);
+			if (ret)
+				return ret;
+
+			/*
+			 * There may be other dirty pointers in this extent, but
+			 * if so they're not required for mounting if we have an
+			 * erasure coded pointer in this extent:
+			 */
+			r.e.nr_required = 0;
+		}
+	}
+
+	if (r.e.nr_devs) {
+		ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+			printbuf_exit(&buf);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_mark_stripe(struct btree_trans *trans,
+		     enum btree_id btree_id, unsigned level,
+		     struct bkey_s_c old, struct bkey_s_c new,
+		     unsigned flags)
+{
+	bool gc = flags & BTREE_TRIGGER_GC;
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
+	u64 idx = new.k->p.offset;
+	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(old).v : NULL;
+	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(new).v : NULL;
+	unsigned i;
+	int ret;
+
+	BUG_ON(gc && old_s);
+
+	if (!gc) {
+		struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+		if (!m) {
+			struct printbuf buf1 = PRINTBUF;
+			struct printbuf buf2 = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf1, c, old);
+			bch2_bkey_val_to_text(&buf2, c, new);
+			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+					    "old %s\n"
+					    "new %s", idx, buf1.buf, buf2.buf);
+			printbuf_exit(&buf2);
+			printbuf_exit(&buf1);
+			bch2_inconsistent_error(c);
+			return -1;
+		}
+
+		if (!new_s) {
+			bch2_stripes_heap_del(c, m, idx);
+
+			memset(m, 0, sizeof(*m));
+		} else {
+			m->sectors	= le16_to_cpu(new_s->sectors);
+			m->algorithm	= new_s->algorithm;
+			m->nr_blocks	= new_s->nr_blocks;
+			m->nr_redundant	= new_s->nr_redundant;
+			m->blocks_nonempty = 0;
+
+			for (i = 0; i < new_s->nr_blocks; i++)
+				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+
+			if (!old_s)
+				bch2_stripes_heap_insert(c, m, idx);
+			else
+				bch2_stripes_heap_update(c, m, idx);
+		}
+	} else {
+		struct gc_stripe *m =
+			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+		if (!m) {
+			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+				idx);
+			return -BCH_ERR_ENOMEM_mark_stripe;
+		}
+		/*
+		 * This will be wrong when we bring back runtime gc: we should
+		 * be unmarking the old key and then marking the new key
+		 */
+		m->alive	= true;
+		m->sectors	= le16_to_cpu(new_s->sectors);
+		m->nr_blocks	= new_s->nr_blocks;
+		m->nr_redundant	= new_s->nr_redundant;
+
+		for (i = 0; i < new_s->nr_blocks; i++)
+			m->ptrs[i] = new_s->ptrs[i];
+
+		bch2_bkey_to_replicas(&m->r.e, new);
+
+		/*
+		 * gc recalculates this field from stripe ptr
+		 * references:
+		 */
+		memset(m->block_sectors, 0, sizeof(m->block_sectors));
+
+		for (i = 0; i < new_s->nr_blocks; i++) {
+			ret = mark_stripe_bucket(trans, new, i, flags);
+			if (ret)
+				return ret;
+		}
+
+		ret = update_replicas(c, new, &m->r.e,
+				      ((s64) m->sectors * m->nr_redundant),
+				      journal_seq, gc);
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, new);
+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+			printbuf_exit(&buf);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_mark_reservation(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
+			  struct bkey_s_c old, struct bkey_s_c new,
+			  unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
+	struct bch_fs_usage *fs_usage;
+	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+	s64 sectors = (s64) k.k->size;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	if (flags & BTREE_TRIGGER_OVERWRITE)
+		sectors = -sectors;
+	sectors *= replicas;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+
+	fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
+	replicas = clamp_t(unsigned, replicas, 1,
+			   ARRAY_SIZE(fs_usage->persistent_reserved));
+
+	fs_usage->reserved				+= sectors;
+	fs_usage->persistent_reserved[replicas - 1]	+= sectors;
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+
+	return 0;
+}
+
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+				 struct bkey_s_c_reflink_p p,
+				 u64 start, u64 end,
+				 u64 *idx, unsigned flags, size_t r_idx)
+{
+	struct bch_fs *c = trans->c;
+	struct reflink_gc *r;
+	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	u64 next_idx = end;
+	s64 ret = 0;
+	struct printbuf buf = PRINTBUF;
+
+	if (r_idx >= c->reflink_gc_nr)
+		goto not_found;
+
+	r = genradix_ptr(&c->reflink_gc_table, r_idx);
+	next_idx = min(next_idx, r->offset - r->size);
+	if (*idx < next_idx)
+		goto not_found;
+
+	BUG_ON((s64) r->refcount + add < 0);
+
+	r->refcount += add;
+	*idx = r->offset;
+	return 0;
+not_found:
+	if (fsck_err(c, "pointer to missing indirect extent\n"
+		     "  %s\n"
+		     "  missing range %llu-%llu",
+		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+		     *idx, next_idx)) {
+		struct bkey_i_error *new;
+
+		new = bch2_trans_kmalloc(trans, sizeof(*new));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto err;
+
+		bkey_init(&new->k);
+		new->k.type	= KEY_TYPE_error;
+		new->k.p		= bkey_start_pos(p.k);
+		new->k.p.offset += *idx - start;
+		bch2_key_resize(&new->k, next_idx - *idx);
+		ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
+					  BTREE_TRIGGER_NORUN);
+	}
+
+	*idx = next_idx;
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_mark_reflink_p(struct btree_trans *trans,
+			enum btree_id btree_id, unsigned level,
+			struct bkey_s_c old, struct bkey_s_c new,
+			unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+	struct reflink_gc *ref;
+	size_t l, r, m;
+	u64 idx = le64_to_cpu(p.v->idx), start = idx;
+	u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+	int ret = 0;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
+		idx -= le32_to_cpu(p.v->front_pad);
+		end += le32_to_cpu(p.v->back_pad);
+	}
+
+	l = 0;
+	r = c->reflink_gc_nr;
+	while (l < r) {
+		m = l + (r - l) / 2;
+
+		ref = genradix_ptr(&c->reflink_gc_table, m);
+		if (ref->offset <= idx)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	while (idx < end && !ret)
+		ret = __bch2_mark_reflink_p(trans, p, start, end,
+					    &idx, flags, l++);
+
+	return ret;
+}
+
+void bch2_trans_fs_usage_revert(struct btree_trans *trans,
+				struct replicas_delta_list *deltas)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *dst;
+	struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
+	s64 added = 0;
+	unsigned i;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+	/* revert changes: */
+	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+		switch (d->r.data_type) {
+		case BCH_DATA_btree:
+		case BCH_DATA_user:
+		case BCH_DATA_parity:
+			added += d->delta;
+		}
+		BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
+	}
+
+	dst->nr_inodes -= deltas->nr_inodes;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		added				-= deltas->persistent_reserved[i];
+		dst->reserved			-= deltas->persistent_reserved[i];
+		dst->persistent_reserved[i]	-= deltas->persistent_reserved[i];
+	}
+
+	if (added > 0) {
+		trans->disk_res->sectors += added;
+		this_cpu_add(*c->online_reserved, added);
+	}
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+			      struct replicas_delta_list *deltas)
+{
+	struct bch_fs *c = trans->c;
+	static int warned_disk_usage = 0;
+	bool warn = false;
+	unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+	struct replicas_delta *d, *d2;
+	struct replicas_delta *top = (void *) deltas->d + deltas->used;
+	struct bch_fs_usage *dst;
+	s64 added = 0, should_not_have_added;
+	unsigned i;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+		switch (d->r.data_type) {
+		case BCH_DATA_btree:
+		case BCH_DATA_user:
+		case BCH_DATA_parity:
+			added += d->delta;
+		}
+
+		if (__update_replicas(c, dst, &d->r, d->delta))
+			goto need_mark;
+	}
+
+	dst->nr_inodes += deltas->nr_inodes;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		added				+= deltas->persistent_reserved[i];
+		dst->reserved			+= deltas->persistent_reserved[i];
+		dst->persistent_reserved[i]	+= deltas->persistent_reserved[i];
+	}
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	should_not_have_added = added - (s64) disk_res_sectors;
+	if (unlikely(should_not_have_added > 0)) {
+		u64 old, new, v = atomic64_read(&c->sectors_available);
+
+		do {
+			old = v;
+			new = max_t(s64, 0, old - should_not_have_added);
+		} while ((v = atomic64_cmpxchg(&c->sectors_available,
+					       old, new)) != old);
+
+		added -= should_not_have_added;
+		warn = true;
+	}
+
+	if (added > 0) {
+		trans->disk_res->sectors -= added;
+		this_cpu_sub(*c->online_reserved, added);
+	}
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+
+	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
+		bch2_trans_inconsistent(trans,
+					"disk usage increased %lli more than %u sectors reserved)",
+					should_not_have_added, disk_res_sectors);
+	return 0;
+need_mark:
+	/* revert changes: */
+	for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
+		BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+	return -1;
+}
+
+/* trans_mark: */
+
+static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
+				   enum btree_id btree_id, unsigned level,
+				   struct bkey_s_c k, struct extent_ptr_decoded p,
+				   unsigned flags)
+{
+	bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a;
+	struct bpos bucket;
+	struct bch_backpointer bp;
+	s64 sectors;
+	int ret;
+
+	bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+	sectors = bp.bucket_len;
+	if (!insert)
+		sectors = -sectors;
+
+	a = bch2_trans_start_alloc_update(trans, &iter, bucket);
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
+			     a->v.gen, &a->v.data_type,
+			     &a->v.dirty_sectors, &a->v.cached_sectors) ?:
+		bch2_trans_update(trans, &iter, &a->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	if (!p.ptr.cached) {
+		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
+			struct extent_ptr_decoded p,
+			s64 sectors, enum bch_data_type data_type)
+{
+	struct btree_iter iter;
+	struct bkey_i_stripe *s;
+	struct bch_replicas_padded r;
+	int ret = 0;
+
+	s = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_stripes, POS(0, p.ec.idx),
+			BTREE_ITER_WITH_UPDATES, stripe);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (unlikely(ret)) {
+		bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
+			"pointer to nonexistent stripe %llu",
+			(u64) p.ec.idx);
+		goto err;
+	}
+
+	if (!bch2_ptr_matches_stripe(&s->v, p)) {
+		bch2_trans_inconsistent(trans,
+			"stripe pointer doesn't match stripe %llu",
+			(u64) p.ec.idx);
+		ret = -EIO;
+		goto err;
+	}
+
+	stripe_blockcount_set(&s->v, p.ec.block,
+		stripe_blockcount_get(&s->v, p.ec.block) +
+		sectors);
+
+	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+	r.e.data_type = data_type;
+	ret = update_replicas_list(trans, &r.e, sectors);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_trans_mark_extent(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old, struct bkey_i *new,
+			   unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_replicas_padded r;
+	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+		? BCH_DATA_btree
+		: BCH_DATA_user;
+	s64 sectors = bkey_is_btree_ptr(k.k)
+		? btree_sectors(c)
+		: k.k->size;
+	s64 dirty_sectors = 0;
+	bool stale;
+	int ret = 0;
+
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = ptr_disk_sectors(sectors, p);
+
+		if (flags & BTREE_TRIGGER_OVERWRITE)
+			disk_sectors = -disk_sectors;
+
+		ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
+		if (ret < 0)
+			return ret;
+
+		stale = ret > 0;
+
+		if (p.ptr.cached) {
+			if (!stale) {
+				ret = update_cached_sectors_list(trans, p.ptr.dev,
+								 disk_sectors);
+				if (ret)
+					return ret;
+			}
+		} else if (!p.has_ec) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
+			ret = bch2_trans_mark_stripe_ptr(trans, p,
+					disk_sectors, data_type);
+			if (ret)
+				return ret;
+
+			r.e.nr_required = 0;
+		}
+	}
+
+	if (r.e.nr_devs)
+		ret = update_replicas_list(trans, &r.e, dirty_sectors);
+
+	return ret;
+}
+
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+					 struct bkey_s_c_stripe s,
+					 unsigned idx, bool deleting)
+{
+	struct bch_fs *c = trans->c;
+	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a;
+	enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+		? BCH_DATA_parity : 0;
+	s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+	int ret = 0;
+
+	if (deleting)
+		sectors = -sectors;
+
+	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
+			       a->v.gen, a->v.data_type,
+			       a->v.dirty_sectors, a->v.cached_sectors);
+	if (ret)
+		goto err;
+
+	if (!deleting) {
+		if (bch2_trans_inconsistent_on(a->v.stripe ||
+					       a->v.stripe_redundancy, trans,
+				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				bch2_data_types[a->v.data_type],
+				a->v.dirty_sectors,
+				a->v.stripe, s.k->p.offset)) {
+			ret = -EIO;
+			goto err;
+		}
+
+		if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
+				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				bch2_data_types[a->v.data_type],
+				a->v.dirty_sectors,
+				s.k->p.offset)) {
+			ret = -EIO;
+			goto err;
+		}
+
+		a->v.stripe		= s.k->p.offset;
+		a->v.stripe_redundancy	= s.v->nr_redundant;
+		a->v.data_type		= BCH_DATA_stripe;
+	} else {
+		if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+					       a->v.stripe_redundancy != s.v->nr_redundant, trans,
+				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				s.k->p.offset, a->v.stripe)) {
+			ret = -EIO;
+			goto err;
+		}
+
+		a->v.stripe		= 0;
+		a->v.stripe_redundancy	= 0;
+		a->v.data_type		= alloc_data_type(a->v, BCH_DATA_user);
+	}
+
+	a->v.dirty_sectors += sectors;
+	if (data_type)
+		a->v.data_type = !deleting ? data_type : 0;
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old, struct bkey_i *new,
+			   unsigned flags)
+{
+	const struct bch_stripe *old_s = NULL;
+	struct bch_stripe *new_s = NULL;
+	struct bch_replicas_padded r;
+	unsigned i, nr_blocks;
+	int ret = 0;
+
+	if (old.k->type == KEY_TYPE_stripe)
+		old_s = bkey_s_c_to_stripe(old).v;
+	if (new->k.type == KEY_TYPE_stripe)
+		new_s = &bkey_i_to_stripe(new)->v;
+
+	/*
+	 * If the pointers aren't changing, we don't need to do anything:
+	 */
+	if (new_s && old_s &&
+	    new_s->nr_blocks	== old_s->nr_blocks &&
+	    new_s->nr_redundant	== old_s->nr_redundant &&
+	    !memcmp(old_s->ptrs, new_s->ptrs,
+		    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+		return 0;
+
+	BUG_ON(new_s && old_s &&
+	       (new_s->nr_blocks	!= old_s->nr_blocks ||
+		new_s->nr_redundant	!= old_s->nr_redundant));
+
+	nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
+
+	if (new_s) {
+		s64 sectors = le16_to_cpu(new_s->sectors);
+
+		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
+		ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+		if (ret)
+			return ret;
+	}
+
+	if (old_s) {
+		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+
+		bch2_bkey_to_replicas(&r.e, old);
+		ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < nr_blocks; i++) {
+		if (new_s && old_s &&
+		    !memcmp(&new_s->ptrs[i],
+			    &old_s->ptrs[i],
+			    sizeof(new_s->ptrs[i])))
+			continue;
+
+		if (new_s) {
+			ret = bch2_trans_mark_stripe_bucket(trans,
+					bkey_i_to_s_c_stripe(new), i, false);
+			if (ret)
+				break;
+		}
+
+		if (old_s) {
+			ret = bch2_trans_mark_stripe_bucket(trans,
+					bkey_s_c_to_stripe(old), i, true);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+				enum btree_id btree_id, unsigned level,
+				struct bkey_s_c old,
+				struct bkey_i *new,
+				unsigned flags)
+{
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
+	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+	s64 sectors = (s64) k.k->size;
+	struct replicas_delta_list *d;
+	int ret;
+
+	if (flags & BTREE_TRIGGER_OVERWRITE)
+		sectors = -sectors;
+	sectors *= replicas;
+
+	ret = bch2_replicas_deltas_realloc(trans, 0);
+	if (ret)
+		return ret;
+
+	d = trans->fs_usage_deltas;
+	replicas = clamp_t(unsigned, replicas, 1,
+			   ARRAY_SIZE(d->persistent_reserved));
+
+	d->persistent_reserved[replicas - 1] += sectors;
+	return 0;
+}
+
+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			struct bkey_s_c_reflink_p p,
+			u64 *idx, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i *k;
+	__le64 *refcount;
+	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	k = bch2_bkey_get_mut_noupdate(trans, &iter,
+			BTREE_ID_reflink, POS(0, *idx),
+			BTREE_ITER_WITH_UPDATES);
+	ret = PTR_ERR_OR_ZERO(k);
+	if (ret)
+		goto err;
+
+	refcount = bkey_refcount(k);
+	if (!refcount) {
+		bch2_bkey_val_to_text(&buf, c, p.s_c);
+		bch2_trans_inconsistent(trans,
+			"nonexistent indirect extent at %llu while marking\n  %s",
+			*idx, buf.buf);
+		ret = -EIO;
+		goto err;
+	}
+
+	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+		bch2_bkey_val_to_text(&buf, c, p.s_c);
+		bch2_trans_inconsistent(trans,
+			"indirect extent refcount underflow at %llu while marking\n  %s",
+			*idx, buf.buf);
+		ret = -EIO;
+		goto err;
+	}
+
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+		u64 pad;
+
+		pad = max_t(s64, le32_to_cpu(v->front_pad),
+			    le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+		BUG_ON(pad > U32_MAX);
+		v->front_pad = cpu_to_le32(pad);
+
+		pad = max_t(s64, le32_to_cpu(v->back_pad),
+			    k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+		BUG_ON(pad > U32_MAX);
+		v->back_pad = cpu_to_le32(pad);
+	}
+
+	le64_add_cpu(refcount, add);
+
+	bch2_btree_iter_set_pos_to_extent_start(&iter);
+	ret = bch2_trans_update(trans, &iter, k, 0);
+	if (ret)
+		goto err;
+
+	*idx = k->k.p.offset;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c old,
+			      struct bkey_i *new,
+			      unsigned flags)
+{
+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+		? old
+		: bkey_i_to_s_c(new);
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+	u64 idx, end_idx;
+	int ret = 0;
+
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+
+		v->front_pad = v->back_pad = 0;
+	}
+
+	idx	= le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+	end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+		le32_to_cpu(p.v->back_pad);
+
+	while (idx < end_idx && !ret)
+		ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
+
+	return ret;
+}
+
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct bch_dev *ca, size_t b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a;
+	int ret = 0;
+
+	/*
+	 * Backup superblock might be past the end of our normal usable space:
+	 */
+	if (b >= ca->mi.nbuckets)
+		return 0;
+
+	a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	if (a->v.data_type && type && a->v.data_type != type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			iter.pos.inode, iter.pos.offset, a->v.gen,
+			bch2_data_types[a->v.data_type],
+			bch2_data_types[type],
+			bch2_data_types[type]);
+		ret = -EIO;
+		goto out;
+	}
+
+	a->v.data_type		= type;
+	a->v.dirty_sectors	= sectors;
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	if (ret)
+		goto out;
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct bch_dev *ca, size_t b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	return commit_do(trans, NULL, NULL, 0,
+			__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+					    struct bch_dev *ca,
+					    u64 start, u64 end,
+					    enum bch_data_type type,
+					    u64 *bucket, unsigned *bucket_sectors)
+{
+	do {
+		u64 b = sector_to_bucket(ca, start);
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+		if (b != *bucket && *bucket_sectors) {
+			int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
+								  type, *bucket_sectors);
+			if (ret)
+				return ret;
+
+			*bucket_sectors = 0;
+		}
+
+		*bucket		= b;
+		*bucket_sectors	+= sectors;
+		start += sectors;
+	} while (start < end);
+
+	return 0;
+}
+
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+				    struct bch_dev *ca)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 bucket = 0;
+	unsigned i, bucket_sectors = 0;
+	int ret;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR) {
+			ret = bch2_trans_mark_metadata_sectors(trans, ca,
+						0, BCH_SB_SECTOR,
+						BCH_DATA_sb, &bucket, &bucket_sectors);
+			if (ret)
+				return ret;
+		}
+
+		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
+				      BCH_DATA_sb, &bucket, &bucket_sectors);
+		if (ret)
+			return ret;
+	}
+
+	if (bucket_sectors) {
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
+				bucket, BCH_DATA_sb, bucket_sectors);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
+				ca->journal.buckets[i],
+				BCH_DATA_journal, ca->mi.bucket_size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
+{
+	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Disk reservations: */
+
+#define SECTORS_CACHE	1024
+
+int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+			      u64 sectors, int flags)
+{
+	struct bch_fs_pcpu *pcpu;
+	u64 old, v, get;
+	s64 sectors_available;
+	int ret;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	pcpu = this_cpu_ptr(c->pcpu);
+
+	if (sectors <= pcpu->sectors_available)
+		goto out;
+
+	v = atomic64_read(&c->sectors_available);
+	do {
+		old = v;
+		get = min((u64) sectors + SECTORS_CACHE, old);
+
+		if (get < sectors) {
+			preempt_enable();
+			goto recalculate;
+		}
+	} while ((v = atomic64_cmpxchg(&c->sectors_available,
+				       old, old - get)) != old);
+
+	pcpu->sectors_available		+= get;
+
+out:
+	pcpu->sectors_available		-= sectors;
+	this_cpu_add(*c->online_reserved, sectors);
+	res->sectors			+= sectors;
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+	return 0;
+
+recalculate:
+	mutex_lock(&c->sectors_available_lock);
+
+	percpu_u64_set(&c->pcpu->sectors_available, 0);
+	sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
+
+	if (sectors <= sectors_available ||
+	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		atomic64_set(&c->sectors_available,
+			     max_t(s64, 0, sectors_available - sectors));
+		this_cpu_add(*c->online_reserved, sectors);
+		res->sectors			+= sectors;
+		ret = 0;
+	} else {
+		atomic64_set(&c->sectors_available, sectors_available);
+		ret = -BCH_ERR_ENOSPC_disk_reservation;
+	}
+
+	mutex_unlock(&c->sectors_available_lock);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+/* Startup/shutdown: */
+
+static void bucket_gens_free_rcu(struct rcu_head *rcu)
+{
+	struct bucket_gens *buckets =
+		container_of(rcu, struct bucket_gens, rcu);
+
+	kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
+	unsigned long *buckets_nouse = NULL;
+	bool resize = ca->bucket_gens != NULL;
+	int ret;
+
+	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+					    GFP_KERNEL|__GFP_ZERO))) {
+		ret = -BCH_ERR_ENOMEM_bucket_gens;
+		goto err;
+	}
+
+	if ((c->opts.buckets_nouse &&
+	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+					    sizeof(unsigned long),
+					    GFP_KERNEL|__GFP_ZERO)))) {
+		ret = -BCH_ERR_ENOMEM_buckets_nouse;
+		goto err;
+	}
+
+	bucket_gens->first_bucket = ca->mi.first_bucket;
+	bucket_gens->nbuckets	= nbuckets;
+
+	bch2_copygc_stop(c);
+
+	if (resize) {
+		down_write(&c->gc_lock);
+		down_write(&ca->bucket_lock);
+		percpu_down_write(&c->mark_lock);
+	}
+
+	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
+
+	if (resize) {
+		size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
+
+		memcpy(bucket_gens->b,
+		       old_bucket_gens->b,
+		       n);
+		if (buckets_nouse)
+			memcpy(buckets_nouse,
+			       ca->buckets_nouse,
+			       BITS_TO_LONGS(n) * sizeof(unsigned long));
+	}
+
+	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
+	bucket_gens	= old_bucket_gens;
+
+	swap(ca->buckets_nouse, buckets_nouse);
+
+	nbuckets = ca->mi.nbuckets;
+
+	if (resize) {
+		percpu_up_write(&c->mark_lock);
+		up_write(&ca->bucket_lock);
+		up_write(&c->gc_lock);
+	}
+
+	ret = 0;
+err:
+	kvpfree(buckets_nouse,
+		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	if (bucket_gens)
+		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
+
+	return ret;
+}
+
+void bch2_dev_buckets_free(struct bch_dev *ca)
+{
+	unsigned i;
+
+	kvpfree(ca->buckets_nouse,
+		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+		sizeof(struct bucket_gens) + ca->mi.nbuckets);
+
+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+		free_percpu(ca->usage[i]);
+	kfree(ca->usage_base);
+}
+
+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+	if (!ca->usage_base)
+		return -BCH_ERR_ENOMEM_usage_init;
+
+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[i])
+			return -BCH_ERR_ENOMEM_usage_init;
+	}
+
+	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
+}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
new file mode 100644
index 000000000000..bf8d7f407e9c
--- /dev/null
+++ b/fs/bcachefs/buckets.h
@@ -0,0 +1,443 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "extents.h"
+#include "sb-members.h"
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+	return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+	return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+	u32 remainder;
+
+	div_u64_rem(s, ca->mi.bucket_size, &remainder);
+	return remainder;
+}
+
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+						 u32 *offset)
+{
+	return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
+
+#define for_each_bucket(_b, _buckets)				\
+	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
+	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
+
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ *   while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR	0
+#else
+#define BUCKET_LOCK_BITNR	(BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+	ulong	ulong;
+	u8	byte;
+};
+
+static inline void bucket_unlock(struct bucket *b)
+{
+	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+
+	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+	wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
+}
+
+static inline void bucket_lock(struct bucket *b)
+{
+	wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
+			 TASK_UNINTERRUPTIBLE);
+}
+
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
+{
+	return rcu_dereference_check(ca->buckets_gc,
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
+{
+	struct bucket_array *buckets = gc_bucket_array(ca);
+
+	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+	return buckets->b + b;
+}
+
+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
+{
+	return rcu_dereference_check(ca->bucket_gens,
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
+{
+	struct bucket_gens *gens = bucket_gens(ca);
+
+	BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+	return gens->b + b;
+}
+
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+				   const struct bch_extent_ptr *ptr)
+{
+	return sector_to_bucket(ca, ptr->offset);
+}
+
+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
+				   const struct bch_extent_ptr *ptr)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+						const struct bch_extent_ptr *ptr,
+						u32 *bucket_offset)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
+}
+
+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
+					   const struct bch_extent_ptr *ptr)
+{
+	return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline enum bch_data_type ptr_data_type(const struct bkey *k,
+					       const struct bch_extent_ptr *ptr)
+{
+	if (bkey_is_btree_ptr(k))
+		return BCH_DATA_btree;
+
+	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
+}
+
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+	EBUG_ON(sectors < 0);
+
+	return crc_is_compressed(p.crc)
+		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+				   p.crc.uncompressed_size)
+		: sectors;
+}
+
+static inline int gen_cmp(u8 a, u8 b)
+{
+	return (s8) (a - b);
+}
+
+static inline int gen_after(u8 a, u8 b)
+{
+	int r = gen_cmp(a, b);
+
+	return r > 0 ? r : 0;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ */
+static inline u8 ptr_stale(struct bch_dev *ca,
+			   const struct bch_extent_ptr *ptr)
+{
+	u8 ret;
+
+	rcu_read_lock();
+	ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/* Device usage: */
+
+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+{
+	struct bch_dev_usage ret;
+
+	bch2_dev_usage_read_fast(ca, &ret);
+	return ret;
+}
+
+void bch2_dev_usage_init(struct bch_dev *);
+
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
+{
+	s64 reserved = 0;
+
+	switch (watermark) {
+	case BCH_WATERMARK_NR:
+		BUG();
+	case BCH_WATERMARK_stripe:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
+	case BCH_WATERMARK_normal:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
+	case BCH_WATERMARK_copygc:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case BCH_WATERMARK_btree:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case BCH_WATERMARK_btree_copygc:
+	case BCH_WATERMARK_reclaim:
+		break;
+	}
+
+	return reserved;
+}
+
+static inline u64 dev_buckets_free(struct bch_dev *ca,
+				   struct bch_dev_usage usage,
+				   enum bch_watermark watermark)
+{
+	return max_t(s64, 0,
+		     usage.d[BCH_DATA_free].buckets -
+		     ca->nr_open_buckets -
+		     bch2_dev_buckets_reserved(ca, watermark));
+}
+
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+					  struct bch_dev_usage usage,
+					  enum bch_watermark watermark)
+{
+	return max_t(s64, 0,
+		       usage.d[BCH_DATA_free].buckets
+		     + usage.d[BCH_DATA_cached].buckets
+		     + usage.d[BCH_DATA_need_gc_gens].buckets
+		     + usage.d[BCH_DATA_need_discard].buckets
+		     - ca->nr_open_buckets
+		     - bch2_dev_buckets_reserved(ca, watermark));
+}
+
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+					enum bch_watermark watermark)
+{
+	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
+}
+
+/* Filesystem usage: */
+
+static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
+{
+	return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
+{
+	return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
+{
+	return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
+{
+	return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned dev_usage_u64s(void)
+{
+	return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
+
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
+
+void bch2_fs_usage_to_text(struct printbuf *,
+			   struct bch_fs *, struct bch_fs_usage_online *);
+
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *);
+
+/* key/bucket marking: */
+
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
+						unsigned journal_seq,
+						bool gc)
+{
+	percpu_rwsem_assert_held(&c->mark_lock);
+	BUG_ON(!gc && !journal_seq);
+
+	return this_cpu_ptr(gc
+			    ? c->usage_gc
+			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
+int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
+
+void bch2_fs_usage_initialize(struct bch_fs *);
+
+int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+			      size_t, enum bch_data_type, unsigned,
+			      struct gc_pos, unsigned);
+
+int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+		    struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
+		     struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
+		     struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+			struct bkey_s_c, struct bkey_s_c, unsigned);
+
+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+
+void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
+int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
+				    size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 b_offset	= bucket_to_sector(ca, b);
+	u64 b_end	= bucket_to_sector(ca, b + 1);
+	unsigned i;
+
+	if (!b)
+		return true;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+		u64 end = offset + (1 << layout->sb_max_size_bits);
+
+		if (!(offset >= b_end || end <= b_offset))
+			return true;
+	}
+
+	return false;
+}
+
+/* disk reservations: */
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+					     struct disk_reservation *res)
+{
+	if (res->sectors) {
+		this_cpu_sub(*c->online_reserved, res->sectors);
+		res->sectors = 0;
+	}
+}
+
+#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
+
+int __bch2_disk_reservation_add(struct bch_fs *,
+				struct disk_reservation *,
+				u64, int);
+
+static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+					    u64 sectors, int flags)
+{
+#ifdef __KERNEL__
+	u64 old, new;
+
+	do {
+		old = this_cpu_read(c->pcpu->sectors_available);
+		if (sectors > old)
+			return __bch2_disk_reservation_add(c, res, sectors, flags);
+
+		new = old - sectors;
+	} while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+
+	this_cpu_add(*c->online_reserved, sectors);
+	res->sectors			+= sectors;
+	return 0;
+#else
+	return __bch2_disk_reservation_add(c, res, sectors, flags);
+#endif
+}
+
+static inline struct disk_reservation
+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
+{
+	return (struct disk_reservation) {
+		.sectors	= 0,
+#if 0
+		/* not used yet: */
+		.gen		= c->capacity_gen,
+#endif
+		.nr_replicas	= nr_replicas,
+	};
+}
+
+static inline int bch2_disk_reservation_get(struct bch_fs *c,
+					    struct disk_reservation *res,
+					    u64 sectors, unsigned nr_replicas,
+					    int flags)
+{
+	*res = bch2_disk_reservation_init(c, nr_replicas);
+
+	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
+}
+
+#define RESERVE_FACTOR	6
+
+static inline u64 avail_factor(u64 r)
+{
+	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
+void bch2_dev_buckets_free(struct bch_dev *);
+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
+
+#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
new file mode 100644
index 000000000000..2a9dab9006ef
--- /dev/null
+++ b/fs/bcachefs/buckets_types.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+#include "bcachefs_format.h"
+#include "util.h"
+
+#define BUCKET_JOURNAL_SEQ_BITS		16
+
+struct bucket {
+	u8			lock;
+	u8			gen_valid:1;
+	u8			data_type:7;
+	u8			gen;
+	u8			stripe_redundancy;
+	u32			stripe;
+	u32			dirty_sectors;
+	u32			cached_sectors;
+};
+
+struct bucket_array {
+	struct rcu_head		rcu;
+	u16			first_bucket;
+	size_t			nbuckets;
+	struct bucket		b[];
+};
+
+struct bucket_gens {
+	struct rcu_head		rcu;
+	u16			first_bucket;
+	size_t			nbuckets;
+	u8			b[];
+};
+
+struct bch_dev_usage {
+	u64			buckets_ec;
+
+	struct {
+		u64		buckets;
+		u64		sectors; /* _compressed_ sectors: */
+		/*
+		 * XXX
+		 * Why do we have this? Isn't it just buckets * bucket_size -
+		 * sectors?
+		 */
+		u64		fragmented;
+	}			d[BCH_DATA_NR];
+};
+
+struct bch_fs_usage {
+	/* all fields are in units of 512 byte sectors: */
+	u64			hidden;
+	u64			btree;
+	u64			data;
+	u64			cached;
+	u64			reserved;
+	u64			nr_inodes;
+
+	/* XXX: add stats for compression ratio */
+#if 0
+	u64			uncompressed;
+	u64			compressed;
+#endif
+
+	/* broken out: */
+
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	u64			replicas[];
+};
+
+struct bch_fs_usage_online {
+	u64			online_reserved;
+	struct bch_fs_usage	u;
+};
+
+struct bch_fs_usage_short {
+	u64			capacity;
+	u64			used;
+	u64			free;
+	u64			nr_inodes;
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+	u64			sectors;
+	u32			gen;
+	unsigned		nr_replicas;
+};
+
+#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
new file mode 100644
index 000000000000..ec1b636ef78d
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/hash.h>
+#include <linux/random.h>
+
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+	    unsigned hash_seed_idx, u64 dev_bucket)
+{
+	return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
+}
+
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
+{
+	unsigned i;
+
+	t->bits = bits;
+	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+		get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+	memset(t->d, 0, sizeof(t->d[0]) << t->bits);
+}
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+				      u64 flushed_seq,
+				      unsigned dev, u64 bucket)
+{
+	struct buckets_waiting_for_journal_table *t;
+	u64 dev_bucket = (u64) dev << 56 | bucket;
+	bool ret = false;
+	unsigned i;
+
+	mutex_lock(&b->lock);
+	t = b->t;
+
+	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+		struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
+
+		if (h->dev_bucket == dev_bucket) {
+			ret = h->journal_seq > flushed_seq;
+			break;
+		}
+	}
+
+	mutex_unlock(&b->lock);
+
+	return ret;
+}
+
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+				struct bucket_hashed *new,
+				u64 flushed_seq)
+{
+	struct bucket_hashed *last_evicted = NULL;
+	unsigned tries, i;
+
+	for (tries = 0; tries < 10; tries++) {
+		struct bucket_hashed *old, *victim = NULL;
+
+		for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+			old = bucket_hash(t, i, new->dev_bucket);
+
+			if (old->dev_bucket == new->dev_bucket ||
+			    old->journal_seq <= flushed_seq) {
+				*old = *new;
+				return true;
+			}
+
+			if (last_evicted != old)
+				victim = old;
+		}
+
+		/* hashed to same slot 3 times: */
+		if (!victim)
+			break;
+
+		/* Failed to find an empty slot: */
+		swap(*new, *victim);
+		last_evicted = victim;
+	}
+
+	return false;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+					 u64 flushed_seq,
+					 unsigned dev, u64 bucket,
+					 u64 journal_seq)
+{
+	struct buckets_waiting_for_journal_table *t, *n;
+	struct bucket_hashed tmp, new = {
+		.dev_bucket	= (u64) dev << 56 | bucket,
+		.journal_seq	= journal_seq,
+	};
+	size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
+	int ret = 0;
+
+	mutex_lock(&b->lock);
+
+	if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+		goto out;
+
+	t = b->t;
+	size = 1UL << t->bits;
+	for (i = 0; i < size; i++)
+		nr_elements += t->d[i].journal_seq > flushed_seq;
+
+	new_bits = t->bits + (nr_elements * 3 > size);
+
+	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
+	if (!n) {
+		ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+		goto out;
+	}
+
+retry_rehash:
+	nr_rehashes++;
+	bucket_table_init(n, new_bits);
+
+	tmp = new;
+	BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+	for (i = 0; i < 1UL << t->bits; i++) {
+		if (t->d[i].journal_seq <= flushed_seq)
+			continue;
+
+		tmp = t->d[i];
+		if (!bucket_table_insert(n, &tmp, flushed_seq))
+			goto retry_rehash;
+	}
+
+	b->t = n;
+	kvfree(t);
+
+	pr_debug("took %zu rehashes, table at %zu/%lu elements",
+		 nr_rehashes, nr_elements, 1UL << b->t->bits);
+out:
+	mutex_unlock(&b->lock);
+
+	return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+	kvfree(b->t);
+}
+
+#define INITIAL_TABLE_BITS		3
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+	mutex_init(&b->lock);
+
+	b->t = kvmalloc(sizeof(*b->t) +
+			(sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
+	if (!b->t)
+		return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
+
+	bucket_table_init(b->t, INITIAL_TABLE_BITS);
+	return 0;
+}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
new file mode 100644
index 000000000000..d2ae19cbe18c
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+				      u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+					 u64, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
new file mode 100644
index 000000000000..e593db061d81
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+#include <linux/siphash.h>
+
+struct bucket_hashed {
+	u64			dev_bucket;
+	u64			journal_seq;
+};
+
+struct buckets_waiting_for_journal_table {
+	unsigned		bits;
+	u64			hash_seeds[3];
+	struct bucket_hashed	d[];
+};
+
+struct buckets_waiting_for_journal {
+	struct mutex		lock;
+	struct buckets_waiting_for_journal_table *t;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
new file mode 100644
index 000000000000..f69e15dc699c
--- /dev/null
+++ b/fs/bcachefs/chardev.c
@@ -0,0 +1,784 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_CHARDEV
+
+#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "journal.h"
+#include "move.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/kthread.h>
+#include <linux/major.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
+					  unsigned flags)
+{
+	struct bch_dev *ca;
+
+	if (flags & BCH_BY_INDEX) {
+		if (dev >= c->sb.nr_devices)
+			return ERR_PTR(-EINVAL);
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca)
+			return ERR_PTR(-EINVAL);
+	} else {
+		char *path;
+
+		path = strndup_user((const char __user *)
+				    (unsigned long) dev, PATH_MAX);
+		if (IS_ERR(path))
+			return ERR_CAST(path);
+
+		ca = bch2_dev_lookup(c, path);
+		kfree(path);
+	}
+
+	return ca;
+}
+
+#if 0
+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+	struct bch_ioctl_assemble arg;
+	struct bch_fs *c;
+	u64 *user_devs = NULL;
+	char **devs = NULL;
+	unsigned i;
+	int ret = -EFAULT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+	if (!user_devs)
+		return -ENOMEM;
+
+	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+	if (copy_from_user(user_devs, user_arg->devs,
+			   sizeof(u64) * arg.nr_devs))
+		goto err;
+
+	for (i = 0; i < arg.nr_devs; i++) {
+		devs[i] = strndup_user((const char __user *)(unsigned long)
+				       user_devs[i],
+				       PATH_MAX);
+		ret= PTR_ERR_OR_ZERO(devs[i]);
+		if (ret)
+			goto err;
+	}
+
+	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+	ret = PTR_ERR_OR_ZERO(c);
+	if (!ret)
+		closure_put(&c->cl);
+err:
+	if (devs)
+		for (i = 0; i < arg.nr_devs; i++)
+			kfree(devs[i]);
+	kfree(devs);
+	return ret;
+}
+
+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+	struct bch_ioctl_incremental arg;
+	const char *err;
+	char *path;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
+
+	err = bch2_fs_open_incremental(path);
+	kfree(path);
+
+	if (err) {
+		pr_err("Could not register bcachefs devices: %s", err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
+static long bch2_global_ioctl(unsigned cmd, void __user *arg)
+{
+	switch (cmd) {
+#if 0
+	case BCH_IOCTL_ASSEMBLE:
+		return bch2_ioctl_assemble(arg);
+	case BCH_IOCTL_INCREMENTAL:
+		return bch2_ioctl_incremental(arg);
+#endif
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch2_ioctl_query_uuid(struct bch_fs *c,
+			struct bch_ioctl_query_uuid __user *user_arg)
+{
+	if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
+			 sizeof(c->sb.user_uuid)))
+		return -EFAULT;
+	return 0;
+}
+
+#if 0
+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	return bch2_fs_start(c);
+}
+
+static long bch2_ioctl_stop(struct bch_fs *c)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	bch2_fs_stop(c);
+	return 0;
+}
+#endif
+
+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
+
+	ret = bch2_dev_add(c, path);
+	kfree(path);
+
+	return ret;
+}
+
+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	return bch2_dev_remove(c, ca, arg.flags);
+}
+
+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
+
+	ret = bch2_dev_online(c, path);
+	kfree(path);
+	return ret;
+}
+
+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_offline(c, ca, arg.flags);
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch2_ioctl_disk_set_state(struct bch_fs *c,
+			struct bch_ioctl_disk_set_state arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad[0] || arg.pad[1] || arg.pad[2] ||
+	    arg.new_state >= BCH_MEMBER_STATE_NR)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+	if (ret)
+		bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+struct bch_data_ctx {
+	struct bch_fs			*c;
+	struct bch_ioctl_data		arg;
+	struct bch_move_stats		stats;
+
+	int				ret;
+
+	struct task_struct		*thread;
+};
+
+static int bch2_data_thread(void *arg)
+{
+	struct bch_data_ctx *ctx = arg;
+
+	ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+
+	ctx->stats.data_type = U8_MAX;
+	return 0;
+}
+
+static int bch2_data_job_release(struct inode *inode, struct file *file)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+
+	kthread_stop(ctx->thread);
+	put_task_struct(ctx->thread);
+	kfree(ctx);
+	return 0;
+}
+
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
+				  size_t len, loff_t *ppos)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+	struct bch_fs *c = ctx->c;
+	struct bch_ioctl_data_event e = {
+		.type			= BCH_DATA_EVENT_PROGRESS,
+		.p.data_type		= ctx->stats.data_type,
+		.p.btree_id		= ctx->stats.btree_id,
+		.p.pos			= ctx->stats.pos,
+		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
+		.p.sectors_total	= bch2_fs_usage_read_short(c).used,
+	};
+
+	if (len < sizeof(e))
+		return -EINVAL;
+
+	if (copy_to_user(buf, &e, sizeof(e)))
+		return -EFAULT;
+
+	return sizeof(e);
+}
+
+static const struct file_operations bcachefs_data_ops = {
+	.release	= bch2_data_job_release,
+	.read		= bch2_data_job_read,
+	.llseek		= no_llseek,
+};
+
+static long bch2_ioctl_data(struct bch_fs *c,
+			    struct bch_ioctl_data arg)
+{
+	struct bch_data_ctx *ctx = NULL;
+	struct file *file = NULL;
+	unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+	int ret, fd = -1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->c = c;
+	ctx->arg = arg;
+
+	ctx->thread = kthread_create(bch2_data_thread, ctx,
+				     "bch-data/%s", c->name);
+	if (IS_ERR(ctx->thread)) {
+		ret = PTR_ERR(ctx->thread);
+		goto err;
+	}
+
+	ret = get_unused_fd_flags(flags);
+	if (ret < 0)
+		goto err;
+	fd = ret;
+
+	file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err;
+	}
+
+	fd_install(fd, file);
+
+	get_task_struct(ctx->thread);
+	wake_up_process(ctx->thread);
+
+	return fd;
+err:
+	if (fd >= 0)
+		put_unused_fd(fd);
+	if (!IS_ERR_OR_NULL(ctx->thread))
+		kthread_stop(ctx->thread);
+	kfree(ctx);
+	return ret;
+}
+
+static long bch2_ioctl_fs_usage(struct bch_fs *c,
+				struct bch_ioctl_fs_usage __user *user_arg)
+{
+	struct bch_ioctl_fs_usage *arg = NULL;
+	struct bch_replicas_usage *dst_e, *dst_end;
+	struct bch_fs_usage_online *src;
+	u32 replica_entries_bytes;
+	unsigned i;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
+		return -EFAULT;
+
+	arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
+	if (!arg)
+		return -ENOMEM;
+
+	src = bch2_fs_usage_read(c);
+	if (!src) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	arg->capacity		= c->capacity;
+	arg->used		= bch2_fs_sectors_used(c, src);
+	arg->online_reserved	= src->online_reserved;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++)
+		arg->persistent_reserved[i] = src->u.persistent_reserved[i];
+
+	dst_e	= arg->replicas;
+	dst_end = (void *) arg->replicas + replica_entries_bytes;
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *src_e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		/* check that we have enough space for one replicas entry */
+		if (dst_e + 1 > dst_end) {
+			ret = -ERANGE;
+			break;
+		}
+
+		dst_e->sectors		= src->u.replicas[i];
+		dst_e->r		= *src_e;
+
+		/* recheck after setting nr_devs: */
+		if (replicas_usage_next(dst_e) > dst_end) {
+			ret = -ERANGE;
+			break;
+		}
+
+		memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
+
+		dst_e = replicas_usage_next(dst_e);
+	}
+
+	arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
+
+	percpu_up_read(&c->mark_lock);
+	kfree(src);
+
+	if (ret)
+		goto err;
+	if (copy_to_user(user_arg, arg,
+			 sizeof(*arg) + arg->replica_entries_bytes))
+		ret = -EFAULT;
+err:
+	kfree(arg);
+	return ret;
+}
+
+static long bch2_ioctl_dev_usage(struct bch_fs *c,
+				 struct bch_ioctl_dev_usage __user *user_arg)
+{
+	struct bch_ioctl_dev_usage arg;
+	struct bch_dev_usage src;
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad[0] ||
+	    arg.pad[1] ||
+	    arg.pad[2])
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	src = bch2_dev_usage_read(ca);
+
+	arg.state		= ca->mi.state;
+	arg.bucket_size		= ca->mi.bucket_size;
+	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
+	arg.buckets_ec		= src.buckets_ec;
+
+	for (i = 0; i < BCH_DATA_NR; i++) {
+		arg.d[i].buckets	= src.d[i].buckets;
+		arg.d[i].sectors	= src.d[i].sectors;
+		arg.d[i].fragmented	= src.d[i].fragmented;
+	}
+
+	percpu_ref_put(&ca->ref);
+
+	if (copy_to_user(user_arg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long bch2_ioctl_read_super(struct bch_fs *c,
+				  struct bch_ioctl_read_super arg)
+{
+	struct bch_dev *ca = NULL;
+	struct bch_sb *sb;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	if (arg.flags & BCH_READ_DEV) {
+		ca = bch2_device_lookup(c, arg.dev, arg.flags);
+
+		if (IS_ERR(ca)) {
+			ret = PTR_ERR(ca);
+			goto err;
+		}
+
+		sb = ca->disk_sb.sb;
+	} else {
+		sb = c->disk_sb.sb;
+	}
+
+	if (vstruct_bytes(sb) > arg.size) {
+		ret = -ERANGE;
+		goto err;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
+			 vstruct_bytes(sb)))
+		ret = -EFAULT;
+err:
+	if (!IS_ERR_OR_NULL(ca))
+		percpu_ref_put(&ca->ref);
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
+				    struct bch_ioctl_disk_get_idx arg)
+{
+	dev_t dev = huge_decode_dev(arg.dev);
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!dev)
+		return -EINVAL;
+
+	for_each_online_member(ca, c, i)
+		if (ca->dev == dev) {
+			percpu_ref_put(&ca->io_ref);
+			return i;
+		}
+
+	return -BCH_ERR_ENOENT_dev_idx_not_found;
+}
+
+static long bch2_ioctl_disk_resize(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_resize(c, ca, arg.nbuckets);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize_journal arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	if (arg.nbuckets > U32_MAX)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+#define BCH_IOCTL(_name, _argtype)					\
+do {									\
+	_argtype i;							\
+									\
+	if (copy_from_user(&i, arg, sizeof(i)))				\
+		return -EFAULT;						\
+	ret = bch2_ioctl_##_name(c, i);					\
+	goto out;							\
+} while (0)
+
+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
+{
+	long ret;
+
+	switch (cmd) {
+	case BCH_IOCTL_QUERY_UUID:
+		return bch2_ioctl_query_uuid(c, arg);
+	case BCH_IOCTL_FS_USAGE:
+		return bch2_ioctl_fs_usage(c, arg);
+	case BCH_IOCTL_DEV_USAGE:
+		return bch2_ioctl_dev_usage(c, arg);
+#if 0
+	case BCH_IOCTL_START:
+		BCH_IOCTL(start, struct bch_ioctl_start);
+	case BCH_IOCTL_STOP:
+		return bch2_ioctl_stop(c);
+#endif
+	case BCH_IOCTL_READ_SUPER:
+		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+	case BCH_IOCTL_DISK_GET_IDX:
+		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+	}
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	switch (cmd) {
+	case BCH_IOCTL_DISK_ADD:
+		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_REMOVE:
+		BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_ONLINE:
+		BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_OFFLINE:
+		BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_SET_STATE:
+		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+	case BCH_IOCTL_DATA:
+		BCH_IOCTL(data, struct bch_ioctl_data);
+	case BCH_IOCTL_DISK_RESIZE:
+		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+	case BCH_IOCTL_DISK_RESIZE_JOURNAL:
+		BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
+
+	default:
+		return -ENOTTY;
+	}
+out:
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+	return ret;
+}
+
+static DEFINE_IDR(bch_chardev_minor);
+
+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+	unsigned minor = iminor(file_inode(filp));
+	struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
+	void __user *arg = (void __user *) v;
+
+	return c
+		? bch2_fs_ioctl(c, cmd, arg)
+		: bch2_global_ioctl(cmd, arg);
+}
+
+static const struct file_operations bch_chardev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = bch2_chardev_ioctl,
+	.open		= nonseekable_open,
+};
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+
+void bch2_fs_chardev_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->chardev))
+		device_unregister(c->chardev);
+	if (c->minor >= 0)
+		idr_remove(&bch_chardev_minor, c->minor);
+}
+
+int bch2_fs_chardev_init(struct bch_fs *c)
+{
+	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+	if (c->minor < 0)
+		return c->minor;
+
+	c->chardev = device_create(bch_chardev_class, NULL,
+				   MKDEV(bch_chardev_major, c->minor), c,
+				   "bcachefs%u-ctl", c->minor);
+	if (IS_ERR(c->chardev))
+		return PTR_ERR(c->chardev);
+
+	return 0;
+}
+
+void bch2_chardev_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		device_destroy(bch_chardev_class,
+			       MKDEV(bch_chardev_major, U8_MAX));
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		class_destroy(bch_chardev_class);
+	if (bch_chardev_major > 0)
+		unregister_chrdev(bch_chardev_major, "bcachefs");
+}
+
+int __init bch2_chardev_init(void)
+{
+	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
+	if (bch_chardev_major < 0)
+		return bch_chardev_major;
+
+	bch_chardev_class = class_create("bcachefs");
+	if (IS_ERR(bch_chardev_class))
+		return PTR_ERR(bch_chardev_class);
+
+	bch_chardev = device_create(bch_chardev_class, NULL,
+				    MKDEV(bch_chardev_major, U8_MAX),
+				    NULL, "bcachefs-ctl");
+	if (IS_ERR(bch_chardev))
+		return PTR_ERR(bch_chardev);
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
new file mode 100644
index 000000000000..0f563ca53c36
--- /dev/null
+++ b/fs/bcachefs/chardev.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H
+
+#ifndef NO_BCACHEFS_FS
+
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
+
+void bch2_fs_chardev_exit(struct bch_fs *);
+int bch2_fs_chardev_init(struct bch_fs *);
+
+void bch2_chardev_exit(void);
+int __init bch2_chardev_init(void);
+
+#else
+
+static inline long bch2_fs_ioctl(struct bch_fs *c,
+				unsigned cmd, void __user * arg)
+{
+	return -ENOTTY;
+}
+
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
+
+static inline void bch2_chardev_exit(void) {}
+static inline int __init bch2_chardev_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
new file mode 100644
index 000000000000..3c761ad6b1c8
--- /dev/null
+++ b/fs/bcachefs/checksum.c
@@ -0,0 +1,804 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "errcode.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/xxhash.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+
+/*
+ * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
+ * it features page merging without having the checksum algorithm lose its state.
+ * for native checksum aglorithms (like crc), a default seed value will do.
+ * for hash-like algorithms, a state needs to be stored
+ */
+
+struct bch2_checksum_state {
+	union {
+		u64 seed;
+		struct xxh64_state h64state;
+	};
+	unsigned int type;
+};
+
+static void bch2_checksum_init(struct bch2_checksum_state *state)
+{
+	switch (state->type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		state->seed = 0;
+		break;
+	case BCH_CSUM_crc32c_nonzero:
+		state->seed = U32_MAX;
+		break;
+	case BCH_CSUM_crc64_nonzero:
+		state->seed = U64_MAX;
+		break;
+	case BCH_CSUM_xxhash:
+		xxh64_reset(&state->h64state, 0);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
+{
+	switch (state->type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		return state->seed;
+	case BCH_CSUM_crc32c_nonzero:
+		return state->seed ^ U32_MAX;
+	case BCH_CSUM_crc64_nonzero:
+		return state->seed ^ U64_MAX;
+	case BCH_CSUM_xxhash:
+		return xxh64_digest(&state->h64state);
+	default:
+		BUG();
+	}
+}
+
+static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
+{
+	switch (state->type) {
+	case BCH_CSUM_none:
+		return;
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc32c:
+		state->seed = crc32c(state->seed, data, len);
+		break;
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc64:
+		state->seed = crc64_be(state->seed, data, len);
+		break;
+	case BCH_CSUM_xxhash:
+		xxh64_update(&state->h64state, data, len);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+				struct nonce nonce,
+				struct scatterlist *sg, size_t len)
+{
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+	int ret;
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+
+	ret = crypto_skcipher_encrypt(req);
+	if (ret)
+		pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+	return ret;
+}
+
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
+			      struct nonce nonce,
+			      void *buf, size_t len)
+{
+	if (!is_vmalloc_addr(buf)) {
+		struct scatterlist sg;
+
+		sg_init_table(&sg, 1);
+		sg_set_page(&sg,
+			    is_vmalloc_addr(buf)
+			    ? vmalloc_to_page(buf)
+			    : virt_to_page(buf),
+			    len, offset_in_page(buf));
+		return do_encrypt_sg(tfm, nonce, &sg, len);
+	} else {
+		unsigned pages = buf_pages(buf, len);
+		struct scatterlist *sg;
+		size_t orig_len = len;
+		int ret, i;
+
+		sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
+		if (!sg)
+			return -BCH_ERR_ENOMEM_do_encrypt;
+
+		sg_init_table(sg, pages);
+
+		for (i = 0; i < pages; i++) {
+			unsigned offset = offset_in_page(buf);
+			unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
+
+			sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
+			buf += pg_len;
+			len -= pg_len;
+		}
+
+		ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
+		kfree(sg);
+		return ret;
+	}
+}
+
+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+			    void *buf, size_t len)
+{
+	struct crypto_sync_skcipher *chacha20 =
+		crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	int ret;
+
+	ret = PTR_ERR_OR_ZERO(chacha20);
+	if (ret) {
+		pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+		return ret;
+	}
+
+	ret = crypto_skcipher_setkey(&chacha20->base,
+				     (void *) key, sizeof(*key));
+	if (ret) {
+		pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
+		goto err;
+	}
+
+	ret = do_encrypt(chacha20, nonce, buf, len);
+err:
+	crypto_free_sync_skcipher(chacha20);
+	return ret;
+}
+
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+			struct nonce nonce)
+{
+	u8 key[POLY1305_KEY_SIZE];
+	int ret;
+
+	nonce.d[3] ^= BCH_NONCE_POLY;
+
+	memset(key, 0, sizeof(key));
+	ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+	if (ret)
+		return ret;
+
+	desc->tfm = c->poly1305;
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, key, sizeof(key));
+	return 0;
+}
+
+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
+			      struct nonce nonce, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_xxhash:
+	case BCH_CSUM_crc64: {
+		struct bch2_checksum_state state;
+
+		state.type = type;
+
+		bch2_checksum_init(&state);
+		bch2_checksum_update(&state, data, len);
+
+		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
+	}
+
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+		crypto_shash_update(desc, data, len);
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+int bch2_encrypt(struct bch_fs *c, unsigned type,
+		  struct nonce nonce, void *data, size_t len)
+{
+	if (!bch2_csum_type_is_encryption(type))
+		return 0;
+
+	return do_encrypt(c->chacha20, nonce, data, len);
+}
+
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+					   struct nonce nonce, struct bio *bio,
+					   struct bvec_iter *iter)
+{
+	struct bio_vec bv;
+
+	switch (type) {
+	case BCH_CSUM_none:
+		return (struct bch_csum) { 0 };
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_xxhash:
+	case BCH_CSUM_crc64: {
+		struct bch2_checksum_state state;
+
+		state.type = type;
+		bch2_checksum_init(&state);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
+			bch2_checksum_update(&state, p, bv.bv_len);
+			kunmap_local(p);
+		}
+#else
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
+			bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
+	}
+
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
+			crypto_shash_update(desc, p, bv.bv_len);
+			kunmap_local(p);
+		}
+#else
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
+			crypto_shash_update(desc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+				  struct nonce nonce, struct bio *bio)
+{
+	struct bvec_iter iter = bio->bi_iter;
+
+	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
+int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+		     struct nonce nonce, struct bio *bio)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	struct scatterlist sgl[16], *sg = sgl;
+	size_t bytes = 0;
+	int ret = 0;
+
+	if (!bch2_csum_type_is_encryption(type))
+		return 0;
+
+	sg_init_table(sgl, ARRAY_SIZE(sgl));
+
+	bio_for_each_segment(bv, bio, iter) {
+		if (sg == sgl + ARRAY_SIZE(sgl)) {
+			sg_mark_end(sg - 1);
+
+			ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+			if (ret)
+				return ret;
+
+			nonce = nonce_add(nonce, bytes);
+			bytes = 0;
+
+			sg_init_table(sgl, ARRAY_SIZE(sgl));
+			sg = sgl;
+		}
+
+		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+		bytes += bv.bv_len;
+	}
+
+	sg_mark_end(sg - 1);
+	return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
+				    struct bch_csum b, size_t b_len)
+{
+	struct bch2_checksum_state state;
+
+	state.type = type;
+	bch2_checksum_init(&state);
+	state.seed = le64_to_cpu(a.lo);
+
+	BUG_ON(!bch2_checksum_mergeable(type));
+
+	while (b_len) {
+		unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
+
+		bch2_checksum_update(&state,
+				page_address(ZERO_PAGE(0)), page_len);
+		b_len -= page_len;
+	}
+	a.lo = cpu_to_le64(bch2_checksum_final(&state));
+	a.lo ^= b.lo;
+	a.hi ^= b.hi;
+	return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+			struct bversion version,
+			struct bch_extent_crc_unpacked crc_old,
+			struct bch_extent_crc_unpacked *crc_a,
+			struct bch_extent_crc_unpacked *crc_b,
+			unsigned len_a, unsigned len_b,
+			unsigned new_csum_type)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	struct nonce nonce = extent_nonce(version, crc_old);
+	struct bch_csum merged = { 0 };
+	struct crc_split {
+		struct bch_extent_crc_unpacked	*crc;
+		unsigned			len;
+		unsigned			csum_type;
+		struct bch_csum			csum;
+	} splits[3] = {
+		{ crc_a, len_a, new_csum_type, { 0 }},
+		{ crc_b, len_b, new_csum_type, { 0 } },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
+	}, *i;
+	bool mergeable = crc_old.csum_type == new_csum_type &&
+		bch2_checksum_mergeable(new_csum_type);
+	unsigned crc_nonce = crc_old.nonce;
+
+	BUG_ON(len_a + len_b > bio_sectors(bio));
+	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+	BUG_ON(crc_is_compressed(crc_old));
+	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+	       bch2_csum_type_is_encryption(new_csum_type));
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		iter.bi_size = i->len << 9;
+		if (mergeable || i->crc)
+			i->csum = __bch2_checksum_bio(c, i->csum_type,
+						      nonce, bio, &iter);
+		else
+			bio_advance_iter(bio, &iter, i->len << 9);
+		nonce = nonce_add(nonce, i->len << 9);
+	}
+
+	if (mergeable)
+		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+			merged = bch2_checksum_merge(new_csum_type, merged,
+						     i->csum, i->len << 9);
+	else
+		merged = bch2_checksum_bio(c, crc_old.csum_type,
+				extent_nonce(version, crc_old), bio);
+
+	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
+		bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
+			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+			__func__,
+			crc_old.csum.hi,
+			crc_old.csum.lo,
+			merged.hi,
+			merged.lo,
+			bch2_csum_types[crc_old.csum_type],
+			bch2_csum_types[new_csum_type]);
+		return -EIO;
+	}
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		if (i->crc)
+			*i->crc = (struct bch_extent_crc_unpacked) {
+				.csum_type		= i->csum_type,
+				.compression_type	= crc_old.compression_type,
+				.compressed_size	= i->len,
+				.uncompressed_size	= i->len,
+				.offset			= 0,
+				.live_size		= i->len,
+				.nonce			= crc_nonce,
+				.csum			= i->csum,
+			};
+
+		if (bch2_csum_type_is_encryption(new_csum_type))
+			crc_nonce += i->len;
+	}
+
+	return 0;
+}
+
+/* BCH_SB_FIELD_crypt: */
+
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
+				  struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&crypt->field), sizeof(*crypt));
+		return -BCH_ERR_invalid_sb_crypt;
+	}
+
+	if (BCH_CRYPT_KDF_TYPE(crypt)) {
+		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+		return -BCH_ERR_invalid_sb_crypt;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+	.validate	= bch2_sb_crypt_validate,
+	.to_text	= bch2_sb_crypt_to_text,
+};
+
+#ifdef __KERNEL__
+static int __bch2_request_key(char *key_description, struct bch_key *key)
+{
+	struct key *keyring_key;
+	const struct user_key_payload *ukp;
+	int ret;
+
+	keyring_key = request_key(&key_type_user, key_description, NULL);
+	if (IS_ERR(keyring_key))
+		return PTR_ERR(keyring_key);
+
+	down_read(&keyring_key->sem);
+	ukp = dereference_key_locked(keyring_key);
+	if (ukp->datalen == sizeof(*key)) {
+		memcpy(key, ukp->data, ukp->datalen);
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	up_read(&keyring_key->sem);
+	key_put(keyring_key);
+
+	return ret;
+}
+#else
+#include <keyutils.h>
+
+static int __bch2_request_key(char *key_description, struct bch_key *key)
+{
+	key_serial_t key_id;
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_SESSION_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_SESSION_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
+	return -errno;
+got_key:
+
+	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+		return -1;
+
+	return 0;
+}
+
+#include "../crypto.h"
+#endif
+
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	struct printbuf key_description = PRINTBUF;
+	int ret;
+
+	prt_printf(&key_description, "bcachefs:");
+	pr_uuid(&key_description, sb->user_uuid.b);
+
+	ret = __bch2_request_key(key_description.buf, key);
+	printbuf_exit(&key_description);
+
+#ifndef __KERNEL__
+	if (ret) {
+		char *passphrase = read_passphrase("Enter passphrase: ");
+		struct bch_encrypted_key sb_key;
+
+		bch2_passphrase_check(sb, passphrase,
+				      key, &sb_key);
+		ret = 0;
+	}
+#endif
+
+	/* stash with memfd, pass memfd fd to mount */
+
+	return ret;
+}
+
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *sb)
+{
+	key_serial_t key_id;
+	struct printbuf key_description = PRINTBUF;
+
+	prt_printf(&key_description, "bcachefs:");
+	pr_uuid(&key_description, sb->user_uuid.b);
+
+	key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
+	printbuf_exit(&key_description);
+	if (key_id < 0)
+		return errno;
+
+	keyctl_revoke(key_id);
+
+	return 0;
+}
+#endif
+
+int bch2_decrypt_sb_key(struct bch_fs *c,
+			struct bch_sb_field_crypt *crypt,
+			struct bch_key *key)
+{
+	struct bch_encrypted_key sb_key = crypt->key;
+	struct bch_key user_key;
+	int ret = 0;
+
+	/* is key encrypted? */
+	if (!bch2_key_is_encrypted(&sb_key))
+		goto out;
+
+	ret = bch2_request_key(c->disk_sb.sb, &user_key);
+	if (ret) {
+		bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
+		goto err;
+	}
+
+	/* decrypt real key: */
+	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+				      &sb_key, sizeof(sb_key));
+	if (ret)
+		goto err;
+
+	if (bch2_key_is_encrypted(&sb_key)) {
+		bch_err(c, "incorrect encryption key");
+		ret = -EINVAL;
+		goto err;
+	}
+out:
+	*key = sb_key.key;
+err:
+	memzero_explicit(&sb_key, sizeof(sb_key));
+	memzero_explicit(&user_key, sizeof(user_key));
+	return ret;
+}
+
+static int bch2_alloc_ciphers(struct bch_fs *c)
+{
+	int ret;
+
+	if (!c->chacha20)
+		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	ret = PTR_ERR_OR_ZERO(c->chacha20);
+
+	if (ret) {
+		bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
+		return ret;
+	}
+
+	if (!c->poly1305)
+		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+	ret = PTR_ERR_OR_ZERO(c->poly1305);
+
+	if (ret) {
+		bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+int bch2_disable_encryption(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
+	if (!crypt)
+		goto out;
+
+	/* is key encrypted? */
+	ret = 0;
+	if (bch2_key_is_encrypted(&crypt->key))
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	crypt->key.magic	= cpu_to_le64(BCH_KEY_MAGIC);
+	crypt->key.key		= key;
+
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_enable_encryption(struct bch_fs *c, bool keyed)
+{
+	struct bch_encrypted_key key;
+	struct bch_key user_key;
+	struct bch_sb_field_crypt *crypt;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	/* Do we already have an encryption key? */
+	if (bch2_sb_field_get(c->disk_sb.sb, crypt))
+		goto err;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto err;
+
+	key.magic = cpu_to_le64(BCH_KEY_MAGIC);
+	get_random_bytes(&key.key, sizeof(key.key));
+
+	if (keyed) {
+		ret = bch2_request_key(c->disk_sb.sb, &user_key);
+		if (ret) {
+			bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
+			goto err;
+		}
+
+		ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+					      &key, sizeof(key));
+		if (ret)
+			goto err;
+	}
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto err;
+
+	crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
+				     sizeof(*crypt) / sizeof(u64));
+	if (!crypt) {
+		ret = -BCH_ERR_ENOSPC_sb_crypt;
+		goto err;
+	}
+
+	crypt->key = key;
+
+	/* write superblock */
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
+	bch2_write_super(c);
+err:
+	mutex_unlock(&c->sb_lock);
+	memzero_explicit(&user_key, sizeof(user_key));
+	memzero_explicit(&key, sizeof(key));
+	return ret;
+}
+
+void bch2_fs_encryption_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->poly1305))
+		crypto_free_shash(c->poly1305);
+	if (!IS_ERR_OR_NULL(c->chacha20))
+		crypto_free_sync_skcipher(c->chacha20);
+	if (!IS_ERR_OR_NULL(c->sha256))
+		crypto_free_shash(c->sha256);
+}
+
+int bch2_fs_encryption_init(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = 0;
+
+	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
+	ret = PTR_ERR_OR_ZERO(c->sha256);
+	if (ret) {
+		bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
+		goto out;
+	}
+
+	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
+	if (!crypt)
+		goto out;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto out;
+out:
+	memzero_explicit(&key, sizeof(key));
+	return ret;
+}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
new file mode 100644
index 000000000000..13998388c545
--- /dev/null
+++ b/fs/bcachefs/checksum.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHECKSUM_H
+#define _BCACHEFS_CHECKSUM_H
+
+#include "bcachefs.h"
+#include "extents_types.h"
+#include "super-io.h"
+
+#include <linux/crc64.h>
+#include <crypto/chacha.h>
+
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
+				    struct bch_csum, size_t);
+
+#define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO		cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY		cpu_to_le32(1 << 31)
+
+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
+			     const void *, size_t);
+
+/*
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
+ */
+#define csum_vstruct(_c, _type, _nonce, _i)				\
+({									\
+	const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
+									\
+	bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
+})
+
+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *);
+#endif
+
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+		 void *data, size_t);
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
+				  struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+			struct bch_extent_crc_unpacked,
+			struct bch_extent_crc_unpacked *,
+			struct bch_extent_crc_unpacked *,
+			unsigned, unsigned, unsigned);
+
+int __bch2_encrypt_bio(struct bch_fs *, unsigned,
+		       struct nonce, struct bio *);
+
+static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+				   struct nonce nonce, struct bio *bio)
+{
+	return bch2_csum_type_is_encryption(type)
+		? __bch2_encrypt_bio(c, type, nonce, bio)
+		: 0;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
+
+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
+			struct bch_key *);
+
+int bch2_disable_encryption(struct bch_fs *);
+int bch2_enable_encryption(struct bch_fs *, bool);
+
+void bch2_fs_encryption_exit(struct bch_fs *);
+int bch2_fs_encryption_init(struct bch_fs *);
+
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+						       bool data)
+{
+	switch (type) {
+	case BCH_CSUM_OPT_none:
+		return BCH_CSUM_none;
+	case BCH_CSUM_OPT_crc32c:
+		return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
+	case BCH_CSUM_OPT_crc64:
+		return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
+	case BCH_CSUM_OPT_xxhash:
+		return BCH_CSUM_xxhash;
+	default:
+		BUG();
+	}
+}
+
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+							 struct bch_io_opts opts)
+{
+	if (opts.nocow)
+		return 0;
+
+	if (c->sb.encryption_type)
+		return c->opts.wide_macs
+			? BCH_CSUM_chacha20_poly1305_128
+			: BCH_CSUM_chacha20_poly1305_80;
+
+	return bch2_csum_opt_to_type(opts.data_checksum, true);
+}
+
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
+{
+	if (c->sb.encryption_type)
+		return BCH_CSUM_chacha20_poly1305_128;
+
+	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
+}
+
+static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
+					   unsigned type)
+{
+	if (type >= BCH_CSUM_NR)
+		return false;
+
+	if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+		return false;
+
+	return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+	/*
+	 * XXX: need some way of preventing the compiler from optimizing this
+	 * into a form that isn't constant time..
+	 */
+	return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
+
+	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
+	return nonce;
+}
+
+static inline struct nonce null_nonce(void)
+{
+	struct nonce ret;
+
+	memset(&ret, 0, sizeof(ret));
+	return ret;
+}
+
+static inline struct nonce extent_nonce(struct bversion version,
+					struct bch_extent_crc_unpacked crc)
+{
+	unsigned compression_type = crc_is_compressed(crc)
+		? crc.compression_type
+		: 0;
+	unsigned size = compression_type ? crc.uncompressed_size : 0;
+	struct nonce nonce = (struct nonce) {{
+		[0] = cpu_to_le32(size << 22),
+		[1] = cpu_to_le32(version.lo),
+		[2] = cpu_to_le32(version.lo >> 32),
+		[3] = cpu_to_le32(version.hi|
+				  (compression_type << 24))^BCH_NONCE_EXTENT,
+	}};
+
+	return nonce_add(nonce, crc.nonce << 9);
+}
+
+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
+{
+	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
+{
+	__le64 magic = __bch2_sb_magic(sb);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
+{
+	__le64 magic = bch2_sb_magic(c);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
new file mode 100644
index 000000000000..f41889093a2c
--- /dev/null
+++ b/fs/bcachefs/clock.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+
+static inline long io_timer_cmp(io_timer_heap *h,
+				struct io_timer *l,
+				struct io_timer *r)
+{
+	return l->expire - r->expire;
+}
+
+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	if (time_after_eq((unsigned long) atomic64_read(&clock->now),
+			  timer->expire)) {
+		spin_unlock(&clock->timer_lock);
+		timer->fn(timer);
+		return;
+	}
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer)
+			goto out;
+
+	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
+out:
+	spin_unlock(&clock->timer_lock);
+}
+
+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer) {
+			heap_del(&clock->timers, i, io_timer_cmp, NULL);
+			break;
+		}
+
+	spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+	struct io_timer		io_timer;
+	struct timer_list	cpu_timer;
+	struct task_struct	*task;
+	int			expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, io_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, cpu_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.io_timer.expire	= until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	schedule();
+
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+				unsigned long io_until,
+				unsigned long cpu_timeout)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct io_clock_wait wait;
+
+	wait.io_timer.expire	= io_until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread && kthread_should_stop())
+			break;
+
+		if (wait.expired)
+			break;
+
+		schedule();
+		try_to_freeze();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	del_timer_sync(&wait.cpu_timer);
+	destroy_timer_on_stack(&wait.cpu_timer);
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+					  unsigned long now)
+{
+	struct io_timer *ret = NULL;
+
+	spin_lock(&clock->timer_lock);
+
+	if (clock->timers.used &&
+	    time_after_eq(now, clock->timers.data[0]->expire))
+		heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
+
+	spin_unlock(&clock->timer_lock);
+
+	return ret;
+}
+
+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
+{
+	struct io_timer *timer;
+	unsigned long now = atomic64_add_return(sectors, &clock->now);
+
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+}
+
+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
+{
+	unsigned long now;
+	unsigned i;
+
+	out->atomic++;
+	spin_lock(&clock->timer_lock);
+	now = atomic64_read(&clock->now);
+
+	for (i = 0; i < clock->timers.used; i++)
+		prt_printf(out, "%ps:\t%li\n",
+		       clock->timers.data[i]->fn,
+		       clock->timers.data[i]->expire - now);
+	spin_unlock(&clock->timer_lock);
+	--out->atomic;
+}
+
+void bch2_io_clock_exit(struct io_clock *clock)
+{
+	free_heap(&clock->timers);
+	free_percpu(clock->pcpu_buf);
+}
+
+int bch2_io_clock_init(struct io_clock *clock)
+{
+	atomic64_set(&clock->now, 0);
+	spin_lock_init(&clock->timer_lock);
+
+	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
+
+	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+	if (!clock->pcpu_buf)
+		return -BCH_ERR_ENOMEM_io_clock_init;
+
+	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+		return -BCH_ERR_ENOMEM_io_clock_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
new file mode 100644
index 000000000000..70a0f7436c84
--- /dev/null
+++ b/fs/bcachefs/clock.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H
+
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+				unsigned long);
+
+void __bch2_increment_clock(struct io_clock *, unsigned);
+
+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+					int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+
+	if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
+		   IO_CLOCK_PCPU_SECTORS))
+		__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
+
+void bch2_io_clock_exit(struct io_clock *);
+int bch2_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
new file mode 100644
index 000000000000..5fae0012d808
--- /dev/null
+++ b/fs/bcachefs/clock_types.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_TYPES_H
+#define _BCACHEFS_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+	io_timer_fn		fn;
+	unsigned long		expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS	128
+
+typedef HEAP(struct io_timer *)	io_timer_heap;
+
+struct io_clock {
+	atomic64_t		now;
+	u16 __percpu		*pcpu_buf;
+	unsigned		max_slop;
+
+	spinlock_t		timer_lock;
+	io_timer_heap		timers;
+};
+
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
new file mode 100644
index 000000000000..1480b64547b0
--- /dev/null
+++ b/fs/bcachefs/compress.c
@@ -0,0 +1,710 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "compress.h"
+#include "extents.h"
+#include "super-io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+#include <linux/zstd.h>
+
+/* Bounce buffer: */
+struct bbuf {
+	void		*b;
+	enum {
+		BB_NONE,
+		BB_VMAP,
+		BB_KMALLOC,
+		BB_MEMPOOL,
+	}		type;
+	int		rw;
+};
+
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
+{
+	void *b;
+
+	BUG_ON(size > c->opts.encoded_extent_max);
+
+	b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	BUG();
+}
+
+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	void *expected_start = NULL;
+
+	__bio_for_each_bvec(bv, bio, iter, start) {
+		if (expected_start &&
+		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
+			return false;
+
+		expected_start = page_address(bv.bv_page) +
+			bv.bv_offset + bv.bv_len;
+	}
+
+	return true;
+}
+
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+				       struct bvec_iter start, int rw)
+{
+	struct bbuf ret;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned nr_pages = 0;
+	struct page *stack_pages[16];
+	struct page **pages = NULL;
+	void *data;
+
+	BUG_ON(start.bi_size > c->opts.encoded_extent_max);
+
+	if (!PageHighMem(bio_iter_page(bio, start)) &&
+	    bio_phys_contig(bio, start))
+		return (struct bbuf) {
+			.b = page_address(bio_iter_page(bio, start)) +
+				bio_iter_offset(bio, start),
+			.type = BB_NONE, .rw = rw
+		};
+
+	/* check if we can map the pages contiguously: */
+	__bio_for_each_segment(bv, bio, iter, start) {
+		if (iter.bi_size != start.bi_size &&
+		    bv.bv_offset)
+			goto bounce;
+
+		if (bv.bv_len < iter.bi_size &&
+		    bv.bv_offset + bv.bv_len < PAGE_SIZE)
+			goto bounce;
+
+		nr_pages++;
+	}
+
+	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+	pages = nr_pages > ARRAY_SIZE(stack_pages)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
+		: stack_pages;
+	if (!pages)
+		goto bounce;
+
+	nr_pages = 0;
+	__bio_for_each_segment(bv, bio, iter, start)
+		pages[nr_pages++] = bv.bv_page;
+
+	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (pages != stack_pages)
+		kfree(pages);
+
+	if (data)
+		return (struct bbuf) {
+			.b = data + bio_iter_offset(bio, start),
+			.type = BB_VMAP, .rw = rw
+		};
+bounce:
+	ret = __bounce_alloc(c, start.bi_size, rw);
+
+	if (rw == READ)
+		memcpy_from_bio(ret.b, bio, start);
+
+	return ret;
+}
+
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
+{
+	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
+}
+
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
+{
+	switch (buf.type) {
+	case BB_NONE:
+		break;
+	case BB_VMAP:
+		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+		break;
+	case BB_KMALLOC:
+		kfree(buf.b);
+		break;
+	case BB_MEMPOOL:
+		mempool_free(buf.b, &c->compression_bounce[buf.rw]);
+		break;
+	}
+}
+
+static inline void zlib_set_workspace(z_stream *strm, void *workspace)
+{
+#ifdef __KERNEL__
+	strm->workspace = workspace;
+#endif
+}
+
+static int __bio_uncompress(struct bch_fs *c, struct bio *src,
+			    void *dst_data, struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf src_data = { NULL };
+	size_t src_len = src->bi_iter.bi_size;
+	size_t dst_len = crc.uncompressed_size << 9;
+	void *workspace;
+	int ret;
+
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	switch (crc.compression_type) {
+	case BCH_COMPRESSION_TYPE_lz4_old:
+	case BCH_COMPRESSION_TYPE_lz4:
+		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+						  src_len, dst_len, dst_len);
+		if (ret != dst_len)
+			goto err;
+		break;
+	case BCH_COMPRESSION_TYPE_gzip: {
+		z_stream strm = {
+			.next_in	= src_data.b,
+			.avail_in	= src_len,
+			.next_out	= dst_data,
+			.avail_out	= dst_len,
+		};
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_inflateInit2(&strm, -MAX_WBITS);
+		ret = zlib_inflate(&strm, Z_FINISH);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (ret != Z_STREAM_END)
+			goto err;
+		break;
+	}
+	case BCH_COMPRESSION_TYPE_zstd: {
+		ZSTD_DCtx *ctx;
+		size_t real_src_len = le32_to_cpup(src_data.b);
+
+		if (real_src_len > src_len - 4)
+			goto err;
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
+
+		ret = zstd_decompress_dctx(ctx,
+				dst_data,	dst_len,
+				src_data.b + 4, real_src_len);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (ret != dst_len)
+			goto err;
+		break;
+	}
+	default:
+		BUG();
+	}
+	ret = 0;
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	return ret;
+err:
+	ret = -EIO;
+	goto out;
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
+				struct bch_extent_crc_unpacked *crc)
+{
+	struct bbuf data = { NULL };
+	size_t dst_len = crc->uncompressed_size << 9;
+
+	/* bio must own its pages: */
+	BUG_ON(!bio->bi_vcnt);
+	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
+
+	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max ||
+	    crc->compressed_size << 9	> c->opts.encoded_extent_max) {
+		bch_err(c, "error rewriting existing data: extent too big");
+		return -EIO;
+	}
+
+	data = __bounce_alloc(c, dst_len, WRITE);
+
+	if (__bio_uncompress(c, bio, data.b, *crc)) {
+		if (!c->opts.no_data_io)
+			bch_err(c, "error rewriting existing data: decompression error");
+		bio_unmap_or_unbounce(c, data);
+		return -EIO;
+	}
+
+	/*
+	 * XXX: don't have a good way to assert that the bio was allocated with
+	 * enough space, we depend on bch2_move_extent doing the right thing
+	 */
+	bio->bi_iter.bi_size = crc->live_size << 9;
+
+	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+	crc->csum_type		= 0;
+	crc->compression_type	= 0;
+	crc->compressed_size	= crc->live_size;
+	crc->uncompressed_size	= crc->live_size;
+	crc->offset		= 0;
+	crc->csum		= (struct bch_csum) { 0, 0 };
+
+	bio_unmap_or_unbounce(c, data);
+	return 0;
+}
+
+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
+		       struct bio *dst, struct bvec_iter dst_iter,
+		       struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf dst_data = { NULL };
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret;
+
+	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
+	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
+		return -EIO;
+
+	dst_data = dst_len == dst_iter.bi_size
+		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+		: __bounce_alloc(c, dst_len, WRITE);
+
+	ret = __bio_uncompress(c, src, dst_data.b, crc);
+	if (ret)
+		goto err;
+
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
+		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
+err:
+	bio_unmap_or_unbounce(c, dst_data);
+	return ret;
+}
+
+static int attempt_compress(struct bch_fs *c,
+			    void *workspace,
+			    void *dst, size_t dst_len,
+			    void *src, size_t src_len,
+			    struct bch_compression_opt compression)
+{
+	enum bch_compression_type compression_type =
+		__bch2_compression_opt_to_type[compression.type];
+
+	switch (compression_type) {
+	case BCH_COMPRESSION_TYPE_lz4:
+		if (compression.level < LZ4HC_MIN_CLEVEL) {
+			int len = src_len;
+			int ret = LZ4_compress_destSize(
+					src,		dst,
+					&len,		dst_len,
+					workspace);
+			if (len < src_len)
+				return -len;
+
+			return ret;
+		} else {
+			int ret = LZ4_compress_HC(
+					src,		dst,
+					src_len,	dst_len,
+					compression.level,
+					workspace);
+
+			return ret ?: -1;
+		}
+	case BCH_COMPRESSION_TYPE_gzip: {
+		z_stream strm = {
+			.next_in	= src,
+			.avail_in	= src_len,
+			.next_out	= dst,
+			.avail_out	= dst_len,
+		};
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_deflateInit2(&strm,
+				  compression.level
+				  ? clamp_t(unsigned, compression.level,
+					    Z_BEST_SPEED, Z_BEST_COMPRESSION)
+				  : Z_DEFAULT_COMPRESSION,
+				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+				  Z_DEFAULT_STRATEGY);
+
+		if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+			return 0;
+
+		if (zlib_deflateEnd(&strm) != Z_OK)
+			return 0;
+
+		return strm.total_out;
+	}
+	case BCH_COMPRESSION_TYPE_zstd: {
+		/*
+		 * rescale:
+		 * zstd max compression level is 22, our max level is 15
+		 */
+		unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
+		ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
+		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+			zstd_cctx_workspace_bound(&params.cParams));
+
+		/*
+		 * ZSTD requires that when we decompress we pass in the exact
+		 * compressed size - rounding it up to the nearest sector
+		 * doesn't work, so we use the first 4 bytes of the buffer for
+		 * that.
+		 *
+		 * Additionally, the ZSTD code seems to have a bug where it will
+		 * write just past the end of the buffer - so subtract a fudge
+		 * factor (7 bytes) from the dst buffer size to account for
+		 * that.
+		 */
+		size_t len = zstd_compress_cctx(ctx,
+				dst + 4,	dst_len - 4 - 7,
+				src,		src_len,
+				&c->zstd_params);
+		if (zstd_is_error(len))
+			return 0;
+
+		*((__le32 *) dst) = cpu_to_le32(len);
+		return len + 4;
+	}
+	default:
+		BUG();
+	}
+}
+
+static unsigned __bio_compress(struct bch_fs *c,
+			       struct bio *dst, size_t *dst_len,
+			       struct bio *src, size_t *src_len,
+			       struct bch_compression_opt compression)
+{
+	struct bbuf src_data = { NULL }, dst_data = { NULL };
+	void *workspace;
+	enum bch_compression_type compression_type =
+		__bch2_compression_opt_to_type[compression.type];
+	unsigned pad;
+	int ret = 0;
+
+	BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
+	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
+	/* If it's only one block, don't bother trying to compress: */
+	if (src->bi_iter.bi_size <= c->opts.block_size)
+		return BCH_COMPRESSION_TYPE_incompressible;
+
+	dst_data = bio_map_or_bounce(c, dst, WRITE);
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
+
+	*src_len = src->bi_iter.bi_size;
+	*dst_len = dst->bi_iter.bi_size;
+
+	/*
+	 * XXX: this algorithm sucks when the compression code doesn't tell us
+	 * how much would fit, like LZ4 does:
+	 */
+	while (1) {
+		if (*src_len <= block_bytes(c)) {
+			ret = -1;
+			break;
+		}
+
+		ret = attempt_compress(c, workspace,
+				       dst_data.b,	*dst_len,
+				       src_data.b,	*src_len,
+				       compression);
+		if (ret > 0) {
+			*dst_len = ret;
+			ret = 0;
+			break;
+		}
+
+		/* Didn't fit: should we retry with a smaller amount?  */
+		if (*src_len <= *dst_len) {
+			ret = -1;
+			break;
+		}
+
+		/*
+		 * If ret is negative, it's a hint as to how much data would fit
+		 */
+		BUG_ON(-ret >= *src_len);
+
+		if (ret < 0)
+			*src_len = -ret;
+		else
+			*src_len -= (*src_len - *dst_len) / 2;
+		*src_len = round_down(*src_len, block_bytes(c));
+	}
+
+	mempool_free(workspace, &c->compress_workspace[compression_type]);
+
+	if (ret)
+		goto err;
+
+	/* Didn't get smaller: */
+	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
+		goto err;
+
+	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+	memset(dst_data.b + *dst_len, 0, pad);
+	*dst_len += pad;
+
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
+		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+	BUG_ON(*dst_len & (block_bytes(c) - 1));
+	BUG_ON(*src_len & (block_bytes(c) - 1));
+	ret = compression_type;
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	bio_unmap_or_unbounce(c, dst_data);
+	return ret;
+err:
+	ret = BCH_COMPRESSION_TYPE_incompressible;
+	goto out;
+}
+
+unsigned bch2_bio_compress(struct bch_fs *c,
+			   struct bio *dst, size_t *dst_len,
+			   struct bio *src, size_t *src_len,
+			   unsigned compression_opt)
+{
+	unsigned orig_dst = dst->bi_iter.bi_size;
+	unsigned orig_src = src->bi_iter.bi_size;
+	unsigned compression_type;
+
+	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
+	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+				     c->opts.encoded_extent_max);
+	/* Don't generate a bigger output than input: */
+	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+	compression_type =
+		__bio_compress(c, dst, dst_len, src, src_len,
+			       bch2_compression_decode(compression_opt));
+
+	dst->bi_iter.bi_size = orig_dst;
+	src->bi_iter.bi_size = orig_src;
+	return compression_type;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
+#define BCH_FEATURE_none	0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+	BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+#undef BCH_FEATURE_none
+
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+{
+	int ret = 0;
+
+	if ((c->sb.features & f) == f)
+		return 0;
+
+	mutex_lock(&c->sb_lock);
+
+	if ((c->sb.features & f) == f) {
+		mutex_unlock(&c->sb_lock);
+		return 0;
+	}
+
+	ret = __bch2_fs_compress_init(c, c->sb.features|f);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ret;
+	}
+
+	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+				       unsigned compression_opt)
+{
+	unsigned compression_type = bch2_compression_decode(compression_opt).type;
+
+	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+	return compression_type
+		? __bch2_check_set_has_compressed_data(c,
+				1ULL << bch2_compression_opt_to_feature[compression_type])
+		: 0;
+}
+
+void bch2_fs_compress_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	mempool_exit(&c->decompress_workspace);
+	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+		mempool_exit(&c->compress_workspace[i]);
+	mempool_exit(&c->compression_bounce[WRITE]);
+	mempool_exit(&c->compression_bounce[READ]);
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+	size_t decompress_workspace_size = 0;
+	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
+						 c->opts.encoded_extent_max);
+	struct {
+		unsigned			feature;
+		enum bch_compression_type	type;
+		size_t				compress_workspace;
+		size_t				decompress_workspace;
+	} compression_types[] = {
+		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
+			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+			0 },
+		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
+			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+			zlib_inflate_workspacesize(), },
+		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
+			zstd_cctx_workspace_bound(&params.cParams),
+			zstd_dctx_workspace_bound() },
+	}, *i;
+	bool have_compressed = false;
+
+	c->zstd_params = params;
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++)
+		have_compressed |= (features & (1 << i->feature)) != 0;
+
+	if (!have_compressed)
+		return 0;
+
+	if (!mempool_initialized(&c->compression_bounce[READ]) &&
+	    mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
+					1, c->opts.encoded_extent_max))
+		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
+
+	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
+	    mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
+					1, c->opts.encoded_extent_max))
+		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++) {
+		decompress_workspace_size =
+			max(decompress_workspace_size, i->decompress_workspace);
+
+		if (!(features & (1 << i->feature)))
+			continue;
+
+		if (mempool_initialized(&c->compress_workspace[i->type]))
+			continue;
+
+		if (mempool_init_kvpmalloc_pool(
+				&c->compress_workspace[i->type],
+				1, i->compress_workspace))
+			return -BCH_ERR_ENOMEM_compression_workspace_init;
+	}
+
+	if (!mempool_initialized(&c->decompress_workspace) &&
+	    mempool_init_kvpmalloc_pool(&c->decompress_workspace,
+					1, decompress_workspace_size))
+		return -BCH_ERR_ENOMEM_decompression_workspace_init;
+
+	return 0;
+}
+
+static u64 compression_opt_to_feature(unsigned v)
+{
+	unsigned type = bch2_compression_decode(v).type;
+
+	return BIT_ULL(bch2_compression_opt_to_feature[type]);
+}
+
+int bch2_fs_compress_init(struct bch_fs *c)
+{
+	u64 f = c->sb.features;
+
+	f |= compression_opt_to_feature(c->opts.compression);
+	f |= compression_opt_to_feature(c->opts.background_compression);
+
+	return __bch2_fs_compress_init(c, f);
+}
+
+int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
+			       struct printbuf *err)
+{
+	char *val = kstrdup(_val, GFP_KERNEL);
+	char *p = val, *type_str, *level_str;
+	struct bch_compression_opt opt = { 0 };
+	int ret;
+
+	if (!val)
+		return -ENOMEM;
+
+	type_str = strsep(&p, ":");
+	level_str = p;
+
+	ret = match_string(bch2_compression_opts, -1, type_str);
+	if (ret < 0 && err)
+		prt_str(err, "invalid compression type");
+	if (ret < 0)
+		goto err;
+
+	opt.type = ret;
+
+	if (level_str) {
+		unsigned level;
+
+		ret = kstrtouint(level_str, 10, &level);
+		if (!ret && !opt.type && level)
+			ret = -EINVAL;
+		if (!ret && level > 15)
+			ret = -EINVAL;
+		if (ret < 0 && err)
+			prt_str(err, "invalid compression level");
+		if (ret < 0)
+			goto err;
+
+		opt.level = level;
+	}
+
+	*res = bch2_compression_encode(opt);
+err:
+	kfree(val);
+	return ret;
+}
+
+void bch2_opt_compression_to_text(struct printbuf *out,
+				  struct bch_fs *c,
+				  struct bch_sb *sb,
+				  u64 v)
+{
+	struct bch_compression_opt opt = bch2_compression_decode(v);
+
+	prt_str(out, bch2_compression_opts[opt.type]);
+	if (opt.level)
+		prt_printf(out, ":%u", opt.level);
+}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
new file mode 100644
index 000000000000..052ea303241f
--- /dev/null
+++ b/fs/bcachefs/compress.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H
+
+#include "extents_types.h"
+
+struct bch_compression_opt {
+	u8		type:4,
+			level:4;
+};
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+	return (struct bch_compression_opt) {
+		.type	= v & 15,
+		.level	= v >> 4,
+	};
+}
+
+static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
+{
+	return opt.type|(opt.level << 4);
+}
+
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+	BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
+{
+	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
+				struct bch_extent_crc_unpacked *);
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
+		       struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+			   struct bio *, size_t *, unsigned);
+
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
+void bch2_fs_compress_exit(struct bch_fs *);
+int bch2_fs_compress_init(struct bch_fs *);
+
+int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+
+#define bch2_opt_compression (struct bch_opt_fn) {		\
+	.parse		= bch2_opt_compression_parse,	\
+	.to_text	= bch2_opt_compression_to_text,	\
+}
+
+#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
new file mode 100644
index 000000000000..02a996e06a64
--- /dev/null
+++ b/fs/bcachefs/counters.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+static const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+	if (!ctrs)
+		return 0;
+
+	return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb,
+				     struct bch_sb_field *f,
+				     struct printbuf *err)
+{
+	return 0;
+};
+
+static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+			      struct bch_sb_field *f)
+{
+	struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+	for (i = 0; i < nr; i++) {
+		if (i < BCH_COUNTER_NR)
+			prt_printf(out, "%s ", bch2_counter_names[i]);
+		else
+			prt_printf(out, "(unknown)");
+
+		prt_tab(out);
+		prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+		prt_newline(out);
+	}
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+	u64 val = 0;
+
+	for (i = 0; i < BCH_COUNTER_NR; i++)
+		c->counters_on_mount[i] = 0;
+
+	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+		val = le64_to_cpu(ctrs->d[i]);
+		percpu_u64_set(&c->counters[i], val);
+		c->counters_on_mount[i] = val;
+	}
+	return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
+	struct bch_sb_field_counters *ret;
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+	if (nr < BCH_COUNTER_NR) {
+		ret = bch2_sb_field_resize(&c->disk_sb, counters,
+					       sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+		if (ret) {
+			ctrs = ret;
+			nr = bch2_sb_counter_nr_entries(ctrs);
+		}
+	}
+
+
+	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+		ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+	return 0;
+}
+
+void bch2_fs_counters_exit(struct bch_fs *c)
+{
+	free_percpu(c->counters);
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+	c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+	if (!c->counters)
+		return -BCH_ERR_ENOMEM_fs_counters_init;
+
+	return bch2_sb_counters_to_cpu(c);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+	.validate	= bch2_sb_counters_validate,
+	.to_text	= bch2_sb_counters_to_text,
+};
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h
new file mode 100644
index 000000000000..4778aa19bf34
--- /dev/null
+++ b/fs/bcachefs/counters.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COUNTERS_H
+#define _BCACHEFS_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+
+int bch2_sb_counters_to_cpu(struct bch_fs *);
+int bch2_sb_counters_from_cpu(struct bch_fs *);
+
+void bch2_fs_counters_exit(struct bch_fs *);
+int bch2_fs_counters_init(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_COUNTERS_H
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
new file mode 100644
index 000000000000..114f86b45fd5
--- /dev/null
+++ b/fs/bcachefs/darray.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include "util.h"
+#include <linux/slab.h>
+
+#define DARRAY(type)							\
+struct {								\
+	size_t nr, size;						\
+	type *data;							\
+}
+
+typedef DARRAY(void) darray_void;
+
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+{
+	if (d->nr + more > d->size) {
+		size_t new_size = roundup_pow_of_two(d->nr + more);
+		void *data = krealloc_array(d->data, new_size, t_size, gfp);
+
+		if (!data)
+			return -ENOMEM;
+
+		d->data	= data;
+		d->size = new_size;
+	}
+
+	return 0;
+}
+
+#define darray_make_room_gfp(_d, _more, _gfp)				\
+	__darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+
+#define darray_make_room(_d, _more)					\
+	darray_make_room_gfp(_d, _more, GFP_KERNEL)
+
+#define darray_top(_d)		((_d).data[(_d).nr])
+
+#define darray_push_gfp(_d, _item, _gfp)				\
+({									\
+	int _ret = darray_make_room_gfp((_d), 1, _gfp);			\
+									\
+	if (!_ret)							\
+		(_d)->data[(_d)->nr++] = (_item);			\
+	_ret;								\
+})
+
+#define darray_push(_d, _item)	darray_push_gfp(_d, _item, GFP_KERNEL)
+
+#define darray_pop(_d)		((_d)->data[--(_d)->nr])
+
+#define darray_first(_d)	((_d).data[0])
+#define darray_last(_d)		((_d).data[(_d).nr - 1])
+
+#define darray_insert_item(_d, pos, _item)				\
+({									\
+	size_t _pos = (pos);						\
+	int _ret = darray_make_room((_d), 1);				\
+									\
+	if (!_ret)							\
+		array_insert_item((_d)->data, (_d)->nr, _pos, (_item));	\
+	_ret;								\
+})
+
+#define darray_for_each(_d, _i)						\
+	for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_init(_d)							\
+do {									\
+	(_d)->data = NULL;						\
+	(_d)->nr = (_d)->size = 0;					\
+} while (0)
+
+#define darray_exit(_d)							\
+do {									\
+	kfree((_d)->data);						\
+	darray_init(_d);						\
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
new file mode 100644
index 000000000000..899ff46de8e0
--- /dev/null
+++ b/fs/bcachefs/data_update.c
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "io_write.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "subvolume.h"
+#include "trace.h"
+
+static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_finish_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_finish(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void trace_move_extent_fail2(struct data_update *m,
+			 struct bkey_s_c new,
+			 struct bkey_s_c wrote,
+			 struct bkey_i *insert,
+			 const char *msg)
+{
+	struct bch_fs *c = m->op.c;
+	struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_ptr *ptr;
+	struct extent_ptr_decoded p;
+	struct printbuf buf = PRINTBUF;
+	unsigned i, rewrites_found = 0;
+
+	if (!trace_move_extent_fail_enabled())
+		return;
+
+	prt_str(&buf, msg);
+
+	if (insert) {
+		i = 0;
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+			    !ptr->cached)
+				rewrites_found |= 1U << i;
+			i++;
+		}
+	}
+
+	prt_printf(&buf, "\nrewrite ptrs:   %u%u%u%u",
+		   (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
+
+	prt_printf(&buf, "\nrewrites found: %u%u%u%u",
+		   (rewrites_found & (1 << 0)) != 0,
+		   (rewrites_found & (1 << 1)) != 0,
+		   (rewrites_found & (1 << 2)) != 0,
+		   (rewrites_found & (1 << 3)) != 0);
+
+	prt_str(&buf, "\nold:    ");
+	bch2_bkey_val_to_text(&buf, c, old);
+
+	prt_str(&buf, "\nnew:    ");
+	bch2_bkey_val_to_text(&buf, c, new);
+
+	prt_str(&buf, "\nwrote:  ");
+	bch2_bkey_val_to_text(&buf, c, wrote);
+
+	if (insert) {
+		prt_str(&buf, "\ninsert: ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+	}
+
+	trace_move_extent_fail(c, buf.buf);
+	printbuf_exit(&buf);
+}
+
+static int __bch2_data_update_index_update(struct btree_trans *trans,
+					   struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_iter iter;
+	struct data_update *m =
+		container_of(op, struct data_update, op);
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_buf _new, _insert;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&_new);
+	bch2_bkey_buf_init(&_insert);
+	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
+	bch2_trans_iter_init(trans, &iter, m->btree_id,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	while (1) {
+		struct bkey_s_c k;
+		struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+		struct bkey_i *insert = NULL;
+		struct bkey_i_extent *new;
+		const union bch_extent_entry *entry_c;
+		union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		struct bch_extent_ptr *ptr;
+		const struct bch_extent_ptr *ptr_c;
+		struct bpos next_pos;
+		bool should_check_enospc;
+		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+		unsigned rewrites_found = 0, durability, i;
+
+		bch2_trans_begin(trans);
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		new = bkey_i_to_extent(bch2_keylist_front(keys));
+
+		if (!bch2_extents_match(k, old)) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
+						NULL, "no match:");
+			goto nowork;
+		}
+
+		bkey_reassemble(_insert.k, k);
+		insert = _insert.k;
+
+		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(_new.k);
+		bch2_cut_front(iter.pos, &new->k_i);
+
+		bch2_cut_front(iter.pos,	insert);
+		bch2_cut_back(new->k.p,		insert);
+		bch2_cut_back(insert->k.p,	&new->k_i);
+
+		/*
+		 * @old: extent that we read from
+		 * @insert: key that we're going to update, initialized from
+		 * extent currently in btree - same as @old unless we raced with
+		 * other updates
+		 * @new: extent with new pointers that we'll be adding to @insert
+		 *
+		 * Fist, drop rewrite_ptrs from @new:
+		 */
+		i = 0;
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
+			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+			    !ptr->cached) {
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+				/*
+				 * See comment below:
+				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+				*/
+				rewrites_found |= 1U << i;
+			}
+			i++;
+		}
+
+		if (m->data_opts.rewrite_ptrs &&
+		    !rewrites_found &&
+		    bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
+			goto nowork;
+		}
+
+		/*
+		 * A replica that we just wrote might conflict with a replica
+		 * that we want to keep, due to racing with another move:
+		 */
+restart_drop_conflicting_replicas:
+		extent_for_each_ptr(extent_i_to_s(new), ptr)
+			if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
+			    !ptr_c->cached) {
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
+				goto restart_drop_conflicting_replicas;
+			}
+
+		if (!bkey_val_u64s(&new->k)) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
+			goto nowork;
+		}
+
+		/* Now, drop pointers that conflict with what we just wrote: */
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+			if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+
+		durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
+			bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
+
+		/* Now, drop excess replicas: */
+restart_drop_extra_replicas:
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+			unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+
+			if (!p.ptr.cached &&
+			    durability - ptr_durability >= m->op.opts.data_replicas) {
+				durability -= ptr_durability;
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
+				/*
+				 * Currently, we're dropping unneeded replicas
+				 * instead of marking them as cached, since
+				 * cached data in stripe buckets prevents them
+				 * from being reused:
+				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+				*/
+				goto restart_drop_extra_replicas;
+			}
+		}
+
+		/* Finally, add the pointers we just wrote: */
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+			bch2_extent_ptr_decoded_append(insert, &p);
+
+		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize(c, bkey_i_to_s(insert));
+
+		ret = bch2_sum_sector_overwrites(trans, &iter, insert,
+						 &should_check_enospc,
+						 &i_sectors_delta,
+						 &disk_sectors_delta);
+		if (ret)
+			goto err;
+
+		if (disk_sectors_delta > (s64) op->res.sectors) {
+			ret = bch2_disk_reservation_add(c, &op->res,
+						disk_sectors_delta - op->res.sectors,
+						!should_check_enospc
+						? BCH_DISK_RESERVATION_NOFAIL : 0);
+			if (ret)
+				goto out;
+		}
+
+		next_pos = insert->k.p;
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+						k.k->p, bkey_start_pos(&insert->k)) ?:
+			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+						k.k->p, insert->k.p);
+		if (ret)
+			goto err;
+
+		ret   = bch2_trans_update(trans, &iter, insert,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+			bch2_trans_commit(trans, &op->res,
+				NULL,
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL|
+				m->data_opts.btree_insert_flags);
+		if (!ret) {
+			bch2_btree_iter_set_pos(&iter, next_pos);
+
+			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+			trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
+		}
+err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+		if (ret)
+			break;
+next:
+		while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
+			bch2_keylist_pop_front(keys);
+			if (bch2_keylist_empty(keys))
+				goto out;
+		}
+		continue;
+nowork:
+		if (m->ctxt && m->ctxt->stats) {
+			BUG_ON(k.k->p.offset <= iter.pos.offset);
+			atomic64_inc(&m->ctxt->stats->keys_raced);
+			atomic64_add(k.k->p.offset - iter.pos.offset,
+				     &m->ctxt->stats->sectors_raced);
+		}
+
+		this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
+
+		bch2_btree_iter_advance(&iter);
+		goto next;
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_bkey_buf_exit(&_insert, c);
+	bch2_bkey_buf_exit(&_new, c);
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+	return ret;
+}
+
+int bch2_data_update_index_update(struct bch_write_op *op)
+{
+	return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
+}
+
+void bch2_data_update_read_done(struct data_update *m,
+				struct bch_extent_crc_unpacked crc)
+{
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+	m->op.crc = crc;
+	m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+
+	closure_call(&m->op.cl, bch2_write, NULL, NULL);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+	struct bch_fs *c = update->op.c;
+	struct bkey_ptrs_c ptrs =
+		bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (c->opts.nocow_enabled)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+						 PTR_BUCKET_POS(c, ptr), 0);
+		percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+	}
+
+	bch2_bkey_buf_exit(&update->k, c);
+	bch2_disk_reservation_put(c, &update->op.res);
+	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+}
+
+void bch2_update_unwritten_extent(struct btree_trans *trans,
+				  struct data_update *update)
+{
+	struct bch_fs *c = update->op.c;
+	struct bio *bio = &update->op.wbio.bio;
+	struct bkey_i_extent *e;
+	struct write_point *wp;
+	struct bch_extent_ptr *ptr;
+	struct closure cl;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	closure_init_stack(&cl);
+	bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
+
+	while (bio_sectors(bio)) {
+		unsigned sectors = bio_sectors(bio);
+
+		bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
+				     BTREE_ITER_SLOTS);
+		ret = lockrestart_do(trans, ({
+			k = bch2_btree_iter_peek_slot(&iter);
+			bkey_err(k);
+		}));
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
+			break;
+
+		e = bkey_extent_init(update->op.insert_keys.top);
+		e->k.p = update->op.pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				update->op.target,
+				false,
+				update->op.write_point,
+				&update->op.devs_have,
+				update->op.nr_replicas,
+				update->op.nr_replicas,
+				update->op.watermark,
+				0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+			bch2_trans_unlock(trans);
+			closure_sync(&cl);
+			continue;
+		}
+
+		if (ret)
+			return;
+
+		sectors = min(sectors, wp->sectors_free);
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &update->op.open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		bio_advance(bio, sectors << 9);
+		update->op.pos.offset += sectors;
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+		bch2_keylist_push(&update->op.insert_keys);
+
+		ret = __bch2_data_update_index_update(trans, &update->op);
+
+		bch2_open_buckets_put(c, &update->op.open_buckets);
+
+		if (ret)
+			break;
+	}
+
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+}
+
+int bch2_data_update_init(struct btree_trans *trans,
+			  struct moving_context *ctxt,
+			  struct data_update *m,
+			  struct write_point_specifier wp,
+			  struct bch_io_opts io_opts,
+			  struct data_update_opts data_opts,
+			  enum btree_id btree_id,
+			  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	const struct bch_extent_ptr *ptr;
+	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+	unsigned ptrs_locked = 0;
+	int ret;
+
+	bch2_bkey_buf_init(&m->k);
+	bch2_bkey_buf_reassemble(&m->k, c, k);
+	m->btree_id	= btree_id;
+	m->data_opts	= data_opts;
+
+	bch2_write_op_init(&m->op, c, io_opts);
+	m->op.pos	= bkey_start_pos(k.k);
+	m->op.version	= k.k->version;
+	m->op.target	= data_opts.target;
+	m->op.write_point = wp;
+	m->op.nr_replicas = 0;
+	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED|
+		BCH_WRITE_MOVE|
+		m->data_opts.write_flags;
+	m->op.compression_opt	= io_opts.background_compression ?: io_opts.compression;
+	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		bool locked;
+
+		if (((1U << i) & m->data_opts.rewrite_ptrs)) {
+			BUG_ON(p.ptr.cached);
+
+			if (crc_is_compressed(p.crc))
+				reserve_sectors += k.k->size;
+
+			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
+		} else if (!p.ptr.cached) {
+			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+		}
+
+		/*
+		 * op->csum_type is normally initialized from the fs/file's
+		 * current options - but if an extent is encrypted, we require
+		 * that it stays encrypted:
+		 */
+		if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+			m->op.nonce	= p.crc.nonce + p.crc.offset;
+			m->op.csum_type = p.crc.csum_type;
+		}
+
+		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+			m->op.incompressible = true;
+
+		if (c->opts.nocow_enabled) {
+			if (ctxt) {
+				move_ctxt_wait_event(ctxt, trans,
+						(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
+									  PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+						!atomic_read(&ctxt->read_sectors));
+
+				if (!locked)
+					bch2_bucket_nocow_lock(&c->nocow_locks,
+							       PTR_BUCKET_POS(c, &p.ptr), 0);
+			} else {
+				if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
+							       PTR_BUCKET_POS(c, &p.ptr), 0)) {
+					ret = -BCH_ERR_nocow_lock_blocked;
+					goto err;
+				}
+			}
+			ptrs_locked |= (1U << i);
+		}
+
+		i++;
+	}
+
+	if (reserve_sectors) {
+		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+				m->data_opts.extra_replicas
+				? 0
+				: BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			goto err;
+	}
+
+	m->op.nr_replicas += m->data_opts.extra_replicas;
+	m->op.nr_replicas_required = m->op.nr_replicas;
+
+	BUG_ON(!m->op.nr_replicas);
+
+	/* Special handling required: */
+	if (bkey_extent_is_unwritten(k))
+		return -BCH_ERR_unwritten_extent_update;
+	return 0;
+err:
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if ((1U << i) & ptrs_locked)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+						 PTR_BUCKET_POS(c, &p.ptr), 0);
+		percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
+		i++;
+	}
+
+	bch2_bkey_buf_exit(&m->k, c);
+	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
+	return ret;
+}
+
+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned i = 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
+			opts->kill_ptrs |= 1U << i;
+			opts->rewrite_ptrs ^= 1U << i;
+		}
+
+		i++;
+	}
+}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
new file mode 100644
index 000000000000..7ca1f98d7e94
--- /dev/null
+++ b/fs/bcachefs/data_update.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHEFS_DATA_UPDATE_H
+#define _BCACHEFS_DATA_UPDATE_H
+
+#include "bkey_buf.h"
+#include "io_write_types.h"
+
+struct moving_context;
+
+struct data_update_opts {
+	unsigned	rewrite_ptrs;
+	unsigned	kill_ptrs;
+	u16		target;
+	u8		extra_replicas;
+	unsigned	btree_insert_flags;
+	unsigned	write_flags;
+};
+
+struct data_update {
+	/* extent being updated: */
+	enum btree_id		btree_id;
+	struct bkey_buf		k;
+	struct data_update_opts	data_opts;
+	struct moving_context	*ctxt;
+	struct bch_write_op	op;
+};
+
+int bch2_data_update_index_update(struct bch_write_op *);
+
+void bch2_data_update_read_done(struct data_update *,
+				struct bch_extent_crc_unpacked);
+
+void bch2_data_update_exit(struct data_update *);
+void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
+int bch2_data_update_init(struct btree_trans *, struct moving_context *,
+			  struct data_update *,
+			  struct write_point_specifier,
+			  struct bch_io_opts, struct data_update_opts,
+			  enum btree_id, struct bkey_s_c);
+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
+
+#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
new file mode 100644
index 000000000000..75a3dc7cbd47
--- /dev/null
+++ b/fs/bcachefs/debug.c
@@ -0,0 +1,954 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Assorted bcachefs debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fsck.h"
+#include "inode.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
+				      struct extent_ptr_decoded pick)
+{
+	struct btree *v = c->verify_data;
+	struct btree_node *n_ondisk = c->verify_ondisk;
+	struct btree_node *n_sorted = c->verify_data->data;
+	struct bset *sorted, *inmemory = &b->data->keys;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	struct bio *bio;
+	bool failed = false, saw_error = false;
+
+	if (!bch2_dev_get_ioref(ca, READ))
+		return false;
+
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_sorted, btree_bytes(c)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bch2_bio_map(bio, n_sorted, btree_bytes(c));
+
+	submit_bio_wait(bio);
+
+	bio_put(bio);
+	percpu_ref_put(&ca->io_ref);
+
+	memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+	v->written = 0;
+	if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
+		return false;
+
+	n_sorted = c->verify_data->data;
+	sorted = &n_sorted->keys;
+
+	if (inmemory->u64s != sorted->u64s ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   vstruct_end(inmemory) - (void *) inmemory->start)) {
+		unsigned offset = 0, sectors;
+		struct bset *i;
+		unsigned j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** in memory:\n");
+		bch2_dump_bset(c, b, inmemory, 0);
+
+		printk(KERN_ERR "*** read back in:\n");
+		bch2_dump_bset(c, v, sorted, 0);
+
+		while (offset < v->written) {
+			if (!offset) {
+				i = &n_ondisk->keys;
+				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
+					c->block_bits;
+			} else {
+				struct btree_node_entry *bne =
+					(void *) n_ondisk + (offset << 9);
+				i = &bne->keys;
+
+				sectors = vstruct_blocks(bne, c->block_bits) <<
+					c->block_bits;
+			}
+
+			printk(KERN_ERR "*** on disk block %u:\n", offset);
+			bch2_dump_bset(c, b, i, offset);
+
+			offset += sectors;
+		}
+
+		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+			if (inmemory->_data[j] != sorted->_data[j])
+				break;
+
+		console_unlock();
+		bch_err(c, "verify failed at key %u", j);
+
+		failed = true;
+	}
+
+	if (v->written != b->written) {
+		bch_err(c, "written wrong: expected %u, got %u",
+			b->written, v->written);
+		failed = true;
+	}
+
+	return failed;
+}
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	struct bkey_ptrs_c ptrs;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	struct btree *v;
+	struct bset *inmemory = &b->data->keys;
+	struct bkey_packed *k;
+	bool failed = false;
+
+	if (c->opts.nochanges)
+		return;
+
+	bch2_btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	if (!c->verify_ondisk) {
+		c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+		if (!c->verify_ondisk)
+			goto out;
+	}
+
+	if (!c->verify_data) {
+		c->verify_data = __bch2_btree_node_mem_alloc(c);
+		if (!c->verify_data)
+			goto out;
+
+		list_del_init(&c->verify_data->list);
+	}
+
+	BUG_ON(b->nsets != 1);
+
+	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
+		if (k->type == KEY_TYPE_btree_ptr_v2)
+			((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
+
+	v = c->verify_data;
+	bkey_copy(&v->key, &b->key);
+	v->c.level	= b->c.level;
+	v->c.btree_id	= b->c.btree_id;
+	bch2_btree_keys_init(v);
+
+	ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+	bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
+		failed |= bch2_btree_verify_replica(c, b, p);
+
+	if (failed) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+		printbuf_exit(&buf);
+	}
+out:
+	mutex_unlock(&c->verify_lock);
+	bch2_btree_node_io_unlock(b);
+}
+
+void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
+				    const struct btree *b)
+{
+	struct btree_node *n_ondisk = NULL;
+	struct extent_ptr_decoded pick;
+	struct bch_dev *ca;
+	struct bio *bio = NULL;
+	unsigned offset = 0;
+	int ret;
+
+	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
+		prt_printf(out, "error getting device to read from: invalid device\n");
+		return;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	if (!bch2_dev_get_ioref(ca, READ)) {
+		prt_printf(out, "error getting device to read from: not online\n");
+		return;
+	}
+
+	n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+	if (!n_ondisk) {
+		prt_printf(out, "memory allocation failure\n");
+		goto out;
+	}
+
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_ondisk, btree_bytes(c)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+
+	ret = submit_bio_wait(bio);
+	if (ret) {
+		prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
+		goto out;
+	}
+
+	while (offset < btree_sectors(c)) {
+		struct bset *i;
+		struct nonce nonce;
+		struct bch_csum csum;
+		struct bkey_packed *k;
+		unsigned sectors;
+
+		if (!offset) {
+			i = &n_ondisk->keys;
+
+			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+					   offset, BSET_CSUM_TYPE(i));
+				goto out;
+			}
+
+			nonce = btree_nonce(i, offset << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
+
+			if (bch2_crc_cmp(csum, n_ondisk->csum)) {
+				prt_printf(out, "invalid checksum\n");
+				goto out;
+			}
+
+			bset_encrypt(c, i, offset << 9);
+
+			sectors = vstruct_sectors(n_ondisk, c->block_bits);
+		} else {
+			struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
+
+			i = &bne->keys;
+
+			if (i->seq != n_ondisk->keys.seq)
+				break;
+
+			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+					   offset, BSET_CSUM_TYPE(i));
+				goto out;
+			}
+
+			nonce = btree_nonce(i, offset << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+			if (bch2_crc_cmp(csum, bne->csum)) {
+				prt_printf(out, "invalid checksum");
+				goto out;
+			}
+
+			bset_encrypt(c, i, offset << 9);
+
+			sectors = vstruct_sectors(bne, c->block_bits);
+		}
+
+		prt_printf(out, "  offset %u version %u, journal seq %llu\n",
+			   offset,
+			   le16_to_cpu(i->version),
+			   le64_to_cpu(i->journal_seq));
+		offset += sectors;
+
+		printbuf_indent_add(out, 4);
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
+			struct bkey u;
+
+			bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
+			prt_newline(out);
+		}
+
+		printbuf_indent_sub(out, 4);
+	}
+out:
+	if (bio)
+		bio_put(bio);
+	kvpfree(n_ondisk, btree_bytes(c));
+	percpu_ref_put(&ca->io_ref);
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: bch_fs refcounting */
+
+struct dump_iter {
+	struct bch_fs		*c;
+	enum btree_id		id;
+	struct bpos		from;
+	struct bpos		prev_node;
+	u64			iter;
+
+	struct printbuf		buf;
+
+	char __user		*ubuf;	/* destination user buffer */
+	size_t			size;	/* size of requested read */
+	ssize_t			ret;	/* bytes read so far */
+};
+
+static ssize_t flush_buf(struct dump_iter *i)
+{
+	if (i->buf.pos) {
+		size_t bytes = min_t(size_t, i->buf.pos, i->size);
+		int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
+
+		i->ret	 += copied;
+		i->ubuf	 += copied;
+		i->size	 -= copied;
+		i->buf.pos -= copied;
+		memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
+
+		if (copied != bytes)
+			return -EFAULT;
+	}
+
+	return i->size ? 0 : i->ret;
+}
+
+static int bch2_dump_open(struct inode *inode, struct file *file)
+{
+	struct btree_debug *bd = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->from = POS_MIN;
+	i->iter	= 0;
+	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
+	i->id	= bd->id;
+	i->buf	= PRINTBUF;
+
+	return 0;
+}
+
+static int bch2_dump_release(struct inode *inode, struct file *file)
+{
+	struct dump_iter *i = file->private_data;
+
+	printbuf_exit(&i->buf);
+	kfree(i);
+	return 0;
+}
+
+static ssize_t bch2_read_btree(struct file *file, char __user *buf,
+			       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	ssize_t ret;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
+
+	trans = bch2_trans_get(i->c);
+	ret = for_each_btree_key2(trans, iter, i->id, i->from,
+				  BTREE_ITER_PREFETCH|
+				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
+		bch2_bkey_val_to_text(&i->buf, i->c, k);
+		prt_newline(&i->buf);
+		drop_locks_do(trans, flush_buf(i));
+	}));
+	i->from = iter.pos;
+
+	bch2_trans_put(trans);
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree,
+};
+
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct btree *b;
+	ssize_t ret;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
+
+	if (bpos_eq(SPOS_MAX, i->from))
+		return i->ret;
+
+	trans = bch2_trans_get(i->c);
+retry:
+	bch2_trans_begin(trans);
+
+	for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
+		bch2_btree_node_to_text(&i->buf, i->c, b);
+		i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
+			? bpos_successor(b->key.k.p)
+			: b->key.k.p;
+
+		ret = drop_locks_do(trans, flush_buf(i));
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree_formats,
+};
+
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	ssize_t ret;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
+
+	trans = bch2_trans_get(i->c);
+
+	ret = for_each_btree_key2(trans, iter, i->id, i->from,
+				  BTREE_ITER_PREFETCH|
+				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
+		struct btree_path_level *l = &iter.path->l[0];
+		struct bkey_packed *_k =
+			bch2_btree_node_iter_peek(&l->iter, l->b);
+
+		if (bpos_gt(l->b->key.k.p, i->prev_node)) {
+			bch2_btree_node_to_text(&i->buf, i->c, l->b);
+			i->prev_node = l->b->key.k.p;
+		}
+
+		bch2_bfloat_to_text(&i->buf, l->b, _k);
+		drop_locks_do(trans, flush_buf(i));
+	}));
+	i->from = iter.pos;
+
+	bch2_trans_put(trans);
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_bfloat_failed,
+};
+
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+					   struct btree *b)
+{
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
+
+	prt_printf(out, "%px btree=%s l=%u ",
+	       b,
+	       bch2_btree_ids[b->c.btree_id],
+	       b->c.level);
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	prt_newline(out);
+
+	prt_printf(out, "flags: ");
+	prt_tab(out);
+	prt_bitflags(out, bch2_btree_node_flags, b->flags);
+	prt_newline(out);
+
+	prt_printf(out, "pcpu read locks: ");
+	prt_tab(out);
+	prt_printf(out, "%u", b->c.lock.readers != NULL);
+	prt_newline(out);
+
+	prt_printf(out, "written:");
+	prt_tab(out);
+	prt_printf(out, "%u", b->written);
+	prt_newline(out);
+
+	prt_printf(out, "writes blocked:");
+	prt_tab(out);
+	prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
+	prt_newline(out);
+
+	prt_printf(out, "will make reachable:");
+	prt_tab(out);
+	prt_printf(out, "%lx", b->will_make_reachable);
+	prt_newline(out);
+
+	prt_printf(out, "journal pin %px:", &b->writes[0].journal);
+	prt_tab(out);
+	prt_printf(out, "%llu", b->writes[0].journal.seq);
+	prt_newline(out);
+
+	prt_printf(out, "journal pin %px:", &b->writes[1].journal);
+	prt_tab(out);
+	prt_printf(out, "%llu", b->writes[1].journal.seq);
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	bool done = false;
+	ssize_t ret = 0;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	do {
+		struct bucket_table *tbl;
+		struct rhash_head *pos;
+		struct btree *b;
+
+		ret = flush_buf(i);
+		if (ret)
+			return ret;
+
+		rcu_read_lock();
+		i->buf.atomic++;
+		tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+					  &c->btree_cache.table);
+		if (i->iter < tbl->size) {
+			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+				bch2_cached_btree_node_to_text(&i->buf, c, b);
+			i->iter++;
+		} else {
+			done = true;
+		}
+		--i->buf.atomic;
+		rcu_read_unlock();
+	} while (!done);
+
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_cached_btree_nodes_read,
+};
+
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	struct btree_trans *trans;
+	ssize_t ret = 0;
+	u32 seq;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+restart:
+	seqmutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if (trans->locking_wait.task->pid <= i->iter)
+			continue;
+
+		closure_get(&trans->ref);
+		seq = seqmutex_seq(&c->btree_trans_lock);
+		seqmutex_unlock(&c->btree_trans_lock);
+
+		ret = flush_buf(i);
+		if (ret) {
+			closure_put(&trans->ref);
+			goto unlocked;
+		}
+
+		bch2_btree_trans_to_text(&i->buf, trans);
+
+		prt_printf(&i->buf, "backtrace:");
+		prt_newline(&i->buf);
+		printbuf_indent_add(&i->buf, 2);
+		bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
+		printbuf_indent_sub(&i->buf, 2);
+		prt_newline(&i->buf);
+
+		i->iter = trans->locking_wait.task->pid;
+
+		closure_put(&trans->ref);
+
+		if (!seqmutex_relock(&c->btree_trans_lock, seq))
+			goto restart;
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+unlocked:
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_transactions_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_transactions_read,
+};
+#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	bool done = false;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	do {
+		err = flush_buf(i);
+		if (err)
+			return err;
+
+		if (!i->size)
+			break;
+
+		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+		i->iter++;
+	} while (!done);
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_journal_pins_read,
+};
+
+static int lock_held_stats_open(struct inode *inode, struct file *file)
+{
+	struct bch_fs *c = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+
+	if (!i)
+		return -ENOMEM;
+
+	i->iter = 0;
+	i->c    = c;
+	i->buf  = PRINTBUF;
+	file->private_data = i;
+
+	return 0;
+}
+
+static int lock_held_stats_release(struct inode *inode, struct file *file)
+{
+	struct dump_iter *i = file->private_data;
+
+	printbuf_exit(&i->buf);
+	kfree(i);
+
+	return 0;
+}
+
+static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter        *i = file->private_data;
+	struct bch_fs *c = i->c;
+	int err;
+
+	i->ubuf = buf;
+	i->size = size;
+	i->ret  = 0;
+
+	while (1) {
+		struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
+
+		err = flush_buf(i);
+		if (err)
+			return err;
+
+		if (!i->size)
+			break;
+
+		if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
+		    !bch2_btree_transaction_fns[i->iter])
+			break;
+
+		prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
+		prt_newline(&i->buf);
+		printbuf_indent_add(&i->buf, 2);
+
+		mutex_lock(&s->lock);
+
+		prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
+		prt_newline(&i->buf);
+
+		if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+			prt_printf(&i->buf, "Lock hold times:");
+			prt_newline(&i->buf);
+
+			printbuf_indent_add(&i->buf, 2);
+			bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+			printbuf_indent_sub(&i->buf, 2);
+		}
+
+		if (s->max_paths_text) {
+			prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
+			prt_newline(&i->buf);
+
+			printbuf_indent_add(&i->buf, 2);
+			prt_str_indented(&i->buf, s->max_paths_text);
+			printbuf_indent_sub(&i->buf, 2);
+		}
+
+		mutex_unlock(&s->lock);
+
+		printbuf_indent_sub(&i->buf, 2);
+		prt_newline(&i->buf);
+		i->iter++;
+	}
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations lock_held_stats_op = {
+	.owner = THIS_MODULE,
+	.open = lock_held_stats_open,
+	.release = lock_held_stats_release,
+	.read = lock_held_stats_read,
+};
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	struct btree_trans *trans;
+	ssize_t ret = 0;
+	u32 seq;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	if (i->iter)
+		goto out;
+restart:
+	seqmutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if (trans->locking_wait.task->pid <= i->iter)
+			continue;
+
+		closure_get(&trans->ref);
+		seq = seqmutex_seq(&c->btree_trans_lock);
+		seqmutex_unlock(&c->btree_trans_lock);
+
+		ret = flush_buf(i);
+		if (ret) {
+			closure_put(&trans->ref);
+			goto out;
+		}
+
+		bch2_check_for_deadlock(trans, &i->buf);
+
+		i->iter = trans->locking_wait.task->pid;
+
+		closure_put(&trans->ref);
+
+		if (!seqmutex_relock(&c->btree_trans_lock, seq))
+			goto restart;
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+out:
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_deadlock_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_deadlock_read,
+};
+
+void bch2_fs_debug_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+		debugfs_remove_recursive(c->fs_debug_dir);
+}
+
+void bch2_fs_debug_init(struct bch_fs *c)
+{
+	struct btree_debug *bd;
+	char name[100];
+
+	if (IS_ERR_OR_NULL(bch_debug))
+		return;
+
+	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+	c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->fs_debug_dir))
+		return;
+
+	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+			    c->btree_debug, &cached_btree_nodes_ops);
+
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+	debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_transactions_ops);
+#endif
+
+	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+			    c->btree_debug, &journal_pins_ops);
+
+	debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
+			    c, &lock_held_stats_op);
+
+	debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_deadlock_ops);
+
+	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+	if (IS_ERR_OR_NULL(c->btree_debug_dir))
+		return;
+
+	for (bd = c->btree_debug;
+	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+	     bd++) {
+		bd->id = bd - c->btree_debug;
+		debugfs_create_file(bch2_btree_ids[bd->id],
+				    0400, c->btree_debug_dir, bd,
+				    &btree_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-formats",
+			 bch2_btree_ids[bd->id]);
+
+		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+				    &btree_format_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-bfloat-failed",
+			 bch2_btree_ids[bd->id]);
+
+		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+				    &bfloat_failed_debug_ops);
+	}
+}
+
+#endif
+
+void bch2_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_debug))
+		debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch2_debug_init(void)
+{
+	int ret = 0;
+
+	bch_debug = debugfs_create_dir("bcachefs", NULL);
+	return ret;
+}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
new file mode 100644
index 000000000000..2c37143b5fd1
--- /dev/null
+++ b/fs/bcachefs/debug.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DEBUG_H
+#define _BCACHEFS_DEBUG_H
+
+#include "bcachefs.h"
+
+struct bio;
+struct btree;
+struct bch_fs;
+
+void __bch2_btree_verify(struct bch_fs *, struct btree *);
+void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
+				    const struct btree *);
+
+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	if (bch2_verify_btree_ondisk)
+		__bch2_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch2_fs_debug_exit(struct bch_fs *);
+void bch2_fs_debug_init(struct bch_fs *);
+#else
+static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
+static inline void bch2_fs_debug_init(struct bch_fs *c) {}
+#endif
+
+void bch2_debug_exit(void);
+int bch2_debug_init(void);
+
+#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
new file mode 100644
index 000000000000..6c6c8d57d72b
--- /dev/null
+++ b/fs/bcachefs/dirent.c
@@ -0,0 +1,587 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+#include "subvolume.h"
+
+#include <linux/dcache.h>
+
+static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+	unsigned bkey_u64s = bkey_val_u64s(d.k);
+	unsigned bkey_bytes = bkey_u64s * sizeof(u64);
+	u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
+#if CPU_BIG_ENDIAN
+	unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
+#else
+	unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
+#endif
+
+	return bkey_bytes -
+		offsetof(struct bch_dirent, d_name) -
+		trailing_nuls;
+}
+
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
+{
+	return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+}
+
+static u64 bch2_dirent_hash(const struct bch_hash_info *info,
+			    const struct qstr *name)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, name->name, name->len);
+
+	/* [0,2) reserved for dots */
+	return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr name = bch2_dirent_get_name(d);
+
+	return bch2_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	const struct qstr l_name = bch2_dirent_get_name(l);
+	const struct qstr *r_name = _r;
+
+	return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+	const struct qstr l_name = bch2_dirent_get_name(l);
+	const struct qstr r_name = bch2_dirent_get_name(r);
+
+	return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
+}
+
+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+	if (d.v->d_type == DT_SUBVOL)
+		return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
+	return true;
+}
+
+const struct bch_hash_desc bch2_dirent_hash_desc = {
+	.btree_id	= BTREE_ID_dirents,
+	.key_type	= KEY_TYPE_dirent,
+	.hash_key	= dirent_hash_key,
+	.hash_bkey	= dirent_hash_bkey,
+	.cmp_key	= dirent_cmp_key,
+	.cmp_bkey	= dirent_cmp_bkey,
+	.is_visible	= dirent_is_visible,
+};
+
+int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr d_name = bch2_dirent_get_name(d);
+
+	if (!d_name.len) {
+		prt_printf(err, "empty name");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) {
+		prt_printf(err, "value too big (%zu > %u)",
+		       bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	/*
+	 * Check new keys don't exceed the max length
+	 * (older keys may be larger.)
+	 */
+	if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) {
+		prt_printf(err, "dirent name too big (%u > %u)",
+		       d_name.len, BCH_NAME_MAX);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (d_name.len != strnlen(d_name.name, d_name.len)) {
+		prt_printf(err, "dirent has stray data after name's NUL");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) {
+		prt_printf(err, "invalid name");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) {
+		prt_printf(err, "invalid name");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (memchr(d_name.name, '/', d_name.len)) {
+		prt_printf(err, "invalid name");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (d.v->d_type != DT_SUBVOL &&
+	    le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
+		prt_printf(err, "dirent points to own directory");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr d_name = bch2_dirent_get_name(d);
+
+	prt_printf(out, "%.*s -> %llu type %s",
+	       d_name.len,
+	       d_name.name,
+	       d.v->d_type != DT_SUBVOL
+	       ? le64_to_cpu(d.v->d_inum)
+	       : le32_to_cpu(d.v->d_child_subvol),
+	       bch2_d_type_str(d.v->d_type));
+}
+
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+				subvol_inum dir, u8 type,
+				const struct qstr *name, u64 dst)
+{
+	struct bkey_i_dirent *dirent;
+	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+
+	if (name->len > BCH_NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	BUG_ON(u64s > U8_MAX);
+
+	dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(dirent))
+		return dirent;
+
+	bkey_dirent_init(&dirent->k_i);
+	dirent->k.u64s = u64s;
+
+	if (type != DT_SUBVOL) {
+		dirent->v.d_inum = cpu_to_le64(dst);
+	} else {
+		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+		dirent->v.d_child_subvol = cpu_to_le32(dst);
+	}
+
+	dirent->v.d_type = type;
+
+	memcpy(dirent->v.d_name, name->name, name->len);
+	memset(dirent->v.d_name + name->len, 0,
+	       bkey_val_bytes(&dirent->k) -
+	       offsetof(struct bch_dirent, d_name) -
+	       name->len);
+
+	EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+	return dirent;
+}
+
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+		       const struct bch_hash_info *hash_info,
+		       u8 type, const struct qstr *name, u64 dst_inum,
+		       u64 *dir_offset, int flags)
+{
+	struct bkey_i_dirent *dirent;
+	int ret;
+
+	dirent = dirent_create_key(trans, dir, type, name, dst_inum);
+	ret = PTR_ERR_OR_ZERO(dirent);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			    dir, &dirent->k_i, flags);
+	*dir_offset = dirent->k.p.offset;
+
+	return ret;
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+			       struct bkey_s_c_dirent src)
+{
+	dst->v.d_inum = src.v->d_inum;
+	dst->v.d_type = src.v->d_type;
+}
+
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+			    struct bkey_s_c_dirent d, subvol_inum *target)
+{
+	struct bch_subvolume s;
+	int ret = 0;
+
+	if (d.v->d_type == DT_SUBVOL &&
+	    le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
+		return 1;
+
+	if (likely(d.v->d_type != DT_SUBVOL)) {
+		target->subvol	= dir.subvol;
+		target->inum	= le64_to_cpu(d.v->d_inum);
+	} else {
+		target->subvol	= le32_to_cpu(d.v->d_child_subvol);
+
+		ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+
+		target->inum	= le64_to_cpu(s.inode);
+	}
+
+	return ret;
+}
+
+int bch2_dirent_rename(struct btree_trans *trans,
+		subvol_inum src_dir, struct bch_hash_info *src_hash,
+		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+		const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+		const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+		enum bch_rename_mode mode)
+{
+	struct btree_iter src_iter = { NULL };
+	struct btree_iter dst_iter = { NULL };
+	struct bkey_s_c old_src, old_dst = bkey_s_c_null;
+	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+	struct bpos dst_pos =
+		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
+	unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+	int ret = 0;
+
+	if (src_dir.subvol != dst_dir.subvol)
+		return -EXDEV;
+
+	memset(src_inum, 0, sizeof(*src_inum));
+	memset(dst_inum, 0, sizeof(*dst_inum));
+
+	/* Lookup src: */
+	ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+			       src_hash, src_dir, src_name,
+			       BTREE_ITER_INTENT);
+	if (ret)
+		goto out;
+
+	old_src = bch2_btree_iter_peek_slot(&src_iter);
+	ret = bkey_err(old_src);
+	if (ret)
+		goto out;
+
+	ret = bch2_dirent_read_target(trans, src_dir,
+			bkey_s_c_to_dirent(old_src), src_inum);
+	if (ret)
+		goto out;
+
+	src_type = bkey_s_c_to_dirent(old_src).v->d_type;
+
+	if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
+		return -EOPNOTSUPP;
+
+
+	/* Lookup dst: */
+	if (mode == BCH_RENAME) {
+		/*
+		 * Note that we're _not_ checking if the target already exists -
+		 * we're relying on the VFS to do that check for us for
+		 * correctness:
+		 */
+		ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
+				     dst_hash, dst_dir, dst_name);
+		if (ret)
+			goto out;
+	} else {
+		ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+				       dst_hash, dst_dir, dst_name,
+				       BTREE_ITER_INTENT);
+		if (ret)
+			goto out;
+
+		old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+		ret = bkey_err(old_dst);
+		if (ret)
+			goto out;
+
+		ret = bch2_dirent_read_target(trans, dst_dir,
+				bkey_s_c_to_dirent(old_dst), dst_inum);
+		if (ret)
+			goto out;
+
+		dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
+
+		if (dst_type == DT_SUBVOL)
+			return -EOPNOTSUPP;
+	}
+
+	if (mode != BCH_RENAME_EXCHANGE)
+		*src_offset = dst_iter.pos.offset;
+
+	/* Create new dst key: */
+	new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
+	ret = PTR_ERR_OR_ZERO(new_dst);
+	if (ret)
+		goto out;
+
+	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+	new_dst->k.p = dst_iter.pos;
+
+	/* Create new src key: */
+	if (mode == BCH_RENAME_EXCHANGE) {
+		new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
+		ret = PTR_ERR_OR_ZERO(new_src);
+		if (ret)
+			goto out;
+
+		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+		new_src->k.p = src_iter.pos;
+	} else {
+		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+		ret = PTR_ERR_OR_ZERO(new_src);
+		if (ret)
+			goto out;
+
+		bkey_init(&new_src->k);
+		new_src->k.p = src_iter.pos;
+
+		if (bkey_le(dst_pos, src_iter.pos) &&
+		    bkey_lt(src_iter.pos, dst_iter.pos)) {
+			/*
+			 * We have a hash collision for the new dst key,
+			 * and new_src - the key we're deleting - is between
+			 * new_dst's hashed slot and the slot we're going to be
+			 * inserting it into - oops.  This will break the hash
+			 * table if we don't deal with it:
+			 */
+			if (mode == BCH_RENAME) {
+				/*
+				 * If we're not overwriting, we can just insert
+				 * new_dst at the src position:
+				 */
+				new_src = new_dst;
+				new_src->k.p = src_iter.pos;
+				goto out_set_src;
+			} else {
+				/* If we're overwriting, we can't insert new_dst
+				 * at a different slot because it has to
+				 * overwrite old_dst - just make sure to use a
+				 * whiteout when deleting src:
+				 */
+				new_src->k.type = KEY_TYPE_hash_whiteout;
+			}
+		} else {
+			/* Check if we need a whiteout to delete src: */
+			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+						       src_hash, &src_iter);
+			if (ret < 0)
+				goto out;
+
+			if (ret)
+				new_src->k.type = KEY_TYPE_hash_whiteout;
+		}
+	}
+
+	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+	if (ret)
+		goto out;
+out_set_src:
+
+	/*
+	 * If we're deleting a subvolume, we need to really delete the dirent,
+	 * not just emit a whiteout in the current snapshot:
+	 */
+	if (src_type == DT_SUBVOL) {
+		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+		ret = bch2_btree_iter_traverse(&src_iter);
+		if (ret)
+			goto out;
+
+		new_src->k.p = src_iter.pos;
+		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+	}
+
+	ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+	if (ret)
+		goto out;
+
+	if (mode == BCH_RENAME_EXCHANGE)
+		*src_offset = new_src->k.p.offset;
+	*dst_offset = new_dst->k.p.offset;
+out:
+	bch2_trans_iter_exit(trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	return ret;
+}
+
+int __bch2_dirent_lookup_trans(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       subvol_inum dir,
+			       const struct bch_hash_info *hash_info,
+			       const struct qstr *name, subvol_inum *inum,
+			       unsigned flags)
+{
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+			       hash_info, dir, name, flags);
+	if (ret)
+		return ret;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	d = bkey_s_c_to_dirent(k);
+
+	ret = bch2_dirent_read_target(trans, dir, d, inum);
+	if (ret > 0)
+		ret = -ENOENT;
+err:
+	if (ret)
+		bch2_trans_iter_exit(trans, iter);
+
+	return ret;
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
+		       const struct bch_hash_info *hash_info,
+		       const struct qstr *name, subvol_inum *inum)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
+					  name, inum, 0);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+			   SPOS(dir.inum, 0, snapshot),
+			   POS(dir.inum, U64_MAX), 0, k, ret)
+		if (k.k->type == KEY_TYPE_dirent) {
+			ret = -ENOTEMPTY;
+			break;
+		}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	subvol_inum target;
+	u32 snapshot;
+	struct bkey_buf sk;
+	struct qstr name;
+	int ret;
+
+	bch2_bkey_buf_init(&sk);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+			   SPOS(inum.inum, ctx->pos, snapshot),
+			   POS(inum.inum, U64_MAX), 0, k, ret) {
+		if (k.k->type != KEY_TYPE_dirent)
+			continue;
+
+		dirent = bkey_s_c_to_dirent(k);
+
+		ret = bch2_dirent_read_target(trans, inum, dirent, &target);
+		if (ret < 0)
+			break;
+		if (ret)
+			continue;
+
+		/* dir_emit() can fault and block: */
+		bch2_bkey_buf_reassemble(&sk, c, k);
+		dirent = bkey_i_to_s_c_dirent(sk.k);
+		bch2_trans_unlock(trans);
+
+		name = bch2_dirent_get_name(dirent);
+
+		ctx->pos = dirent.k->p.offset;
+		if (!dir_emit(ctx, name.name,
+			      name.len,
+			      target.inum,
+			      vfs_d_type(dirent.v->d_type)))
+			break;
+		ctx->pos = dirent.k->p.offset + 1;
+
+		/*
+		 * read_target looks up subvolumes, we can overflow paths if the
+		 * directory has many subvolumes in it
+		 */
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret;
+}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
new file mode 100644
index 000000000000..e9fa1df38232
--- /dev/null
+++ b/fs/bcachefs/dirent.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_H
+#define _BCACHEFS_DIRENT_H
+
+#include "str_hash.h"
+
+enum bkey_invalid_flags;
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
+
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c,
+			enum bkey_invalid_flags, struct printbuf *);
+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
+	.key_invalid	= bch2_dirent_invalid,		\
+	.val_to_text	= bch2_dirent_to_text,		\
+	.min_val_size	= 16,				\
+})
+
+struct qstr;
+struct file;
+struct dir_context;
+struct bch_fs;
+struct bch_hash_info;
+struct bch_inode_info;
+
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
+
+static inline unsigned dirent_val_u64s(unsigned len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+			    sizeof(u64));
+}
+
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+			    struct bkey_s_c_dirent, subvol_inum *);
+
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
+		       const struct bch_hash_info *, u8,
+		       const struct qstr *, u64, u64 *, int);
+
+static inline unsigned vfs_d_type(unsigned type)
+{
+	return type == DT_SUBVOL ? DT_DIR : type;
+}
+
+enum bch_rename_mode {
+	BCH_RENAME,
+	BCH_RENAME_OVERWRITE,
+	BCH_RENAME_EXCHANGE,
+};
+
+int bch2_dirent_rename(struct btree_trans *,
+		       subvol_inum, struct bch_hash_info *,
+		       subvol_inum, struct bch_hash_info *,
+		       const struct qstr *, subvol_inum *, u64 *,
+		       const struct qstr *, subvol_inum *, u64 *,
+		       enum bch_rename_mode);
+
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+			       subvol_inum, const struct bch_hash_info *,
+			       const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+		       const struct bch_hash_info *,
+		       const struct qstr *, subvol_inum *);
+
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+
+#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
new file mode 100644
index 000000000000..e00133b6ea51
--- /dev/null
+++ b/fs/bcachefs/disk_groups.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+	const struct bch_disk_group *l = _l;
+	const struct bch_disk_group *r = _r;
+
+	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+		strncmp(l->label, r->label, sizeof(l->label));
+}
+
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
+					struct bch_sb_field *f,
+					struct printbuf *err)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g, *sorted = NULL;
+	unsigned nr_groups = disk_groups_nr(groups);
+	unsigned i, len;
+	int ret = 0;
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = bch2_sb_member_get(sb, i);
+		unsigned group_id;
+
+		if (!BCH_MEMBER_GROUP(&m))
+			continue;
+
+		group_id = BCH_MEMBER_GROUP(&m) - 1;
+
+		if (group_id >= nr_groups) {
+			prt_printf(err, "disk %u has invalid label %u (have %u)",
+				   i, group_id, nr_groups);
+			return -BCH_ERR_invalid_sb_disk_groups;
+		}
+
+		if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+			prt_printf(err, "disk %u has deleted label %u", i, group_id);
+			return -BCH_ERR_invalid_sb_disk_groups;
+		}
+	}
+
+	if (!nr_groups)
+		return 0;
+
+	for (i = 0; i < nr_groups; i++) {
+		g = groups->entries + i;
+
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		len = strnlen(g->label, sizeof(g->label));
+		if (!len) {
+			prt_printf(err, "label %u empty", i);
+			return -BCH_ERR_invalid_sb_disk_groups;
+		}
+	}
+
+	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+	if (!sorted)
+		return -BCH_ERR_ENOMEM_disk_groups_validate;
+
+	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+	for (g = sorted; g + 1 < sorted + nr_groups; g++)
+		if (!BCH_GROUP_DELETED(g) &&
+		    !group_cmp(&g[0], &g[1])) {
+			prt_printf(err, "duplicate label %llu.%.*s",
+			       BCH_GROUP_PARENT(g),
+			       (int) sizeof(g->label), g->label);
+			ret = -BCH_ERR_invalid_sb_disk_groups;
+			goto err;
+		}
+err:
+	kfree(sorted);
+	return ret;
+}
+
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_disk_groups_cpu *g;
+	struct bch_dev *ca;
+	int i;
+	unsigned iter;
+
+	out->atomic++;
+	rcu_read_lock();
+
+	g = rcu_dereference(c->disk_groups);
+	if (!g)
+		goto out;
+
+	for (i = 0; i < g->nr; i++) {
+		if (i)
+			prt_printf(out, " ");
+
+		if (g->entries[i].deleted) {
+			prt_printf(out, "[deleted]");
+			continue;
+		}
+
+		prt_printf(out, "[parent %d devs", g->entries[i].parent);
+		for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs)
+			prt_printf(out, " %s", ca->name);
+		prt_printf(out, "]");
+	}
+
+out:
+	rcu_read_unlock();
+	out->atomic--;
+}
+
+static void bch2_sb_disk_groups_to_text(struct printbuf *out,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g;
+	unsigned nr_groups = disk_groups_nr(groups);
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (g != groups->entries)
+			prt_printf(out, " ");
+
+		if (BCH_GROUP_DELETED(g))
+			prt_printf(out, "[deleted]");
+		else
+			prt_printf(out, "[parent %llu name %s]",
+			       BCH_GROUP_PARENT(g), g->label);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+	.validate	= bch2_sb_disk_groups_validate,
+	.to_text	= bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_disk_groups *groups;
+	struct bch_disk_groups_cpu *cpu_g, *old_g;
+	unsigned i, g, nr_groups;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	groups		= bch2_sb_field_get(c->disk_sb.sb, disk_groups);
+	nr_groups	= disk_groups_nr(groups);
+
+	if (!groups)
+		return 0;
+
+	cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
+	if (!cpu_g)
+		return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
+
+	cpu_g->nr = nr_groups;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *src	= &groups->entries[i];
+		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
+
+		dst->deleted	= BCH_GROUP_DELETED(src);
+		dst->parent	= BCH_GROUP_PARENT(src);
+	}
+
+	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
+		struct bch_disk_group_cpu *dst;
+
+		if (!bch2_member_exists(&m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(&m);
+		while (g) {
+			dst = &cpu_g->entries[g - 1];
+			__set_bit(i, dst->devs.d);
+			g = dst->parent;
+		}
+	}
+
+	old_g = rcu_dereference_protected(c->disk_groups,
+				lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->disk_groups, cpu_g);
+	if (old_g)
+		kfree_rcu(old_g, rcu);
+
+	return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+	struct target t = target_decode(target);
+	struct bch_devs_mask *devs;
+
+	rcu_read_lock();
+
+	switch (t.type) {
+	case TARGET_NULL:
+		devs = NULL;
+		break;
+	case TARGET_DEV: {
+		struct bch_dev *ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+		devs = ca ? &ca->self : NULL;
+		break;
+	}
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+		devs = g && t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	rcu_read_unlock();
+
+	return devs;
+}
+
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return false;
+	case TARGET_DEV:
+		return dev == t.dev;
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g;
+		const struct bch_devs_mask *m;
+		bool ret;
+
+		rcu_read_lock();
+		g = rcu_dereference(c->disk_groups);
+		m = g && t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+
+		ret = m ? test_bit(dev, m->d) : false;
+		rcu_read_unlock();
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+				  unsigned parent,
+				  const char *name, unsigned namelen)
+{
+	unsigned i, nr_groups = disk_groups_nr(groups);
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *g = groups->entries + i;
+
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		if (!BCH_GROUP_DELETED(g) &&
+		    BCH_GROUP_PARENT(g) == parent &&
+		    strnlen(g->label, sizeof(g->label)) == namelen &&
+		    !memcmp(name, g->label, namelen))
+			return i;
+	}
+
+	return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+				 const char *name, unsigned namelen)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_field_get(sb->sb, disk_groups);
+	unsigned i, nr_groups = disk_groups_nr(groups);
+	struct bch_disk_group *g;
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0;
+	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+	     i++)
+		;
+
+	if (i == nr_groups) {
+		unsigned u64s =
+			(sizeof(struct bch_sb_field_disk_groups) +
+			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+			sizeof(u64);
+
+		groups = bch2_sb_field_resize(sb, disk_groups, u64s);
+		if (!groups)
+			return -BCH_ERR_ENOSPC_disk_label_add;
+
+		nr_groups = disk_groups_nr(groups);
+	}
+
+	BUG_ON(i >= nr_groups);
+
+	g = &groups->entries[i];
+
+	memcpy(g->label, name, namelen);
+	if (namelen < sizeof(g->label))
+		g->label[namelen] = '\0';
+	SET_BCH_GROUP_DELETED(g, 0);
+	SET_BCH_GROUP_PARENT(g, parent);
+	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+	return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_field_get(sb->sb, disk_groups);
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		v = __bch2_disk_group_find(groups, v + 1, name, len);
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups;
+	unsigned parent = 0;
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		groups = bch2_sb_field_get(sb->sb, disk_groups);
+
+		v = __bch2_disk_group_find(groups, parent, name, len);
+		if (v < 0)
+			v = __bch2_disk_group_add(sb, parent, name, len);
+		if (v < 0)
+			return v;
+
+		parent = v + 1;
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_field_get(sb, disk_groups);
+	struct bch_disk_group *g;
+	unsigned nr = 0;
+	u16 path[32];
+
+	while (1) {
+		if (nr == ARRAY_SIZE(path))
+			goto inval;
+
+		if (v >= disk_groups_nr(groups))
+			goto inval;
+
+		g = groups->entries + v;
+
+		if (BCH_GROUP_DELETED(g))
+			goto inval;
+
+		path[nr++] = v;
+
+		if (!BCH_GROUP_PARENT(g))
+			break;
+
+		v = BCH_GROUP_PARENT(g) - 1;
+	}
+
+	while (nr) {
+		v = path[--nr];
+		g = groups->entries + v;
+
+		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+		if (nr)
+			prt_printf(out, ".");
+	}
+	return;
+inval:
+	prt_printf(out, "invalid label %u", v);
+}
+
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	struct bch_member *mi;
+	int ret, v = -1;
+
+	if (!strlen(name) || !strcmp(name, "none"))
+		return 0;
+
+	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+	if (v < 0)
+		return v;
+
+	ret = bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		return ret;
+
+	mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_GROUP(mi, v + 1);
+	return 0;
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	ret = __bch2_dev_group_set(c, ca, name) ?:
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
+			  struct printbuf *err)
+{
+	struct bch_dev *ca;
+	int g;
+
+	if (!val)
+		return -EINVAL;
+
+	if (!c)
+		return 0;
+
+	if (!strlen(val) || !strcmp(val, "none")) {
+		*res = 0;
+		return 0;
+	}
+
+	/* Is it a device? */
+	ca = bch2_dev_lookup(c, val);
+	if (!IS_ERR(ca)) {
+		*res = dev_to_target(ca->dev_idx);
+		percpu_ref_put(&ca->ref);
+		return 0;
+	}
+
+	mutex_lock(&c->sb_lock);
+	g = bch2_disk_path_find(&c->disk_sb, val);
+	mutex_unlock(&c->sb_lock);
+
+	if (g >= 0) {
+		*res = group_to_target(g);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+void bch2_opt_target_to_text(struct printbuf *out,
+			     struct bch_fs *c,
+			     struct bch_sb *sb,
+			     u64 v)
+{
+	struct target t = target_decode(v);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		prt_printf(out, "none");
+		break;
+	case TARGET_DEV:
+		if (c) {
+			struct bch_dev *ca;
+
+			rcu_read_lock();
+			ca = t.dev < c->sb.nr_devices
+				? rcu_dereference(c->devs[t.dev])
+				: NULL;
+
+			if (ca && percpu_ref_tryget(&ca->io_ref)) {
+				prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+				percpu_ref_put(&ca->io_ref);
+			} else if (ca) {
+				prt_printf(out, "offline device %u", t.dev);
+			} else {
+				prt_printf(out, "invalid device %u", t.dev);
+			}
+
+			rcu_read_unlock();
+		} else {
+			struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+			if (bch2_dev_exists(sb, t.dev)) {
+				prt_printf(out, "Device ");
+				pr_uuid(out, m.uuid.b);
+				prt_printf(out, " (%u)", t.dev);
+			} else {
+				prt_printf(out, "Bad device %u", t.dev);
+			}
+		}
+		break;
+	case TARGET_GROUP:
+		if (c) {
+			mutex_lock(&c->sb_lock);
+			bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
+			mutex_unlock(&c->sb_lock);
+		} else {
+			bch2_disk_path_to_text(out, sb, t.group);
+		}
+		break;
+	default:
+		BUG();
+	}
+}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
new file mode 100644
index 000000000000..bd7711767fd4
--- /dev/null
+++ b/fs/bcachefs/disk_groups.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+	return groups
+		? (vstruct_end(&groups->field) -
+		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+		: 0;
+}
+
+struct target {
+	enum {
+		TARGET_NULL,
+		TARGET_DEV,
+		TARGET_GROUP,
+	}			type;
+	union {
+		unsigned	dev;
+		unsigned	group;
+	};
+};
+
+#define TARGET_DEV_START	1
+#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+	return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+	return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+	if (target >= TARGET_GROUP_START)
+		return (struct target) {
+			.type	= TARGET_GROUP,
+			.group	= target - TARGET_GROUP_START
+		};
+
+	if (target >= TARGET_DEV_START)
+		return (struct target) {
+			.type	= TARGET_DEV,
+			.group	= target - TARGET_DEV_START
+		};
+
+	return (struct target) { .type = TARGET_NULL };
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+						  enum bch_data_type data_type,
+						  u16 target)
+{
+	struct bch_devs_mask devs = c->rw_devs[data_type];
+	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	return devs;
+}
+
+static inline bool bch2_target_accepts_data(struct bch_fs *c,
+					    enum bch_data_type data_type,
+					    u16 target)
+{
+	struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
+	return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
+}
+
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+
+/* Exported for userspace bcachefs-tools: */
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+
+#define bch2_opt_target (struct bch_opt_fn) {		\
+	.parse		= bch2_opt_target_parse,	\
+	.to_text	= bch2_opt_target_to_text,	\
+}
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+					 struct bch_sb_field *);
+
+void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
new file mode 100644
index 000000000000..8646856e4539
--- /dev/null
+++ b/fs/bcachefs/ec.c
@@ -0,0 +1,1966 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* erasure coding */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "util.h"
+
+#include <linux/sort.h>
+
+#ifdef __KERNEL__
+
+#include <linux/raid/pq.h>
+#include <linux/raid/xor.h>
+
+static void raid5_recov(unsigned disks, unsigned failed_idx,
+			size_t size, void **data)
+{
+	unsigned i = 2, nr;
+
+	BUG_ON(failed_idx >= disks);
+
+	swap(data[0], data[failed_idx]);
+	memcpy(data[0], data[1], size);
+
+	while (i < disks) {
+		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+		xor_blocks(nr, size, data[0], data + i);
+		i += nr;
+	}
+
+	swap(data[0], data[failed_idx]);
+}
+
+static void raid_gen(int nd, int np, size_t size, void **v)
+{
+	if (np >= 1)
+		raid5_recov(nd + np, nd, size, v);
+	if (np >= 2)
+		raid6_call.gen_syndrome(nd + np, size, v);
+	BUG_ON(np > 2);
+}
+
+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	switch (nr) {
+	case 0:
+		break;
+	case 1:
+		if (ir[0] < nd + 1)
+			raid5_recov(nd + 1, ir[0], size, v);
+		else
+			raid6_call.gen_syndrome(nd + np, size, v);
+		break;
+	case 2:
+		if (ir[1] < nd) {
+			/* data+data failure. */
+			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
+		} else if (ir[0] < nd) {
+			/* data + p/q failure */
+
+			if (ir[1] == nd) /* data + p failure */
+				raid6_datap_recov(nd + np, size, ir[0], v);
+			else { /* data + q failure */
+				raid5_recov(nd + 1, ir[0], size, v);
+				raid6_call.gen_syndrome(nd + np, size, v);
+			}
+		} else {
+			raid_gen(nd, np, size, v);
+		}
+		break;
+	default:
+		BUG();
+	}
+}
+
+#else
+
+#include <raid/raid.h>
+
+#endif
+
+struct ec_bio {
+	struct bch_dev		*ca;
+	struct ec_stripe_buf	*buf;
+	size_t			idx;
+	struct bio		bio;
+};
+
+/* Stripes btree keys: */
+
+int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
+{
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+
+	if (bkey_eq(k.k->p, POS_MIN)) {
+		prt_printf(err, "stripe at POS_MIN");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (k.k->p.inode) {
+		prt_printf(err, "nonzero inode field");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
+		prt_printf(err, "incorrect value size (%zu < %u)",
+		       bkey_val_u64s(k.k), stripe_val_u64s(s));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
+}
+
+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+
+	prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+	       s->algorithm,
+	       le16_to_cpu(s->sectors),
+	       nr_data,
+	       s->nr_redundant,
+	       s->csum_type,
+	       1U << s->csum_granularity_bits);
+
+	for (i = 0; i < s->nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = s->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		u32 offset;
+		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+		prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
+		if (i < nr_data)
+			prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+		if (ptr_stale(ca, ptr))
+			prt_printf(out, " stale");
+	}
+}
+
+/* returns blocknr in stripe that we matched: */
+static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
+						struct bkey_s_c k, unsigned *block)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		for (i = 0; i < nr_data; i++)
+			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
+						      le16_to_cpu(s->sectors))) {
+				*block = i;
+				return ptr;
+			}
+
+	return NULL;
+}
+
+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+
+		extent_for_each_entry(e, entry)
+			if (extent_entry_type(entry) ==
+			    BCH_EXTENT_ENTRY_stripe_ptr &&
+			    entry->stripe_ptr.idx == idx)
+				return true;
+
+		break;
+	}
+	}
+
+	return false;
+}
+
+/* Stripe bufs: */
+
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
+{
+	if (buf->key.k.type == KEY_TYPE_stripe) {
+		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
+		unsigned i;
+
+		for (i = 0; i < s->v.nr_blocks; i++) {
+			kvpfree(buf->data[i], buf->size << 9);
+			buf->data[i] = NULL;
+		}
+	}
+}
+
+/* XXX: this is a non-mempoolified memory allocation: */
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+			      unsigned offset, unsigned size)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1U << v->csum_granularity_bits;
+	unsigned end = offset + size;
+	unsigned i;
+
+	BUG_ON(end > le16_to_cpu(v->sectors));
+
+	offset	= round_down(offset, csum_granularity);
+	end	= min_t(unsigned, le16_to_cpu(v->sectors),
+			round_up(end, csum_granularity));
+
+	buf->offset	= offset;
+	buf->size	= end - offset;
+
+	memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+		if (!buf->data[i])
+			goto err;
+	}
+
+	return 0;
+err:
+	ec_stripe_buf_exit(buf);
+	return -BCH_ERR_ENOMEM_stripe_buf;
+}
+
+/* Checksumming: */
+
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
+					 unsigned block, unsigned offset)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned end = buf->offset + buf->size;
+	unsigned len = min(csum_granularity, end - offset);
+
+	BUG_ON(offset >= end);
+	BUG_ON(offset <  buf->offset);
+	BUG_ON(offset & (csum_granularity - 1));
+	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+	       (len & (csum_granularity - 1)));
+
+	return bch2_checksum(NULL, v->csum_type,
+			     null_nonce(),
+			     buf->data[block] + ((offset - buf->offset) << 9),
+			     len << 9);
+}
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
+
+	if (!v->csum_type)
+		return;
+
+	BUG_ON(buf->offset);
+	BUG_ON(buf->size != le16_to_cpu(v->sectors));
+
+	for (i = 0; i < v->nr_blocks; i++)
+		for (j = 0; j < csums_per_device; j++)
+			stripe_csum_set(v, i, j,
+				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
+}
+
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned i;
+
+	if (!v->csum_type)
+		return;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		unsigned offset = buf->offset;
+		unsigned end = buf->offset + buf->size;
+
+		if (!test_bit(i, buf->valid))
+			continue;
+
+		while (offset < end) {
+			unsigned j = offset >> v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, end - offset);
+			struct bch_csum want = stripe_csum_get(v, i, j);
+			struct bch_csum got = ec_block_checksum(buf, i, offset);
+
+			if (bch2_crc_cmp(want, got)) {
+				struct printbuf buf2 = PRINTBUF;
+
+				bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
+
+				bch_err_ratelimited(c,
+					"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
+					(void *) _RET_IP_, i, j, v->csum_type,
+					want.lo, got.lo, buf2.buf);
+				printbuf_exit(&buf2);
+				clear_bit(i, buf->valid);
+				break;
+			}
+
+			offset += len;
+		}
+	}
+}
+
+/* Erasure coding: */
+
+static void ec_generate_ec(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = le16_to_cpu(v->sectors) << 9;
+
+	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
+}
+
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+
+	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
+}
+
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = buf->size << 9;
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		return -1;
+	}
+
+	for (i = 0; i < nr_data; i++)
+		if (!test_bit(i, buf->valid))
+			failed[nr_failed++] = i;
+
+	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
+	return 0;
+}
+
+/* IO: */
+
+static void ec_block_endio(struct bio *bio)
+{
+	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
+	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
+	struct bch_dev *ca = ec_bio->ca;
+	struct closure *cl = bio->bi_private;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
+			       bio_data_dir(bio) ? "write" : "read",
+			       bch2_blk_status_to_str(bio->bi_status)))
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+
+	if (ptr_stale(ca, ptr)) {
+		bch_err_ratelimited(ca->fs,
+				    "error %s stripe: stale pointer after io",
+				    bio_data_dir(bio) == READ ? "reading from" : "writing to");
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+	}
+
+	bio_put(&ec_bio->bio);
+	percpu_ref_put(&ca->io_ref);
+	closure_put(cl);
+}
+
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
+			blk_opf_t opf, unsigned idx, struct closure *cl)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned offset = 0, bytes = buf->size << 9;
+	struct bch_extent_ptr *ptr = &v->ptrs[idx];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
+		? BCH_DATA_user
+		: BCH_DATA_parity;
+	int rw = op_is_write(opf);
+
+	if (ptr_stale(ca, ptr)) {
+		bch_err_ratelimited(c,
+				    "error %s stripe: stale pointer",
+				    rw == READ ? "reading from" : "writing to");
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+	if (!bch2_dev_get_ioref(ca, rw)) {
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
+
+	while (offset < bytes) {
+		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
+					   DIV_ROUND_UP(bytes, PAGE_SIZE));
+		unsigned b = min_t(size_t, bytes - offset,
+				   nr_iovecs << PAGE_SHIFT);
+		struct ec_bio *ec_bio;
+
+		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+						       nr_iovecs,
+						       opf,
+						       GFP_KERNEL,
+						       &c->ec_bioset),
+				      struct ec_bio, bio);
+
+		ec_bio->ca			= ca;
+		ec_bio->buf			= buf;
+		ec_bio->idx			= idx;
+
+		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
+		ec_bio->bio.bi_end_io		= ec_block_endio;
+		ec_bio->bio.bi_private		= cl;
+
+		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
+
+		closure_get(cl);
+		percpu_ref_get(&ca->io_ref);
+
+		submit_bio(&ec_bio->bio);
+
+		offset += b;
+	}
+
+	percpu_ref_put(&ca->io_ref);
+}
+
+static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
+				struct ec_stripe_buf *stripe)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+			       POS(0, idx), BTREE_ITER_SLOTS);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+	if (k.k->type != KEY_TYPE_stripe) {
+		ret = -ENOENT;
+		goto err;
+	}
+	bkey_reassemble(&stripe->key, k);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+{
+	return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
+	struct ec_stripe_buf *buf;
+	struct closure cl;
+	struct bch_stripe *v;
+	unsigned i, offset;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	BUG_ON(!rbio->pick.has_ec);
+
+	buf = kzalloc(sizeof(*buf), GFP_NOFS);
+	if (!buf)
+		return -BCH_ERR_ENOMEM_ec_read_extent;
+
+	ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+	if (ret) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: error %i looking up stripe", ret);
+		kfree(buf);
+		return -EIO;
+	}
+
+	v = &bkey_i_to_stripe(&buf->key)->v;
+
+	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: pointer doesn't match stripe");
+		ret = -EIO;
+		goto err;
+	}
+
+	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
+	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: read is bigger than stripe");
+		ret = -EIO;
+		goto err;
+	}
+
+	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+	if (ret)
+		goto err;
+
+	for (i = 0; i < v->nr_blocks; i++)
+		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
+
+	closure_sync(&cl);
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		ret = -EIO;
+		goto err;
+	}
+
+	ec_validate_checksums(c, buf);
+
+	ret = ec_do_recov(c, buf);
+	if (ret)
+		goto err;
+
+	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
+		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
+err:
+	ec_stripe_buf_exit(buf);
+	kfree(buf);
+	return ret;
+}
+
+/* stripe bucket accounting: */
+
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+{
+	ec_stripes_heap n, *h = &c->ec_stripes_heap;
+
+	if (idx >= h->size) {
+		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+			return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+		mutex_lock(&c->ec_stripes_heap_lock);
+		if (n.size > h->size) {
+			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
+			n.used = h->used;
+			swap(*h, n);
+		}
+		mutex_unlock(&c->ec_stripes_heap_lock);
+
+		free_heap(&n);
+	}
+
+	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
+		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
+		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+	return 0;
+}
+
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
+			       struct btree_iter *iter)
+{
+	return allocate_dropping_locks_errcode(trans,
+			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
+}
+
+/*
+ * Hash table of open stripes:
+ * Stripes that are being created or modified are kept in a hash table, so that
+ * stripe deletion can skip them.
+ */
+
+static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+	struct ec_stripe_new *s;
+
+	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
+		if (s->idx == idx)
+			return true;
+	return false;
+}
+
+static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+	bool ret = false;
+
+	spin_lock(&c->ec_stripes_new_lock);
+	ret = __bch2_stripe_is_open(c, idx);
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	return ret;
+}
+
+static bool bch2_try_open_stripe(struct bch_fs *c,
+				 struct ec_stripe_new *s,
+				 u64 idx)
+{
+	bool ret;
+
+	spin_lock(&c->ec_stripes_new_lock);
+	ret = !__bch2_stripe_is_open(c, idx);
+	if (ret) {
+		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+
+		s->idx = idx;
+		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
+	}
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	return ret;
+}
+
+static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	BUG_ON(!s->idx);
+
+	spin_lock(&c->ec_stripes_new_lock);
+	hlist_del_init(&s->hash);
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	s->idx = 0;
+}
+
+/* Heap of all existing stripes, ordered by blocks_nonempty */
+
+static u64 stripe_idx_to_delete(struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+
+	lockdep_assert_held(&c->ec_stripes_heap_lock);
+
+	if (h->used &&
+	    h->data[0].blocks_nonempty == 0 &&
+	    !bch2_stripe_is_open(c, h->data[0].idx))
+		return h->data[0].idx;
+
+	return 0;
+}
+
+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
+				      struct ec_stripe_heap_entry l,
+				      struct ec_stripe_heap_entry r)
+{
+	return ((l.blocks_nonempty > r.blocks_nonempty) -
+		(l.blocks_nonempty < r.blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
+						   size_t i)
+{
+	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
+
+	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
+}
+
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+	BUG_ON(m->heap_idx >= h->used);
+	BUG_ON(h->data[m->heap_idx].idx != idx);
+}
+
+void bch2_stripes_heap_del(struct bch_fs *c,
+			   struct stripe *m, size_t idx)
+{
+	mutex_lock(&c->ec_stripes_heap_lock);
+	heap_verify_backpointer(c, idx);
+
+	heap_del(&c->ec_stripes_heap, m->heap_idx,
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_insert(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	mutex_lock(&c->ec_stripes_heap_lock);
+	BUG_ON(heap_full(&c->ec_stripes_heap));
+
+	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+			.idx = idx,
+			.blocks_nonempty = m->blocks_nonempty,
+		}),
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+
+	heap_verify_backpointer(c, idx);
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_update(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	bool do_deletes;
+	size_t i;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	heap_verify_backpointer(c, idx);
+
+	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
+
+	i = m->heap_idx;
+	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
+		     ec_stripes_heap_set_backpointer);
+	heap_sift_down(h, i, ec_stripes_heap_cmp,
+		       ec_stripes_heap_set_backpointer);
+
+	heap_verify_backpointer(c, idx);
+
+	do_deletes = stripe_idx_to_delete(c) != 0;
+	mutex_unlock(&c->ec_stripes_heap_lock);
+
+	if (do_deletes)
+		bch2_do_stripe_deletes(c);
+}
+
+/* stripe deletion */
+
+static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_stripe s;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
+			       BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_stripe) {
+		bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	s = bkey_s_c_to_stripe(k);
+	for (unsigned i = 0; i < s.v->nr_blocks; i++)
+		if (stripe_blockcount_get(s.v, i)) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
+			printbuf_exit(&buf);
+			ret = -EINVAL;
+			goto err;
+		}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void ec_stripe_delete_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, ec_stripe_delete_work);
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret;
+	u64 idx;
+
+	while (1) {
+		mutex_lock(&c->ec_stripes_heap_lock);
+		idx = stripe_idx_to_delete(c);
+		mutex_unlock(&c->ec_stripes_heap_lock);
+
+		if (!idx)
+			break;
+
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				ec_stripe_delete(trans, idx));
+		if (ret) {
+			bch_err_fn(c, ret);
+			break;
+		}
+	}
+
+	bch2_trans_put(trans);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+void bch2_do_stripe_deletes(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+/* stripe creation: */
+
+static int ec_stripe_key_update(struct btree_trans *trans,
+				struct bkey_i_stripe *new,
+				bool create)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+			       new->k.p, BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
+		bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
+				     create ? "creating" : "updating",
+				     bch2_bkey_types[k.k->type]);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (k.k->type == KEY_TYPE_stripe) {
+		const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
+		unsigned i;
+
+		if (old->nr_blocks != new->v.nr_blocks) {
+			bch_err(c, "error updating stripe: nr_blocks does not match");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		for (i = 0; i < new->v.nr_blocks; i++) {
+			unsigned v = stripe_blockcount_get(old, i);
+
+			BUG_ON(v &&
+			       (old->ptrs[i].dev != new->v.ptrs[i].dev ||
+				old->ptrs[i].gen != new->v.ptrs[i].gen ||
+				old->ptrs[i].offset != new->v.ptrs[i].offset));
+
+			stripe_blockcount_set(&new->v, i, v);
+		}
+	}
+
+	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int ec_stripe_update_extent(struct btree_trans *trans,
+				   struct bpos bucket, u8 gen,
+				   struct ec_stripe_buf *s,
+				   struct bpos *bp_pos)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	struct bch_fs *c = trans->c;
+	struct bch_backpointer bp;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_extent_ptr *ptr_c;
+	struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+	struct bch_extent_stripe_ptr stripe_ptr;
+	struct bkey_i *n;
+	int ret, dev, block;
+
+	ret = bch2_get_next_backpointer(trans, bucket, gen,
+				bp_pos, &bp, BTREE_ITER_CACHED);
+	if (ret)
+		return ret;
+	if (bpos_eq(*bp_pos, SPOS_MAX))
+		return 0;
+
+	if (bp.level) {
+		struct printbuf buf = PRINTBUF;
+		struct btree_iter node_iter;
+		struct btree *b;
+
+		b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
+		bch2_trans_iter_exit(trans, &node_iter);
+
+		if (!b)
+			return 0;
+
+		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
+		bch2_backpointer_to_text(&buf, &bp);
+
+		bch2_fs_inconsistent(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		return -EIO;
+	}
+
+	k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k) {
+		/*
+		 * extent no longer exists - we could flush the btree
+		 * write buffer and retry to verify, but no need:
+		 */
+		return 0;
+	}
+
+	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+		goto out;
+
+	ptr_c = bkey_matches_stripe(v, k, &block);
+	/*
+	 * It doesn't generally make sense to erasure code cached ptrs:
+	 * XXX: should we be incrementing a counter?
+	 */
+	if (!ptr_c || ptr_c->cached)
+		goto out;
+
+	dev = v->ptrs[block].dev;
+
+	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		goto out;
+
+	bkey_reassemble(n, k);
+
+	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
+	BUG_ON(!ec_ptr);
+
+	stripe_ptr = (struct bch_extent_stripe_ptr) {
+		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+		.block		= block,
+		.redundancy	= v->nr_redundant,
+		.idx		= s->key.k.p.offset,
+	};
+
+	__extent_entry_insert(n,
+			(union bch_extent_entry *) ec_ptr,
+			(union bch_extent_entry *) &stripe_ptr);
+
+	ret = bch2_trans_update(trans, &iter, n, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
+				   unsigned block)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	struct bch_extent_ptr bucket = v->ptrs[block];
+	struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+	struct bpos bp_pos = POS_MIN;
+	int ret = 0;
+
+	while (1) {
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL,
+			ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
+						s, &bp_pos));
+		if (ret)
+			break;
+		if (bkey_eq(bp_pos, POS_MAX))
+			break;
+
+		bp_pos = bpos_nosnap_successor(bp_pos);
+	}
+
+	return ret;
+}
+
+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	int ret = 0;
+
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_data; i++) {
+		ret = ec_stripe_update_bucket(trans, s, i);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
+				       struct ec_stripe_new *s,
+				       unsigned block,
+				       struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
+	int ret;
+
+	if (!bch2_dev_get_ioref(ca, WRITE)) {
+		s->err = -BCH_ERR_erofs_no_writes;
+		return;
+	}
+
+	memset(s->new_stripe.data[block] + (offset << 9),
+	       0,
+	       ob->sectors_free << 9);
+
+	ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+			ob->bucket * ca->mi.bucket_size + offset,
+			ob->sectors_free,
+			GFP_KERNEL, 0);
+
+	percpu_ref_put(&ca->io_ref);
+
+	if (ret)
+		s->err = ret;
+}
+
+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	if (s->idx)
+		bch2_stripe_close(c, s);
+	kfree(s);
+}
+
+/*
+ * data buckets of new stripe all written: create the stripe
+ */
+static void ec_stripe_create(struct ec_stripe_new *s)
+{
+	struct bch_fs *c = s->c;
+	struct open_bucket *ob;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	int ret;
+
+	BUG_ON(s->h->s == s);
+
+	closure_sync(&s->iodone);
+
+	if (!s->err) {
+		for (i = 0; i < nr_data; i++)
+			if (s->blocks[i]) {
+				ob = c->open_buckets + s->blocks[i];
+
+				if (ob->sectors_free)
+					zero_out_rest_of_ec_bucket(c, s, i, ob);
+			}
+	}
+
+	if (s->err) {
+		if (!bch2_err_matches(s->err, EROFS))
+			bch_err(c, "error creating stripe: error writing data buckets");
+		goto err;
+	}
+
+	if (s->have_existing_stripe) {
+		ec_validate_checksums(c, &s->existing_stripe);
+
+		if (ec_do_recov(c, &s->existing_stripe)) {
+			bch_err(c, "error creating stripe: error reading existing stripe");
+			goto err;
+		}
+
+		for (i = 0; i < nr_data; i++)
+			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
+				swap(s->new_stripe.data[i],
+				     s->existing_stripe.data[i]);
+
+		ec_stripe_buf_exit(&s->existing_stripe);
+	}
+
+	BUG_ON(!s->allocated);
+	BUG_ON(!s->idx);
+
+	ec_generate_ec(&s->new_stripe);
+
+	ec_generate_checksums(&s->new_stripe);
+
+	/* write p/q: */
+	for (i = nr_data; i < v->nr_blocks; i++)
+		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+	closure_sync(&s->iodone);
+
+	if (ec_nr_failed(&s->new_stripe)) {
+		bch_err(c, "error creating stripe: error writing redundancy buckets");
+		goto err;
+	}
+
+	ret = bch2_trans_do(c, &s->res, NULL,
+			    BTREE_INSERT_NOCHECK_RW|
+			    BTREE_INSERT_NOFAIL,
+			    ec_stripe_key_update(trans,
+					bkey_i_to_stripe(&s->new_stripe.key),
+					!s->have_existing_stripe));
+	if (ret) {
+		bch_err(c, "error creating stripe: error creating stripe key");
+		goto err;
+	}
+
+	ret = ec_stripe_update_extents(c, &s->new_stripe);
+	if (ret) {
+		bch_err_msg(c, ret, "creating stripe: error updating pointers");
+		goto err;
+	}
+err:
+	bch2_disk_reservation_put(c, &s->res);
+
+	for (i = 0; i < v->nr_blocks; i++)
+		if (s->blocks[i]) {
+			ob = c->open_buckets + s->blocks[i];
+
+			if (i < nr_data) {
+				ob->ec = NULL;
+				__bch2_open_bucket_put(c, ob);
+			} else {
+				bch2_open_bucket_put(c, ob);
+			}
+		}
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_del(&s->list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+	wake_up(&c->ec_stripe_new_wait);
+
+	ec_stripe_buf_exit(&s->existing_stripe);
+	ec_stripe_buf_exit(&s->new_stripe);
+	closure_debug_destroy(&s->iodone);
+
+	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
+}
+
+static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
+{
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry(s, &c->ec_stripe_new_list, list)
+		if (!atomic_read(&s->ref[STRIPE_REF_io]))
+			goto out;
+	s = NULL;
+out:
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	return s;
+}
+
+static void ec_stripe_create_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work,
+		struct bch_fs, ec_stripe_create_work);
+	struct ec_stripe_new *s;
+
+	while ((s = get_pending_stripe(c)))
+		ec_stripe_create(s);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+void bch2_ec_do_stripe_creates(struct bch_fs *c)
+{
+	bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
+
+	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = h->s;
+
+	BUG_ON(!s->allocated && !s->err);
+
+	h->s		= NULL;
+	s->pending	= true;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_add(&s->list, &c->ec_stripe_new_list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	ec_stripe_new_put(c, s, STRIPE_REF_io);
+}
+
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	s->err = -EIO;
+}
+
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	struct bch_dev *ca;
+	unsigned offset;
+
+	if (!ob)
+		return NULL;
+
+	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
+
+	ca	= bch_dev_bkey_exists(c, ob->dev);
+	offset	= ca->mi.bucket_size - ob->sectors_free;
+
+	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+	unsigned l = *((const unsigned *) _l);
+	unsigned r = *((const unsigned *) _r);
+
+	return cmp_int(l, r);
+}
+
+/* pick most common bucket size: */
+static unsigned pick_blocksize(struct bch_fs *c,
+			       struct bch_devs_mask *devs)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+	struct {
+		unsigned nr, size;
+	} cur = { 0, 0 }, best = { 0, 0 };
+
+	for_each_member_device_rcu(ca, c, i, devs)
+		sizes[nr++] = ca->mi.bucket_size;
+
+	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
+
+	for (i = 0; i < nr; i++) {
+		if (sizes[i] != cur.size) {
+			if (cur.nr > best.nr)
+				best = cur;
+
+			cur.nr = 0;
+			cur.size = sizes[i];
+		}
+
+		cur.nr++;
+	}
+
+	if (cur.nr > best.nr)
+		best = cur;
+
+	return best.size;
+}
+
+static bool may_create_new_stripe(struct bch_fs *c)
+{
+	return false;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+			       struct bkey_i *k,
+			       unsigned nr_data,
+			       unsigned nr_parity,
+			       unsigned stripe_size)
+{
+	struct bkey_i_stripe *s = bkey_stripe_init(k);
+	unsigned u64s;
+
+	s->v.sectors			= cpu_to_le16(stripe_size);
+	s->v.algorithm			= 0;
+	s->v.nr_blocks			= nr_data + nr_parity;
+	s->v.nr_redundant		= nr_parity;
+	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
+	s->v.csum_type			= BCH_CSUM_crc32c;
+	s->v.pad			= 0;
+
+	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+		BUG_ON(1 << s->v.csum_granularity_bits >=
+		       le16_to_cpu(s->v.sectors) ||
+		       s->v.csum_granularity_bits == U8_MAX);
+		s->v.csum_granularity_bits++;
+	}
+
+	set_bkey_val_u64s(&s->k, u64s);
+}
+
+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s;
+
+	lockdep_assert_held(&h->lock);
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+
+	mutex_init(&s->lock);
+	closure_init(&s->iodone, NULL);
+	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
+	atomic_set(&s->ref[STRIPE_REF_io], 1);
+	s->c		= c;
+	s->h		= h;
+	s->nr_data	= min_t(unsigned, h->nr_active_devs,
+				BCH_BKEY_PTRS_MAX) - h->redundancy;
+	s->nr_parity	= h->redundancy;
+
+	ec_stripe_key_init(c, &s->new_stripe.key,
+			   s->nr_data, s->nr_parity, h->blocksize);
+
+	h->s = s;
+	return 0;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
+			 unsigned algo, unsigned redundancy,
+			 enum bch_watermark watermark)
+{
+	struct ec_stripe_head *h;
+	struct bch_dev *ca;
+	unsigned i;
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return NULL;
+
+	mutex_init(&h->lock);
+	BUG_ON(!mutex_trylock(&h->lock));
+
+	h->target	= target;
+	h->algo		= algo;
+	h->redundancy	= redundancy;
+	h->watermark	= watermark;
+
+	rcu_read_lock();
+	h->devs = target_rw_devs(c, BCH_DATA_user, target);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (!ca->mi.durability)
+			__clear_bit(i, h->devs.d);
+
+	h->blocksize = pick_blocksize(c, &h->devs);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (ca->mi.bucket_size == h->blocksize)
+			h->nr_active_devs++;
+
+	rcu_read_unlock();
+	list_add(&h->list, &c->ec_stripe_head_list);
+	return h;
+}
+
+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	if (h->s &&
+	    h->s->allocated &&
+	    bitmap_weight(h->s->blocks_allocated,
+			  h->s->nr_data) == h->s->nr_data)
+		ec_stripe_set_pending(c, h);
+
+	mutex_unlock(&h->lock);
+}
+
+static struct ec_stripe_head *
+__bch2_ec_stripe_head_get(struct btree_trans *trans,
+			  unsigned target,
+			  unsigned algo,
+			  unsigned redundancy,
+			  enum bch_watermark watermark)
+{
+	struct bch_fs *c = trans->c;
+	struct ec_stripe_head *h;
+	int ret;
+
+	if (!redundancy)
+		return NULL;
+
+	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
+		goto found;
+	}
+
+	list_for_each_entry(h, &c->ec_stripe_head_list, list)
+		if (h->target		== target &&
+		    h->algo		== algo &&
+		    h->redundancy	== redundancy &&
+		    h->watermark	== watermark) {
+			ret = bch2_trans_mutex_lock(trans, &h->lock);
+			if (ret)
+				h = ERR_PTR(ret);
+			goto found;
+		}
+
+	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
+found:
+	mutex_unlock(&c->ec_stripe_head_lock);
+	return h;
+}
+
+static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
+				    enum bch_watermark watermark, struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_devs_mask devs = h->devs;
+	struct open_bucket *ob;
+	struct open_buckets buckets;
+	struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
+	bool have_cache = true;
+	int ret = 0;
+
+	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
+	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
+
+	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
+		__clear_bit(v->ptrs[i].dev, devs.d);
+		if (i < h->s->nr_data)
+			nr_have_data++;
+		else
+			nr_have_parity++;
+	}
+
+	BUG_ON(nr_have_data	> h->s->nr_data);
+	BUG_ON(nr_have_parity	> h->s->nr_parity);
+
+	buckets.nr = 0;
+	if (nr_have_parity < h->s->nr_parity) {
+		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
+					    &h->parity_stripe,
+					    &devs,
+					    h->s->nr_parity,
+					    &nr_have_parity,
+					    &have_cache, 0,
+					    BCH_DATA_parity,
+					    watermark,
+					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(h->s->blocks_gotten,
+					       h->s->nr_data + h->s->nr_parity,
+					       h->s->nr_data);
+			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+			h->s->blocks[j] = buckets.v[i];
+			v->ptrs[j] = bch2_ob_ptr(c, ob);
+			__set_bit(j, h->s->blocks_gotten);
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	buckets.nr = 0;
+	if (nr_have_data < h->s->nr_data) {
+		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
+					    &h->block_stripe,
+					    &devs,
+					    h->s->nr_data,
+					    &nr_have_data,
+					    &have_cache, 0,
+					    BCH_DATA_user,
+					    watermark,
+					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(h->s->blocks_gotten,
+					       h->s->nr_data, 0);
+			BUG_ON(j >= h->s->nr_data);
+
+			h->s->blocks[j] = buckets.v[i];
+			v->ptrs[j] = bch2_ob_ptr(c, ob);
+			__set_bit(j, h->s->blocks_gotten);
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* XXX: doesn't obey target: */
+static s64 get_existing_stripe(struct bch_fs *c,
+			       struct ec_stripe_head *head)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m;
+	size_t heap_idx;
+	u64 stripe_idx;
+	s64 ret = -1;
+
+	if (may_create_new_stripe(c))
+		return -1;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+		/* No blocks worth reusing, stripe will just be deleted: */
+		if (!h->data[heap_idx].blocks_nonempty)
+			continue;
+
+		stripe_idx = h->data[heap_idx].idx;
+
+		m = genradix_ptr(&c->stripes, stripe_idx);
+
+		if (m->algorithm	== head->algo &&
+		    m->nr_redundant	== head->redundancy &&
+		    m->sectors		== head->blocksize &&
+		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant &&
+		    bch2_try_open_stripe(c, head->s, stripe_idx)) {
+			ret = stripe_idx;
+			break;
+		}
+	}
+	mutex_unlock(&c->ec_stripes_heap_lock);
+	return ret;
+}
+
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+	struct bch_stripe *existing_v;
+	unsigned i;
+	s64 idx;
+	int ret;
+
+	/*
+	 * If we can't allocate a new stripe, and there's no stripes with empty
+	 * blocks for us to reuse, that means we have to wait on copygc:
+	 */
+	idx = get_existing_stripe(c, h);
+	if (idx < 0)
+		return -BCH_ERR_stripe_alloc_blocked;
+
+	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+	if (ret) {
+		bch2_stripe_close(c, h->s);
+		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
+		return ret;
+	}
+
+	existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
+
+	BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
+	h->s->nr_data = existing_v->nr_blocks -
+		existing_v->nr_redundant;
+
+	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+	if (ret) {
+		bch2_stripe_close(c, h->s);
+		return ret;
+	}
+
+	BUG_ON(h->s->existing_stripe.size != h->blocksize);
+	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
+
+	/*
+	 * Free buckets we initially allocated - they might conflict with
+	 * blocks from the stripe we're reusing:
+	 */
+	for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
+		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
+		h->s->blocks[i] = 0;
+	}
+	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
+	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+
+	for (i = 0; i < existing_v->nr_blocks; i++) {
+		if (stripe_blockcount_get(existing_v, i)) {
+			__set_bit(i, h->s->blocks_gotten);
+			__set_bit(i, h->s->blocks_allocated);
+		}
+
+		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+	}
+
+	bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
+	h->s->have_existing_stripe = true;
+
+	return 0;
+}
+
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bpos min_pos = POS(0, 1);
+	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
+	int ret;
+
+	if (!h->s->res.sectors) {
+		ret = bch2_disk_reservation_get(c, &h->s->res,
+					h->blocksize,
+					h->s->nr_parity,
+					BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			return ret;
+	}
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
+			if (start_pos.offset) {
+				start_pos = min_pos;
+				bch2_btree_iter_set_pos(&iter, start_pos);
+				continue;
+			}
+
+			ret = -BCH_ERR_ENOSPC_stripe_create;
+			break;
+		}
+
+		if (bkey_deleted(k.k) &&
+		    bch2_try_open_stripe(c, h->s, k.k->p.offset))
+			break;
+	}
+
+	c->ec_stripe_hint = iter.pos.offset;
+
+	if (ret)
+		goto err;
+
+	ret = ec_stripe_mem_alloc(trans, &iter);
+	if (ret) {
+		bch2_stripe_close(c, h->s);
+		goto err;
+	}
+
+	h->s->new_stripe.key.k.p = iter.pos;
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+err:
+	bch2_disk_reservation_put(c, &h->s->res);
+	goto out;
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
+					       unsigned target,
+					       unsigned algo,
+					       unsigned redundancy,
+					       enum bch_watermark watermark,
+					       struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct ec_stripe_head *h;
+	bool waiting = false;
+	int ret;
+
+	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
+	if (!h)
+		bch_err(c, "no stripe head");
+	if (IS_ERR_OR_NULL(h))
+		return h;
+
+	if (!h->s) {
+		ret = ec_new_stripe_alloc(c, h);
+		if (ret) {
+			bch_err(c, "failed to allocate new stripe");
+			goto err;
+		}
+	}
+
+	if (h->s->allocated)
+		goto allocated;
+
+	if (h->s->have_existing_stripe)
+		goto alloc_existing;
+
+	/* First, try to allocate a full stripe: */
+	ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
+		__bch2_ec_stripe_head_reserve(trans, h);
+	if (!ret)
+		goto allocate_buf;
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    bch2_err_matches(ret, ENOMEM))
+		goto err;
+
+	/*
+	 * Not enough buckets available for a full stripe: we must reuse an
+	 * existing stripe:
+	 */
+	while (1) {
+		ret = __bch2_ec_stripe_head_reuse(trans, h);
+		if (!ret)
+			break;
+		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
+			goto err;
+
+		if (watermark == BCH_WATERMARK_copygc) {
+			ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
+				__bch2_ec_stripe_head_reserve(trans, h);
+			if (ret)
+				goto err;
+			goto allocate_buf;
+		}
+
+		/* XXX freelist_wait? */
+		closure_wait(&c->freelist_wait, cl);
+		waiting = true;
+	}
+
+	if (waiting)
+		closure_wake_up(&c->freelist_wait);
+alloc_existing:
+	/*
+	 * Retry allocating buckets, with the watermark for this
+	 * particular write:
+	 */
+	ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
+	if (ret)
+		goto err;
+
+allocate_buf:
+	ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
+	if (ret)
+		goto err;
+
+	h->s->allocated = true;
+allocated:
+	BUG_ON(!h->s->idx);
+	BUG_ON(!h->s->new_stripe.data[0]);
+	BUG_ON(trans->restarted);
+	return h;
+err:
+	bch2_ec_stripe_head_put(c, h);
+	return ERR_PTR(ret);
+}
+
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+		mutex_lock(&h->lock);
+		if (!h->s)
+			goto unlock;
+
+		if (!ca)
+			goto found;
+
+		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
+			if (!h->s->blocks[i])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[i];
+			if (ob->dev == ca->dev_idx)
+				goto found;
+		}
+		goto unlock;
+found:
+		h->s->err = -BCH_ERR_erofs_no_writes;
+		ec_stripe_set_pending(c, h);
+unlock:
+		mutex_unlock(&h->lock);
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
+}
+
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+	__bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+	__bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+	bool ret;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	ret = list_empty(&c->ec_stripe_new_list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	return ret;
+}
+
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
+}
+
+int bch2_stripes_read(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_stripe *s;
+	struct stripe *m;
+	unsigned i;
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		if (k.k->type != KEY_TYPE_stripe)
+			continue;
+
+		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+		if (ret)
+			break;
+
+		s = bkey_s_c_to_stripe(k).v;
+
+		m = genradix_ptr(&c->stripes, k.k->p.offset);
+		m->sectors	= le16_to_cpu(s->sectors);
+		m->algorithm	= s->algorithm;
+		m->nr_blocks	= s->nr_blocks;
+		m->nr_redundant	= s->nr_redundant;
+		m->blocks_nonempty = 0;
+
+		for (i = 0; i < s->nr_blocks; i++)
+			m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+
+		bch2_stripes_heap_insert(c, m, k.k->p.offset);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+
+	return ret;
+}
+
+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m;
+	size_t i;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	for (i = 0; i < min_t(size_t, h->used, 50); i++) {
+		m = genradix_ptr(&c->stripes, h->data[i].idx);
+
+		prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
+		       h->data[i].blocks_nonempty,
+		       m->nr_blocks - m->nr_redundant,
+		       m->nr_redundant);
+		if (bch2_stripe_is_open(c, h->data[i].idx))
+			prt_str(out, " open");
+		prt_newline(out);
+	}
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+		prt_printf(out, "target %u algo %u redundancy %u %s:\n",
+		       h->target, h->algo, h->redundancy,
+		       bch2_watermarks[h->watermark]);
+
+		if (h->s)
+			prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
+			       h->s->idx, h->s->nr_data, h->s->nr_parity,
+			       bitmap_weight(h->s->blocks_allocated,
+					     h->s->nr_data));
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
+
+	prt_printf(out, "in flight:\n");
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
+		prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
+			   s->idx, s->nr_data, s->nr_parity,
+			   atomic_read(&s->ref[STRIPE_REF_io]),
+			   atomic_read(&s->ref[STRIPE_REF_stripe]),
+			   bch2_watermarks[s->h->watermark]);
+	}
+	mutex_unlock(&c->ec_stripe_new_lock);
+}
+
+void bch2_fs_ec_exit(struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+	unsigned i;
+
+	while (1) {
+		mutex_lock(&c->ec_stripe_head_lock);
+		h = list_first_entry_or_null(&c->ec_stripe_head_list,
+					     struct ec_stripe_head, list);
+		if (h)
+			list_del(&h->list);
+		mutex_unlock(&c->ec_stripe_head_lock);
+		if (!h)
+			break;
+
+		if (h->s) {
+			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
+				BUG_ON(h->s->blocks[i]);
+
+			kfree(h->s);
+		}
+		kfree(h);
+	}
+
+	BUG_ON(!list_empty(&c->ec_stripe_new_list));
+
+	free_heap(&c->ec_stripes_heap);
+	genradix_free(&c->stripes);
+	bioset_exit(&c->ec_bioset);
+}
+
+void bch2_fs_ec_init_early(struct bch_fs *c)
+{
+	spin_lock_init(&c->ec_stripes_new_lock);
+	mutex_init(&c->ec_stripes_heap_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_head_list);
+	mutex_init(&c->ec_stripe_head_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_new_list);
+	mutex_init(&c->ec_stripe_new_lock);
+	init_waitqueue_head(&c->ec_stripe_new_wait);
+
+	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
+	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+}
+
+int bch2_fs_ec_init(struct bch_fs *c)
+{
+	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
+			   BIOSET_NEED_BVECS);
+}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
new file mode 100644
index 000000000000..966d165a3b66
--- /dev/null
+++ b/fs/bcachefs/ec.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+
+enum bkey_invalid_flags;
+
+int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
+			enum bkey_invalid_flags, struct printbuf *);
+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
+			 struct bkey_s_c);
+
+#define bch2_bkey_ops_stripe ((struct bkey_ops) {	\
+	.key_invalid	= bch2_stripe_invalid,		\
+	.val_to_text	= bch2_stripe_to_text,		\
+	.swab		= bch2_ptr_swab,		\
+	.trans_trigger	= bch2_trans_mark_stripe,	\
+	.atomic_trigger	= bch2_mark_stripe,		\
+	.min_val_size	= 8,				\
+})
+
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+			    1 << s->csum_granularity_bits);
+}
+
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
+					  unsigned dev, unsigned csum_idx)
+{
+	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+	return sizeof(struct bch_stripe) +
+		sizeof(struct bch_extent_ptr) * s->nr_blocks +
+		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+						unsigned idx)
+{
+	return stripe_csum_offset(s, s->nr_blocks, 0) +
+		sizeof(u16) * idx;
+}
+
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
+					     unsigned idx)
+{
+	return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
+}
+
+static inline void stripe_blockcount_set(struct bch_stripe *s,
+					 unsigned idx, unsigned v)
+{
+	__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
+
+	*p = cpu_to_le16(v);
+}
+
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+			    sizeof(u64));
+}
+
+static inline void *stripe_csum(struct bch_stripe *s,
+				unsigned block, unsigned csum_idx)
+{
+	EBUG_ON(block >= s->nr_blocks);
+	EBUG_ON(csum_idx >= stripe_csums_per_device(s));
+
+	return (void *) s + stripe_csum_offset(s, block, csum_idx);
+}
+
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
+				   unsigned block, unsigned csum_idx)
+{
+	struct bch_csum csum = { 0 };
+
+	memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
+	return csum;
+}
+
+static inline void stripe_csum_set(struct bch_stripe *s,
+				   unsigned block, unsigned csum_idx,
+				   struct bch_csum csum)
+{
+	memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
+}
+
+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
+					     const struct bch_extent_ptr *data_ptr,
+					     unsigned sectors)
+{
+	return  data_ptr->dev    == stripe_ptr->dev &&
+		data_ptr->gen    == stripe_ptr->gen &&
+		data_ptr->offset >= stripe_ptr->offset &&
+		data_ptr->offset  < stripe_ptr->offset + sectors;
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+					   struct extent_ptr_decoded p)
+{
+	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+
+	BUG_ON(!p.has_ec);
+
+	if (p.ec.block >= nr_data)
+		return false;
+
+	return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
+					 le16_to_cpu(s->sectors));
+}
+
+static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
+					     struct extent_ptr_decoded p)
+{
+	unsigned nr_data = m->nr_blocks - m->nr_redundant;
+
+	BUG_ON(!p.has_ec);
+
+	if (p.ec.block >= nr_data)
+		return false;
+
+	return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
+					 m->sectors);
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+	/* might not be buffering the entire stripe: */
+	unsigned		offset;
+	unsigned		size;
+	unsigned long		valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+
+	void			*data[BCH_BKEY_PTRS_MAX];
+
+	__BKEY_PADDED(key, 255);
+};
+
+struct ec_stripe_head;
+
+enum ec_stripe_ref {
+	STRIPE_REF_io,
+	STRIPE_REF_stripe,
+	STRIPE_REF_NR
+};
+
+struct ec_stripe_new {
+	struct bch_fs		*c;
+	struct ec_stripe_head	*h;
+	struct mutex		lock;
+	struct list_head	list;
+
+	struct hlist_node	hash;
+	u64			idx;
+
+	struct closure		iodone;
+
+	atomic_t		ref[STRIPE_REF_NR];
+
+	int			err;
+
+	u8			nr_data;
+	u8			nr_parity;
+	bool			allocated;
+	bool			pending;
+	bool			have_existing_stripe;
+
+	unsigned long		blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
+	struct disk_reservation	res;
+
+	struct ec_stripe_buf	new_stripe;
+	struct ec_stripe_buf	existing_stripe;
+};
+
+struct ec_stripe_head {
+	struct list_head	list;
+	struct mutex		lock;
+
+	unsigned		target;
+	unsigned		algo;
+	unsigned		redundancy;
+	enum bch_watermark	watermark;
+
+	struct bch_devs_mask	devs;
+	unsigned		nr_active_devs;
+
+	unsigned		blocksize;
+
+	struct dev_stripe_state	block_stripe;
+	struct dev_stripe_state	parity_stripe;
+
+	struct ec_stripe_new	*s;
+};
+
+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
+			unsigned, unsigned, unsigned,
+			enum bch_watermark, struct closure *);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
+
+void bch2_do_stripe_deletes(struct bch_fs *);
+void bch2_ec_do_stripe_creates(struct bch_fs *);
+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
+
+static inline void ec_stripe_new_get(struct ec_stripe_new *s,
+				     enum ec_stripe_ref ref)
+{
+	atomic_inc(&s->ref[ref]);
+}
+
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
+				     enum ec_stripe_ref ref)
+{
+	BUG_ON(atomic_read(&s->ref[ref]) <= 0);
+
+	if (atomic_dec_and_test(&s->ref[ref]))
+		switch (ref) {
+		case STRIPE_REF_stripe:
+			bch2_ec_stripe_new_free(c, s);
+			break;
+		case STRIPE_REF_io:
+			bch2_ec_do_stripe_creates(c);
+			break;
+		default:
+			BUG();
+		}
+}
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
+
+int bch2_stripes_read(struct bch_fs *);
+
+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+void bch2_fs_ec_init_early(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
new file mode 100644
index 000000000000..e2b02a82de32
--- /dev/null
+++ b/fs/bcachefs/ec_types.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_replicas_padded {
+	struct bch_replicas_entry	e;
+	u8				pad[BCH_BKEY_PTRS_MAX];
+};
+
+struct stripe {
+	size_t			heap_idx;
+	u16			sectors;
+	u8			algorithm;
+	u8			nr_blocks;
+	u8			nr_redundant;
+	u8			blocks_nonempty;
+};
+
+struct gc_stripe {
+	u16			sectors;
+
+	u8			nr_blocks;
+	u8			nr_redundant;
+
+	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
+	u16			block_sectors[BCH_BKEY_PTRS_MAX];
+	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
+
+	struct bch_replicas_padded r;
+};
+
+struct ec_stripe_heap_entry {
+	size_t			idx;
+	unsigned		blocks_nonempty;
+};
+
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
new file mode 100644
index 000000000000..d260ff9bbfeb
--- /dev/null
+++ b/fs/bcachefs/errcode.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "errcode.h"
+
+#include <linux/errname.h>
+
+static const char * const bch2_errcode_strs[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
+	BCH_ERRCODES()
+#undef x
+	NULL
+};
+
+static unsigned bch2_errcode_parents[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
+	BCH_ERRCODES()
+#undef x
+};
+
+const char *bch2_err_str(int err)
+{
+	const char *errstr;
+
+	err = abs(err);
+
+	BUG_ON(err >= BCH_ERR_MAX);
+
+	if (err >= BCH_ERR_START)
+		errstr = bch2_errcode_strs[err - BCH_ERR_START];
+	else if (err)
+		errstr = errname(err);
+	else
+		errstr = "(No error)";
+	return errstr ?: "(Invalid error)";
+}
+
+bool __bch2_err_matches(int err, int class)
+{
+	err	= abs(err);
+	class	= abs(class);
+
+	BUG_ON(err	>= BCH_ERR_MAX);
+	BUG_ON(class	>= BCH_ERR_MAX);
+
+	while (err >= BCH_ERR_START && err != class)
+		err = bch2_errcode_parents[err - BCH_ERR_START];
+
+	return err == class;
+}
+
+int __bch2_err_class(int err)
+{
+	err = -err;
+	BUG_ON((unsigned) err >= BCH_ERR_MAX);
+
+	while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
+		err = bch2_errcode_parents[err - BCH_ERR_START];
+
+	return -err;
+}
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+	if (status == BLK_STS_REMOVED)
+		return "device removed";
+	return blk_status_to_str(status);
+}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
new file mode 100644
index 000000000000..7cc083776a2e
--- /dev/null
+++ b/fs/bcachefs/errcode.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERRCODE_H
+#define _BCACHEFS_ERRCODE_H
+
+#define BCH_ERRCODES()								\
+	x(ENOMEM,			ENOMEM_stripe_buf)			\
+	x(ENOMEM,			ENOMEM_replicas_table)			\
+	x(ENOMEM,			ENOMEM_cpu_replicas)			\
+	x(ENOMEM,			ENOMEM_replicas_gc)			\
+	x(ENOMEM,			ENOMEM_disk_groups_validate)		\
+	x(ENOMEM,			ENOMEM_disk_groups_to_cpu)		\
+	x(ENOMEM,			ENOMEM_mark_snapshot)			\
+	x(ENOMEM,			ENOMEM_mark_stripe)			\
+	x(ENOMEM,			ENOMEM_mark_stripe_ptr)			\
+	x(ENOMEM,			ENOMEM_btree_key_cache_create)		\
+	x(ENOMEM,			ENOMEM_btree_key_cache_fill)		\
+	x(ENOMEM,			ENOMEM_btree_key_cache_insert)		\
+	x(ENOMEM,			ENOMEM_trans_kmalloc)			\
+	x(ENOMEM,			ENOMEM_trans_log_msg)			\
+	x(ENOMEM,			ENOMEM_do_encrypt)			\
+	x(ENOMEM,			ENOMEM_ec_read_extent)			\
+	x(ENOMEM,			ENOMEM_ec_stripe_mem_alloc)		\
+	x(ENOMEM,			ENOMEM_ec_new_stripe_alloc)		\
+	x(ENOMEM,			ENOMEM_fs_btree_cache_init)		\
+	x(ENOMEM,			ENOMEM_fs_btree_key_cache_init)		\
+	x(ENOMEM,			ENOMEM_fs_counters_init)		\
+	x(ENOMEM,			ENOMEM_fs_btree_write_buffer_init)	\
+	x(ENOMEM,			ENOMEM_io_clock_init)			\
+	x(ENOMEM,			ENOMEM_blacklist_table_init)		\
+	x(ENOMEM,			ENOMEM_sb_realloc_injected)		\
+	x(ENOMEM,			ENOMEM_sb_bio_realloc)			\
+	x(ENOMEM,			ENOMEM_sb_buf_realloc)			\
+	x(ENOMEM,			ENOMEM_sb_journal_validate)		\
+	x(ENOMEM,			ENOMEM_sb_journal_v2_validate)		\
+	x(ENOMEM,			ENOMEM_journal_entry_add)		\
+	x(ENOMEM,			ENOMEM_journal_read_buf_realloc)	\
+	x(ENOMEM,			ENOMEM_btree_interior_update_worker_init)\
+	x(ENOMEM,			ENOMEM_btree_interior_update_pool_init)	\
+	x(ENOMEM,			ENOMEM_bio_read_init)			\
+	x(ENOMEM,			ENOMEM_bio_read_split_init)		\
+	x(ENOMEM,			ENOMEM_bio_write_init)			\
+	x(ENOMEM,			ENOMEM_bio_bounce_pages_init)		\
+	x(ENOMEM,			ENOMEM_writepage_bioset_init)		\
+	x(ENOMEM,			ENOMEM_dio_read_bioset_init)		\
+	x(ENOMEM,			ENOMEM_dio_write_bioset_init)		\
+	x(ENOMEM,			ENOMEM_nocow_flush_bioset_init)		\
+	x(ENOMEM,			ENOMEM_promote_table_init)		\
+	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
+	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
+	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
+	x(ENOMEM,			ENOMEM_decompression_workspace_init)	\
+	x(ENOMEM,			ENOMEM_bucket_gens)			\
+	x(ENOMEM,			ENOMEM_buckets_nouse)			\
+	x(ENOMEM,			ENOMEM_usage_init)			\
+	x(ENOMEM,			ENOMEM_btree_node_read_all_replicas)	\
+	x(ENOMEM,			ENOMEM_btree_node_reclaim)		\
+	x(ENOMEM,			ENOMEM_btree_node_mem_alloc)		\
+	x(ENOMEM,			ENOMEM_btree_cache_cannibalize_lock)	\
+	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_init)\
+	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_set)	\
+	x(ENOMEM,			ENOMEM_set_nr_journal_buckets)		\
+	x(ENOMEM,			ENOMEM_dev_journal_init)		\
+	x(ENOMEM,			ENOMEM_journal_pin_fifo)		\
+	x(ENOMEM,			ENOMEM_journal_buf)			\
+	x(ENOMEM,			ENOMEM_gc_start)			\
+	x(ENOMEM,			ENOMEM_gc_alloc_start)			\
+	x(ENOMEM,			ENOMEM_gc_reflink_start)		\
+	x(ENOMEM,			ENOMEM_gc_gens)				\
+	x(ENOMEM,			ENOMEM_gc_repair_key)			\
+	x(ENOMEM,			ENOMEM_fsck_extent_ends_at)		\
+	x(ENOMEM,			ENOMEM_fsck_add_nlink)			\
+	x(ENOMEM,			ENOMEM_journal_key_insert)		\
+	x(ENOMEM,			ENOMEM_journal_keys_sort)		\
+	x(ENOMEM,			ENOMEM_journal_replay)			\
+	x(ENOMEM,			ENOMEM_read_superblock_clean)		\
+	x(ENOMEM,			ENOMEM_fs_alloc)			\
+	x(ENOMEM,			ENOMEM_fs_name_alloc)			\
+	x(ENOMEM,			ENOMEM_fs_other_alloc)			\
+	x(ENOMEM,			ENOMEM_dev_alloc)			\
+	x(ENOSPC,			ENOSPC_disk_reservation)		\
+	x(ENOSPC,			ENOSPC_bucket_alloc)			\
+	x(ENOSPC,			ENOSPC_disk_label_add)			\
+	x(ENOSPC,			ENOSPC_stripe_create)			\
+	x(ENOSPC,			ENOSPC_inode_create)			\
+	x(ENOSPC,			ENOSPC_str_hash_create)			\
+	x(ENOSPC,			ENOSPC_snapshot_create)			\
+	x(ENOSPC,			ENOSPC_subvolume_create)		\
+	x(ENOSPC,			ENOSPC_sb)				\
+	x(ENOSPC,			ENOSPC_sb_journal)			\
+	x(ENOSPC,			ENOSPC_sb_journal_seq_blacklist)	\
+	x(ENOSPC,			ENOSPC_sb_quota)			\
+	x(ENOSPC,			ENOSPC_sb_replicas)			\
+	x(ENOSPC,			ENOSPC_sb_members)			\
+	x(ENOSPC,			ENOSPC_sb_members_v2)			\
+	x(ENOSPC,			ENOSPC_sb_crypt)			\
+	x(ENOSPC,			ENOSPC_btree_slot)			\
+	x(ENOSPC,			ENOSPC_snapshot_tree)			\
+	x(ENOENT,			ENOENT_bkey_type_mismatch)		\
+	x(ENOENT,			ENOENT_str_hash_lookup)			\
+	x(ENOENT,			ENOENT_str_hash_set_must_replace)	\
+	x(ENOENT,			ENOENT_inode)				\
+	x(ENOENT,			ENOENT_not_subvol)			\
+	x(ENOENT,			ENOENT_not_directory)			\
+	x(ENOENT,			ENOENT_directory_dead)			\
+	x(ENOENT,			ENOENT_subvolume)			\
+	x(ENOENT,			ENOENT_snapshot_tree)			\
+	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
+	x(ENOENT,			ENOENT_dev_not_found)			\
+	x(ENOENT,			ENOENT_dev_idx_not_found)		\
+	x(0,				open_buckets_empty)			\
+	x(0,				freelist_empty)				\
+	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
+	x(0,				transaction_restart)			\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fault_inject)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path_intent)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_after_fill)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_too_many_iters)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_lock_node_reused)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fill_relock)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_mem_realloced)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_in_traverse_all)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock_write)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_deadlock_recursion_limit)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_upgrade)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_upgrade)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_fill)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_raced)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_realloced)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_journal_preres_get)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_write_buffer_flush)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
+	x(0,				no_btree_node)				\
+	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_upgrade)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_drop)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_lock_root)		\
+	x(BCH_ERR_no_btree_node,	no_btree_node_up)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_srcu_reset)		\
+	x(0,				btree_insert_fail)			\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_btree_node_full)		\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_flush_buffer)		\
+	x(0,				backpointer_to_overwritten_btree_node)	\
+	x(0,				lock_fail_root_changed)			\
+	x(0,				journal_reclaim_would_deadlock)		\
+	x(EINVAL,			fsck)					\
+	x(BCH_ERR_fsck,			fsck_fix)				\
+	x(BCH_ERR_fsck,			fsck_ignore)				\
+	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
+	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
+	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
+	x(0,				restart_recovery)			\
+	x(0,				unwritten_extent_update)		\
+	x(EINVAL,			device_state_not_allowed)		\
+	x(EINVAL,			member_info_missing)			\
+	x(EINVAL,			mismatched_block_size)			\
+	x(EINVAL,			block_size_too_small)			\
+	x(EINVAL,			bucket_size_too_small)			\
+	x(EINVAL,			device_size_too_small)			\
+	x(EINVAL,			device_not_a_member_of_filesystem)	\
+	x(EINVAL,			device_has_been_removed)		\
+	x(EINVAL,			device_already_online)			\
+	x(EINVAL,			insufficient_devices_to_start)		\
+	x(EINVAL,			invalid)				\
+	x(EINVAL,			internal_fsck_err)			\
+	x(EROFS,			erofs_trans_commit)			\
+	x(EROFS,			erofs_no_writes)			\
+	x(EROFS,			erofs_journal_err)			\
+	x(EROFS,			erofs_sb_err)				\
+	x(EROFS,			erofs_unfixed_errors)			\
+	x(EROFS,			erofs_norecovery)			\
+	x(EROFS,			erofs_nochanges)			\
+	x(EROFS,			insufficient_devices)			\
+	x(0,				operation_blocked)			\
+	x(BCH_ERR_operation_blocked,	btree_cache_cannibalize_lock_blocked)	\
+	x(BCH_ERR_operation_blocked,	journal_res_get_blocked)		\
+	x(BCH_ERR_operation_blocked,	journal_preres_get_blocked)		\
+	x(BCH_ERR_operation_blocked,	bucket_alloc_blocked)			\
+	x(BCH_ERR_operation_blocked,	stripe_alloc_blocked)			\
+	x(BCH_ERR_invalid,		invalid_sb)				\
+	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_features)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_too_big)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_csum_type)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_csum)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_block_size)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_uuid)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_too_many_members)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_dev_idx)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_time_precision)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_field_size)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_layout)			\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_type)			\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_nr_superblocks)	\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_superblocks_overlap)	\
+	x(BCH_ERR_invalid_sb,		invalid_sb_members_missing)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_members)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_disk_groups)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_replicas)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_journal)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_journal_seq_blacklist)	\
+	x(BCH_ERR_invalid_sb,		invalid_sb_crypt)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_clean)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
+	x(BCH_ERR_invalid,		invalid_bkey)				\
+	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
+	x(EIO,				btree_node_read_err)			\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)	\
+	x(0,				nopromote)				\
+	x(BCH_ERR_nopromote,		nopromote_may_not)			\
+	x(BCH_ERR_nopromote,		nopromote_already_promoted)		\
+	x(BCH_ERR_nopromote,		nopromote_unwritten)			\
+	x(BCH_ERR_nopromote,		nopromote_congested)			\
+	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
+	x(BCH_ERR_nopromote,		nopromote_enomem)
+
+enum bch_errcode {
+	BCH_ERR_START		= 2048,
+#define x(class, err) BCH_ERR_##err,
+	BCH_ERRCODES()
+#undef x
+	BCH_ERR_MAX
+};
+
+const char *bch2_err_str(int);
+bool __bch2_err_matches(int, int);
+
+static inline bool _bch2_err_matches(int err, int class)
+{
+	return err < 0 && __bch2_err_matches(err, class);
+}
+
+#define bch2_err_matches(_err, _class)			\
+({							\
+	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
+	unlikely(_bch2_err_matches(_err, _class));	\
+})
+
+int __bch2_err_class(int);
+
+static inline long bch2_err_class(long err)
+{
+	return err < 0 ? __bch2_err_class(err) : err;
+}
+
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
+#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
new file mode 100644
index 000000000000..2a5af8872613
--- /dev/null
+++ b/fs/bcachefs/error.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "error.h"
+#include "super.h"
+
+#define FSCK_ERR_RATELIMIT_NR	10
+
+bool bch2_inconsistent_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_ERROR, &c->flags);
+
+	switch (c->opts.errors) {
+	case BCH_ON_ERROR_continue:
+		return false;
+	case BCH_ON_ERROR_ro:
+		if (bch2_fs_emergency_read_only(c))
+			bch_err(c, "inconsistency detected - emergency read only");
+		return true;
+	case BCH_ON_ERROR_panic:
+		panic(bch2_fmt(c, "panic after error"));
+		return true;
+	default:
+		BUG();
+	}
+}
+
+void bch2_topology_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+		bch2_inconsistent_error(c);
+}
+
+void bch2_fatal_error(struct bch_fs *c)
+{
+	if (bch2_fs_emergency_read_only(c))
+		bch_err(c, "fatal error - emergency read only");
+}
+
+void bch2_io_error_work(struct work_struct *work)
+{
+	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
+	struct bch_fs *c = ca->fs;
+	bool dev;
+
+	down_write(&c->state_lock);
+	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
+				    BCH_FORCE_IF_DEGRADED);
+	if (dev
+	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+				  BCH_FORCE_IF_DEGRADED)
+	    : bch2_fs_emergency_read_only(c))
+		bch_err(ca,
+			"too many IO errors, setting %s RO",
+			dev ? "device" : "filesystem");
+	up_write(&c->state_lock);
+}
+
+void bch2_io_error(struct bch_dev *ca)
+{
+	//queue_work(system_long_wq, &ca->io_error_work);
+}
+
+enum ask_yn {
+	YN_NO,
+	YN_YES,
+	YN_ALLNO,
+	YN_ALLYES,
+};
+
+#ifdef __KERNEL__
+#define bch2_fsck_ask_yn()	YN_NO
+#else
+
+#include "tools-util.h"
+
+enum ask_yn bch2_fsck_ask_yn(void)
+{
+	char *buf = NULL;
+	size_t buflen = 0;
+	bool ret;
+
+	while (true) {
+		fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
+		fflush(stdout);
+
+		if (getline(&buf, &buflen, stdin) < 0)
+			die("error reading from standard input");
+
+		strim(buf);
+		if (strlen(buf) != 1)
+			continue;
+
+		switch (buf[0]) {
+		case 'n':
+			return YN_NO;
+		case 'y':
+			return YN_YES;
+		case 'N':
+			return YN_ALLNO;
+		case 'Y':
+			return YN_ALLYES;
+		}
+	}
+
+	free(buf);
+	return ret;
+}
+
+#endif
+
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
+{
+	struct fsck_err_state *s;
+
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+		return NULL;
+
+	list_for_each_entry(s, &c->fsck_errors, list)
+		if (s->fmt == fmt) {
+			/*
+			 * move it to the head of the list: repeated fsck errors
+			 * are common
+			 */
+			list_move(&s->list, &c->fsck_errors);
+			return s;
+		}
+
+	s = kzalloc(sizeof(*s), GFP_NOFS);
+	if (!s) {
+		if (!c->fsck_alloc_err)
+			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+		c->fsck_alloc_err = true;
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&s->list);
+	s->fmt = fmt;
+	list_add(&s->list, &c->fsck_errors);
+	return s;
+}
+
+int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+{
+	struct fsck_err_state *s = NULL;
+	va_list args;
+	bool print = true, suppressing = false, inconsistent = false;
+	struct printbuf buf = PRINTBUF, *out = &buf;
+	int ret = -BCH_ERR_fsck_ignore;
+
+	va_start(args, fmt);
+	prt_vprintf(out, fmt, args);
+	va_end(args);
+
+	mutex_lock(&c->fsck_error_lock);
+	s = fsck_err_get(c, fmt);
+	if (s) {
+		/*
+		 * We may be called multiple times for the same error on
+		 * transaction restart - this memoizes instead of asking the user
+		 * multiple times for the same error:
+		 */
+		if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
+			ret = s->ret;
+			mutex_unlock(&c->fsck_error_lock);
+			printbuf_exit(&buf);
+			return ret;
+		}
+
+		kfree(s->last_msg);
+		s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+
+		if (c->opts.ratelimit_errors &&
+		    !(flags & FSCK_NO_RATELIMIT) &&
+		    s->nr >= FSCK_ERR_RATELIMIT_NR) {
+			if (s->nr == FSCK_ERR_RATELIMIT_NR)
+				suppressing = true;
+			else
+				print = false;
+		}
+
+		s->nr++;
+	}
+
+#ifdef BCACHEFS_LOG_PREFIX
+	if (!strncmp(fmt, "bcachefs:", 9))
+		prt_printf(out, bch2_log_msg(c, ""));
+#endif
+
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+		if (c->opts.errors != BCH_ON_ERROR_continue ||
+		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
+			prt_str(out, ", shutting down");
+			inconsistent = true;
+			ret = -BCH_ERR_fsck_errors_not_fixed;
+		} else if (flags & FSCK_CAN_FIX) {
+			prt_str(out, ", fixing");
+			ret = -BCH_ERR_fsck_fix;
+		} else {
+			prt_str(out, ", continuing");
+			ret = -BCH_ERR_fsck_ignore;
+		}
+	} else if (c->opts.fix_errors == FSCK_FIX_exit) {
+		prt_str(out, ", exiting");
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+	} else if (flags & FSCK_CAN_FIX) {
+		int fix = s && s->fix
+			? s->fix
+			: c->opts.fix_errors;
+
+		if (fix == FSCK_FIX_ask) {
+			int ask;
+
+			prt_str(out, ": fix?");
+			bch2_print_string_as_lines(KERN_ERR, out->buf);
+			print = false;
+
+			ask = bch2_fsck_ask_yn();
+
+			if (ask >= YN_ALLNO && s)
+				s->fix = ask == YN_ALLNO
+					? FSCK_FIX_no
+					: FSCK_FIX_yes;
+
+			ret = ask & 1
+				? -BCH_ERR_fsck_fix
+				: -BCH_ERR_fsck_ignore;
+		} else if (fix == FSCK_FIX_yes ||
+			   (c->opts.nochanges &&
+			    !(flags & FSCK_CAN_IGNORE))) {
+			prt_str(out, ", fixing");
+			ret = -BCH_ERR_fsck_fix;
+		} else {
+			prt_str(out, ", not fixing");
+		}
+	} else if (flags & FSCK_NEED_FSCK) {
+		prt_str(out, " (run fsck to correct)");
+	} else {
+		prt_str(out, " (repair unimplemented)");
+	}
+
+	if (ret == -BCH_ERR_fsck_ignore &&
+	    (c->opts.fix_errors == FSCK_FIX_exit ||
+	     !(flags & FSCK_CAN_IGNORE)))
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+
+	if (print)
+		bch2_print_string_as_lines(KERN_ERR, out->buf);
+
+	if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+	    (ret != -BCH_ERR_fsck_fix &&
+	     ret != -BCH_ERR_fsck_ignore))
+		bch_err(c, "Unable to continue, halting");
+	else if (suppressing)
+		bch_err(c, "Ratelimiting new instances of previous error");
+
+	if (s)
+		s->ret = ret;
+
+	mutex_unlock(&c->fsck_error_lock);
+
+	printbuf_exit(&buf);
+
+	if (inconsistent)
+		bch2_inconsistent_error(c);
+
+	if (ret == -BCH_ERR_fsck_fix) {
+		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+	} else {
+		set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+		set_bit(BCH_FS_ERROR, &c->flags);
+	}
+
+	return ret;
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+	struct fsck_err_state *s, *n;
+
+	mutex_lock(&c->fsck_error_lock);
+
+	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+		if (s->ratelimited && s->last_msg)
+			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->last_msg);
+
+		list_del(&s->list);
+		kfree(s->last_msg);
+		kfree(s);
+	}
+
+	mutex_unlock(&c->fsck_error_lock);
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
new file mode 100644
index 000000000000..7ce9540052e5
--- /dev/null
+++ b/fs/bcachefs/error.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERROR_H
+#define _BCACHEFS_ERROR_H
+
+#include <linux/list.h>
+#include <linux/printk.h>
+
+struct bch_dev;
+struct bch_fs;
+struct work_struct;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+bool bch2_inconsistent_error(struct bch_fs *);
+
+void bch2_topology_error(struct bch_fs *);
+
+#define bch2_fs_inconsistent(c, ...)					\
+({									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_inconsistent_error(c);					\
+})
+
+#define bch2_fs_inconsistent_on(cond, c, ...)				\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_fs_inconsistent(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire filesystem:
+ */
+
+#define bch2_dev_inconsistent(ca, ...)					\
+do {									\
+	bch_err(ca, __VA_ARGS__);					\
+	bch2_inconsistent_error((ca)->fs);				\
+} while (0)
+
+#define bch2_dev_inconsistent_on(cond, ca, ...)				\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...)				\
+({									\
+	bch_err(trans->c, __VA_ARGS__);					\
+	bch2_dump_trans_updates(trans);					\
+	bch2_inconsistent_error(trans->c);				\
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...)			\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_trans_inconsistent(trans, __VA_ARGS__);		\
+	_ret;								\
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+struct fsck_err_state {
+	struct list_head	list;
+	const char		*fmt;
+	u64			nr;
+	bool			ratelimited;
+	int			ret;
+	int			fix;
+	char			*last_msg;
+};
+
+#define FSCK_CAN_FIX		(1 << 0)
+#define FSCK_CAN_IGNORE		(1 << 1)
+#define FSCK_NEED_FSCK		(1 << 2)
+#define FSCK_NO_RATELIMIT	(1 << 3)
+
+__printf(3, 4) __cold
+int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, msg, ...)					\
+({									\
+	int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);	\
+									\
+	if (_ret != -BCH_ERR_fsck_fix &&				\
+	    _ret != -BCH_ERR_fsck_ignore) {				\
+		ret = _ret;						\
+		goto fsck_err;						\
+	}								\
+									\
+	_ret == -BCH_ERR_fsck_fix;					\
+})
+
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, ...)				\
+	(unlikely(cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, ...)					\
+	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define need_fsck_err(c, ...)						\
+	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define mustfix_fsck_err(c, ...)					\
+	__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, ...)				\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define fsck_err(c, ...)						\
+	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+#define fsck_err_on(cond, c, ...)					\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch2_fatal_error(struct bch_fs *);
+
+#define bch2_fs_fatal_error(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_fatal_error(c);						\
+} while (0)
+
+#define bch2_fs_fatal_err_on(cond, c, ...)				\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_fs_fatal_error(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
+ * IO - we need to log it and print out a message, but we don't (necessarily)
+ * want to shut down the fs:
+ */
+
+void bch2_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch2_io_error(struct bch_dev *);
+
+#define bch2_dev_io_err_on(cond, ca, ...)				\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret) {							\
+		bch_err_dev_ratelimited(ca, __VA_ARGS__);		\
+		bch2_io_error(ca);					\
+	}								\
+	_ret;								\
+})
+
+#define bch2_dev_inum_io_err_on(cond, ca, ...)				\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret) {							\
+		bch_err_inum_offset_ratelimited(ca, __VA_ARGS__);	\
+		bch2_io_error(ca);					\
+	}								\
+	_ret;								\
+})
+
+#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
new file mode 100644
index 000000000000..21af6fb8cecf
--- /dev/null
+++ b/fs/bcachefs/extent_update.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "debug.h"
+#include "extents.h"
+#include "extent_update.h"
+
+/*
+ * This counts the number of iterators to the alloc & ec btrees we'll need
+ * inserting/removing this extent:
+ */
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	unsigned ret = 0, lru = 0;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			/* Might also be updating LRU btree */
+			if (entry->ptr.cached)
+				lru++;
+
+			fallthrough;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ret++;
+		}
+	}
+
+	/*
+	 * Updating keys in the alloc btree may also update keys in the
+	 * freespace or discard btrees:
+	 */
+	return lru + ret * 2;
+}
+
+static int count_iters_for_insert(struct btree_trans *trans,
+				  struct bkey_s_c k,
+				  unsigned offset,
+				  struct bpos *end,
+				  unsigned *nr_iters,
+				  unsigned max_iters)
+{
+	int ret = 0, ret2 = 0;
+
+	if (*nr_iters >= max_iters) {
+		*end = bpos_min(*end, k.k->p);
+		ret = 1;
+	}
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+		if (*nr_iters >= max_iters) {
+			*end = bpos_min(*end, k.k->p);
+			ret = 1;
+		}
+
+		break;
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+		u64 idx = le64_to_cpu(p.v->idx);
+		unsigned sectors = bpos_min(*end, p.k->p).offset -
+			bkey_start_offset(p.k);
+		struct btree_iter iter;
+		struct bkey_s_c r_k;
+
+		for_each_btree_key_norestart(trans, iter,
+				   BTREE_ID_reflink, POS(0, idx + offset),
+				   BTREE_ITER_SLOTS, r_k, ret2) {
+			if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
+				break;
+
+			/* extent_update_to_keys(), for the reflink_v update */
+			*nr_iters += 1;
+
+			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
+
+			if (*nr_iters >= max_iters) {
+				struct bpos pos = bkey_start_pos(k.k);
+				pos.offset += min_t(u64, k.k->size,
+						    r_k.k->p.offset - idx);
+
+				*end = bpos_min(*end, pos);
+				ret = 1;
+				break;
+			}
+		}
+		bch2_trans_iter_exit(trans, &iter);
+
+		break;
+	}
+	}
+
+	return ret2 ?: ret;
+}
+
+#define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
+
+int bch2_extent_atomic_end(struct btree_trans *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert,
+			   struct bpos *end)
+{
+	struct btree_iter copy;
+	struct bkey_s_c k;
+	unsigned nr_iters = 0;
+	int ret;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	*end = insert->k.p;
+
+	/* extent_update_to_keys(): */
+	nr_iters += 1;
+
+	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+				     &nr_iters, EXTENT_ITERS_MAX / 2);
+	if (ret < 0)
+		return ret;
+
+	bch2_trans_copy_iter(&copy, iter);
+
+	for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
+		unsigned offset = 0;
+
+		if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
+			offset = bkey_start_offset(&insert->k) -
+				bkey_start_offset(k.k);
+
+		/* extent_handle_overwrites(): */
+		switch (bch2_extent_overlap(&insert->k, k.k)) {
+		case BCH_EXTENT_OVERLAP_ALL:
+		case BCH_EXTENT_OVERLAP_FRONT:
+			nr_iters += 1;
+			break;
+		case BCH_EXTENT_OVERLAP_BACK:
+		case BCH_EXTENT_OVERLAP_MIDDLE:
+			nr_iters += 2;
+			break;
+		}
+
+		ret = count_iters_for_insert(trans, k, offset, end,
+					&nr_iters, EXTENT_ITERS_MAX);
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &copy);
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i *k)
+{
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(trans, iter, k, &end);
+	if (ret)
+		return ret;
+
+	bch2_cut_back(end, k);
+	return 0;
+}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
new file mode 100644
index 000000000000..6f5cf449361a
--- /dev/null
+++ b/fs/bcachefs/extent_update.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENT_UPDATE_H
+#define _BCACHEFS_EXTENT_UPDATE_H
+
+#include "bcachefs.h"
+
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+			   struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+			    struct bkey_i *);
+
+#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
new file mode 100644
index 000000000000..1b25f84e4b9c
--- /dev/null
+++ b/fs/bcachefs/extents.c
@@ -0,0 +1,1403 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Code for managing the extent btree and dynamically updating the writeback
+ * dirty sector count.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+#include "util.h"
+
+static unsigned bch2_crc_field_size_max[] = {
+	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
+
+static void bch2_extent_crc_pack(union bch_extent_crc *,
+				 struct bch_extent_crc_unpacked,
+				 enum bch_extent_entry_type);
+
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
+						   unsigned dev)
+{
+	struct bch_dev_io_failures *i;
+
+	for (i = f->devs; i < f->devs + f->nr; i++)
+		if (i->dev == dev)
+			return i;
+
+	return NULL;
+}
+
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+			  struct extent_ptr_decoded *p)
+{
+	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+
+	if (!f) {
+		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+		f = &failed->devs[failed->nr++];
+		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else {
+		f->nr_failed++;
+	}
+}
+
+/*
+ * returns true if p1 is better than p2:
+ */
+static inline bool ptr_better(struct bch_fs *c,
+			      const struct extent_ptr_decoded p1,
+			      const struct extent_ptr_decoded p2)
+{
+	if (likely(!p1.idx && !p2.idx)) {
+		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+
+		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+
+		/* Pick at random, biased in favor of the faster device: */
+
+		return bch2_rand_range(l1 + l2) > l1;
+	}
+
+	if (bch2_force_reconstruct_read)
+		return p1.idx > p2.idx;
+
+	return p1.idx < p2.idx;
+}
+
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
+			       struct bch_io_failures *failed,
+			       struct extent_ptr_decoded *pick)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_dev_io_failures *f;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	if (k.k->type == KEY_TYPE_error)
+		return -EIO;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		/*
+		 * Unwritten extent: no need to actually read, treat it as a
+		 * hole and return 0s:
+		 */
+		if (p.ptr.unwritten)
+			return 0;
+
+		ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+		/*
+		 * If there are any dirty pointers it's an error if we can't
+		 * read:
+		 */
+		if (!ret && !p.ptr.cached)
+			ret = -EIO;
+
+		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+			continue;
+
+		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx &&
+		    !bch2_dev_is_readable(ca))
+			p.idx++;
+
+		if (bch2_force_reconstruct_read &&
+		    !p.idx && p.has_ec)
+			p.idx++;
+
+		if (p.idx >= (unsigned) p.has_ec + 1)
+			continue;
+
+		if (ret > 0 && !ptr_better(c, p, *pick))
+			continue;
+
+		*pick = p;
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/* KEY_TYPE_btree_ptr: */
+
+int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
+{
+	if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
+		prt_printf(err, "value too big (%zu > %u)",
+		       bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
+}
+
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			      enum bkey_invalid_flags flags,
+			      struct printbuf *err)
+{
+	if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
+		prt_printf(err, "value too big (%zu > %zu)",
+		       bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
+}
+
+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct bkey_s_c k)
+{
+	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+
+	prt_printf(out, "seq %llx written %u min_key %s",
+	       le64_to_cpu(bp.v->seq),
+	       le16_to_cpu(bp.v->sectors_written),
+	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
+
+	bch2_bpos_to_text(out, bp.v->min_key);
+	prt_printf(out, " ");
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
+			      unsigned big_endian, int write,
+			      struct bkey_s k)
+{
+	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
+
+	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bkey_eq(bp.v->min_key, POS_MIN))
+		bp.v->min_key = write
+			? bpos_nosnap_predecessor(bp.v->min_key)
+			: bpos_nosnap_successor(bp.v->min_key);
+}
+
+/* KEY_TYPE_extent: */
+
+bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
+	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
+	union bch_extent_entry *en_l;
+	const union bch_extent_entry *en_r;
+	struct extent_ptr_decoded lp, rp;
+	bool use_right_ptr;
+	struct bch_dev *ca;
+
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+		if (extent_entry_type(en_l) != extent_entry_type(en_r))
+			return false;
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
+		return false;
+
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
+	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+
+	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
+	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
+		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
+		    rp.ptr.offset + rp.crc.offset ||
+		    lp.ptr.dev			!= rp.ptr.dev ||
+		    lp.ptr.gen			!= rp.ptr.gen ||
+		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
+		    lp.has_ec			!= rp.has_ec)
+			return false;
+
+		/* Extents may not straddle buckets: */
+		ca = bch_dev_bkey_exists(c, lp.ptr.dev);
+		if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
+			return false;
+
+		if (lp.has_ec			!= rp.has_ec ||
+		    (lp.has_ec &&
+		     (lp.ec.block		!= rp.ec.block ||
+		      lp.ec.redundancy		!= rp.ec.redundancy ||
+		      lp.ec.idx			!= rp.ec.idx)))
+			return false;
+
+		if (lp.crc.compression_type	!= rp.crc.compression_type ||
+		    lp.crc.nonce		!= rp.crc.nonce)
+			return false;
+
+		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
+		    lp.crc.uncompressed_size) {
+			/* can use left extent's crc entry */
+		} else if (lp.crc.live_size <= rp.crc.offset) {
+			/* can use right extent's crc entry */
+		} else {
+			/* check if checksums can be merged: */
+			if (lp.crc.csum_type		!= rp.crc.csum_type ||
+			    lp.crc.nonce		!= rp.crc.nonce ||
+			    crc_is_compressed(lp.crc) ||
+			    !bch2_checksum_mergeable(lp.crc.csum_type))
+				return false;
+
+			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
+			    rp.crc.offset)
+				return false;
+
+			if (lp.crc.csum_type &&
+			    lp.crc.uncompressed_size +
+			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
+				return false;
+		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+		if (extent_entry_is_crc(en_l)) {
+			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
+			    bch2_crc_field_size_max[extent_entry_type(en_l)])
+				return false;
+		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	use_right_ptr = false;
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end) {
+		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
+		    use_right_ptr)
+			en_l->ptr = en_r->ptr;
+
+		if (extent_entry_is_crc(en_l)) {
+			struct bch_extent_crc_unpacked crc_l =
+				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			struct bch_extent_crc_unpacked crc_r =
+				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+			use_right_ptr = false;
+
+			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
+			    crc_l.uncompressed_size) {
+				/* can use left extent's crc entry */
+			} else if (crc_l.live_size <= crc_r.offset) {
+				/* can use right extent's crc entry */
+				crc_r.offset -= crc_l.live_size;
+				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
+						     extent_entry_type(en_l));
+				use_right_ptr = true;
+			} else {
+				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+								 crc_l.csum,
+								 crc_r.csum,
+								 crc_r.uncompressed_size << 9);
+
+				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
+				crc_l.compressed_size	+= crc_r.compressed_size;
+				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+						     extent_entry_type(en_l));
+			}
+		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+/* KEY_TYPE_reservation: */
+
+int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
+		prt_printf(err, "invalid nr_replicas (%u)",
+		       r.v->nr_replicas);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	prt_printf(out, "generation %u replicas %u",
+	       le32_to_cpu(r.v->generation),
+	       r.v->nr_replicas);
+}
+
+bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
+
+	if (l.v->generation != r.v->generation ||
+	    l.v->nr_replicas != r.v->nr_replicas)
+		return false;
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+/* Extent checksum entries: */
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+					 struct bch_extent_crc_unpacked r)
+{
+	return (l.csum_type		!= r.csum_type ||
+		l.compression_type	!= r.compression_type ||
+		l.compressed_size	!= r.compressed_size ||
+		l.uncompressed_size	!= r.uncompressed_size ||
+		l.offset		!= r.offset ||
+		l.live_size		!= r.live_size ||
+		l.nonce			!= r.nonce ||
+		bch2_crc_cmp(l.csum, r.csum));
+}
+
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+				  struct bch_extent_crc_unpacked n)
+{
+	return !crc_is_compressed(u) &&
+		u.csum_type &&
+		u.uncompressed_size > u.live_size &&
+		bch2_csum_type_is_encryption(u.csum_type) ==
+		bch2_csum_type_is_encryption(n.csum_type);
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
+				 struct bch_extent_crc_unpacked n)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	if (!n.csum_type)
+		return false;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (can_narrow_crc(crc, n))
+			return true;
+
+	return false;
+}
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ */
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked u;
+	struct extent_ptr_decoded p;
+	union bch_extent_entry *i;
+	bool ret = false;
+
+	/* Find a checksum entry that covers only live data: */
+	if (!n.csum_type) {
+		bkey_for_each_crc(&k->k, ptrs, u, i)
+			if (!crc_is_compressed(u) &&
+			    u.csum_type &&
+			    u.live_size == u.uncompressed_size) {
+				n = u;
+				goto found;
+			}
+		return false;
+	}
+found:
+	BUG_ON(crc_is_compressed(n));
+	BUG_ON(n.offset);
+	BUG_ON(n.live_size != k->k.size);
+
+restart_narrow_pointers:
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
+		if (can_narrow_crc(p.crc, n)) {
+			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
+			p.ptr.offset += p.crc.offset;
+			p.crc = n;
+			bch2_extent_ptr_decoded_append(k, &p);
+			ret = true;
+			goto restart_narrow_pointers;
+		}
+
+	return ret;
+}
+
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+				 struct bch_extent_crc_unpacked src,
+				 enum bch_extent_entry_type type)
+{
+#define set_common_fields(_dst, _src)					\
+		_dst.type		= 1 << type;			\
+		_dst.csum_type		= _src.csum_type,		\
+		_dst.compression_type	= _src.compression_type,	\
+		_dst._compressed_size	= _src.compressed_size - 1,	\
+		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
+		_dst.offset		= _src.offset
+
+	switch (type) {
+	case BCH_EXTENT_ENTRY_crc32:
+		set_common_fields(dst->crc32, src);
+		dst->crc32.csum		= (u32 __force) *((__le32 *) &src.csum.lo);
+		break;
+	case BCH_EXTENT_ENTRY_crc64:
+		set_common_fields(dst->crc64, src);
+		dst->crc64.nonce	= src.nonce;
+		dst->crc64.csum_lo	= (u64 __force) src.csum.lo;
+		dst->crc64.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi);
+		break;
+	case BCH_EXTENT_ENTRY_crc128:
+		set_common_fields(dst->crc128, src);
+		dst->crc128.nonce	= src.nonce;
+		dst->crc128.csum	= src.csum;
+		break;
+	default:
+		BUG();
+	}
+#undef set_common_fields
+}
+
+void bch2_extent_crc_append(struct bkey_i *k,
+			    struct bch_extent_crc_unpacked new)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	union bch_extent_crc *crc = (void *) ptrs.end;
+	enum bch_extent_entry_type type;
+
+	if (bch_crc_bytes[new.csum_type]	<= 4 &&
+	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc32;
+	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
+		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
+		   new.nonce			<= CRC64_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc64;
+	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
+		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
+		   new.nonce			<= CRC128_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc128;
+	else
+		BUG();
+
+	bch2_extent_crc_pack(crc, new, type);
+
+	k->k.u64s += extent_entry_u64s(ptrs.end);
+
+	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
+}
+
+/* Generic code for keys with pointers: */
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
+{
+	return bch2_bkey_devs(k).nr;
+}
+
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+	return k.k->type == KEY_TYPE_reservation
+		? bkey_s_c_to_reservation(k).v->nr_replicas
+		: bch2_bkey_dirty_devs(k).nr;
+}
+
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
+{
+	unsigned ret = 0;
+
+	if (k.k->type == KEY_TYPE_reservation) {
+		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+	} else {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
+	}
+
+	return ret;
+}
+
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned ret = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (!p.ptr.cached && crc_is_compressed(p.crc))
+			ret += p.crc.compressed_size;
+
+	return ret;
+}
+
+bool bch2_bkey_is_incompressible(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+
+	bkey_for_each_crc(k.k, ptrs, crc, entry)
+		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+			return true;
+	return false;
+}
+
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p = { 0 };
+	unsigned replicas = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
+
+		if (p.has_ec)
+			replicas += p.ec.redundancy;
+
+		replicas++;
+
+	}
+
+	return replicas;
+}
+
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+	struct bch_dev *ca;
+
+	if (p->ptr.cached)
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+	return ca->mi.durability +
+		(p->has_ec
+		 ? p->ec.redundancy
+		 : 0);
+}
+
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+	struct bch_dev *ca;
+
+	if (p->ptr.cached)
+		return 0;
+
+	ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_failed)
+		return 0;
+
+	return ca->mi.durability +
+		(p->has_ec
+		 ? p->ec.redundancy
+		 : 0);
+}
+
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		durability += bch2_extent_ptr_durability(c, &p);
+
+	return durability;
+}
+
+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
+			durability += bch2_extent_ptr_durability(c, &p);
+
+	return durability;
+}
+
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
+{
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+	k->k.u64s -= extent_entry_u64s(entry);
+}
+
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
+				    struct extent_ptr_decoded *p)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked crc =
+		bch2_extent_crc_unpack(&k->k, NULL);
+	union bch_extent_entry *pos;
+
+	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+		pos = ptrs.start;
+		goto found;
+	}
+
+	bkey_for_each_crc(&k->k, ptrs, crc, pos)
+		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+			pos = extent_entry_next(pos);
+			goto found;
+		}
+
+	bch2_extent_crc_append(k, p->crc);
+	pos = bkey_val_end(bkey_i_to_s(k));
+found:
+	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	__extent_entry_insert(k, pos, to_entry(&p->ptr));
+
+	if (p->has_ec) {
+		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+		__extent_entry_insert(k, pos, to_entry(&p->ec));
+	}
+}
+
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+					  union bch_extent_entry *entry)
+{
+	union bch_extent_entry *i = ptrs.start;
+
+	if (i == entry)
+		return NULL;
+
+	while (extent_entry_next(i) != entry)
+		i = extent_entry_next(i);
+	return i;
+}
+
+static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	/* stripes have ptrs, but their layout doesn't work with this code */
+	BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+	memmove_u64s_down(entry, next,
+			  (u64 *) bkey_val_end(k) - (u64 *) next);
+	k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+						   struct bch_extent_ptr *ptr)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry = to_entry(ptr), *next;
+	union bch_extent_entry *ret = entry;
+	bool drop_crc = true;
+
+	EBUG_ON(ptr < &ptrs.start->ptr ||
+		ptr >= &ptrs.end->ptr);
+	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+
+	for (next = extent_entry_next(entry);
+	     next != ptrs.end;
+	     next = extent_entry_next(next)) {
+		if (extent_entry_is_crc(next)) {
+			break;
+		} else if (extent_entry_is_ptr(next)) {
+			drop_crc = false;
+			break;
+		}
+	}
+
+	extent_entry_drop(k, entry);
+
+	while ((entry = extent_entry_prev(ptrs, entry))) {
+		if (extent_entry_is_ptr(entry))
+			break;
+
+		if ((extent_entry_is_crc(entry) && drop_crc) ||
+		    extent_entry_is_stripe_ptr(entry)) {
+			ret = (void *) ret - extent_entry_bytes(entry);
+			extent_entry_drop(k, entry);
+		}
+	}
+
+	return ret;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+					   struct bch_extent_ptr *ptr)
+{
+	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+	union bch_extent_entry *ret =
+		bch2_bkey_drop_ptr_noerror(k, ptr);
+
+	/*
+	 * If we deleted all the dirty pointers and there's still cached
+	 * pointers, we could set the cached pointers to dirty if they're not
+	 * stale - but to do that correctly we'd need to grab an open_bucket
+	 * reference so that we don't race with bucket reuse:
+	 */
+	if (have_dirty &&
+	    !bch2_bkey_dirty_devs(k.s_c).nr) {
+		k.k->type = KEY_TYPE_error;
+		set_bkey_val_u64s(k.k, 0);
+		ret = NULL;
+	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+		k.k->type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(k.k, 0);
+		ret = NULL;
+	}
+
+	return ret;
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
+
+	if (ptr)
+		bch2_bkey_drop_ptr_noerror(k, ptr);
+}
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (!ptr->cached ||
+		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+			return true;
+
+	return false;
+}
+
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+			   struct bch_extent_ptr m, u64 offset)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev	== m.dev &&
+		    p.ptr.gen	== m.gen &&
+		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
+		    (s64) m.offset  - offset)
+			return true;
+
+	return false;
+}
+
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+	if (k1.k->type != k2.k->type)
+		return false;
+
+	if (bkey_extent_is_direct_data(k1.k)) {
+		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+		const union bch_extent_entry *entry1, *entry2;
+		struct extent_ptr_decoded p1, p2;
+
+		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+			return false;
+
+		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+				if (p1.ptr.dev		== p2.ptr.dev &&
+				    p1.ptr.gen		== p2.ptr.gen &&
+				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+					return true;
+
+		return false;
+	} else {
+		/* KEY_TYPE_deleted, etc. */
+		return true;
+	}
+}
+
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
+{
+	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+	union bch_extent_entry *entry2;
+	struct extent_ptr_decoded p2;
+
+	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+		if (p1.ptr.dev		== p2.ptr.dev &&
+		    p1.ptr.gen		== p2.ptr.gen &&
+		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+			return &entry2->ptr;
+
+	return NULL;
+}
+
+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	union bch_extent_entry *ec = NULL;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (&entry->ptr == ptr) {
+			ptr->cached = true;
+			if (ec)
+				extent_entry_drop(k, ec);
+			return;
+		}
+
+		if (extent_entry_is_stripe_ptr(entry))
+			ec = entry;
+		else if (extent_entry_is_ptr(entry))
+			ec = NULL;
+	}
+
+	BUG();
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr,
+		ptr->cached &&
+		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
+
+	return bkey_deleted(k.k);
+}
+
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	const struct bch_extent_ptr *ptr;
+	const struct bch_extent_stripe_ptr *ec;
+	struct bch_dev *ca;
+	bool first = true;
+
+	if (c)
+		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (!first)
+			prt_printf(out, " ");
+
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ptr = entry_to_ptr(entry);
+			ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
+
+			if (!ca) {
+				prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+				       (u64) ptr->offset, ptr->gen,
+				       ptr->cached ? " cached" : "");
+			} else {
+				u32 offset;
+				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+				prt_printf(out, "ptr: %u:%llu:%u gen %u",
+					   ptr->dev, b, offset, ptr->gen);
+				if (ptr->cached)
+					prt_str(out, " cached");
+				if (ptr->unwritten)
+					prt_str(out, " unwritten");
+				if (ca && ptr_stale(ca, ptr))
+					prt_printf(out, " stale");
+			}
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+			       crc.compressed_size,
+			       crc.uncompressed_size,
+			       crc.offset, crc.nonce,
+			       bch2_csum_types[crc.csum_type],
+			       bch2_compression_types[crc.compression_type]);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ec = &entry->stripe_ptr;
+
+			prt_printf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
+			break;
+		default:
+			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+			return;
+		}
+
+		first = false;
+	}
+}
+
+static int extent_ptr_invalid(const struct bch_fs *c,
+			      struct bkey_s_c k,
+			      enum bkey_invalid_flags flags,
+			      const struct bch_extent_ptr *ptr,
+			      unsigned size_ondisk,
+			      bool metadata,
+			      struct printbuf *err)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr2;
+	u64 bucket;
+	u32 bucket_offset;
+	struct bch_dev *ca;
+
+	if (!bch2_dev_exists2(c, ptr->dev)) {
+		/*
+		 * If we're in the write path this key might have already been
+		 * overwritten, and we could be seeing a device that doesn't
+		 * exist anymore due to racing with device removal:
+		 */
+		if (flags & BKEY_INVALID_WRITE)
+			return 0;
+
+		prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	ca = bch_dev_bkey_exists(c, ptr->dev);
+	bkey_for_each_ptr(ptrs, ptr2)
+		if (ptr != ptr2 && ptr->dev == ptr2->dev) {
+			prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
+			return -BCH_ERR_invalid_bkey;
+		}
+
+	bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+
+	if (bucket >= ca->mi.nbuckets) {
+		prt_printf(err, "pointer past last bucket (%llu > %llu)",
+		       bucket, ca->mi.nbuckets);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
+		prt_printf(err, "pointer before first bucket (%llu < %u)",
+		       bucket, ca->mi.first_bucket);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
+		prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+		       bucket_offset, size_ondisk, ca->mi.bucket_size);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	unsigned size_ondisk = k.k->size;
+	unsigned nonce = UINT_MAX;
+	unsigned nr_ptrs = 0;
+	bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
+	int ret;
+
+	if (bkey_is_btree_ptr(k.k))
+		size_ondisk = btree_sectors(c);
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
+			prt_printf(err, "invalid extent entry type (got %u, max %u)",
+			       __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		if (bkey_is_btree_ptr(k.k) &&
+		    !extent_entry_is_ptr(entry)) {
+			prt_printf(err, "has non ptr field");
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+						 size_ondisk, false, err);
+			if (ret)
+				return ret;
+
+			if (nr_ptrs && unwritten != entry->ptr.unwritten) {
+				prt_printf(err, "extent with unwritten and written ptrs");
+				return -BCH_ERR_invalid_bkey;
+			}
+
+			if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
+				prt_printf(err, "has unwritten ptrs");
+				return -BCH_ERR_invalid_bkey;
+			}
+
+			if (entry->ptr.cached && have_ec) {
+				prt_printf(err, "cached, erasure coded ptr");
+				return -BCH_ERR_invalid_bkey;
+			}
+
+			unwritten = entry->ptr.unwritten;
+			have_ec = false;
+			crc_since_last_ptr = false;
+			nr_ptrs++;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+			if (crc.offset + crc.live_size >
+			    crc.uncompressed_size) {
+				prt_printf(err, "checksum offset + key size > uncompressed size");
+				return -BCH_ERR_invalid_bkey;
+			}
+
+			size_ondisk = crc.compressed_size;
+
+			if (!bch2_checksum_type_valid(c, crc.csum_type)) {
+				prt_printf(err, "invalid checksum type");
+				return -BCH_ERR_invalid_bkey;
+			}
+
+			if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
+				prt_printf(err, "invalid compression type");
+				return -BCH_ERR_invalid_bkey;
+			}
+
+			if (bch2_csum_type_is_encryption(crc.csum_type)) {
+				if (nonce == UINT_MAX)
+					nonce = crc.offset + crc.nonce;
+				else if (nonce != crc.offset + crc.nonce) {
+					prt_printf(err, "incorrect nonce");
+					return -BCH_ERR_invalid_bkey;
+				}
+			}
+
+			if (crc_since_last_ptr) {
+				prt_printf(err, "redundant crc entry");
+				return -BCH_ERR_invalid_bkey;
+			}
+			crc_since_last_ptr = true;
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			if (have_ec) {
+				prt_printf(err, "redundant stripe entry");
+				return -BCH_ERR_invalid_bkey;
+			}
+			have_ec = true;
+			break;
+		case BCH_EXTENT_ENTRY_rebalance:
+			break;
+		}
+	}
+
+	if (!nr_ptrs) {
+		prt_str(err, "no ptrs");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
+		prt_str(err, "too many ptrs");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (crc_since_last_ptr) {
+		prt_printf(err, "redundant crc entry");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (have_ec) {
+		prt_printf(err, "redundant stripe entry");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_ptr_swab(struct bkey_s k)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	u64 *d;
+
+	for (d =  (u64 *) ptrs.start;
+	     d != (u64 *) ptrs.end;
+	     d++)
+		*d = swab64(*d);
+
+	for (entry = ptrs.start;
+	     entry < ptrs.end;
+	     entry = extent_entry_next(entry)) {
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+			entry->crc32.csum = swab32(entry->crc32.csum);
+			break;
+		case BCH_EXTENT_ENTRY_crc64:
+			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+			break;
+		case BCH_EXTENT_ENTRY_crc128:
+			entry->crc128.csum.hi = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.hi);
+			entry->crc128.csum.lo = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.lo);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_rebalance:
+			break;
+		}
+	}
+}
+
+/* Generic extent code: */
+
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
+{
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
+	u64 sub;
+
+	if (bkey_le(where, bkey_start_pos(k.k)))
+		return 0;
+
+	EBUG_ON(bkey_gt(where, k.k->p));
+
+	sub = where.offset - bkey_start_offset(k.k);
+
+	k.k->size -= sub;
+
+	if (!k.k->size) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
+	}
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v: {
+		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+		union bch_extent_entry *entry;
+		bool seen_crc = false;
+
+		bkey_extent_entry_for_each(ptrs, entry) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				if (!seen_crc)
+					entry->ptr.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
+			case BCH_EXTENT_ENTRY_rebalance:
+				break;
+			}
+
+			if (extent_entry_is_crc(entry))
+				seen_crc = true;
+		}
+
+		break;
+	}
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
+
+		le64_add_cpu(&p.v->idx, sub);
+		break;
+	}
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data: {
+		void *p = bkey_inline_data_p(k);
+		unsigned bytes = bkey_inline_data_bytes(k.k);
+
+		sub = min_t(u64, sub << 9, bytes);
+
+		memmove(p, p + sub, bytes - sub);
+
+		new_val_u64s -= sub >> 3;
+		break;
+	}
+	}
+
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
+
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
+}
+
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
+{
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
+	u64 len = 0;
+
+	if (bkey_ge(where, k.k->p))
+		return 0;
+
+	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
+
+	len = where.offset - bkey_start_offset(k.k);
+
+	k.k->p.offset = where.offset;
+	k.k->size = len;
+
+	if (!len) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
+	}
+
+	switch (k.k->type) {
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data:
+		new_val_u64s = (bkey_inline_data_offset(k.k) +
+				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
+		break;
+	}
+
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
+
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
+}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
new file mode 100644
index 000000000000..879e7d218b6a
--- /dev/null
+++ b/fs/bcachefs/extents.h
@@ -0,0 +1,758 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_H
+#define _BCACHEFS_EXTENTS_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "extents_types.h"
+
+struct bch_fs;
+struct btree_trans;
+enum bkey_invalid_flags;
+
+/* extent entries: */
+
+#define extent_entry_last(_e)						\
+	((typeof(&(_e).v->start[0])) bkey_val_end(_e))
+
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
+
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *) &&	\
+		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
+		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
+
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+	int ret = __ffs(e->type);
+
+	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+	return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+	switch (extent_entry_type(entry)) {
+#define x(f, n)						\
+	case BCH_EXTENT_ENTRY_##f:			\
+		return sizeof(struct bch_extent_##f);
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+	return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline void __extent_entry_insert(struct bkey_i *k,
+					 union bch_extent_entry *dst,
+					 union bch_extent_entry *new)
+{
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+			      dst, (u64 *) end - (u64 *) dst);
+	k->k.u64s += extent_entry_u64s(new);
+	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+	switch (extent_entry_type(e)) {
+	case BCH_EXTENT_ENTRY_crc32:
+	case BCH_EXTENT_ENTRY_crc64:
+	case BCH_EXTENT_ENTRY_crc128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+union bch_extent_crc {
+	u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+	struct bch_extent_crc128	crc128;
+};
+
+#define __entry_to_crc(_entry)						\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const union bch_extent_crc *) (_entry),		\
+		(union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
+									\
+	__entry_to_crc(_entry);						\
+})
+
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		.compressed_size	= _crc._compressed_size + 1,	\
+		.uncompressed_size	= _crc._uncompressed_size + 1,	\
+		.offset			= _crc.offset,			\
+		.live_size		= k->size
+
+	if (!crc)
+		return (struct bch_extent_crc_unpacked) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+			.live_size		= k->size,
+		};
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc32),
+		};
+
+		*((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
+		return ret;
+	}
+	case BCH_EXTENT_ENTRY_crc64: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc64),
+			.nonce			= crc->crc64.nonce,
+			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
+		};
+
+		*((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
+
+		return ret;
+	}
+	case BCH_EXTENT_ENTRY_crc128: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc128),
+			.nonce			= crc->crc128.nonce,
+			.csum			= crc->crc128.csum,
+		};
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+#undef common_fields
+}
+
+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
+{
+	return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+		crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
+}
+
+/* bkey_ptrs: generically over any key type that has ptrs */
+
+struct bkey_ptrs_c {
+	const union bch_extent_entry	*start;
+	const union bch_extent_entry	*end;
+};
+
+struct bkey_ptrs {
+	union bch_extent_entry	*start;
+	union bch_extent_entry	*end;
+};
+
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+		return (struct bkey_ptrs_c) {
+			e.v->start,
+			extent_entry_last(e)
+		};
+	}
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+		return (struct bkey_ptrs_c) {
+			to_entry(&s.v->ptrs[0]),
+			to_entry(&s.v->ptrs[s.v->nr_blocks]),
+		};
+	}
+	case KEY_TYPE_reflink_v: {
+		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+		return (struct bkey_ptrs_c) {
+			r.v->start,
+			bkey_val_end(r),
+		};
+	}
+	case KEY_TYPE_btree_ptr_v2: {
+		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
+
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
+	default:
+		return (struct bkey_ptrs_c) { NULL, NULL };
+	}
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+	return (struct bkey_ptrs) {
+		(void *) p.start,
+		(void *) p.end
+	};
+}
+
+#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
+	for ((_entry) = (_start);					\
+	     (_entry) < (_end);						\
+	     (_entry) = extent_entry_next(_entry))
+
+#define __bkey_ptr_next(_ptr, _end)					\
+({									\
+	typeof(_end) _entry;						\
+									\
+	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
+		if (extent_entry_is_ptr(_entry))			\
+			break;						\
+									\
+	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
+})
+
+#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
+	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
+
+#define bkey_extent_entry_for_each(_p, _entry)				\
+	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
+
+#define __bkey_for_each_ptr(_start, _end, _ptr)				\
+	for ((_ptr) = (_start);						\
+	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
+	     (_ptr)++)
+
+#define bkey_ptr_next(_p, _ptr)						\
+	__bkey_ptr_next(_ptr, (_p).end)
+
+#define bkey_for_each_ptr(_p, _ptr)					\
+	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
+
+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
+({									\
+	__label__ out;							\
+									\
+	(_ptr).idx	= 0;						\
+	(_ptr).has_ec	= false;					\
+									\
+	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
+		switch (extent_entry_type(_entry)) {			\
+		case BCH_EXTENT_ENTRY_ptr:				\
+			(_ptr).ptr		= _entry->ptr;		\
+			goto out;					\
+		case BCH_EXTENT_ENTRY_crc32:				\
+		case BCH_EXTENT_ENTRY_crc64:				\
+		case BCH_EXTENT_ENTRY_crc128:				\
+			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
+					entry_to_crc(_entry));		\
+			break;						\
+		case BCH_EXTENT_ENTRY_stripe_ptr:			\
+			(_ptr).ec = _entry->stripe_ptr;			\
+			(_ptr).has_ec	= true;				\
+			break;						\
+		default:						\
+			/* nothing */					\
+			break;						\
+		}							\
+out:									\
+	_entry < (_end);						\
+})
+
+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
+	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
+	     (_entry) = _start;						\
+	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
+	     (_entry) = extent_entry_next(_entry))
+
+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
+				   _ptr, _entry)
+
+#define bkey_crc_next(_k, _start, _end, _crc, _iter)			\
+({									\
+	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack(_k,		\
+						entry_to_crc(_iter));	\
+			break;						\
+		}							\
+									\
+	(_iter) < (_end);						\
+})
+
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
+	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
+	     (_iter) = (_start);					\
+	     bkey_crc_next(_k, _start, _end, _crc, _iter);		\
+	     (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter)				\
+	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
+/* Iterate over pointers in KEY_TYPE_extent: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	__bkey_extent_entry_for_each_from(_start,			\
+				extent_entry_last(_e), _entry)
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+#define extent_ptr_next(_e, _ptr)					\
+	__bkey_ptr_next(_ptr, extent_entry_last(_e))
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
+
+#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
+				   extent_entry_last(_e), _ptr, _entry)
+
+/* utility code common to all keys with pointers: */
+
+void bch2_mark_io_failure(struct bch_io_failures *,
+			  struct extent_ptr_decoded *);
+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
+			       struct bch_io_failures *,
+			       struct extent_ptr_decoded *);
+
+/* KEY_TYPE_btree_ptr: */
+
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+			      enum bkey_invalid_flags, struct printbuf *);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
+			      int, struct bkey_s);
+
+#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {		\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
+})
+
+#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {		\
+	.key_invalid	= bch2_btree_ptr_v2_invalid,		\
+	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.compat		= bch2_btree_ptr_v2_compat,		\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
+	.min_val_size	= 40,					\
+})
+
+/* KEY_TYPE_extent: */
+
+bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_extent ((struct bkey_ops) {		\
+	.key_invalid	= bch2_bkey_ptrs_invalid,		\
+	.val_to_text	= bch2_bkey_ptrs_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_extent_normalize,		\
+	.key_merge	= bch2_extent_merge,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
+})
+
+/* KEY_TYPE_reservation: */
+
+int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
+			     enum bkey_invalid_flags, struct printbuf *);
+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_reservation ((struct bkey_ops) {		\
+	.key_invalid	= bch2_reservation_invalid,		\
+	.val_to_text	= bch2_reservation_to_text,		\
+	.key_merge	= bch2_reservation_merge,		\
+	.trans_trigger	= bch2_trans_mark_reservation,		\
+	.atomic_trigger	= bch2_mark_reservation,		\
+	.min_val_size	= 8,					\
+})
+
+/* Extent checksum entries: */
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
+				 struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+void bch2_extent_crc_append(struct bkey_i *,
+			    struct bch_extent_crc_unpacked);
+
+/* Generic code for keys with pointers: */
+
+static inline bool bkey_is_btree_ptr(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_direct_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_inline_data(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_inline_data ||
+		k->type == KEY_TYPE_indirect_inline_data;
+}
+
+static inline unsigned bkey_inline_data_offset(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_inline_data:
+		return sizeof(struct bch_inline_data);
+	case KEY_TYPE_indirect_inline_data:
+		return sizeof(struct bch_indirect_inline_data);
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
+{
+	return bkey_val_bytes(k) - bkey_inline_data_offset(k);
+}
+
+#define bkey_inline_data_p(_k)	(((void *) (_k).v) + bkey_inline_data_offset((_k).k))
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	return  bkey_extent_is_direct_data(k) ||
+		bkey_extent_is_inline_data(k) ||
+		k->type == KEY_TYPE_reflink_p;
+}
+
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reservation:
+	case KEY_TYPE_reflink_p:
+	case KEY_TYPE_reflink_v:
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data:
+	case KEY_TYPE_error:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->unwritten)
+			return true;
+	return false;
+}
+
+static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
+{
+	return k.k->type == KEY_TYPE_reservation ||
+		bkey_extent_is_unwritten(k);
+}
+
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (!ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return BCH_DATA_btree;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return BCH_DATA_user;
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+		BUG_ON(ptr < s.v->ptrs ||
+		       ptr >= s.v->ptrs + s.v->nr_blocks);
+
+		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+			? BCH_DATA_parity
+			: BCH_DATA_user;
+	}
+	default:
+		BUG();
+	}
+}
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+bool bch2_bkey_is_incompressible(struct bkey_s_c);
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
+unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
+
+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
+{
+	return (void *) bch2_bkey_has_device_c(k.s_c, dev);
+}
+
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
+
+static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
+{
+	struct bch_extent_ptr *dest;
+
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
+
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+	case KEY_TYPE_extent:
+		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+		dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
+		*dest = ptr;
+		k->k.u64s++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
+				    struct extent_ptr_decoded *);
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
+						   struct bch_extent_ptr *);
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
+					   struct bch_extent_ptr *);
+
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
+do {									\
+	struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k);			\
+									\
+	_ptr = &_ptrs.start->ptr;					\
+									\
+	while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) {			\
+		if (_cond) {						\
+			_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr);	\
+			_ptrs = bch2_bkey_ptrs(_k);			\
+			continue;					\
+		}							\
+									\
+		(_ptr)++;						\
+	}								\
+} while (0)
+
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+			   struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
+
+void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+
+void bch2_ptr_swab(struct bkey_s);
+
+/* Generic extent code: */
+
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+							  const struct bkey *m)
+{
+	int cmp1 = bkey_lt(k->p, m->p);
+	int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
+
+	return (cmp1 << 1) + cmp2;
+}
+
+int bch2_cut_front_s(struct bpos, struct bkey_s);
+int bch2_cut_back_s(struct bpos, struct bkey_s);
+
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+	bch2_cut_front_s(where, bkey_i_to_s(k));
+}
+
+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
+{
+	bch2_cut_back_s(where, bkey_i_to_s(k));
+}
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
+{
+	k->p.offset -= k->size;
+	k->p.offset += new_size;
+	k->size = new_size;
+}
+
+/*
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
+ * that we have to unpack the key, modify the unpacked key - then this
+ * copies/repacks the unpacked to the original as necessary.
+ */
+static inline void extent_save(struct btree *b, struct bkey_packed *dst,
+			       struct bkey *src)
+{
+	struct bkey_format *f = &b->format;
+	struct bkey_i *dst_unpacked;
+
+	if ((dst_unpacked = packed_to_bkey(dst)))
+		dst_unpacked->k = *src;
+	else
+		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
+}
+
+#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
new file mode 100644
index 000000000000..43d6c341ecca
--- /dev/null
+++ b/fs/bcachefs/extents_types.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+	u32			compressed_size;
+	u32			uncompressed_size;
+	u32			live_size;
+
+	u8			csum_type;
+	u8			compression_type;
+
+	u16			offset;
+
+	u16			nonce;
+
+	struct bch_csum		csum;
+};
+
+struct extent_ptr_decoded {
+	unsigned			idx;
+	bool				has_ec;
+	struct bch_extent_crc_unpacked	crc;
+	struct bch_extent_ptr		ptr;
+	struct bch_extent_stripe_ptr	ec;
+};
+
+struct bch_io_failures {
+	u8			nr;
+	struct bch_dev_io_failures {
+		u8		dev;
+		u8		idx;
+		u8		nr_failed;
+		u8		nr_retries;
+	}			devs[BCH_REPLICAS_MAX];
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
new file mode 100644
index 000000000000..05429c9631cd
--- /dev/null
+++ b/fs/bcachefs/eytzinger.h
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ */
+
+/*
+ * One based indexing version:
+ *
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
+ */
+
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + child;
+}
+
+static inline unsigned eytzinger1_left_child(unsigned i)
+{
+	return eytzinger1_child(i, 0);
+}
+
+static inline unsigned eytzinger1_right_child(unsigned i)
+{
+	return eytzinger1_child(i, 1);
+}
+
+static inline unsigned eytzinger1_first(unsigned size)
+{
+	return rounddown_pow_of_two(size);
+}
+
+static inline unsigned eytzinger1_last(unsigned size)
+{
+	return rounddown_pow_of_two(size + 1) - 1;
+}
+
+/*
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
+ *
+ * eytzinger1_next(0) == eytzinger1_first())
+ * eytzinger1_prev(0) == eytzinger1_last())
+ *
+ * eytzinger1_prev(eytzinger1_first()) == 0
+ * eytzinger1_next(eytzinger1_last()) == 0
+ */
+
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
+{
+	EBUG_ON(i > size);
+
+	if (eytzinger1_right_child(i) <= size) {
+		i = eytzinger1_right_child(i);
+
+		i <<= __fls(size + 1) - __fls(i);
+		i >>= i > size;
+	} else {
+		i >>= ffz(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
+{
+	EBUG_ON(i > size);
+
+	if (eytzinger1_left_child(i) <= size) {
+		i = eytzinger1_left_child(i) + 1;
+
+		i <<= __fls(size + 1) - __fls(i);
+		i -= 1;
+		i >>= i > size;
+	} else {
+		i >>= __ffs(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_extra(unsigned size)
+{
+	return (size + 1 - rounddown_pow_of_two(size)) << 1;
+}
+
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
+					      unsigned extra)
+{
+	unsigned b = __fls(i);
+	unsigned shift = __fls(size) - b;
+	int s;
+
+	EBUG_ON(!i || i > size);
+
+	i  ^= 1U << b;
+	i <<= 1;
+	i  |= 1;
+	i <<= shift;
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i -= (i - extra) >> 1;
+	 */
+	s = extra - i;
+	i += (s >> 1) & (s >> 31);
+
+	return i;
+}
+
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	unsigned shift;
+	int s;
+
+	EBUG_ON(!i || i > size);
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i += i - extra;
+	 */
+	s = extra - i;
+	i -= s & (s >> 31);
+
+	shift = __ffs(i);
+
+	i >>= shift + 1;
+	i  |= 1U << (__fls(size) - shift);
+
+	return i;
+}
+
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
+}
+
+#define eytzinger1_for_each(_i, _size)			\
+	for ((_i) = eytzinger1_first((_size));		\
+	     (_i) != 0;					\
+	     (_i) = eytzinger1_next((_i), (_size)))
+
+/* Zero based indexing version: */
+
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + 1 + child;
+}
+
+static inline unsigned eytzinger0_left_child(unsigned i)
+{
+	return eytzinger0_child(i, 0);
+}
+
+static inline unsigned eytzinger0_right_child(unsigned i)
+{
+	return eytzinger0_child(i, 1);
+}
+
+static inline unsigned eytzinger0_first(unsigned size)
+{
+	return eytzinger1_first(size) - 1;
+}
+
+static inline unsigned eytzinger0_last(unsigned size)
+{
+	return eytzinger1_last(size) - 1;
+}
+
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
+{
+	return eytzinger1_next(i + 1, size) - 1;
+}
+
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
+{
+	return eytzinger1_prev(i + 1, size) - 1;
+}
+
+static inline unsigned eytzinger0_extra(unsigned size)
+{
+	return eytzinger1_extra(size);
+}
+
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
+}
+
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
+}
+
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
+}
+
+#define eytzinger0_for_each(_i, _size)			\
+	for ((_i) = eytzinger0_first((_size));		\
+	     (_i) != -1;				\
+	     (_i) = eytzinger0_next((_i), (_size)))
+
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+					 eytzinger_cmp_fn cmp, const void *search)
+{
+	unsigned i, n = 0;
+
+	if (!nr)
+		return -1;
+
+	do {
+		i = n;
+		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+	} while (n < nr);
+
+	if (n & 1) {
+		/* @i was greater than @search, return previous node: */
+
+		if (i == eytzinger0_first(nr))
+			return -1;
+
+		return eytzinger0_prev(i, nr);
+	} else {
+		return i;
+	}
+}
+
+#define eytzinger0_find(base, nr, size, _cmp, search)			\
+({									\
+	void *_base	= (base);					\
+	void *_search	= (search);					\
+	size_t _nr	= (nr);						\
+	size_t _size	= (size);					\
+	size_t _i	= 0;						\
+	int _res;							\
+									\
+	while (_i < _nr &&						\
+	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
+		_i = eytzinger0_child(_i, _res > 0);			\
+	_i;								\
+})
+
+void eytzinger0_sort(void *, size_t, size_t,
+		    int (*cmp_func)(const void *, const void *, size_t),
+		    void (*swap_func)(void *, void *, size_t));
+
+#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
new file mode 100644
index 000000000000..66b945be10c2
--- /dev/null
+++ b/fs/bcachefs/fifo.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FIFO_H
+#define _BCACHEFS_FIFO_H
+
+#include "util.h"
+
+#define FIFO(type)							\
+struct {								\
+	size_t front, back, size, mask;					\
+	type *data;							\
+}
+
+#define DECLARE_FIFO(type, name)	FIFO(type) name
+
+#define fifo_buf_size(fifo)						\
+	((fifo)->size							\
+	 ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])	\
+	 : 0)
+
+#define init_fifo(fifo, _size, _gfp)					\
+({									\
+	(fifo)->front	= (fifo)->back = 0;				\
+	(fifo)->size	= (_size);					\
+	(fifo)->mask	= (fifo)->size					\
+		? roundup_pow_of_two((fifo)->size) - 1			\
+		: 0;							\
+	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
+#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx_abs(fifo, p)					\
+	((((p) >= &fifo_peek_front(fifo)				\
+	   ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +		\
+	   (((p) - (fifo)->data)))
+
+#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i)	((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
+
+#define fifo_push_back_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
+
+#define fifo_push_front_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
+
+#define fifo_push_back(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_back_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_push_front(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_front_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_push_ref(fifo)	fifo_push_back_ref(fifo)
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo)		fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter)			\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     (_iter)++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     (_iter)++)
+
+#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
new file mode 100644
index 000000000000..bb5305441f27
--- /dev/null
+++ b/fs/bcachefs/fs-common.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fs-common.h"
+#include "inode.h"
+#include "subvolume.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+	return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+		      subvol_inum dir,
+		      struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *new_inode,
+		      const struct qstr *name,
+		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		      struct posix_acl *default_acl,
+		      struct posix_acl *acl,
+		      subvol_inum snapshot_src,
+		      unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	subvol_inum new_inum = dir;
+	u64 now = bch2_current_time(c);
+	u64 cpu = raw_smp_processor_id();
+	u64 dir_target;
+	u32 snapshot;
+	unsigned dir_type = mode_to_type(mode);
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		/* Normal create path - allocate a new inode: */
+		bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+		if (flags & BCH_CREATE_TMPFILE)
+			new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+		ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
+		if (ret)
+			goto err;
+
+		snapshot_src = (subvol_inum) { 0 };
+	} else {
+		/*
+		 * Creating a snapshot - we're not allocating a new inode, but
+		 * we do have to lookup the root inode of the subvolume we're
+		 * snapshotting and update it (in the new snapshot):
+		 */
+
+		if (!snapshot_src.inum) {
+			/* Inode wasn't specified, just snapshot: */
+			struct bch_subvolume s;
+
+			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
+						 BTREE_ITER_CACHED, &s);
+			if (ret)
+				goto err;
+
+			snapshot_src.inum = le64_to_cpu(s.inode);
+		}
+
+		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+				      BTREE_ITER_INTENT);
+		if (ret)
+			goto err;
+
+		if (new_inode->bi_subvol != snapshot_src.subvol) {
+			/* Not a subvolume root: */
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * If we're not root, we have to own the subvolume being
+		 * snapshotted:
+		 */
+		if (uid && new_inode->bi_uid != uid) {
+			ret = -EPERM;
+			goto err;
+		}
+
+		flags |= BCH_CREATE_SUBVOL;
+	}
+
+	new_inum.inum	= new_inode->bi_inum;
+	dir_target	= new_inode->bi_inum;
+
+	if (flags & BCH_CREATE_SUBVOL) {
+		u32 new_subvol, dir_snapshot;
+
+		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+					    snapshot_src.subvol,
+					    &new_subvol, &snapshot,
+					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
+		if (ret)
+			goto err;
+
+		new_inode->bi_parent_subvol	= dir.subvol;
+		new_inode->bi_subvol		= new_subvol;
+		new_inum.subvol			= new_subvol;
+		dir_target			= new_subvol;
+		dir_type			= DT_SUBVOL;
+
+		ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+		ret = bch2_btree_iter_traverse(&dir_iter);
+		if (ret)
+			goto err;
+	}
+
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		if (default_acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 default_acl, ACL_TYPE_DEFAULT);
+			if (ret)
+				goto err;
+		}
+
+		if (acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 acl, ACL_TYPE_ACCESS);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (!(flags & BCH_CREATE_TMPFILE)) {
+		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+		u64 dir_offset;
+
+		if (is_subdir_for_nlink(new_inode))
+			dir_u->bi_nlink++;
+		dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+		ret = bch2_inode_write(trans, &dir_iter, dir_u);
+		if (ret)
+			goto err;
+
+		ret = bch2_dirent_create(trans, dir, &dir_hash,
+					 dir_type,
+					 name,
+					 dir_target,
+					 &dir_offset,
+					 BCH_HASH_SET_MUST_CREATE);
+		if (ret)
+			goto err;
+
+		if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+			new_inode->bi_dir		= dir_u->bi_inum;
+			new_inode->bi_dir_offset	= dir_offset;
+		}
+	}
+
+	inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+	bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
+
+	ret   = bch2_btree_iter_traverse(&inode_iter) ?:
+		bch2_inode_write(trans, &inode_iter, new_inode);
+err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
+	return ret;
+}
+
+int bch2_link_trans(struct btree_trans *trans,
+		    subvol_inum dir,  struct bch_inode_unpacked *dir_u,
+		    subvol_inum inum, struct bch_inode_unpacked *inode_u,
+		    const struct qstr *name)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	struct bch_hash_info dir_hash;
+	u64 now = bch2_current_time(c);
+	u64 dir_offset = 0;
+	int ret;
+
+	if (dir.subvol != inum.subvol)
+		return -EXDEV;
+
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	inode_u->bi_ctime = now;
+	ret = bch2_inode_nlink_inc(inode_u);
+	if (ret)
+		return ret;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	if (bch2_reinherit_attrs(inode_u, dir_u)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+	dir_hash = bch2_hash_info_init(c, dir_u);
+
+	ret = bch2_dirent_create(trans, dir, &dir_hash,
+				 mode_to_type(inode_u->bi_mode),
+				 name, inum.inum, &dir_offset,
+				 BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		goto err;
+
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		inode_u->bi_dir		= dir.inum;
+		inode_u->bi_dir_offset	= dir_offset;
+	}
+
+	ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
+		bch2_inode_write(trans, &inode_iter, inode_u);
+err:
+	bch2_trans_iter_exit(trans, &dir_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
+	return ret;
+}
+
+int bch2_unlink_trans(struct btree_trans *trans,
+		      subvol_inum dir,
+		      struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *inode_u,
+		      const struct qstr *name,
+		      bool deleting_snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter dirent_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	struct bch_hash_info dir_hash;
+	subvol_inum inum;
+	u64 now = bch2_current_time(c);
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	dir_hash = bch2_hash_info_init(c, dir_u);
+
+	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+					 name, &inum, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+		ret = bch2_empty_dir_trans(trans, inum);
+		if (ret)
+			goto err;
+	}
+
+	if (deleting_snapshot && !inode_u->bi_subvol) {
+		ret = -BCH_ERR_ENOENT_not_subvol;
+		goto err;
+	}
+
+	if (deleting_snapshot || inode_u->bi_subvol) {
+		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
+		if (ret)
+			goto err;
+
+		k = bch2_btree_iter_peek_slot(&dirent_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		/*
+		 * If we're deleting a subvolume, we need to really delete the
+		 * dirent, not just emit a whiteout in the current snapshot:
+		 */
+		bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+		ret = bch2_btree_iter_traverse(&dirent_iter);
+		if (ret)
+			goto err;
+	} else {
+		bch2_inode_nlink_dec(trans, inode_u);
+	}
+
+	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
+	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
+		inode_u->bi_dir		= 0;
+		inode_u->bi_dir_offset	= 0;
+	}
+
+	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
+	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
+
+	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				    &dir_hash, &dirent_iter,
+				    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_inode_write(trans, &dir_iter, dir_u) ?:
+		bch2_inode_write(trans, &inode_iter, inode_u);
+err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
+	return ret;
+}
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
+			  struct bch_inode_unpacked *src_u)
+{
+	u64 src, dst;
+	unsigned id;
+	bool ret = false;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		/* Skip attributes that were explicitly set on this inode */
+		if (dst_u->bi_fields_set & (1 << id))
+			continue;
+
+		src = bch2_inode_opt_get(src_u, id);
+		dst = bch2_inode_opt_get(dst_u, id);
+
+		if (src == dst)
+			continue;
+
+		bch2_inode_opt_set(dst_u, id, src);
+		ret = true;
+	}
+
+	return ret;
+}
+
+int bch2_rename_trans(struct btree_trans *trans,
+		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
+		      struct bch_inode_unpacked *src_inode_u,
+		      struct bch_inode_unpacked *dst_inode_u,
+		      const struct qstr *src_name,
+		      const struct qstr *dst_name,
+		      enum bch_rename_mode mode)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter src_dir_iter = { NULL };
+	struct btree_iter dst_dir_iter = { NULL };
+	struct btree_iter src_inode_iter = { NULL };
+	struct btree_iter dst_inode_iter = { NULL };
+	struct bch_hash_info src_hash, dst_hash;
+	subvol_inum src_inum, dst_inum;
+	u64 src_offset, dst_offset;
+	u64 now = bch2_current_time(c);
+	int ret;
+
+	ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	src_hash = bch2_hash_info_init(c, src_dir_u);
+
+	if (dst_dir.inum	!= src_dir.inum ||
+	    dst_dir.subvol	!= src_dir.subvol) {
+		ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+				      BTREE_ITER_INTENT);
+		if (ret)
+			goto err;
+
+		dst_hash = bch2_hash_info_init(c, dst_dir_u);
+	} else {
+		dst_dir_u = src_dir_u;
+		dst_hash = src_hash;
+	}
+
+	ret = bch2_dirent_rename(trans,
+				 src_dir, &src_hash,
+				 dst_dir, &dst_hash,
+				 src_name, &src_inum, &src_offset,
+				 dst_name, &dst_inum, &dst_offset,
+				 mode);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	if (dst_inum.inum) {
+		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
+				      BTREE_ITER_INTENT);
+		if (ret)
+			goto err;
+	}
+
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		src_inode_u->bi_dir		= dst_dir_u->bi_inum;
+		src_inode_u->bi_dir_offset	= dst_offset;
+
+		if (mode == BCH_RENAME_EXCHANGE) {
+			dst_inode_u->bi_dir		= src_dir_u->bi_inum;
+			dst_inode_u->bi_dir_offset	= src_offset;
+		}
+
+		if (mode == BCH_RENAME_OVERWRITE &&
+		    dst_inode_u->bi_dir		== dst_dir_u->bi_inum &&
+		    dst_inode_u->bi_dir_offset	== src_offset) {
+			dst_inode_u->bi_dir		= 0;
+			dst_inode_u->bi_dir_offset	= 0;
+		}
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE) {
+		if (S_ISDIR(src_inode_u->bi_mode) !=
+		    S_ISDIR(dst_inode_u->bi_mode)) {
+			ret = -ENOTDIR;
+			goto err;
+		}
+
+		if (S_ISDIR(dst_inode_u->bi_mode) &&
+		    bch2_empty_dir_trans(trans, dst_inum)) {
+			ret = -ENOTEMPTY;
+			goto err;
+		}
+	}
+
+	if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+	    S_ISDIR(src_inode_u->bi_mode)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+	    S_ISDIR(dst_inode_u->bi_mode)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (is_subdir_for_nlink(src_inode_u)) {
+		src_dir_u->bi_nlink--;
+		dst_dir_u->bi_nlink++;
+	}
+
+	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
+		dst_dir_u->bi_nlink--;
+		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE)
+		bch2_inode_nlink_dec(trans, dst_inode_u);
+
+	src_dir_u->bi_mtime		= now;
+	src_dir_u->bi_ctime		= now;
+
+	if (src_dir.inum != dst_dir.inum) {
+		dst_dir_u->bi_mtime	= now;
+		dst_dir_u->bi_ctime	= now;
+	}
+
+	src_inode_u->bi_ctime		= now;
+
+	if (dst_inum.inum)
+		dst_inode_u->bi_ctime	= now;
+
+	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
+		(src_dir.inum != dst_dir.inum
+		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
+		 : 0) ?:
+		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
+		(dst_inum.inum
+		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
+		 : 0);
+err:
+	bch2_trans_iter_exit(trans, &dst_inode_iter);
+	bch2_trans_iter_exit(trans, &src_inode_iter);
+	bch2_trans_iter_exit(trans, &dst_dir_iter);
+	bch2_trans_iter_exit(trans, &src_dir_iter);
+	return ret;
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
new file mode 100644
index 000000000000..dde237859514
--- /dev/null
+++ b/fs/bcachefs/fs-common.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_COMMON_H
+#define _BCACHEFS_FS_COMMON_H
+
+struct posix_acl;
+
+#define BCH_CREATE_TMPFILE		(1U << 0)
+#define BCH_CREATE_SUBVOL		(1U << 1)
+#define BCH_CREATE_SNAPSHOT		(1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO		(1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      uid_t, gid_t, umode_t, dev_t,
+		      struct posix_acl *,
+		      struct posix_acl *,
+		      subvol_inum, unsigned);
+
+int bch2_link_trans(struct btree_trans *,
+		    subvol_inum, struct bch_inode_unpacked *,
+		    subvol_inum, struct bch_inode_unpacked *,
+		    const struct qstr *);
+
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *, bool);
+
+int bch2_rename_trans(struct btree_trans *,
+		      subvol_inum, struct bch_inode_unpacked *,
+		      subvol_inum, struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      const struct qstr *,
+		      enum bch_rename_mode);
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
+			  struct bch_inode_unpacked *);
+
+#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
new file mode 100644
index 000000000000..58ccc7b91ac7
--- /dev/null
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -0,0 +1,1093 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/backing-dev.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return true;
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
+		return true;
+	return false;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+	struct folio_iter fi;
+
+	bio_for_each_folio_all(fi, bio) {
+		if (!bio->bi_status) {
+			folio_mark_uptodate(fi.folio);
+		} else {
+			folio_clear_uptodate(fi.folio);
+			folio_set_error(fi.folio);
+		}
+		folio_unlock(fi.folio);
+	}
+
+	bio_put(bio);
+}
+
+struct readpages_iter {
+	struct address_space	*mapping;
+	unsigned		idx;
+	folios			folios;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+			       struct readahead_control *ractl)
+{
+	struct folio **fi;
+	int ret;
+
+	memset(iter, 0, sizeof(*iter));
+
+	iter->mapping = ractl->mapping;
+
+	ret = bch2_filemap_get_contig_folios_d(iter->mapping,
+				ractl->_index << PAGE_SHIFT,
+				(ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
+				0, mapping_gfp_mask(iter->mapping),
+				&iter->folios);
+	if (ret)
+		return ret;
+
+	darray_for_each(iter->folios, fi) {
+		ractl->_nr_pages -= 1U << folio_order(*fi);
+		__bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
+		folio_put(*fi);
+		folio_put(*fi);
+	}
+
+	return 0;
+}
+
+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
+{
+	if (iter->idx >= iter->folios.nr)
+		return NULL;
+	return iter->folios.data[iter->idx];
+}
+
+static inline void readpage_iter_advance(struct readpages_iter *iter)
+{
+	iter->idx++;
+}
+
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (crc.csum_type || crc.compression_type)
+			return true;
+	return false;
+}
+
+static int readpage_bio_extend(struct btree_trans *trans,
+			       struct readpages_iter *iter,
+			       struct bio *bio,
+			       unsigned sectors_this_extent,
+			       bool get_more)
+{
+	/* Don't hold btree locks while allocating memory: */
+	bch2_trans_unlock(trans);
+
+	while (bio_sectors(bio) < sectors_this_extent &&
+	       bio->bi_vcnt < bio->bi_max_vecs) {
+		struct folio *folio = readpage_iter_peek(iter);
+		int ret;
+
+		if (folio) {
+			readpage_iter_advance(iter);
+		} else {
+			pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+
+			if (!get_more)
+				break;
+
+			folio = xa_load(&iter->mapping->i_pages, folio_offset);
+			if (folio && !xa_is_value(folio))
+				break;
+
+			folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+			if (!folio)
+				break;
+
+			if (!__bch2_folio_create(folio, GFP_KERNEL)) {
+				folio_put(folio);
+				break;
+			}
+
+			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
+			if (ret) {
+				__bch2_folio_release(folio);
+				folio_put(folio);
+				break;
+			}
+
+			folio_put(folio);
+		}
+
+		BUG_ON(folio_sector(folio) != bio_end_sector(bio));
+
+		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
+	}
+
+	return bch2_trans_relock(trans);
+}
+
+static void bchfs_read(struct btree_trans *trans,
+		       struct bch_read_bio *rbio,
+		       subvol_inum inum,
+		       struct readpages_iter *readpages_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	int flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE;
+	u32 snapshot;
+	int ret = 0;
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	bch2_bkey_buf_init(&sk);
+retry:
+	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		struct bkey_s_c k;
+		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			break;
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		if (readpages_iter) {
+			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+						  extent_partial_reads_expensive(k));
+			if (ret)
+				break;
+		}
+
+		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+
+		if (rbio->bio.bi_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		bch2_bio_page_state_set(&rbio->bio, k);
+
+		bch2_read_extent(trans, rbio, iter.pos,
+				 data_btree, k, offset_into_extent, flags);
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
+
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c,
+				iter.pos.inode,
+				iter.pos.offset << 9,
+				"read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bio_endio(&rbio->bio);
+	}
+
+	bch2_bkey_buf_exit(&sk, c);
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct folio *folio;
+	struct readpages_iter readpages_iter;
+	int ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	ret = readpages_iter_init(&readpages_iter, ractl);
+	BUG_ON(ret);
+
+	bch2_pagecache_add_get(inode);
+
+	while ((folio = readpage_iter_peek(&readpages_iter))) {
+		unsigned n = min_t(unsigned,
+				   readpages_iter.folios.nr -
+				   readpages_iter.idx,
+				   BIO_MAX_VECS);
+		struct bch_read_bio *rbio =
+			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+						   GFP_KERNEL, &c->bio_read),
+				  opts);
+
+		readpage_iter_advance(&readpages_iter);
+
+		rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+		rbio->bio.bi_end_io = bch2_readpages_end_io;
+		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+		bchfs_read(trans, rbio, inode_inum(inode),
+			   &readpages_iter);
+		bch2_trans_unlock(trans);
+	}
+
+	bch2_pagecache_add_put(inode);
+
+	bch2_trans_put(trans);
+	darray_exit(&readpages_iter.folios);
+}
+
+static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum, struct folio *folio)
+{
+	bch2_folio_create(folio, __GFP_NOFAIL);
+
+	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+	bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
+}
+
+static void bch2_read_single_folio_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+	struct bch_io_opts opts;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
+			 opts);
+	rbio->bio.bi_private = &done;
+	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
+
+	__bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+	wait_for_completion(&done);
+
+	ret = blk_status_to_errno(rbio->bio.bi_status);
+	bio_put(&rbio->bio);
+
+	if (ret < 0)
+		return ret;
+
+	folio_mark_uptodate(folio);
+	return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+	int ret;
+
+	ret = bch2_read_single_folio(folio, folio->mapping);
+	folio_unlock(folio);
+	return bch2_err_class(ret);
+}
+
+/* writepages: */
+
+struct bch_writepage_io {
+	struct bch_inode_info		*inode;
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+struct bch_writepage_state {
+	struct bch_writepage_io	*io;
+	struct bch_io_opts	opts;
+	struct bch_folio_sector	*tmp;
+	unsigned		tmp_sectors;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+								  struct bch_inode_info *inode)
+{
+	struct bch_writepage_state ret = { 0 };
+
+	bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+	return ret;
+}
+
+static void bch2_writepage_io_done(struct bch_write_op *op)
+{
+	struct bch_writepage_io *io =
+		container_of(op, struct bch_writepage_io, op);
+	struct bch_fs *c = io->op.c;
+	struct bio *bio = &io->op.wbio.bio;
+	struct folio_iter fi;
+	unsigned i;
+
+	if (io->op.error) {
+		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
+		bio_for_each_folio_all(fi, bio) {
+			struct bch_folio *s;
+
+			folio_set_error(fi.folio);
+			mapping_set_error(fi.folio->mapping, -EIO);
+
+			s = __bch2_folio(fi.folio);
+			spin_lock(&s->lock);
+			for (i = 0; i < folio_sectors(fi.folio); i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+		bio_for_each_folio_all(fi, bio) {
+			struct bch_folio *s;
+
+			s = __bch2_folio(fi.folio);
+			spin_lock(&s->lock);
+			for (i = 0; i < folio_sectors(fi.folio); i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	/*
+	 * racing with fallocate can cause us to add fewer sectors than
+	 * expected - but we shouldn't add more sectors than expected:
+	 */
+	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
+
+	/*
+	 * (error (due to going RO) halfway through a page can screw that up
+	 * slightly)
+	 * XXX wtf?
+	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
+	 */
+
+	/*
+	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
+	 * before calling end_page_writeback:
+	 */
+	bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
+
+	bio_for_each_folio_all(fi, bio) {
+		struct bch_folio *s = __bch2_folio(fi.folio);
+
+		if (atomic_dec_and_test(&s->write_count))
+			folio_end_writeback(fi.folio);
+	}
+
+	bio_put(&io->op.wbio.bio);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+	struct bch_writepage_io *io = w->io;
+
+	w->io = NULL;
+	closure_call(&io->op.cl, bch2_write, NULL, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+				    struct writeback_control *wbc,
+				    struct bch_writepage_state *w,
+				    struct bch_inode_info *inode,
+				    u64 sector,
+				    unsigned nr_replicas)
+{
+	struct bch_write_op *op;
+
+	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+					      REQ_OP_WRITE,
+					      GFP_KERNEL,
+					      &c->writepage_bioset),
+			     struct bch_writepage_io, op.wbio.bio);
+
+	w->io->inode		= inode;
+	op			= &w->io->op;
+	bch2_write_op_init(op, c, w->opts);
+	op->target		= w->opts.foreground_target;
+	op->nr_replicas		= nr_replicas;
+	op->res.nr_replicas	= nr_replicas;
+	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
+	op->subvol		= inode->ei_subvol;
+	op->pos			= POS(inode->v.i_ino, sector);
+	op->end_io		= bch2_writepage_io_done;
+	op->devs_need_flush	= &inode->ei_devs_need_flush;
+	op->wbio.bio.bi_iter.bi_sector = sector;
+	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
+}
+
+static int __bch2_writepage(struct folio *folio,
+			    struct writeback_control *wbc,
+			    void *data)
+{
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_writepage_state *w = data;
+	struct bch_folio *s;
+	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
+	loff_t i_size = i_size_read(&inode->v);
+	int ret;
+
+	EBUG_ON(!folio_test_uptodate(folio));
+
+	/* Is the folio fully inside i_size? */
+	if (folio_end_pos(folio) <= i_size)
+		goto do_io;
+
+	/* Is the folio fully outside i_size? (truncate in progress) */
+	if (folio_pos(folio) >= i_size) {
+		folio_unlock(folio);
+		return 0;
+	}
+
+	/*
+	 * The folio straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the folio size.  For a file that is not a multiple of
+	 * the  folio size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	folio_zero_segment(folio,
+			   i_size - folio_pos(folio),
+			   folio_size(folio));
+do_io:
+	f_sectors = folio_sectors(folio);
+	s = bch2_folio(folio);
+
+	if (f_sectors > w->tmp_sectors) {
+		kfree(w->tmp);
+		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+		w->tmp_sectors = f_sectors;
+	}
+
+	/*
+	 * Things get really hairy with errors during writeback:
+	 */
+	ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
+	BUG_ON(ret);
+
+	/* Before unlocking the page, get copy of reservations: */
+	spin_lock(&s->lock);
+	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
+
+	for (i = 0; i < f_sectors; i++) {
+		if (s->s[i].state < SECTOR_dirty)
+			continue;
+
+		nr_replicas_this_write =
+			min_t(unsigned, nr_replicas_this_write,
+			      s->s[i].nr_replicas +
+			      s->s[i].replicas_reserved);
+	}
+
+	for (i = 0; i < f_sectors; i++) {
+		if (s->s[i].state < SECTOR_dirty)
+			continue;
+
+		s->s[i].nr_replicas = w->opts.compression
+			? 0 : nr_replicas_this_write;
+
+		s->s[i].replicas_reserved = 0;
+		bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
+	}
+	spin_unlock(&s->lock);
+
+	BUG_ON(atomic_read(&s->write_count));
+	atomic_set(&s->write_count, 1);
+
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);
+
+	folio_unlock(folio);
+
+	offset = 0;
+	while (1) {
+		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
+		u64 sector;
+
+		while (offset < f_sectors &&
+		       w->tmp[offset].state < SECTOR_dirty)
+			offset++;
+
+		if (offset == f_sectors)
+			break;
+
+		while (offset + sectors < f_sectors &&
+		       w->tmp[offset + sectors].state >= SECTOR_dirty) {
+			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
+			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
+			sectors++;
+		}
+		BUG_ON(!sectors);
+
+		sector = folio_sector(folio) + offset;
+
+		if (w->io &&
+		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+		     bio_full(&w->io->op.wbio.bio, sectors << 9) ||
+		     w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
+		     (BIO_MAX_VECS * PAGE_SIZE) ||
+		     bio_end_sector(&w->io->op.wbio.bio) != sector))
+			bch2_writepage_do_io(w);
+
+		if (!w->io)
+			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
+						nr_replicas_this_write);
+
+		atomic_inc(&s->write_count);
+
+		BUG_ON(inode != w->io->inode);
+		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
+				     sectors << 9, offset << 9));
+
+		/* Check for writing past i_size: */
+		WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+			  round_up(i_size, block_bytes(c)) &&
+			  !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+			  "writing past i_size: %llu > %llu (unrounded %llu)\n",
+			  bio_end_sector(&w->io->op.wbio.bio) << 9,
+			  round_up(i_size, block_bytes(c)),
+			  i_size);
+
+		w->io->op.res.sectors += reserved_sectors;
+		w->io->op.i_sectors_delta -= dirty_sectors;
+		w->io->op.new_i_size = i_size;
+
+		offset += sectors;
+	}
+
+	if (atomic_dec_and_test(&s->write_count))
+		folio_end_writeback(folio);
+
+	return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(mapping->host));
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+	if (w.io)
+		bch2_writepage_do_io(&w);
+	blk_finish_plug(&plug);
+	kfree(w.tmp);
+	return bch2_err_class(ret);
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+		     loff_t pos, unsigned len,
+		     struct page **pagep, void **fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation *res;
+	struct folio *folio;
+	unsigned offset;
+	int ret = -ENOMEM;
+
+	res = kmalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	bch2_folio_reservation_init(c, inode, res);
+	*fsdata = res;
+
+	bch2_pagecache_add_get(inode);
+
+	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
+				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
+				mapping_gfp_mask(mapping));
+	if (IS_ERR_OR_NULL(folio))
+		goto err_unlock;
+
+	offset = pos - folio_pos(folio);
+	len = min_t(size_t, len, folio_end_pos(folio) - pos);
+
+	if (folio_test_uptodate(folio))
+		goto out;
+
+	/* If we're writing entire folio, don't need to read it in first: */
+	if (!offset && len == folio_size(folio))
+		goto out;
+
+	if (!offset && pos + len >= inode->v.i_size) {
+		folio_zero_segment(folio, len, folio_size(folio));
+		flush_dcache_folio(folio);
+		goto out;
+	}
+
+	if (folio_pos(folio) >= inode->v.i_size) {
+		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
+		flush_dcache_folio(folio);
+		goto out;
+	}
+readpage:
+	ret = bch2_read_single_folio(folio, mapping);
+	if (ret)
+		goto err;
+out:
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto err;
+
+	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
+	if (ret) {
+		if (!folio_test_uptodate(folio)) {
+			/*
+			 * If the folio hasn't been read in, we won't know if we
+			 * actually need a reservation - we don't actually need
+			 * to read here, we just need to check if the folio is
+			 * fully backed by uncompressed data:
+			 */
+			goto readpage;
+		}
+
+		goto err;
+	}
+
+	*pagep = &folio->page;
+	return 0;
+err:
+	folio_unlock(folio);
+	folio_put(folio);
+	*pagep = NULL;
+err_unlock:
+	bch2_pagecache_add_put(inode);
+	kfree(res);
+	*fsdata = NULL;
+	return bch2_err_class(ret);
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+		   loff_t pos, unsigned len, unsigned copied,
+		   struct page *page, void *fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation *res = fsdata;
+	struct folio *folio = page_folio(page);
+	unsigned offset = pos - folio_pos(folio);
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+	BUG_ON(offset + copied > folio_size(folio));
+
+	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
+		/*
+		 * The folio needs to be read in, but that would destroy
+		 * our partial write - simplest thing is to just force
+		 * userspace to redo the write:
+		 */
+		folio_zero_range(folio, 0, folio_size(folio));
+		flush_dcache_folio(folio);
+		copied = 0;
+	}
+
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
+	if (copied) {
+		if (!folio_test_uptodate(folio))
+			folio_mark_uptodate(folio);
+
+		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
+
+		inode->ei_last_dirtied = (unsigned long) current;
+	}
+
+	folio_unlock(folio);
+	folio_put(folio);
+	bch2_pagecache_add_put(inode);
+
+	bch2_folio_reservation_put(c, inode, res);
+	kfree(res);
+
+	return copied;
+}
+
+static noinline void folios_trunc(folios *fs, struct folio **fi)
+{
+	while (fs->data + fs->nr > fi) {
+		struct folio *f = darray_pop(fs);
+
+		folio_unlock(f);
+		folio_put(f);
+	}
+}
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+				 struct address_space *mapping,
+				 struct iov_iter *iter,
+				 loff_t pos, unsigned len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation res;
+	folios fs;
+	struct folio **fi, *f;
+	unsigned copied = 0, f_offset, f_copied;
+	u64 end = pos + len, f_pos, f_len;
+	loff_t last_folio_pos = inode->v.i_size;
+	int ret = 0;
+
+	BUG_ON(!len);
+
+	bch2_folio_reservation_init(c, inode, &res);
+	darray_init(&fs);
+
+	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
+				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
+				   mapping_gfp_mask(mapping),
+				   &fs);
+	if (ret)
+		goto out;
+
+	BUG_ON(!fs.nr);
+
+	f = darray_first(fs);
+	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+		ret = bch2_read_single_folio(f, mapping);
+		if (ret)
+			goto out;
+	}
+
+	f = darray_last(fs);
+	end = min(end, folio_end_pos(f));
+	last_folio_pos = folio_pos(f);
+	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
+		if (end >= inode->v.i_size) {
+			folio_zero_range(f, 0, folio_size(f));
+		} else {
+			ret = bch2_read_single_folio(f, mapping);
+			if (ret)
+				goto out;
+		}
+	}
+
+	ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
+	if (ret)
+		goto out;
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+
+		/*
+		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
+		 * supposed to write as much as we have disk space for.
+		 *
+		 * On failure here we should still write out a partial page if
+		 * we aren't completely out of disk space - we don't do that
+		 * yet:
+		 */
+		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
+		if (unlikely(ret)) {
+			folios_trunc(&fs, fi);
+			if (!fs.nr)
+				goto out;
+
+			end = min(end, folio_end_pos(darray_last(fs)));
+			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	if (mapping_writably_mapped(mapping))
+		darray_for_each(fs, fi)
+			flush_dcache_folio(*fi);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+		f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+		if (!f_copied) {
+			folios_trunc(&fs, fi);
+			break;
+		}
+
+		if (!folio_test_uptodate(f) &&
+		    f_copied != folio_size(f) &&
+		    pos + copied + f_copied < inode->v.i_size) {
+			iov_iter_revert(iter, f_copied);
+			folio_zero_range(f, 0, folio_size(f));
+			folios_trunc(&fs, fi);
+			break;
+		}
+
+		flush_dcache_folio(f);
+		copied += f_copied;
+
+		if (f_copied != f_len) {
+			folios_trunc(&fs, fi + 1);
+			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	if (!copied)
+		goto out;
+
+	end = pos + copied;
+
+	spin_lock(&inode->v.i_lock);
+	if (end > inode->v.i_size)
+		i_size_write(&inode->v, end);
+	spin_unlock(&inode->v.i_lock);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+
+		if (!folio_test_uptodate(f))
+			folio_mark_uptodate(f);
+
+		bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	inode->ei_last_dirtied = (unsigned long) current;
+out:
+	darray_for_each(fs, fi) {
+		folio_unlock(*fi);
+		folio_put(*fi);
+	}
+
+	/*
+	 * If the last folio added to the mapping starts beyond current EOF, we
+	 * performed a short write but left around at least one post-EOF folio.
+	 * Clean up the mapping before we return.
+	 */
+	if (last_folio_pos >= inode->v.i_size)
+		truncate_pagecache(&inode->v, inode->v.i_size);
+
+	darray_exit(&fs);
+	bch2_folio_reservation_put(c, inode, &res);
+
+	return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
+
+	bch2_pagecache_add_get(inode);
+
+	do {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		unsigned bytes = iov_iter_count(iter);
+again:
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+			bytes = min_t(unsigned long, iov_iter_count(iter),
+				      PAGE_SIZE - offset);
+
+			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+				ret = -EFAULT;
+				break;
+			}
+		}
+
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		if (unlikely(ret < 0))
+			break;
+
+		cond_resched();
+
+		if (unlikely(ret == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+				      iov_iter_single_seg_count(iter));
+			goto again;
+		}
+		pos += ret;
+		written += ret;
+		ret = 0;
+
+		balance_dirty_pages_ratelimited(mapping);
+	} while (iov_iter_count(iter));
+
+	bch2_pagecache_add_put(inode);
+
+	return written ? written : ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = bch2_direct_write(iocb, from);
+		goto out;
+	}
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto unlock;
+
+	ret = file_remove_privs(file);
+	if (ret)
+		goto unlock;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+
+	ret = bch2_buffered_write(iocb, from);
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+unlock:
+	inode_unlock(&inode->v);
+
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+out:
+	return bch2_err_class(ret);
+}
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->writepage_bioset,
+			4, offsetof(struct bch_writepage_io, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
new file mode 100644
index 000000000000..a6126ff790e6
--- /dev/null
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_BUFFERED_H
+#define _BCACHEFS_FS_IO_BUFFERED_H
+
+#ifndef NO_BCACHEFS_FS
+
+int bch2_read_single_folio(struct folio *, struct address_space *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+		     unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+		   unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
+int bch2_fs_fs_io_buffered_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
new file mode 100644
index 000000000000..6a9557e7ecab
--- /dev/null
+++ b/fs/bcachefs/fs-io-direct.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/prefetch.h>
+#include <linux/task_io_accounting_ops.h>
+
+/* O_DIRECT reads */
+
+struct dio_read {
+	struct closure			cl;
+	struct kiocb			*req;
+	long				ret;
+	bool				should_dirty;
+	struct bch_read_bio		rbio;
+};
+
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+	if (check_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
+static void bch2_dio_read_complete(struct closure *cl)
+{
+	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret);
+	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_status)
+		dio->ret = blk_status_to_errno(bio->bi_status);
+
+	closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+	bool should_dirty = dio->should_dirty;
+
+	bch2_direct_IO_read_endio(bio);
+	bio_check_or_release(bio, should_dirty);
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts;
+	struct dio_read *dio;
+	struct bio *bio;
+	loff_t offset = req->ki_pos;
+	bool sync = is_sync_kiocb(req);
+	size_t shorten;
+	ssize_t ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	if ((offset|iter->count) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	ret = min_t(loff_t, iter->count,
+		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+	if (!ret)
+		return ret;
+
+	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+	iter->count -= shorten;
+
+	bio = bio_alloc_bioset(NULL,
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+			       REQ_OP_READ,
+			       GFP_KERNEL,
+			       &c->dio_read_bioset);
+
+	bio->bi_end_io = bch2_direct_IO_read_endio;
+
+	dio = container_of(bio, struct dio_read, rbio.bio);
+	closure_init(&dio->cl, NULL);
+
+	/*
+	 * this is a _really_ horrible hack just to avoid an atomic sub at the
+	 * end:
+	 */
+	if (!sync) {
+		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER -
+			   CLOSURE_RUNNING +
+			   CLOSURE_DESTRUCTOR);
+	} else {
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER + 1);
+	}
+
+	dio->req	= req;
+	dio->ret	= ret;
+	/*
+	 * This is one of the sketchier things I've encountered: we have to skip
+	 * the dirtying of requests that are internal from the kernel (i.e. from
+	 * loopback), because we'll deadlock on page_lock.
+	 */
+	dio->should_dirty = iter_is_iovec(iter);
+
+	goto start;
+	while (iter->count) {
+		bio = bio_alloc_bioset(NULL,
+				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+				       REQ_OP_READ,
+				       GFP_KERNEL,
+				       &c->bio_read);
+		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
+start:
+		bio->bi_opf		= REQ_OP_READ|REQ_SYNC;
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_private		= dio;
+
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (ret < 0) {
+			/* XXX: fault inject this path */
+			bio->bi_status = BLK_STS_RESOURCE;
+			bio_endio(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+
+		if (dio->should_dirty)
+			bio_set_pages_dirty(bio);
+
+		if (iter->count)
+			closure_get(&dio->cl);
+
+		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+	}
+
+	iter->count += shorten;
+
+	if (sync) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+		return ret;
+	} else {
+		return -EIOCBQUEUED;
+	}
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	if (!count)
+		return 0; /* skip atime */
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct blk_plug plug;
+
+		if (unlikely(mapping->nrpages)) {
+			ret = filemap_write_and_wait_range(mapping,
+						iocb->ki_pos,
+						iocb->ki_pos + count - 1);
+			if (ret < 0)
+				goto out;
+		}
+
+		file_accessed(file);
+
+		blk_start_plug(&plug);
+		ret = bch2_direct_IO_read(iocb, iter);
+		blk_finish_plug(&plug);
+
+		if (ret >= 0)
+			iocb->ki_pos += ret;
+	} else {
+		bch2_pagecache_add_get(inode);
+		ret = generic_file_read_iter(iocb, iter);
+		bch2_pagecache_add_put(inode);
+	}
+out:
+	return bch2_err_class(ret);
+}
+
+/* O_DIRECT writes */
+
+struct dio_write {
+	struct kiocb			*req;
+	struct address_space		*mapping;
+	struct bch_inode_info		*inode;
+	struct mm_struct		*mm;
+	unsigned			loop:1,
+					extending:1,
+					sync:1,
+					flush:1,
+					free_iov:1;
+	struct quota_res		quota_res;
+	u64				written;
+
+	struct iov_iter			iter;
+	struct iovec			inline_vecs[2];
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+				       u64 offset, u64 size,
+				       unsigned nr_replicas, bool compressed)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 end = offset + size;
+	u32 snapshot;
+	bool ret = true;
+	int err;
+retry:
+	bch2_trans_begin(trans);
+
+	err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (err)
+		goto err;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_SLOTS, k, err) {
+		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
+			break;
+
+		if (k.k->p.snapshot != snapshot ||
+		    nr_replicas > bch2_bkey_replicas(c, k) ||
+		    (!compressed && bch2_bkey_sectors_compressed(k))) {
+			ret = false;
+			break;
+		}
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
+		goto retry;
+	bch2_trans_put(trans);
+
+	return err ? false : ret;
+}
+
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	return bch2_check_range_allocated(c, inode_inum(inode),
+				dio->op.pos.offset, bio_sectors(bio),
+				dio->op.opts.data_replicas,
+				dio->op.opts.compression != 0);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+	struct iovec *iov = dio->inline_vecs;
+
+	/*
+	 * iov_iter has a single embedded iovec - nothing to do:
+	 */
+	if (iter_is_ubuf(&dio->iter))
+		return 0;
+
+	/*
+	 * We don't currently handle non-iovec iov_iters here - return an error,
+	 * and we'll fall back to doing the IO synchronously:
+	 */
+	if (!iter_is_iovec(&dio->iter))
+		return -1;
+
+	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+		iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+				    GFP_KERNEL);
+		if (unlikely(!iov))
+			return -ENOMEM;
+
+		dio->free_iov = true;
+	}
+
+	memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+	dio->iter.__iov = iov;
+	return 0;
+}
+
+static void bch2_dio_write_flush_done(struct closure *cl)
+{
+	struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+	struct bch_fs *c = dio->op.c;
+
+	closure_debug_destroy(cl);
+
+	dio->op.error = bch2_journal_error(&c->journal);
+
+	bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	dio->flush = 0;
+
+	closure_init(&dio->op.cl, NULL);
+
+	if (!dio->op.error) {
+		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+		if (ret) {
+			dio->op.error = ret;
+		} else {
+			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
+						     &dio->op.cl);
+			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+		}
+	}
+
+	if (dio->sync) {
+		closure_sync(&dio->op.cl);
+		closure_debug_destroy(&dio->op.cl);
+	} else {
+		continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+	}
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	bool sync = dio->sync;
+	long ret;
+
+	if (unlikely(dio->flush)) {
+		bch2_dio_write_flush(dio);
+		if (!sync)
+			return -EIOCBQUEUED;
+	}
+
+	bch2_pagecache_block_put(inode);
+
+	if (dio->free_iov)
+		kfree(dio->iter.__iov);
+
+	ret = dio->op.error ?: ((long) dio->written << 9);
+	bio_put(&dio->op.wbio.bio);
+
+	/* inode->i_dio_count is our ref on inode and thus bch_fs */
+	inode_dio_end(&inode->v);
+
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+
+	if (!sync) {
+		req->ki_complete(req, ret);
+		ret = -EIOCBQUEUED;
+	}
+	return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	req->ki_pos	+= (u64) dio->op.written << 9;
+	dio->written	+= dio->op.written;
+
+	if (dio->extending) {
+		spin_lock(&inode->v.i_lock);
+		if (req->ki_pos > inode->v.i_size)
+			i_size_write(&inode->v, req->ki_pos);
+		spin_unlock(&inode->v.i_lock);
+	}
+
+	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+
+	bio_release_pages(bio, false);
+
+	if (unlikely(dio->op.error))
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct address_space *mapping = dio->mapping;
+	struct bch_inode_info *inode = dio->inode;
+	struct bch_io_opts opts;
+	struct bio *bio = &dio->op.wbio.bio;
+	unsigned unaligned, iter_count;
+	bool sync = dio->sync, dropped_locks;
+	long ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	while (1) {
+		iter_count = dio->iter.count;
+
+		EBUG_ON(current->faults_disabled_mapping);
+		current->faults_disabled_mapping = mapping;
+
+		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+		dropped_locks = fdm_dropped_locks();
+
+		current->faults_disabled_mapping = NULL;
+
+		/*
+		 * If the fault handler returned an error but also signalled
+		 * that it dropped & retook ei_pagecache_lock, we just need to
+		 * re-shoot down the page cache and retry:
+		 */
+		if (dropped_locks && ret)
+			ret = 0;
+
+		if (unlikely(ret < 0))
+			goto err;
+
+		if (unlikely(dropped_locks)) {
+			ret = bch2_write_invalidate_inode_pages_range(mapping,
+					req->ki_pos,
+					req->ki_pos + iter_count - 1);
+			if (unlikely(ret))
+				goto err;
+
+			if (!bio->bi_iter.bi_size)
+				continue;
+		}
+
+		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+		bio->bi_iter.bi_size -= unaligned;
+		iov_iter_revert(&dio->iter, unaligned);
+
+		if (!bio->bi_iter.bi_size) {
+			/*
+			 * bio_iov_iter_get_pages was only able to get <
+			 * blocksize worth of pages:
+			 */
+			ret = -EFAULT;
+			goto err;
+		}
+
+		bch2_write_op_init(&dio->op, c, opts);
+		dio->op.end_io		= sync
+			? NULL
+			: bch2_dio_write_loop_async;
+		dio->op.target		= dio->op.opts.foreground_target;
+		dio->op.write_point	= writepoint_hashed((unsigned long) current);
+		dio->op.nr_replicas	= dio->op.opts.data_replicas;
+		dio->op.subvol		= inode->ei_subvol;
+		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
+
+		if (sync)
+			dio->op.flags |= BCH_WRITE_SYNC;
+		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+
+		ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+						 bio_sectors(bio), true);
+		if (unlikely(ret))
+			goto err;
+
+		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
+						dio->op.opts.data_replicas, 0);
+		if (unlikely(ret) &&
+		    !bch2_dio_write_check_allocated(dio))
+			goto err;
+
+		task_io_account_write(bio->bi_iter.bi_size);
+
+		if (unlikely(dio->iter.count) &&
+		    !dio->sync &&
+		    !dio->loop &&
+		    bch2_dio_write_copy_iov(dio))
+			dio->sync = sync = true;
+
+		dio->loop = true;
+		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
+
+		if (!sync)
+			return -EIOCBQUEUED;
+
+		bch2_dio_write_end(dio);
+
+		if (likely(!dio->iter.count) || dio->op.error)
+			break;
+
+		bio_reset(bio, NULL, REQ_OP_WRITE);
+	}
+out:
+	return bch2_dio_write_done(dio);
+err:
+	dio->op.error = ret;
+
+	bio_release_pages(bio, false);
+
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
+	goto out;
+}
+
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+{
+	struct mm_struct *mm = dio->mm;
+
+	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+	if (mm)
+		kthread_use_mm(mm);
+	bch2_dio_write_loop(dio);
+	if (mm)
+		kthread_unuse_mm(mm);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+	struct dio_write *dio = container_of(op, struct dio_write, op);
+
+	bch2_dio_write_end(dio);
+
+	if (likely(!dio->iter.count) || dio->op.error)
+		bch2_dio_write_done(dio);
+	else
+		bch2_dio_write_continue(dio);
+}
+
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct dio_write *dio;
+	struct bio *bio;
+	bool locked = true, extending;
+	ssize_t ret;
+
+	prefetch(&c->opts);
+	prefetch((void *) &c->opts + 64);
+	prefetch(&inode->ei_inode);
+	prefetch((void *) &inode->ei_inode + 64);
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(req, iter);
+	if (unlikely(ret <= 0))
+		goto err;
+
+	ret = file_remove_privs(file);
+	if (unlikely(ret))
+		goto err;
+
+	ret = file_update_time(file);
+	if (unlikely(ret))
+		goto err;
+
+	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
+		goto err;
+
+	inode_dio_begin(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	extending = req->ki_pos + iter->count > inode->v.i_size;
+	if (!extending) {
+		inode_unlock(&inode->v);
+		locked = false;
+	}
+
+	bio = bio_alloc_bioset(NULL,
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+			       REQ_OP_WRITE,
+			       GFP_KERNEL,
+			       &c->dio_write_bioset);
+	dio = container_of(bio, struct dio_write, op.wbio.bio);
+	dio->req		= req;
+	dio->mapping		= mapping;
+	dio->inode		= inode;
+	dio->mm			= current->mm;
+	dio->loop		= false;
+	dio->extending		= extending;
+	dio->sync		= is_sync_kiocb(req) || extending;
+	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
+	dio->free_iov		= false;
+	dio->quota_res.sectors	= 0;
+	dio->written		= 0;
+	dio->iter		= *iter;
+	dio->op.c		= c;
+
+	if (unlikely(mapping->nrpages)) {
+		ret = bch2_write_invalidate_inode_pages_range(mapping,
+						req->ki_pos,
+						req->ki_pos + iter->count - 1);
+		if (unlikely(ret))
+			goto err_put_bio;
+	}
+
+	ret = bch2_dio_write_loop(dio);
+err:
+	if (locked)
+		inode_unlock(&inode->v);
+	return ret;
+err_put_bio:
+	bch2_pagecache_block_put(inode);
+	bio_put(bio);
+	inode_dio_end(&inode->v);
+	goto err;
+}
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->dio_write_bioset);
+	bioset_exit(&c->dio_read_bioset);
+}
+
+int bch2_fs_fs_io_direct_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->dio_read_bioset,
+			4, offsetof(struct dio_read, rbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+	if (bioset_init(&c->dio_write_bioset,
+			4, offsetof(struct dio_write, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
new file mode 100644
index 000000000000..814621ec7f81
--- /dev/null
+++ b/fs/bcachefs/fs-io-direct.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_DIRECT_H
+#define _BCACHEFS_FS_IO_DIRECT_H
+
+#ifndef NO_BCACHEFS_FS
+ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *);
+int bch2_fs_fs_io_direct_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
new file mode 100644
index 000000000000..8bd9bcdd27f7
--- /dev/null
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -0,0 +1,791 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "extents.h"
+#include "fs-io.h"
+#include "fs-io-pagecache.h"
+#include "subvolume.h"
+
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
+				     loff_t start, u64 end,
+				     int fgp_flags, gfp_t gfp,
+				     folios *fs)
+{
+	struct folio *f;
+	u64 pos = start;
+	int ret = 0;
+
+	while (pos < end) {
+		if ((u64) pos >= (u64) start + (1ULL << 20))
+			fgp_flags &= ~FGP_CREAT;
+
+		ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
+		if (ret)
+			break;
+
+		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
+		if (IS_ERR_OR_NULL(f))
+			break;
+
+		BUG_ON(fs->nr && folio_pos(f) != pos);
+
+		pos = folio_end_pos(f);
+		darray_push(fs, f);
+	}
+
+	if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
+		ret = -ENOMEM;
+
+	return fs->nr ? 0 : ret;
+}
+
+/* pagecache_block must be held */
+int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
+					    loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * XXX: the way this is currently implemented, we can spin if a process
+	 * is continually redirtying a specific page
+	 */
+	do {
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = filemap_write_and_wait_range(mapping, start, end);
+		if (ret)
+			break;
+
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				start >> PAGE_SHIFT,
+				end >> PAGE_SHIFT);
+	} while (ret == -EBUSY);
+
+	return ret;
+}
+
+#if 0
+/* Useful for debug tracing: */
+static const char * const bch2_folio_sector_states[] = {
+#define x(n)	#n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+	NULL
+};
+#endif
+
+static inline enum bch_folio_sector_state
+folio_sector_dirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_dirty;
+	case SECTOR_reserved:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_undirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_dirty:
+		return SECTOR_unallocated;
+	case SECTOR_dirty_reserved:
+		return SECTOR_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_reserve(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_reserved;
+	case SECTOR_dirty:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+/* for newly allocated folios: */
+struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+	struct bch_folio *s;
+
+	s = kzalloc(sizeof(*s) +
+		    sizeof(struct bch_folio_sector) *
+		    folio_sectors(folio), gfp);
+	if (!s)
+		return NULL;
+
+	spin_lock_init(&s->lock);
+	folio_attach_private(folio, s);
+	return s;
+}
+
+struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
+}
+
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
+{
+	if (bkey_extent_is_reservation(k))
+		return SECTOR_reserved;
+	if (bkey_extent_is_allocation(k.k))
+		return SECTOR_allocated;
+	return SECTOR_unallocated;
+}
+
+static void __bch2_folio_set(struct folio *folio,
+			     unsigned pg_offset, unsigned pg_len,
+			     unsigned nr_ptrs, unsigned state)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, sectors = folio_sectors(folio);
+
+	BUG_ON(pg_offset >= sectors);
+	BUG_ON(pg_offset + pg_len > sectors);
+
+	spin_lock(&s->lock);
+
+	for (i = pg_offset; i < pg_offset + pg_len; i++) {
+		s->s[i].nr_replicas	= nr_ptrs;
+		bch2_folio_sector_set(folio, s, i, state);
+	}
+
+	if (i == sectors)
+		s->uptodate = true;
+
+	spin_unlock(&s->lock);
+}
+
+/*
+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
+ * extents btree:
+ */
+int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
+		   struct folio **fs, unsigned nr_folios)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_folio *s;
+	u64 offset = folio_sector(fs[0]);
+	unsigned folio_idx;
+	u32 snapshot;
+	bool need_set = false;
+	int ret;
+
+	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+		s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+		if (!s)
+			return -ENOMEM;
+
+		need_set |= !s->uptodate;
+	}
+
+	if (!need_set)
+		return 0;
+
+	folio_idx = 0;
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_SLOTS, k, ret) {
+		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+		unsigned state = bkey_to_sector_state(k);
+
+		while (folio_idx < nr_folios) {
+			struct folio *folio = fs[folio_idx];
+			u64 folio_start	= folio_sector(folio);
+			u64 folio_end	= folio_end_sector(folio);
+			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+				folio_start;
+			unsigned folio_len = min(k.k->p.offset, folio_end) -
+				folio_offset - folio_start;
+
+			BUG_ON(k.k->p.offset < folio_start);
+			BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+			if (!bch2_folio(folio)->uptodate)
+				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+			if (k.k->p.offset < folio_end)
+				break;
+			folio_idx++;
+		}
+
+		if (folio_idx == nr_folios)
+			break;
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+	struct bvec_iter iter;
+	struct folio_vec fv;
+	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+	unsigned state = bkey_to_sector_state(k);
+
+	bio_for_each_folio(fv, bio, iter)
+		__bch2_folio_set(fv.fv_folio,
+				 fv.fv_offset >> 9,
+				 fv.fv_len >> 9,
+				 nr_ptrs, state);
+}
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
+				     u64 start, u64 end)
+{
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+			struct bch_folio *s;
+
+			BUG_ON(end <= folio_start);
+
+			folio_lock(folio);
+			s = bch2_folio(folio);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = folio_offset; j < folio_offset + folio_len; j++)
+					s->s[j].nr_replicas = 0;
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+}
+
+void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+				  u64 start, u64 end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	s64 i_sectors_delta = 0;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+			struct bch_folio *s;
+
+			BUG_ON(end <= folio_start);
+
+			folio_lock(folio);
+			s = bch2_folio(folio);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = folio_offset; j < folio_offset + folio_len; j++) {
+					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
+					bch2_folio_sector_set(folio, s, j,
+						folio_sector_reserve(s->s[j].state));
+				}
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
+					  unsigned nr_replicas)
+{
+	return max(0, (int) nr_replicas -
+		   s->nr_replicas -
+		   s->replicas_reserved);
+}
+
+int bch2_get_folio_disk_reservation(struct bch_fs *c,
+				struct bch_inode_info *inode,
+				struct folio *folio, bool check_enospc)
+{
+	struct bch_folio *s = bch2_folio_create(folio, 0);
+	unsigned nr_replicas = inode_nr_replicas(c, inode);
+	struct disk_reservation disk_res = { 0 };
+	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	for (i = 0; i < sectors; i++)
+		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+	if (!disk_res_sectors)
+		return 0;
+
+	ret = bch2_disk_reservation_get(c, &disk_res,
+					disk_res_sectors, 1,
+					!check_enospc
+					? BCH_DISK_RESERVATION_NOFAIL
+					: 0);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < sectors; i++)
+		s->s[i].replicas_reserved +=
+			sectors_to_reserve(&s->s[i], nr_replicas);
+
+	return 0;
+}
+
+void bch2_folio_reservation_put(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_folio_reservation *res)
+{
+	bch2_disk_reservation_put(c, &res->disk);
+	bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+int bch2_folio_reservation_get(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
+			unsigned offset, unsigned len)
+{
+	struct bch_folio *s = bch2_folio_create(folio, 0);
+	unsigned i, disk_sectors = 0, quota_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	BUG_ON(!s->uptodate);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		disk_sectors += sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+		quota_sectors += s->s[i].state == SECTOR_unallocated;
+	}
+
+	if (disk_sectors) {
+		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	if (quota_sectors) {
+		ret = bch2_quota_reservation_add(c, inode, &res->quota,
+						 quota_sectors, true);
+		if (unlikely(ret)) {
+			struct disk_reservation tmp = {
+				.sectors = disk_sectors
+			};
+
+			bch2_disk_reservation_put(c, &tmp);
+			res->disk.sectors -= disk_sectors;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_clear_folio_bits(struct folio *folio)
+{
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_folio *s = bch2_folio(folio);
+	struct disk_reservation disk_res = { 0 };
+	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
+
+	if (!s)
+		return;
+
+	EBUG_ON(!folio_test_locked(folio));
+	EBUG_ON(folio_test_writeback(folio));
+
+	for (i = 0; i < sectors; i++) {
+		disk_res.sectors += s->s[i].replicas_reserved;
+		s->s[i].replicas_reserved = 0;
+
+		dirty_sectors -= s->s[i].state == SECTOR_dirty;
+		bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
+	}
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
+
+	bch2_folio_release(folio);
+}
+
+void bch2_set_folio_dirty(struct bch_fs *c,
+			  struct bch_inode_info *inode,
+			  struct folio *folio,
+			  struct bch2_folio_reservation *res,
+			  unsigned offset, unsigned len)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, dirty_sectors = 0;
+
+	WARN_ON((u64) folio_pos(folio) + offset + len >
+		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+	BUG_ON(!s->uptodate);
+
+	spin_lock(&s->lock);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		unsigned sectors = sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+
+		/*
+		 * This can happen if we race with the error path in
+		 * bch2_writepage_io_done():
+		 */
+		sectors = min_t(unsigned, sectors, res->disk.sectors);
+
+		s->s[i].replicas_reserved += sectors;
+		res->disk.sectors -= sectors;
+
+		dirty_sectors += s->s[i].state == SECTOR_unallocated;
+
+		bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
+	}
+
+	spin_unlock(&s->lock);
+
+	bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+
+	if (!folio_test_dirty(folio))
+		filemap_dirty_folio(inode->v.i_mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+	struct file *file = vmf->vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct address_space *fdm = faults_disabled_mapping();
+	struct bch_inode_info *inode = file_bch_inode(file);
+	vm_fault_t ret;
+
+	if (fdm == mapping)
+		return VM_FAULT_SIGBUS;
+
+	/* Lock ordering: */
+	if (fdm > mapping) {
+		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+		if (bch2_pagecache_add_tryget(inode))
+			goto got_lock;
+
+		bch2_pagecache_block_put(fdm_host);
+
+		bch2_pagecache_add_get(inode);
+		bch2_pagecache_add_put(inode);
+
+		bch2_pagecache_block_get(fdm_host);
+
+		/* Signal that lock has been dropped: */
+		set_fdm_dropped_locks();
+		return VM_FAULT_SIGBUS;
+	}
+
+	bch2_pagecache_add_get(inode);
+got_lock:
+	ret = filemap_fault(vmf);
+	bch2_pagecache_add_put(inode);
+
+	return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+	struct folio *folio = page_folio(vmf->page);
+	struct file *file = vmf->vma->vm_file;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation res;
+	unsigned len;
+	loff_t isize;
+	vm_fault_t ret;
+
+	bch2_folio_reservation_init(c, inode, &res);
+
+	sb_start_pagefault(inode->v.i_sb);
+	file_update_time(file);
+
+	/*
+	 * Not strictly necessary, but helps avoid dio writes livelocking in
+	 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
+	 * a bch2_write_invalidate_inode_pages_range() that works without dropping
+	 * page lock before invalidating page
+	 */
+	bch2_pagecache_add_get(inode);
+
+	folio_lock(folio);
+	isize = i_size_read(&inode->v);
+
+	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+		folio_unlock(folio);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+
+	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+		folio_unlock(folio);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+	bch2_folio_reservation_put(c, inode, &res);
+
+	folio_wait_stable(folio);
+	ret = VM_FAULT_LOCKED;
+out:
+	bch2_pagecache_add_put(inode);
+	sb_end_pagefault(inode->v.i_sb);
+
+	return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+	if (offset || length < folio_size(folio))
+		return;
+
+	bch2_clear_folio_bits(folio);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+	if (folio_test_dirty(folio) || folio_test_writeback(folio))
+		return false;
+
+	bch2_clear_folio_bits(folio);
+	return true;
+}
+
+/* fseek: */
+
+static int folio_data_offset(struct folio *folio, loff_t pos,
+			     unsigned min_replicas)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, sectors = folio_sectors(folio);
+
+	if (s)
+		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
+			if (s->s[i].state >= SECTOR_dirty &&
+			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
+				return i << SECTOR_SHIFT;
+
+	return -1;
+}
+
+loff_t bch2_seek_pagecache_data(struct inode *vinode,
+				loff_t start_offset,
+				loff_t end_offset,
+				unsigned min_replicas,
+				bool nonblock)
+{
+	struct folio_batch fbatch;
+	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
+	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
+	pgoff_t index		= start_index;
+	unsigned i;
+	loff_t ret;
+	int offset;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(vinode->i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+
+			if (!nonblock) {
+				folio_lock(folio);
+			} else if (!folio_trylock(folio)) {
+				folio_batch_release(&fbatch);
+				return -EAGAIN;
+			}
+
+			offset = folio_data_offset(folio,
+					max(folio_pos(folio), start_offset),
+					min_replicas);
+			if (offset >= 0) {
+				ret = clamp(folio_pos(folio) + offset,
+					    start_offset, end_offset);
+				folio_unlock(folio);
+				folio_batch_release(&fbatch);
+				return ret;
+			}
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	return end_offset;
+}
+
+/*
+ * Search for a hole in a folio.
+ *
+ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
+ * code to indicate a pagecache hole exists at the returned offset. Otherwise
+ * return 0 if the folio is filled with data, or an error code. This function
+ * can return -EAGAIN if nonblock is specified.
+ */
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+			      unsigned min_replicas, bool nonblock)
+{
+	struct folio *folio;
+	struct bch_folio *s;
+	unsigned i, sectors;
+	int ret = -ENOENT;
+
+	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
+
+	s = bch2_folio(folio);
+	if (!s)
+		goto unlock;
+
+	sectors = folio_sectors(folio);
+	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
+		if (s->s[i].state < SECTOR_dirty ||
+		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
+			*offset = max(*offset,
+				      folio_pos(folio) + (i << SECTOR_SHIFT));
+			goto unlock;
+		}
+
+	*offset = folio_end_pos(folio);
+	ret = 0;
+unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+	return ret;
+}
+
+loff_t bch2_seek_pagecache_hole(struct inode *vinode,
+				loff_t start_offset,
+				loff_t end_offset,
+				unsigned min_replicas,
+				bool nonblock)
+{
+	struct address_space *mapping = vinode->i_mapping;
+	loff_t offset = start_offset;
+	loff_t ret = 0;
+
+	while (!ret && offset < end_offset)
+		ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
+
+	if (ret && ret != -ENOENT)
+		return ret;
+	return min(offset, end_offset);
+}
+
+int bch2_clamp_data_hole(struct inode *inode,
+			 u64 *hole_start,
+			 u64 *hole_end,
+			 unsigned min_replicas,
+			 bool nonblock)
+{
+	loff_t ret;
+
+	ret = bch2_seek_pagecache_hole(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_start = ret;
+
+	if (*hole_start == *hole_end)
+		return 0;
+
+	ret = bch2_seek_pagecache_data(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_end = ret;
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
new file mode 100644
index 000000000000..a2222ad586e9
--- /dev/null
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
+#define _BCACHEFS_FS_IO_PAGECACHE_H
+
+#include <linux/pagemap.h>
+
+typedef DARRAY(struct folio *) folios;
+
+int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
+				     u64, int, gfp_t, folios *);
+int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
+
+/*
+ * Use u64 for the end pos and sector helpers because if the folio covers the
+ * max supported range of the mapping, the start offset of the next folio
+ * overflows loff_t. This breaks much of the range based processing in the
+ * buffered write path.
+ */
+static inline u64 folio_end_pos(struct folio *folio)
+{
+	return folio_pos(folio) + folio_size(folio);
+}
+
+static inline size_t folio_sectors(struct folio *folio)
+{
+	return PAGE_SECTORS << folio_order(folio);
+}
+
+static inline loff_t folio_sector(struct folio *folio)
+{
+	return folio_pos(folio) >> 9;
+}
+
+static inline u64 folio_end_sector(struct folio *folio)
+{
+	return folio_end_pos(folio) >> 9;
+}
+
+#define BCH_FOLIO_SECTOR_STATE()	\
+	x(unallocated)			\
+	x(reserved)			\
+	x(dirty)			\
+	x(dirty_reserved)		\
+	x(allocated)
+
+enum bch_folio_sector_state {
+#define x(n)	SECTOR_##n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+};
+
+struct bch_folio_sector {
+	/* Uncompressed, fully allocated replicas (or on disk reservation): */
+	unsigned		nr_replicas:4;
+
+	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+	unsigned		replicas_reserved:4;
+
+	/* i_sectors: */
+	enum bch_folio_sector_state state:8;
+};
+
+struct bch_folio {
+	spinlock_t		lock;
+	atomic_t		write_count;
+	/*
+	 * Is the sector state up to date with the btree?
+	 * (Not the data itself)
+	 */
+	bool			uptodate;
+	struct bch_folio_sector	s[];
+};
+
+/* Helper for when we need to add debug instrumentation: */
+static inline void bch2_folio_sector_set(struct folio *folio,
+			     struct bch_folio *s,
+			     unsigned i, unsigned n)
+{
+	s->s[i].state = n;
+}
+
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+	u64 f_offset = pos - folio_pos(folio);
+
+	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+	return f_offset >> SECTOR_SHIFT;
+}
+
+/* for newly allocated folios: */
+static inline void __bch2_folio_release(struct folio *folio)
+{
+	kfree(folio_detach_private(folio));
+}
+
+static inline void bch2_folio_release(struct folio *folio)
+{
+	EBUG_ON(!folio_test_locked(folio));
+	__bch2_folio_release(folio);
+}
+
+static inline struct bch_folio *__bch2_folio(struct folio *folio)
+{
+	return folio_has_private(folio)
+		? (struct bch_folio *) folio_get_private(folio)
+		: NULL;
+}
+
+static inline struct bch_folio *bch2_folio(struct folio *folio)
+{
+	EBUG_ON(!folio_test_locked(folio));
+
+	return __bch2_folio(folio);
+}
+
+struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
+struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
+
+struct bch2_folio_reservation {
+	struct disk_reservation	disk;
+	struct quota_res	quota;
+};
+
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	/* XXX: this should not be open coded */
+	return inode->ei_inode.bi_data_replicas
+		? inode->ei_inode.bi_data_replicas - 1
+		: c->opts.data_replicas;
+}
+
+static inline void bch2_folio_reservation_init(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_folio_reservation *res)
+{
+	memset(res, 0, sizeof(*res));
+
+	res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
+void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
+void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+
+int bch2_get_folio_disk_reservation(struct bch_fs *,
+				struct bch_inode_info *,
+				struct folio *, bool);
+
+void bch2_folio_reservation_put(struct bch_fs *,
+			struct bch_inode_info *,
+			struct bch2_folio_reservation *);
+int bch2_folio_reservation_get(struct bch_fs *,
+			struct bch_inode_info *,
+			struct folio *,
+			struct bch2_folio_reservation *,
+			unsigned, unsigned);
+
+void bch2_set_folio_dirty(struct bch_fs *,
+			  struct bch_inode_info *,
+			  struct folio *,
+			  struct bch2_folio_reservation *,
+			  unsigned, unsigned);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
+loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
+int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
+
+#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
new file mode 100644
index 000000000000..b0e8144ec550
--- /dev/null
+++ b/fs/bcachefs/fs-io.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-pagecache.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "io_misc.h"
+#include "keylist.h"
+#include "quota.h"
+#include "reflink.h"
+#include "trace.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+
+#include <trace/events/writeback.h>
+
+struct nocow_flush {
+	struct closure	*cl;
+	struct bch_dev	*ca;
+	struct bio	bio;
+};
+
+static void nocow_flush_endio(struct bio *_bio)
+{
+
+	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
+
+	closure_put(bio->cl);
+	percpu_ref_put(&bio->ca->io_ref);
+	bio_put(&bio->bio);
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct closure *cl)
+{
+	struct nocow_flush *bio;
+	struct bch_dev *ca;
+	struct bch_devs_mask devs;
+	unsigned dev;
+
+	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+	if (dev == BCH_SB_MEMBERS_MAX)
+		return;
+
+	devs = inode->ei_devs_need_flush;
+	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
+
+	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca && !percpu_ref_tryget(&ca->io_ref))
+			ca = NULL;
+		rcu_read_unlock();
+
+		if (!ca)
+			continue;
+
+		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+						    REQ_OP_FLUSH,
+						    GFP_KERNEL,
+						    &c->nocow_flush_bioset),
+				   struct nocow_flush, bio);
+		bio->cl			= cl;
+		bio->ca			= ca;
+		bio->bio.bi_end_io	= nocow_flush_endio;
+		closure_bio_submit(&bio->bio, cl);
+	}
+}
+
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+					 struct bch_inode_info *inode)
+{
+	struct closure cl;
+
+	closure_init_stack(&cl);
+	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+	closure_sync(&cl);
+
+	return 0;
+}
+
+/* i_size updates: */
+
+struct inode_new_size {
+	loff_t		new_size;
+	u64		now;
+	unsigned	fields;
+};
+
+static int inode_set_size(struct btree_trans *trans,
+			  struct bch_inode_info *inode,
+			  struct bch_inode_unpacked *bi,
+			  void *p)
+{
+	struct inode_new_size *s = p;
+
+	bi->bi_size = s->new_size;
+	if (s->fields & ATTR_ATIME)
+		bi->bi_atime = s->now;
+	if (s->fields & ATTR_MTIME)
+		bi->bi_mtime = s->now;
+	if (s->fields & ATTR_CTIME)
+		bi->bi_ctime = s->now;
+
+	return 0;
+}
+
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       loff_t new_size, unsigned fields)
+{
+	struct inode_new_size s = {
+		.new_size	= new_size,
+		.now		= bch2_current_time(c),
+		.fields		= fields,
+	};
+
+	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
+}
+
+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+			   struct quota_res *quota_res, s64 sectors)
+{
+	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+				inode->ei_inode.bi_sectors);
+	inode->v.i_blocks += sectors;
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+	if (quota_res &&
+	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+	    sectors > 0) {
+		BUG_ON(sectors > quota_res->sectors);
+		BUG_ON(sectors > inode->ei_quota_reserved);
+
+		quota_res->sectors -= sectors;
+		inode->ei_quota_reserved -= sectors;
+	} else {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
+	}
+#endif
+}
+
+/* fsync: */
+
+/*
+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
+ * insert trigger: look up the btree inode instead
+ */
+static int bch2_flush_inode(struct bch_fs *c,
+			    struct bch_inode_info *inode)
+{
+	struct bch_inode_unpacked u;
+	int ret;
+
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
+	if (ret)
+		return ret;
+
+	return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+		bch2_inode_flush_nocow_writes(c, inode);
+}
+
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret, ret2, ret3;
+
+	ret = file_write_and_wait_range(file, start, end);
+	ret2 = sync_inode_metadata(&inode->v, 1);
+	ret3 = bch2_flush_inode(c, inode);
+
+	return bch2_err_class(ret ?: ret2 ?: ret3);
+}
+
+/* truncate: */
+
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+				 struct bpos start,
+				 struct bpos end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
+		if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
+			ret = 1;
+			break;
+		}
+	start = iter.pos;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int __bch2_truncate_folio(struct bch_inode_info *inode,
+				 pgoff_t index, loff_t start, loff_t end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_folio *s;
+	unsigned start_offset;
+	unsigned end_offset;
+	unsigned i;
+	struct folio *folio;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+	u64 end_pos;
+
+	folio = filemap_lock_folio(mapping, index);
+	if (IS_ERR_OR_NULL(folio)) {
+		/*
+		 * XXX: we're doing two index lookups when we end up reading the
+		 * folio
+		 */
+		ret = range_has_data(c, inode->ei_subvol,
+				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
+				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
+		if (ret <= 0)
+			return ret;
+
+		folio = __filemap_get_folio(mapping, index,
+					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
+		if (IS_ERR_OR_NULL(folio)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	BUG_ON(start	>= folio_end_pos(folio));
+	BUG_ON(end	<= folio_pos(folio));
+
+	start_offset	= max(start, folio_pos(folio)) - folio_pos(folio);
+	end_offset	= min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
+
+	/* Folio boundary? Nothing to do */
+	if (start_offset == 0 &&
+	    end_offset == folio_size(folio)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	s = bch2_folio_create(folio, 0);
+	if (!s) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	if (!folio_test_uptodate(folio)) {
+		ret = bch2_read_single_folio(folio, mapping);
+		if (ret)
+			goto unlock;
+	}
+
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto unlock;
+
+	for (i = round_up(start_offset, block_bytes(c)) >> 9;
+	     i < round_down(end_offset, block_bytes(c)) >> 9;
+	     i++) {
+		s->s[i].nr_replicas	= 0;
+
+		i_sectors_delta -= s->s[i].state == SECTOR_dirty;
+		bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
+	}
+
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+	/*
+	 * Caller needs to know whether this folio will be written out by
+	 * writeback - doing an i_size update if necessary - or whether it will
+	 * be responsible for the i_size update.
+	 *
+	 * Note that we shouldn't ever see a folio beyond EOF, but check and
+	 * warn if so. This has been observed by failure to clean up folios
+	 * after a short write and there's still a chance reclaim will fix
+	 * things up.
+	 */
+	WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
+	end_pos = folio_end_pos(folio);
+	if (inode->v.i_size > folio_pos(folio))
+		end_pos = min_t(u64, inode->v.i_size, end_pos);
+	ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
+
+	folio_zero_segment(folio, start_offset, end_offset);
+
+	/*
+	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+	 *
+	 * XXX: because we aren't currently tracking whether the folio has actual
+	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
+	 */
+	BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
+
+	/*
+	 * This removes any writeable userspace mappings; we need to force
+	 * .page_mkwrite to be called again before any mmapped writes, to
+	 * redirty the full page:
+	 */
+	folio_mkclean(folio);
+	filemap_dirty_folio(mapping, folio);
+unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+out:
+	return ret;
+}
+
+static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
+{
+	return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
+				     from, ANYSINT_MAX(loff_t));
+}
+
+static int bch2_truncate_folios(struct bch_inode_info *inode,
+				loff_t start, loff_t end)
+{
+	int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
+					start, end);
+
+	if (ret >= 0 &&
+	    start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+		ret = __bch2_truncate_folio(inode,
+					(end - 1) >> PAGE_SHIFT,
+					start, end);
+	return ret;
+}
+
+static int bch2_extend(struct mnt_idmap *idmap,
+		       struct bch_inode_info *inode,
+		       struct bch_inode_unpacked *inode_u,
+		       struct iattr *iattr)
+{
+	struct address_space *mapping = inode->v.i_mapping;
+	int ret;
+
+	/*
+	 * sync appends:
+	 *
+	 * this has to be done _before_ extending i_size:
+	 */
+	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
+	if (ret)
+		return ret;
+
+	truncate_setsize(&inode->v, iattr->ia_size);
+
+	return bch2_setattr_nonsize(idmap, inode, iattr);
+}
+
+int bchfs_truncate(struct mnt_idmap *idmap,
+		  struct bch_inode_info *inode, struct iattr *iattr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_inode_unpacked inode_u;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+
+	/*
+	 * If the truncate call with change the size of the file, the
+	 * cmtimes should be updated. If the size will not change, we
+	 * do not need to update the cmtimes.
+	 */
+	if (iattr->ia_size != inode->v.i_size) {
+		if (!(iattr->ia_valid & ATTR_MTIME))
+			ktime_get_coarse_real_ts64(&iattr->ia_mtime);
+		if (!(iattr->ia_valid & ATTR_CTIME))
+			ktime_get_coarse_real_ts64(&iattr->ia_ctime);
+		iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+	}
+
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
+	if (ret)
+		goto err;
+
+	/*
+	 * check this before next assertion; on filesystem error our normal
+	 * invariants are a bit broken (truncate has to truncate the page cache
+	 * before the inode).
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
+
+	WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+		  inode->v.i_size < inode_u.bi_size,
+		  "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
+		  (u64) inode->v.i_size, inode_u.bi_size);
+
+	if (iattr->ia_size > inode->v.i_size) {
+		ret = bch2_extend(idmap, inode, &inode_u, iattr);
+		goto err;
+	}
+
+	iattr->ia_valid &= ~ATTR_SIZE;
+
+	ret = bch2_truncate_folio(inode, iattr->ia_size);
+	if (unlikely(ret < 0))
+		goto err;
+
+	truncate_setsize(&inode->v, iattr->ia_size);
+
+	/*
+	 * When extending, we're going to write the new i_size to disk
+	 * immediately so we need to flush anything above the current on disk
+	 * i_size first:
+	 *
+	 * Also, when extending we need to flush the page that i_size currently
+	 * straddles - if it's mapped to userspace, we need to ensure that
+	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
+	 * again to allocate the part of the page that was extended.
+	 */
+	if (iattr->ia_size > inode_u.bi_size)
+		ret = filemap_write_and_wait_range(mapping,
+				inode_u.bi_size,
+				iattr->ia_size - 1);
+	else if (iattr->ia_size & (PAGE_SIZE - 1))
+		ret = filemap_write_and_wait_range(mapping,
+				round_down(iattr->ia_size, PAGE_SIZE),
+				iattr->ia_size - 1);
+	if (ret)
+		goto err;
+
+	ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+	if (unlikely(ret)) {
+		/*
+		 * If we error here, VFS caches are now inconsistent with btree
+		 */
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+		goto err;
+	}
+
+	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+				!bch2_journal_error(&c->journal), c,
+				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks,
+				inode->ei_inode.bi_sectors);
+
+	ret = bch2_setattr_nonsize(idmap, inode, iattr);
+err:
+	bch2_pagecache_block_put(inode);
+	return bch2_err_class(ret);
+}
+
+/* fallocate: */
+
+static int inode_update_times_fn(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *bi, void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+	return 0;
+}
+
+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	u64 end		= offset + len;
+	u64 block_start	= round_up(offset, block_bytes(c));
+	u64 block_end	= round_down(end, block_bytes(c));
+	bool truncated_last_page;
+	int ret = 0;
+
+	ret = bch2_truncate_folios(inode, offset, end);
+	if (unlikely(ret < 0))
+		goto err;
+
+	truncated_last_page = ret;
+
+	truncate_pagecache_range(&inode->v, offset, end - 1);
+
+	if (block_start < block_end) {
+		s64 i_sectors_delta = 0;
+
+		ret = bch2_fpunch(c, inode_inum(inode),
+				  block_start >> 9, block_end >> 9,
+				  &i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	if (end >= inode->v.i_size && !truncated_last_page) {
+		ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+					    ATTR_MTIME|ATTR_CTIME);
+	} else {
+		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+				       ATTR_MTIME|ATTR_CTIME);
+	}
+	mutex_unlock(&inode->ei_update_lock);
+err:
+	return ret;
+}
+
+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
+				   loff_t offset, loff_t len,
+				   bool insert)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+
+	if ((offset | len) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	if (insert) {
+		if (offset >= inode->v.i_size)
+			return -EINVAL;
+	} else {
+		if (offset + len >= inode->v.i_size)
+			return -EINVAL;
+	}
+
+	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+	if (ret)
+		return ret;
+
+	if (insert)
+		i_size_write(&inode->v, inode->v.i_size + len);
+
+	ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+				     insert, &i_sectors_delta);
+	if (!ret && !insert)
+		i_size_write(&inode->v, inode->v.i_size - len);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+	return ret;
+}
+
+static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			     u64 start_sector, u64 end_sector)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
+	struct bch_io_opts opts;
+	int ret = 0;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			POS(inode->v.i_ino, start_sector),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	while (!ret && bkey_lt(iter.pos, end_pos)) {
+		s64 i_sectors_delta = 0;
+		struct quota_res quota_res = { 0 };
+		struct bkey_s_c k;
+		unsigned sectors;
+		bool is_allocation;
+		u64 hole_start, hole_end;
+		u32 snapshot;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans,
+					inode->ei_subvol, &snapshot);
+		if (ret)
+			goto bkey_err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		if ((ret = bkey_err(k)))
+			goto bkey_err;
+
+		hole_start	= iter.pos.offset;
+		hole_end	= bpos_min(k.k->p, end_pos).offset;
+		is_allocation	= bkey_extent_is_allocation(k.k);
+
+		/* already reserved */
+		if (bkey_extent_is_reservation(k) &&
+		    bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
+			bch2_btree_iter_advance(&iter);
+			continue;
+		}
+
+		if (bkey_extent_is_data(k.k) &&
+		    !(mode & FALLOC_FL_ZERO_RANGE)) {
+			bch2_btree_iter_advance(&iter);
+			continue;
+		}
+
+		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+			/*
+			 * Lock ordering - can't be holding btree locks while
+			 * blocking on a folio lock:
+			 */
+			if (bch2_clamp_data_hole(&inode->v,
+						 &hole_start,
+						 &hole_end,
+						 opts.data_replicas, true))
+				ret = drop_locks_do(trans,
+					(bch2_clamp_data_hole(&inode->v,
+							      &hole_start,
+							      &hole_end,
+							      opts.data_replicas, false), 0));
+			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
+
+			if (ret)
+				goto bkey_err;
+
+			if (hole_start == hole_end)
+				continue;
+		}
+
+		sectors	= hole_end - hole_start;
+
+		if (!is_allocation) {
+			ret = bch2_quota_reservation_add(c, inode,
+					&quota_res, sectors, true);
+			if (unlikely(ret))
+				goto bkey_err;
+		}
+
+		ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
+					    sectors, opts, &i_sectors_delta,
+					    writepoint_hashed((unsigned long) current));
+		if (ret)
+			goto bkey_err;
+
+		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+
+		drop_locks_do(trans,
+			(bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+bkey_err:
+		bch2_quota_reservation_put(c, inode, &quota_res);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+	}
+
+	if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
+		struct quota_res quota_res = { 0 };
+		s64 i_sectors_delta = 0;
+
+		bch2_fpunch_at(trans, &iter, inode_inum(inode),
+			       end_sector, &i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+		bch2_quota_reservation_put(c, inode, &quota_res);
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	u64 end		= offset + len;
+	u64 block_start	= round_down(offset,	block_bytes(c));
+	u64 block_end	= round_up(end,		block_bytes(c));
+	bool truncated_last_page = false;
+	int ret, ret2 = 0;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+		ret = inode_newsize_ok(&inode->v, end);
+		if (ret)
+			return ret;
+	}
+
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = bch2_truncate_folios(inode, offset, end);
+		if (unlikely(ret < 0))
+			return ret;
+
+		truncated_last_page = ret;
+
+		truncate_pagecache_range(&inode->v, offset, end - 1);
+
+		block_start	= round_up(offset,	block_bytes(c));
+		block_end	= round_down(end,	block_bytes(c));
+	}
+
+	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
+
+	/*
+	 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+	 * so that the VFS cache i_size is consistent with the btree i_size:
+	 */
+	if (ret &&
+	    !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
+		return ret;
+
+	if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+		end = inode->v.i_size;
+
+	if (end >= inode->v.i_size &&
+	    (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+	     !(mode & FALLOC_FL_KEEP_SIZE))) {
+		spin_lock(&inode->v.i_lock);
+		i_size_write(&inode->v, end);
+		spin_unlock(&inode->v.i_lock);
+
+		mutex_lock(&inode->ei_update_lock);
+		ret2 = bch2_write_inode_size(c, inode, end, 0);
+		mutex_unlock(&inode->ei_update_lock);
+	}
+
+	return ret ?: ret2;
+}
+
+long bch2_fallocate_dispatch(struct file *file, int mode,
+			     loff_t offset, loff_t len)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	long ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
+		return -EROFS;
+
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	ret = file_modified(file);
+	if (ret)
+		goto err;
+
+	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+		ret = bchfs_fallocate(inode, mode, offset, len);
+	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+		ret = bchfs_fpunch(inode, offset, len);
+	else if (mode == FALLOC_FL_INSERT_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, true);
+	else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
+	else
+		ret = -EOPNOTSUPP;
+err:
+	bch2_pagecache_block_put(inode);
+	inode_unlock(&inode->v);
+	bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
+
+	return bch2_err_class(ret);
+}
+
+/*
+ * Take a quota reservation for unallocated blocks in a given file range
+ * Does not check pagecache
+ */
+static int quota_reserve_range(struct bch_inode_info *inode,
+			       struct quota_res *res,
+			       u64 start, u64 end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot;
+	u64 sectors = end - start;
+	u64 pos = start;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inode->v.i_ino, pos, snapshot), 0);
+
+	while (!(ret = btree_trans_too_many_iters(trans)) &&
+	       (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
+	       !(ret = bkey_err(k))) {
+		if (bkey_extent_is_allocation(k.k)) {
+			u64 s = min(end, k.k->p.offset) -
+				max(start, bkey_start_offset(k.k));
+			BUG_ON(s > sectors);
+			sectors -= s;
+		}
+		bch2_btree_iter_advance(&iter);
+	}
+	pos = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+
+	return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
+}
+
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+			     struct file *file_dst, loff_t pos_dst,
+			     loff_t len, unsigned remap_flags)
+{
+	struct bch_inode_info *src = file_bch_inode(file_src);
+	struct bch_inode_info *dst = file_bch_inode(file_dst);
+	struct bch_fs *c = src->v.i_sb->s_fs_info;
+	struct quota_res quota_res = { 0 };
+	s64 i_sectors_delta = 0;
+	u64 aligned_len;
+	loff_t ret = 0;
+
+	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (remap_flags & REMAP_FILE_DEDUP)
+		return -EOPNOTSUPP;
+
+	if ((pos_src & (block_bytes(c) - 1)) ||
+	    (pos_dst & (block_bytes(c) - 1)))
+		return -EINVAL;
+
+	if (src == dst &&
+	    abs(pos_src - pos_dst) < len)
+		return -EINVAL;
+
+	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+	inode_dio_wait(&src->v);
+	inode_dio_wait(&dst->v);
+
+	ret = generic_remap_file_range_prep(file_src, pos_src,
+					    file_dst, pos_dst,
+					    &len, remap_flags);
+	if (ret < 0 || len == 0)
+		goto err;
+
+	aligned_len = round_up((u64) len, block_bytes(c));
+
+	ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
+				pos_dst, pos_dst + len - 1);
+	if (ret)
+		goto err;
+
+	ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
+				  (pos_dst + aligned_len) >> 9);
+	if (ret)
+		goto err;
+
+	file_update_time(file_dst);
+
+	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
+				   (pos_src + aligned_len) >> 9);
+
+	ret = bch2_remap_range(c,
+			       inode_inum(dst), pos_dst >> 9,
+			       inode_inum(src), pos_src >> 9,
+			       aligned_len >> 9,
+			       pos_dst + len, &i_sectors_delta);
+	if (ret < 0)
+		goto err;
+
+	/*
+	 * due to alignment, we might have remapped slightly more than requsted
+	 */
+	ret = min((u64) ret << 9, (u64) len);
+
+	bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
+
+	spin_lock(&dst->v.i_lock);
+	if (pos_dst + ret > dst->v.i_size)
+		i_size_write(&dst->v, pos_dst + ret);
+	spin_unlock(&dst->v.i_lock);
+
+	if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+	    IS_SYNC(file_inode(file_dst)))
+		ret = bch2_flush_inode(c, dst);
+err:
+	bch2_quota_reservation_put(c, dst, &quota_res);
+	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+	return bch2_err_class(ret);
+}
+
+/* fseek: */
+
+static loff_t bch2_seek_data(struct file *file, u64 offset)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	subvol_inum inum = inode_inum(inode);
+	u64 isize, next_data = MAX_LFS_FILESIZE;
+	u32 snapshot;
+	int ret;
+
+	isize = i_size_read(&inode->v);
+	if (offset >= isize)
+		return -ENXIO;
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
+			   POS(inode->v.i_ino, U64_MAX),
+			   0, k, ret) {
+		if (bkey_extent_is_data(k.k)) {
+			next_data = max(offset, bkey_start_offset(k.k) << 9);
+			break;
+		} else if (k.k->p.offset >> 9 > isize)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	if (ret)
+		return ret;
+
+	if (next_data > offset)
+		next_data = bch2_seek_pagecache_data(&inode->v,
+					offset, next_data, 0, false);
+
+	if (next_data >= isize)
+		return -ENXIO;
+
+	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
+}
+
+static loff_t bch2_seek_hole(struct file *file, u64 offset)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	subvol_inum inum = inode_inum(inode);
+	u64 isize, next_hole = MAX_LFS_FILESIZE;
+	u32 snapshot;
+	int ret;
+
+	isize = i_size_read(&inode->v);
+	if (offset >= isize)
+		return -ENXIO;
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
+			   BTREE_ITER_SLOTS, k, ret) {
+		if (k.k->p.inode != inode->v.i_ino) {
+			next_hole = bch2_seek_pagecache_hole(&inode->v,
+					offset, MAX_LFS_FILESIZE, 0, false);
+			break;
+		} else if (!bkey_extent_is_data(k.k)) {
+			next_hole = bch2_seek_pagecache_hole(&inode->v,
+					max(offset, bkey_start_offset(k.k) << 9),
+					k.k->p.offset << 9, 0, false);
+
+			if (next_hole < k.k->p.offset << 9)
+				break;
+		} else {
+			offset = max(offset, bkey_start_offset(k.k) << 9);
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	if (ret)
+		return ret;
+
+	if (next_hole > isize)
+		next_hole = isize;
+
+	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
+}
+
+loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
+{
+	loff_t ret;
+
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		ret = generic_file_llseek(file, offset, whence);
+		break;
+	case SEEK_DATA:
+		ret = bch2_seek_data(file, offset);
+		break;
+	case SEEK_HOLE:
+		ret = bch2_seek_hole(file, offset);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return bch2_err_class(ret);
+}
+
+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->nocow_flush_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->nocow_flush_bioset,
+			1, offsetof(struct nocow_flush, bio), 0))
+		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
new file mode 100644
index 000000000000..ca70346e68dc
--- /dev/null
+++ b/fs/bcachefs/fs-io.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_H
+#define _BCACHEFS_FS_IO_H
+
+#ifndef NO_BCACHEFS_FS
+
+#include "buckets.h"
+#include "fs.h"
+#include "io_write_types.h"
+#include "quota.h"
+
+#include <linux/uio.h>
+
+struct folio_vec {
+	struct folio	*fv_folio;
+	size_t		fv_offset;
+	size_t		fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+	struct folio *folio	= page_folio(bv.bv_page);
+	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+		bv.bv_offset;
+	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+	return (struct folio_vec) {
+		.fv_folio	= folio,
+		.fv_offset	= offset,
+		.fv_len		= len,
+	};
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+						    struct bvec_iter iter)
+{
+	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start)			\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
+	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter)				\
+	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
+struct quota_res {
+	u64				sectors;
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res)
+{
+	BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+	inode->ei_quota_reserved -= res->sectors;
+	res->sectors = 0;
+}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+	if (res->sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_quota_reservation_put(c, inode, res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      u64 sectors,
+				      bool check_enospc)
+{
+	int ret;
+
+	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+	if (likely(!ret)) {
+		inode->ei_quota_reserved += sectors;
+		res->sectors += sectors;
+	}
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+#else
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res) {}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res) {}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      unsigned sectors,
+				      bool check_enospc)
+{
+	return 0;
+}
+
+#endif
+
+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
+			   struct quota_res *, s64);
+
+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+				       struct quota_res *quota_res, s64 sectors)
+{
+	if (sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_i_sectors_acct(c, inode, quota_res, sectors);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
+static inline struct address_space *faults_disabled_mapping(void)
+{
+	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+	current->faults_disabled_mapping =
+		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+	return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
+			struct bch_inode_info *, struct closure *);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+				       struct bch_inode_info *,
+				       loff_t, unsigned);
+
+int bch2_fsync(struct file *, loff_t, loff_t, int);
+
+int bchfs_truncate(struct mnt_idmap *,
+		  struct bch_inode_info *, struct iattr *);
+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+			     loff_t, loff_t, unsigned);
+
+loff_t bch2_llseek(struct file *, loff_t, int);
+
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
new file mode 100644
index 000000000000..6040bd3f0778
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.c
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "fs.h"
+#include "fs-common.h"
+#include "fs-ioctl.h"
+#include "quota.h"
+
+#include <linux/compat.h>
+#include <linux/fsnotify.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
+
+#define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
+#define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
+#define FSOP_GOING_FLAGS_LOGFLUSH	0x1	/* flush log but not data */
+#define FSOP_GOING_FLAGS_NOLOGFLUSH	0x2	/* don't flush log nor data */
+
+struct flags_set {
+	unsigned		mask;
+	unsigned		flags;
+
+	unsigned		projid;
+
+	bool			set_projinherit;
+	bool			projinherit;
+};
+
+static int bch2_inode_flags_set(struct btree_trans *trans,
+				struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi,
+				void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	/*
+	 * We're relying on btree locking here for exclusion with other ioctl
+	 * calls - use the flags in the btree (@bi), not inode->i_flags:
+	 */
+	struct flags_set *s = p;
+	unsigned newflags = s->flags;
+	unsigned oldflags = bi->bi_flags & s->mask;
+
+	if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	if (!S_ISREG(bi->bi_mode) &&
+	    !S_ISDIR(bi->bi_mode) &&
+	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+		return -EINVAL;
+
+	if (s->set_projinherit) {
+		bi->bi_fields_set &= ~(1 << Inode_opt_project);
+		bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+	}
+
+	bi->bi_flags &= ~s->mask;
+	bi->bi_flags |= newflags;
+
+	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
+	return 0;
+}
+
+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
+{
+	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
+
+	return put_user(flags, arg);
+}
+
+static int bch2_ioc_setflags(struct bch_fs *c,
+			     struct file *file,
+			     struct bch_inode_info *inode,
+			     void __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
+	unsigned uflags;
+	int ret;
+
+	if (get_user(uflags, (int __user *) arg))
+		return -EFAULT;
+
+	s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
+	if (uflags)
+		return -EOPNOTSUPP;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+		ret = -EACCES;
+		goto setflags_out;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
+			       ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+
+setflags_out:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct fsxattr fa = { 0 };
+
+	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+
+	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+		fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
+	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+
+	if (copy_to_user(arg, &fa, sizeof(fa)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+				      struct bch_inode_info *inode,
+				      struct bch_inode_unpacked *bi,
+				      void *p)
+{
+	struct flags_set *s = p;
+
+	if (s->projid != bi->bi_project) {
+		bi->bi_fields_set |= 1U << Inode_opt_project;
+		bi->bi_project = s->projid;
+	}
+
+	return bch2_inode_flags_set(trans, inode, bi, p);
+}
+
+static int bch2_ioc_fssetxattr(struct bch_fs *c,
+			       struct file *file,
+			       struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
+	struct fsxattr fa;
+	int ret;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
+
+	s.set_projinherit = true;
+	s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+	fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
+	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
+	if (fa.fsx_xflags)
+		return -EOPNOTSUPP;
+
+	if (fa.fsx_projid >= U32_MAX)
+		return -EINVAL;
+
+	/*
+	 * inode fields accessible via the xattr interface are stored with a +1
+	 * bias, so that 0 means unset:
+	 */
+	s.projid = fa.fsx_projid + 1;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_set_projid(c, inode, fa.fsx_projid);
+	if (ret)
+		goto err_unlock;
+
+	ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+			       ATTR_CTIME);
+err_unlock:
+	mutex_unlock(&inode->ei_update_lock);
+err:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
+				   struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   void *p)
+{
+	struct bch_inode_info *dir = p;
+
+	return !bch2_reinherit_attrs(bi, &dir->ei_inode);
+}
+
+static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
+				    struct file *file,
+				    struct bch_inode_info *src,
+				    const char __user *name)
+{
+	struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
+	struct bch_inode_info *dst;
+	struct inode *vinode = NULL;
+	char *kname = NULL;
+	struct qstr qstr;
+	int ret = 0;
+	subvol_inum inum;
+
+	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+	if (!kname)
+		return -ENOMEM;
+
+	ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
+	if (unlikely(ret < 0))
+		goto err1;
+
+	qstr.len	= ret;
+	qstr.name	= kname;
+
+	ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+	if (ret)
+		goto err1;
+
+	vinode = bch2_vfs_inode_get(c, inum);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	if (ret)
+		goto err1;
+
+	dst = to_bch_ei(vinode);
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto err2;
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+	if (inode_attr_changing(src, dst, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst,
+					     src->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err3;
+	}
+
+	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
+err3:
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+	/* return true if we did work */
+	if (ret >= 0)
+		ret = !ret;
+
+	mnt_drop_write_file(file);
+err2:
+	iput(vinode);
+err1:
+	kfree(kname);
+
+	return ret;
+}
+
+static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
+{
+	u32 flags;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(flags, arg))
+		return -EFAULT;
+
+	bch_notice(c, "shutdown by ioctl type %u", flags);
+
+	down_write(&c->vfs_sb->s_umount);
+
+	switch (flags) {
+	case FSOP_GOING_FLAGS_DEFAULT:
+		ret = freeze_bdev(c->vfs_sb->s_bdev);
+		if (ret)
+			goto err;
+
+		bch2_journal_flush(&c->journal);
+		c->vfs_sb->s_flags |= SB_RDONLY;
+		bch2_fs_emergency_read_only(c);
+		thaw_bdev(c->vfs_sb->s_bdev);
+		break;
+
+	case FSOP_GOING_FLAGS_LOGFLUSH:
+		bch2_journal_flush(&c->journal);
+		fallthrough;
+
+	case FSOP_GOING_FLAGS_NOLOGFLUSH:
+		c->vfs_sb->s_flags |= SB_RDONLY;
+		bch2_fs_emergency_read_only(c);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+err:
+	up_write(&c->vfs_sb->s_umount);
+	return ret;
+}
+
+static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+					  struct bch_ioctl_subvolume arg)
+{
+	struct inode *dir;
+	struct bch_inode_info *inode;
+	struct user_namespace *s_user_ns;
+	struct dentry *dst_dentry;
+	struct path src_path, dst_path;
+	int how = LOOKUP_FOLLOW;
+	int error;
+	subvol_inum snapshot_src = { 0 };
+	unsigned lookup_flags = 0;
+	unsigned create_flags = BCH_CREATE_SUBVOL;
+
+	if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+			  BCH_SUBVOL_SNAPSHOT_RO))
+		return -EINVAL;
+
+	if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    (arg.src_ptr ||
+	     (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+		return -EINVAL;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		create_flags |= BCH_CREATE_SNAPSHOT;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+		create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+	/* why do we need this lock? */
+	down_read(&c->vfs_sb->s_umount);
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		sync_inodes_sb(c->vfs_sb);
+retry:
+	if (arg.src_ptr) {
+		error = user_path_at(arg.dirfd,
+				(const char __user *)(unsigned long)arg.src_ptr,
+				how, &src_path);
+		if (error)
+			goto err1;
+
+		if (src_path.dentry->d_sb->s_fs_info != c) {
+			path_put(&src_path);
+			error = -EXDEV;
+			goto err1;
+		}
+
+		snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+	}
+
+	dst_dentry = user_path_create(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			&dst_path, lookup_flags);
+	error = PTR_ERR_OR_ZERO(dst_dentry);
+	if (error)
+		goto err2;
+
+	if (dst_dentry->d_sb->s_fs_info != c) {
+		error = -EXDEV;
+		goto err3;
+	}
+
+	if (dst_dentry->d_inode) {
+		error = -EEXIST;
+		goto err3;
+	}
+
+	dir = dst_path.dentry->d_inode;
+	if (IS_DEADDIR(dir)) {
+		error = -BCH_ERR_ENOENT_directory_dead;
+		goto err3;
+	}
+
+	s_user_ns = dir->i_sb->s_user_ns;
+	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+	    !kgid_has_mapping(s_user_ns, current_fsgid())) {
+		error = -EOVERFLOW;
+		goto err3;
+	}
+
+	error = inode_permission(file_mnt_idmap(filp),
+				 dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto err3;
+
+	if (!IS_POSIXACL(dir))
+		arg.mode &= ~current_umask();
+
+	error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+	if (error)
+		goto err3;
+
+	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    !arg.src_ptr)
+		snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+
+	inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
+			      dst_dentry, arg.mode|S_IFDIR,
+			      0, snapshot_src, create_flags);
+	error = PTR_ERR_OR_ZERO(inode);
+	if (error)
+		goto err3;
+
+	d_instantiate(dst_dentry, &inode->v);
+	fsnotify_mkdir(dir, dst_dentry);
+err3:
+	done_path_create(&dst_path, dst_dentry);
+err2:
+	if (arg.src_ptr)
+		path_put(&src_path);
+
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+err1:
+	up_read(&c->vfs_sb->s_umount);
+
+	return error;
+}
+
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+					struct bch_ioctl_subvolume arg)
+{
+	down_write(&c->snapshot_create_lock);
+	long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
+	up_write(&c->snapshot_create_lock);
+
+	return ret;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+				struct bch_ioctl_subvolume arg)
+{
+	struct path path;
+	struct inode *dir;
+	int ret = 0;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	ret = user_path_at(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ret;
+
+	if (path.dentry->d_sb->s_fs_info != c) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	dir = path.dentry->d_parent->d_inode;
+
+	ret = __bch2_unlink(dir, path.dentry, true);
+	if (ret)
+		goto err;
+
+	fsnotify_rmdir(dir, path.dentry);
+	d_delete(path.dentry);
+err:
+	path_put(&path);
+	return ret;
+}
+
+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	long ret;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		ret = bch2_ioc_getflags(inode, (int __user *) arg);
+		break;
+
+	case FS_IOC_SETFLAGS:
+		ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+		break;
+
+	case FS_IOC_FSGETXATTR:
+		ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+		break;
+
+	case FS_IOC_FSSETXATTR:
+		ret = bch2_ioc_fssetxattr(c, file, inode,
+					  (void __user *) arg);
+		break;
+
+	case BCHFS_IOC_REINHERIT_ATTRS:
+		ret = bch2_ioc_reinherit_attrs(c, file, inode,
+					       (void __user *) arg);
+		break;
+
+	case FS_IOC_GETVERSION:
+		ret = -ENOTTY;
+		break;
+
+	case FS_IOC_SETVERSION:
+		ret = -ENOTTY;
+		break;
+
+	case FS_IOC_GOINGDOWN:
+		ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
+		break;
+
+	case BCH_IOCTL_SUBVOLUME_CREATE: {
+		struct bch_ioctl_subvolume i;
+
+		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+			? -EFAULT
+			: bch2_ioctl_subvolume_create(c, file, i);
+		break;
+	}
+
+	case BCH_IOCTL_SUBVOLUME_DESTROY: {
+		struct bch_ioctl_subvolume i;
+
+		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+			? -EFAULT
+			: bch2_ioctl_subvolume_destroy(c, file, i);
+		break;
+	}
+
+	default:
+		ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
+		break;
+	}
+
+	return bch2_err_class(ret);
+}
+
+#ifdef CONFIG_COMPAT
+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
new file mode 100644
index 000000000000..54a9c21a3b83
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IOCTL_H
+#define _BCACHEFS_FS_IOCTL_H
+
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
+	[__BCH_INODE_SYNC]	= S_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= S_APPEND,
+	[__BCH_INODE_NOATIME]	= S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
+	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
+	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
+	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
+	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
+	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
+	[__BCH_INODE_NODUMP]	= FS_XFLAG_NODUMP,
+	[__BCH_INODE_NOATIME]	= FS_XFLAG_NOATIME,
+	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)					\
+do {									\
+	unsigned _i;							\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & (1 << _i))					\
+			(_out) |= _map[_i];				\
+		else							\
+			(_out) &= ~_map[_i];				\
+} while (0)
+
+#define map_flags(_map, _in)						\
+({									\
+	unsigned _out = 0;						\
+									\
+	set_flags(_map, _in, _out);					\
+	_out;								\
+})
+
+#define map_flags_rev(_map, _in)					\
+({									\
+	unsigned _i, _out = 0;						\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & _map[_i]) {					\
+			(_out) |= 1 << _i;				\
+			(_in) &= ~_map[_i];				\
+		}							\
+	(_out);								\
+})
+
+#define map_defined(_map)						\
+({									\
+	unsigned _in = ~0;						\
+									\
+	map_flags_rev(_map, _in);					\
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
+
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
+
+#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
new file mode 100644
index 000000000000..6642b88c41a0
--- /dev/null
+++ b/fs/bcachefs/fs.c
@@ -0,0 +1,1980 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "errcode.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-common.h"
+#include "fs-io.h"
+#include "fs-ioctl.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io_read.h"
+#include "journal.h"
+#include "keylist.h"
+#include "quota.h"
+#include "snapshot.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/exportfs.h>
+#include <linux/fiemap.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+
+static struct kmem_cache *bch2_inode_cache;
+
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
+				struct bch_inode_info *,
+				struct bch_inode_unpacked *,
+				struct bch_subvolume *);
+
+void bch2_inode_update_after_write(struct btree_trans *trans,
+				   struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   unsigned fields)
+{
+	struct bch_fs *c = trans->c;
+
+	BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+			       POS(0, bi->bi_inum),
+			       c->opts.inodes_use_key_cache);
+
+	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
+	i_uid_write(&inode->v, bi->bi_uid);
+	i_gid_write(&inode->v, bi->bi_gid);
+	inode->v.i_mode	= bi->bi_mode;
+
+	if (fields & ATTR_ATIME)
+		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
+	if (fields & ATTR_MTIME)
+		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
+	if (fields & ATTR_CTIME)
+		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
+
+	inode->ei_inode		= *bi;
+
+	bch2_inode_flags_to_vfs(inode);
+}
+
+int __must_check bch2_write_inode(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  inode_set_fn set,
+				  void *p, unsigned fields)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+
+	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
+				BTREE_ITER_INTENT) ?:
+		(set ? set(trans, inode, &inode_u, p) : 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+
+	/*
+	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
+	 * this is important for inode updates via bchfs_write_index_update
+	 */
+	if (!ret)
+		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
+			     "inode %u:%llu not found when updating",
+			     inode_inum(inode).subvol,
+			     inode_inum(inode).inum);
+
+	bch2_trans_put(trans);
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_fs_quota_transfer(struct bch_fs *c,
+			   struct bch_inode_info *inode,
+			   struct bch_qid new_qid,
+			   unsigned qtypes,
+			   enum quota_acct_mode mode)
+{
+	unsigned i;
+	int ret;
+
+	qtypes &= enabled_qtypes(c);
+
+	for (i = 0; i < QTYP_NR; i++)
+		if (new_qid.q[i] == inode->ei_qid.q[i])
+			qtypes &= ~(1U << i);
+
+	if (!qtypes)
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+
+	ret = bch2_quota_transfer(c, qtypes, new_qid,
+				  inode->ei_qid,
+				  inode->v.i_blocks +
+				  inode->ei_quota_reserved,
+				  mode);
+	if (!ret)
+		for (i = 0; i < QTYP_NR; i++)
+			if (qtypes & (1 << i))
+				inode->ei_qid.q[i] = new_qid.q[i];
+
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	subvol_inum *inum = p;
+
+	return inode->ei_subvol == inum->subvol &&
+		inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	subvol_inum *inum = p;
+
+	inode->v.i_ino		= inum->inum;
+	inode->ei_subvol	= inum->subvol;
+	inode->ei_inode.bi_inum	= inum->inum;
+	return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+	struct bch_inode_unpacked inode_u;
+	struct bch_inode_info *inode;
+	struct btree_trans *trans;
+	struct bch_subvolume subvol;
+	int ret;
+
+	inode = to_bch_ei(iget5_locked(c->vfs_sb,
+				       bch2_inode_hash(inum),
+				       bch2_iget5_test,
+				       bch2_iget5_set,
+				       &inum));
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->v.i_state & I_NEW))
+		return &inode->v;
+
+	trans = bch2_trans_get(c);
+	ret = lockrestart_do(trans,
+		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+
+	if (!ret)
+		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+	bch2_trans_put(trans);
+
+	if (ret) {
+		iget_failed(&inode->v);
+		return ERR_PTR(bch2_err_class(ret));
+	}
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+	mutex_unlock(&c->vfs_inodes_lock);
+
+	unlock_new_inode(&inode->v);
+
+	return &inode->v;
+}
+
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *idmap,
+	      struct bch_inode_info *dir, struct dentry *dentry,
+	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+	      unsigned flags)
+{
+	struct bch_fs *c = dir->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct bch_inode_unpacked dir_u;
+	struct bch_inode_info *inode, *old;
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *default_acl = NULL, *acl = NULL;
+	subvol_inum inum;
+	struct bch_subvolume subvol;
+	u64 journal_seq = 0;
+	int ret;
+
+	/*
+	 * preallocate acls + vfs inode before btree transaction, so that
+	 * nothing can fail after the transaction succeeds:
+	 */
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
+	if (ret)
+		return ERR_PTR(ret);
+#endif
+	inode = to_bch_ei(new_inode(c->vfs_sb));
+	if (unlikely(!inode)) {
+		inode = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	bch2_inode_init_early(c, &inode_u);
+
+	if (!(flags & BCH_CREATE_TMPFILE))
+		mutex_lock(&dir->ei_update_lock);
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret   = bch2_create_trans(trans,
+				  inode_inum(dir), &dir_u, &inode_u,
+				  !(flags & BCH_CREATE_TMPFILE)
+				  ? &dentry->d_name : NULL,
+				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
+				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
+				  mode, rdev,
+				  default_acl, acl, snapshot_src, flags) ?:
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+				KEY_TYPE_QUOTA_PREALLOC);
+	if (unlikely(ret))
+		goto err_before_quota;
+
+	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+	inum.inum = inode_u.bi_inum;
+
+	ret   = bch2_subvolume_get(trans, inum.subvol, true,
+				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
+		bch2_trans_commit(trans, NULL, &journal_seq, 0);
+	if (unlikely(ret)) {
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
+				KEY_TYPE_QUOTA_WARN);
+err_before_quota:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+		goto err_trans;
+	}
+
+	if (!(flags & BCH_CREATE_TMPFILE)) {
+		bch2_inode_update_after_write(trans, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		mutex_unlock(&dir->ei_update_lock);
+	}
+
+	bch2_iget5_set(&inode->v, &inum);
+	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+
+	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+	/*
+	 * we must insert the new inode into the inode cache before calling
+	 * bch2_trans_exit() and dropping locks, else we could race with another
+	 * thread pulling the inode in and modifying it:
+	 */
+
+	inode->v.i_state |= I_CREATING;
+
+	old = to_bch_ei(inode_insert5(&inode->v,
+				      bch2_inode_hash(inum),
+				      bch2_iget5_test,
+				      bch2_iget5_set,
+				      &inum));
+	BUG_ON(!old);
+
+	if (unlikely(old != inode)) {
+		/*
+		 * We raced, another process pulled the new inode into cache
+		 * before us:
+		 */
+		make_bad_inode(&inode->v);
+		iput(&inode->v);
+
+		inode = old;
+	} else {
+		mutex_lock(&c->vfs_inodes_lock);
+		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+		mutex_unlock(&c->vfs_inodes_lock);
+		/*
+		 * we really don't want insert_inode_locked2() to be setting
+		 * I_NEW...
+		 */
+		unlock_new_inode(&inode->v);
+	}
+
+	bch2_trans_put(trans);
+err:
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
+	return inode;
+err_trans:
+	if (!(flags & BCH_CREATE_TMPFILE))
+		mutex_unlock(&dir->ei_update_lock);
+
+	bch2_trans_put(trans);
+	make_bad_inode(&inode->v);
+	iput(&inode->v);
+	inode = ERR_PTR(ret);
+	goto err;
+}
+
+/* methods */
+
+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
+	struct inode *vinode = NULL;
+	subvol_inum inum = { .subvol = 1 };
+	int ret;
+
+	ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+				 &dentry->d_name, &inum);
+
+	if (!ret)
+		vinode = bch2_vfs_inode_get(c, inum);
+
+	return d_splice_alias(vinode, dentry);
+}
+
+static int bch2_mknod(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
+			      (subvol_inum) { 0 }, 0);
+
+	if (IS_ERR(inode))
+		return bch2_err_class(PTR_ERR(inode));
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+static int bch2_create(struct mnt_idmap *idmap,
+		       struct inode *vdir, struct dentry *dentry,
+		       umode_t mode, bool excl)
+{
+	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
+}
+
+static int __bch2_link(struct bch_fs *c,
+		       struct bch_inode_info *inode,
+		       struct bch_inode_info *dir,
+		       struct dentry *dentry)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bch_inode_unpacked dir_u, inode_u;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_link_trans(trans,
+					inode_inum(dir),   &dir_u,
+					inode_inum(inode), &inode_u,
+					&dentry->d_name));
+
+	if (likely(!ret)) {
+		bch2_inode_update_after_write(trans, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
+	}
+
+	bch2_trans_put(trans);
+	mutex_unlock(&inode->ei_update_lock);
+	return ret;
+}
+
+static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
+		     struct dentry *dentry)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
+	int ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	ret = __bch2_link(c, inode, dir, dentry);
+	if (unlikely(ret))
+		return ret;
+
+	ihold(&inode->v);
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+		  bool deleting_snapshot)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_inode_unpacked dir_u, inode_u;
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret;
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
+
+	ret = commit_do(trans, NULL, NULL,
+			BTREE_INSERT_NOFAIL,
+		bch2_unlink_trans(trans,
+				  inode_inum(dir), &dir_u,
+				  &inode_u, &dentry->d_name,
+				  deleting_snapshot));
+	if (unlikely(ret))
+		goto err;
+
+	bch2_inode_update_after_write(trans, dir, &dir_u,
+				      ATTR_MTIME|ATTR_CTIME);
+	bch2_inode_update_after_write(trans, inode, &inode_u,
+				      ATTR_MTIME);
+
+	if (inode_u.bi_subvol) {
+		/*
+		 * Subvolume deletion is asynchronous, but we still want to tell
+		 * the VFS that it's been deleted here:
+		 */
+		set_nlink(&inode->v, 0);
+	}
+err:
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+	return __bch2_unlink(vdir, dentry, false);
+}
+
+static int bch2_symlink(struct mnt_idmap *idmap,
+			struct inode *vdir, struct dentry *dentry,
+			const char *symname)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
+	int ret;
+
+	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+	if (IS_ERR(inode))
+		return bch2_err_class(PTR_ERR(inode));
+
+	inode_lock(&inode->v);
+	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
+	inode_unlock(&inode->v);
+
+	if (unlikely(ret))
+		goto err;
+
+	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
+	if (unlikely(ret))
+		goto err;
+
+	ret = __bch2_link(c, inode, dir, dentry);
+	if (unlikely(ret))
+		goto err;
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+err:
+	iput(&inode->v);
+	return ret;
+}
+
+static int bch2_mkdir(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry, umode_t mode)
+{
+	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
+}
+
+static int bch2_rename2(struct mnt_idmap *idmap,
+			struct inode *src_vdir, struct dentry *src_dentry,
+			struct inode *dst_vdir, struct dentry *dst_dentry,
+			unsigned flags)
+{
+	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
+	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
+	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
+	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
+	struct bch_inode_unpacked dst_dir_u, src_dir_u;
+	struct bch_inode_unpacked src_inode_u, dst_inode_u;
+	struct btree_trans *trans;
+	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
+		? BCH_RENAME_EXCHANGE
+		: dst_dentry->d_inode
+		? BCH_RENAME_OVERWRITE : BCH_RENAME;
+	int ret;
+
+	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (mode == BCH_RENAME_OVERWRITE) {
+		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
+						   0, LLONG_MAX);
+		if (ret)
+			return ret;
+	}
+
+	trans = bch2_trans_get(c);
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK,
+			 src_dir,
+			 dst_dir,
+			 src_inode,
+			 dst_inode);
+
+	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, src_inode,
+					     dst_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst_inode,
+					     src_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_rename_trans(trans,
+					  inode_inum(src_dir), &src_dir_u,
+					  inode_inum(dst_dir), &dst_dir_u,
+					  &src_inode_u,
+					  &dst_inode_u,
+					  &src_dentry->d_name,
+					  &dst_dentry->d_name,
+					  mode));
+	if (unlikely(ret))
+		goto err;
+
+	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
+	BUG_ON(dst_inode &&
+	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
+
+	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
+				      ATTR_MTIME|ATTR_CTIME);
+
+	if (src_dir != dst_dir)
+		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+
+	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
+				      ATTR_CTIME);
+
+	if (dst_inode)
+		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
+					      ATTR_CTIME);
+err:
+	bch2_trans_put(trans);
+
+	bch2_fs_quota_transfer(c, src_inode,
+			       bch_qid(&src_inode->ei_inode),
+			       1 << QTYP_PRJ,
+			       KEY_TYPE_QUOTA_NOCHECK);
+	if (dst_inode)
+		bch2_fs_quota_transfer(c, dst_inode,
+				       bch_qid(&dst_inode->ei_inode),
+				       1 << QTYP_PRJ,
+				       KEY_TYPE_QUOTA_NOCHECK);
+
+	bch2_unlock_inodes(INODE_UPDATE_LOCK,
+			   src_dir,
+			   dst_dir,
+			   src_inode,
+			   dst_inode);
+
+	return ret;
+}
+
+static void bch2_setattr_copy(struct mnt_idmap *idmap,
+			      struct bch_inode_info *inode,
+			      struct bch_inode_unpacked *bi,
+			      struct iattr *attr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & ATTR_UID)
+		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+	if (ia_valid & ATTR_GID)
+		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+
+	if (ia_valid & ATTR_SIZE)
+		bi->bi_size = attr->ia_size;
+
+	if (ia_valid & ATTR_ATIME)
+		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
+	if (ia_valid & ATTR_MTIME)
+		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
+	if (ia_valid & ATTR_CTIME)
+		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
+
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+		kgid_t gid = ia_valid & ATTR_GID
+			? attr->ia_gid
+			: inode->v.i_gid;
+
+		if (!in_group_p(gid) &&
+		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+			mode &= ~S_ISGID;
+		bi->bi_mode = mode;
+	}
+}
+
+int bch2_setattr_nonsize(struct mnt_idmap *idmap,
+			 struct bch_inode_info *inode,
+			 struct iattr *attr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_qid qid;
+	struct btree_trans *trans;
+	struct btree_iter inode_iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *acl = NULL;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+
+	qid = inode->ei_qid;
+
+	if (attr->ia_valid & ATTR_UID)
+		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+
+	if (attr->ia_valid & ATTR_GID)
+		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+
+	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
+				     KEY_TYPE_QUOTA_PREALLOC);
+	if (ret)
+		goto err;
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+	kfree(acl);
+	acl = NULL;
+
+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto btree_err;
+
+	bch2_setattr_copy(idmap, inode, &inode_u, attr);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
+				     inode_u.bi_mode, &acl);
+		if (ret)
+			goto btree_err;
+	}
+
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL);
+btree_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	if (unlikely(ret))
+		goto err_trans;
+
+	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
+
+	if (acl)
+		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+	bch2_trans_put(trans);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_getattr(struct mnt_idmap *idmap,
+			const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned query_flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	stat->dev	= inode->v.i_sb->s_dev;
+	stat->ino	= inode->v.i_ino;
+	stat->mode	= inode->v.i_mode;
+	stat->nlink	= inode->v.i_nlink;
+	stat->uid	= inode->v.i_uid;
+	stat->gid	= inode->v.i_gid;
+	stat->rdev	= inode->v.i_rdev;
+	stat->size	= i_size_read(&inode->v);
+	stat->atime	= inode_get_atime(&inode->v);
+	stat->mtime	= inode_get_mtime(&inode->v);
+	stat->ctime	= inode_get_ctime(&inode->v);
+	stat->blksize	= block_bytes(c);
+	stat->blocks	= inode->v.i_blocks;
+
+	if (request_mask & STATX_BTIME) {
+		stat->result_mask |= STATX_BTIME;
+		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
+	}
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+		stat->attributes |= STATX_ATTR_APPEND;
+	stat->attributes_mask	 |= STATX_ATTR_APPEND;
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+		stat->attributes |= STATX_ATTR_NODUMP;
+	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
+
+	return 0;
+}
+
+static int bch2_setattr(struct mnt_idmap *idmap,
+			struct dentry *dentry, struct iattr *iattr)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	int ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	ret = setattr_prepare(idmap, dentry, iattr);
+	if (ret)
+		return ret;
+
+	return iattr->ia_valid & ATTR_SIZE
+		? bchfs_truncate(idmap, inode, iattr)
+		: bch2_setattr_nonsize(idmap, inode, iattr);
+}
+
+static int bch2_tmpfile(struct mnt_idmap *idmap,
+			struct inode *vdir, struct file *file, umode_t mode)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(idmap, to_bch_ei(vdir),
+			      file->f_path.dentry, mode, 0,
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+
+	if (IS_ERR(inode))
+		return bch2_err_class(PTR_ERR(inode));
+
+	d_mark_tmpfile(file, &inode->v);
+	d_instantiate(file->f_path.dentry, &inode->v);
+	return finish_open_simple(file, 0);
+}
+
+static int bch2_fill_extent(struct bch_fs *c,
+			    struct fiemap_extent_info *info,
+			    struct bkey_s_c k, unsigned flags)
+{
+	if (bkey_extent_is_direct_data(k.k)) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		int ret;
+
+		if (k.k->type == KEY_TYPE_reflink_v)
+			flags |= FIEMAP_EXTENT_SHARED;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			int flags2 = 0;
+			u64 offset = p.ptr.offset;
+
+			if (p.ptr.unwritten)
+				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
+
+			if (p.crc.compression_type)
+				flags2 |= FIEMAP_EXTENT_ENCODED;
+			else
+				offset += p.crc.offset;
+
+			if ((offset & (block_sectors(c) - 1)) ||
+			    (k.k->size & (block_sectors(c) - 1)))
+				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+			ret = fiemap_fill_next_extent(info,
+						bkey_start_offset(k.k) << 9,
+						offset << 9,
+						k.k->size << 9, flags|flags2);
+			if (ret)
+				return ret;
+		}
+
+		return 0;
+	} else if (bkey_extent_is_inline_data(k.k)) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DATA_INLINE);
+	} else if (k.k->type == KEY_TYPE_reservation) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DELALLOC|
+					       FIEMAP_EXTENT_UNWRITTEN);
+	} else {
+		BUG();
+	}
+}
+
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+		       u64 start, u64 len)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(vinode);
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_buf cur, prev;
+	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
+	unsigned offset_into_extent, sectors;
+	bool have_extent = false;
+	u32 snapshot;
+	int ret = 0;
+
+	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	if (start + len < start)
+		return -EINVAL;
+
+	start >>= 9;
+
+	bch2_bkey_buf_init(&cur);
+	bch2_bkey_buf_init(&prev);
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(ei->v.i_ino, start, snapshot), 0);
+
+	while (!(ret = btree_trans_too_many_iters(trans)) &&
+	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+	       !(ret = bkey_err(k))) {
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		if (!bkey_extent_is_data(k.k) &&
+		    k.k->type != KEY_TYPE_reservation) {
+			bch2_btree_iter_advance(&iter);
+			continue;
+		}
+
+		offset_into_extent	= iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors			= k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&cur, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &cur);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(cur.k);
+		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bch2_cut_front(POS(k.k->p.inode,
+				   bkey_start_offset(k.k) +
+				   offset_into_extent),
+			       cur.k);
+		bch2_key_resize(&cur.k->k, sectors);
+		cur.k->k.p = iter.pos;
+		cur.k->k.p.offset += cur.k->k.size;
+
+		if (have_extent) {
+			bch2_trans_unlock(trans);
+			ret = bch2_fill_extent(c, info,
+					bkey_i_to_s_c(prev.k), 0);
+			if (ret)
+				break;
+		}
+
+		bkey_copy(prev.k, cur.k);
+		have_extent = true;
+
+		bch2_btree_iter_set_pos(&iter,
+			POS(iter.pos.inode, iter.pos.offset + sectors));
+	}
+	start = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (!ret && have_extent) {
+		bch2_trans_unlock(trans);
+		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
+				       FIEMAP_EXTENT_LAST);
+	}
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
+	return ret < 0 ? ret : 0;
+}
+
+static const struct vm_operations_struct bch_vm_ops = {
+	.fault		= bch2_page_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite   = bch2_page_mkwrite,
+};
+
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+
+	vma->vm_ops = &bch_vm_ops;
+	return 0;
+}
+
+/* Directories: */
+
+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_file_llseek_size(file, offset, whence,
+					S64_MAX, S64_MAX);
+}
+
+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	ret = bch2_readdir(c, inode_inum(inode), ctx);
+	if (ret)
+		bch_err_fn(c, ret);
+
+	return bch2_err_class(ret);
+}
+
+static const struct file_operations bch_file_operations = {
+	.llseek		= bch2_llseek,
+	.read_iter	= bch2_read_iter,
+	.write_iter	= bch2_write_iter,
+	.mmap		= bch2_mmap,
+	.open		= generic_file_open,
+	.fsync		= bch2_fsync,
+	.splice_read	= filemap_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= bch2_fallocate_dispatch,
+	.unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch2_compat_fs_ioctl,
+#endif
+	.remap_file_range = bch2_remap_file_range,
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.fiemap		= bch2_fiemap,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+	.lookup		= bch2_lookup,
+	.create		= bch2_create,
+	.link		= bch2_link,
+	.unlink		= bch2_unlink,
+	.symlink	= bch2_symlink,
+	.mkdir		= bch2_mkdir,
+	.rmdir		= bch2_unlink,
+	.mknod		= bch2_mknod,
+	.rename		= bch2_rename2,
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.tmpfile	= bch2_tmpfile,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct file_operations bch_dir_file_operations = {
+	.llseek		= bch2_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= bch2_vfs_readdir,
+	.fsync		= bch2_fsync,
+	.unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch2_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+	.get_link	= page_get_link,
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct address_space_operations bch_address_space_operations = {
+	.read_folio	= bch2_read_folio,
+	.writepages	= bch2_writepages,
+	.readahead	= bch2_readahead,
+	.dirty_folio	= filemap_dirty_folio,
+	.write_begin	= bch2_write_begin,
+	.write_end	= bch2_write_end,
+	.invalidate_folio = bch2_invalidate_folio,
+	.release_folio	= bch2_release_folio,
+	.direct_IO	= noop_direct_IO,
+#ifdef CONFIG_MIGRATION
+	.migrate_folio	= filemap_migrate_folio,
+#endif
+	.error_remove_page = generic_error_remove_page,
+};
+
+struct bcachefs_fid {
+	u64		inum;
+	u32		subvol;
+	u32		gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+	struct bcachefs_fid	fid;
+	struct bcachefs_fid	dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
+{
+	switch (fh_type) {
+	case FILEID_BCACHEFS_WITHOUT_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+	case FILEID_BCACHEFS_WITH_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+	default:
+		return false;
+	}
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+	return (struct bcachefs_fid) {
+		.inum	= inode->ei_inode.bi_inum,
+		.subvol	= inode->ei_subvol,
+		.gen	= inode->ei_inode.bi_generation,
+	};
+}
+
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+			  struct inode *vdir)
+{
+	struct bch_inode_info *inode	= to_bch_ei(vinode);
+	struct bch_inode_info *dir	= to_bch_ei(vdir);
+
+	if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
+		return FILEID_INVALID;
+
+	if (!S_ISDIR(inode->v.i_mode) && dir) {
+		struct bcachefs_fid_with_parent *fid = (void *) fh;
+
+		fid->fid = bch2_inode_to_fid(inode);
+		fid->dir = bch2_inode_to_fid(dir);
+
+		*len = sizeof(*fid) / sizeof(u32);
+		return FILEID_BCACHEFS_WITH_PARENT;
+	} else {
+		struct bcachefs_fid *fid = (void *) fh;
+
+		*fid = bch2_inode_to_fid(inode);
+
+		*len = sizeof(*fid) / sizeof(u32);
+		return FILEID_BCACHEFS_WITHOUT_PARENT;
+	}
+}
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+					struct bcachefs_fid fid)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+				    .subvol = fid.subvol,
+				    .inum = fid.inum,
+	});
+	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
+		iput(vinode);
+		vinode = ERR_PTR(-ESTALE);
+	}
+	return vinode;
+}
+
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
+		int fh_len, int fh_type)
+{
+	struct bcachefs_fid *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type))
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
+}
+
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
+		int fh_len, int fh_type)
+{
+	struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type) ||
+	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	subvol_inum parent_inum = {
+		.subvol = inode->ei_inode.bi_parent_subvol ?:
+			inode->ei_subvol,
+		.inum = inode->ei_inode.bi_dir,
+	};
+
+	if (!parent_inum.inum)
+		return NULL;
+
+	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
+	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct btree_iter iter1;
+	struct btree_iter iter2;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	struct bch_inode_unpacked inode_u;
+	subvol_inum target;
+	u32 snapshot;
+	struct qstr dirent_name;
+	unsigned name_len = 0;
+	int ret;
+
+	if (!S_ISDIR(dir->v.i_mode))
+		return -EINVAL;
+
+	trans = bch2_trans_get(c);
+
+	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_set_snapshot(&iter1, snapshot);
+	bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
+	if (ret)
+		goto err;
+
+	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+		k = bch2_btree_iter_peek_slot(&iter1);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_dirent) {
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+			goto err;
+		}
+
+		d = bkey_s_c_to_dirent(k);
+		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+		if (ret > 0)
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+		if (ret)
+			goto err;
+
+		if (target.subvol	== inode->ei_subvol &&
+		    target.inum		== inode->ei_inode.bi_inum)
+			goto found;
+	} else {
+		/*
+		 * File with multiple hardlinks and our backref is to the wrong
+		 * directory - linear search:
+		 */
+		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+			if (k.k->p.inode > dir->ei_inode.bi_inum)
+				break;
+
+			if (k.k->type != KEY_TYPE_dirent)
+				continue;
+
+			d = bkey_s_c_to_dirent(k);
+			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+			if (ret < 0)
+				break;
+			if (ret)
+				continue;
+
+			if (target.subvol	== inode->ei_subvol &&
+			    target.inum		== inode->ei_inode.bi_inum)
+				goto found;
+		}
+	}
+
+	ret = -ENOENT;
+	goto err;
+found:
+	dirent_name = bch2_dirent_get_name(d);
+
+	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
+	memcpy(name, dirent_name.name, name_len);
+	name[name_len] = '\0';
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_iter_exit(trans, &iter1);
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static const struct export_operations bch_export_ops = {
+	.encode_fh	= bch2_encode_fh,
+	.fh_to_dentry	= bch2_fh_to_dentry,
+	.fh_to_parent	= bch2_fh_to_parent,
+	.get_parent	= bch2_get_parent,
+	.get_name	= bch2_get_name,
+};
+
+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+				struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi,
+				struct bch_subvolume *subvol)
+{
+	bch2_inode_update_after_write(trans, inode, bi, ~0);
+
+	if (BCH_SUBVOLUME_SNAP(subvol))
+		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+	else
+		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+
+	inode->v.i_blocks	= bi->bi_sectors;
+	inode->v.i_ino		= bi->bi_inum;
+	inode->v.i_rdev		= bi->bi_dev;
+	inode->v.i_generation	= bi->bi_generation;
+	inode->v.i_size		= bi->bi_size;
+
+	inode->ei_flags		= 0;
+	inode->ei_quota_reserved = 0;
+	inode->ei_qid		= bch_qid(bi);
+	inode->ei_subvol	= inum.subvol;
+
+	inode->v.i_mapping->a_ops = &bch_address_space_operations;
+
+	switch (inode->v.i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->v.i_op	= &bch_file_inode_operations;
+		inode->v.i_fop	= &bch_file_operations;
+		break;
+	case S_IFDIR:
+		inode->v.i_op	= &bch_dir_inode_operations;
+		inode->v.i_fop	= &bch_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode_nohighmem(&inode->v);
+		inode->v.i_op	= &bch_symlink_inode_operations;
+		break;
+	default:
+		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
+		inode->v.i_op	= &bch_special_inode_operations;
+		break;
+	}
+
+	mapping_set_large_folios(inode->v.i_mapping);
+}
+
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+	struct bch_inode_info *inode;
+
+	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+	if (!inode)
+		return NULL;
+
+	inode_init_once(&inode->v);
+	mutex_init(&inode->ei_update_lock);
+	two_state_lock_init(&inode->ei_pagecache_lock);
+	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+	mutex_init(&inode->ei_quota_lock);
+
+	return &inode->v;
+}
+
+static void bch2_i_callback(struct rcu_head *head)
+{
+	struct inode *vinode = container_of(head, struct inode, i_rcu);
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+
+	kmem_cache_free(bch2_inode_cache, inode);
+}
+
+static void bch2_destroy_inode(struct inode *vinode)
+{
+	call_rcu(&vinode->i_rcu, bch2_i_callback);
+}
+
+static int inode_update_times_fn(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *bi,
+				 void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
+	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
+	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
+
+	return 0;
+}
+
+static int bch2_vfs_write_inode(struct inode *vinode,
+				struct writeback_control *wbc)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+
+	return bch2_err_class(ret);
+}
+
+static void bch2_evict_inode(struct inode *vinode)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+
+	truncate_inode_pages_final(&inode->v.i_data);
+
+	clear_inode(&inode->v);
+
+	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
+
+	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+				KEY_TYPE_QUOTA_WARN);
+		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+				KEY_TYPE_QUOTA_WARN);
+		bch2_inode_rm(c, inode_inum(inode));
+	}
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_del_init(&inode->ei_vfs_inode_list);
+	mutex_unlock(&c->vfs_inodes_lock);
+}
+
+void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
+{
+	struct bch_inode_info *inode, **i;
+	DARRAY(struct bch_inode_info *) grabbed;
+	bool clean_pass = false, this_pass_clean;
+
+	/*
+	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
+	 * be pruned with d_mark_dontcache().
+	 *
+	 * Once we've had a clean pass where we didn't find any inodes without
+	 * I_DONTCACHE, we wait for them to be freed:
+	 */
+
+	darray_init(&grabbed);
+	darray_make_room(&grabbed, 1024);
+again:
+	cond_resched();
+	this_pass_clean = true;
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
+		if (!snapshot_list_has_id(s, inode->ei_subvol))
+			continue;
+
+		if (!(inode->v.i_state & I_DONTCACHE) &&
+		    !(inode->v.i_state & I_FREEING) &&
+		    igrab(&inode->v)) {
+			this_pass_clean = false;
+
+			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
+				iput(&inode->v);
+				break;
+			}
+		} else if (clean_pass && this_pass_clean) {
+			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
+			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+
+			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+			mutex_unlock(&c->vfs_inodes_lock);
+
+			schedule();
+			finish_wait(wq, &wait.wq_entry);
+			goto again;
+		}
+	}
+	mutex_unlock(&c->vfs_inodes_lock);
+
+	darray_for_each(grabbed, i) {
+		inode = *i;
+		d_mark_dontcache(&inode->v);
+		d_prune_aliases(&inode->v);
+		iput(&inode->v);
+	}
+	grabbed.nr = 0;
+
+	if (!clean_pass || !this_pass_clean) {
+		clean_pass = this_pass_clean;
+		goto again;
+	}
+
+	darray_exit(&grabbed);
+}
+
+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
+	unsigned shift = sb->s_blocksize_bits - 9;
+	/*
+	 * this assumes inodes take up 64 bytes, which is a decent average
+	 * number:
+	 */
+	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
+	u64 fsid;
+
+	buf->f_type	= BCACHEFS_STATFS_MAGIC;
+	buf->f_bsize	= sb->s_blocksize;
+	buf->f_blocks	= usage.capacity >> shift;
+	buf->f_bfree	= usage.free >> shift;
+	buf->f_bavail	= avail_factor(usage.free) >> shift;
+
+	buf->f_files	= usage.nr_inodes + avail_inodes;
+	buf->f_ffree	= avail_inodes;
+
+	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_namelen	= BCH_NAME_MAX;
+
+	return 0;
+}
+
+static int bch2_sync_fs(struct super_block *sb, int wait)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	if (!wait) {
+		bch2_journal_flush_async(&c->journal, NULL);
+		return 0;
+	}
+
+	ret = bch2_journal_flush(&c->journal);
+	return bch2_err_class(ret);
+}
+
+static struct bch_fs *bch2_path_to_fs(const char *path)
+{
+	struct bch_fs *c;
+	dev_t dev;
+	int ret;
+
+	ret = lookup_bdev(path, &dev);
+	if (ret)
+		return ERR_PTR(ret);
+
+	c = bch2_dev_to_fs(dev);
+	if (c)
+		closure_put(&c->cl);
+	return c ?: ERR_PTR(-ENOENT);
+}
+
+static char **split_devs(const char *_dev_name, unsigned *nr)
+{
+	char *dev_name = NULL, **devs = NULL, *s;
+	size_t i = 0, nr_devs = 0;
+
+	dev_name = kstrdup(_dev_name, GFP_KERNEL);
+	if (!dev_name)
+		return NULL;
+
+	for (s = dev_name; s; s = strchr(s + 1, ':'))
+		nr_devs++;
+
+	devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
+	if (!devs) {
+		kfree(dev_name);
+		return NULL;
+	}
+
+	while ((s = strsep(&dev_name, ":")))
+		devs[i++] = s;
+
+	*nr = nr_devs;
+	return devs;
+}
+
+static int bch2_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_opts opts = bch2_opts_empty();
+	int ret;
+
+	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
+	ret = bch2_parse_mount_opts(c, &opts, data);
+	if (ret)
+		goto err;
+
+	if (opts.read_only != c->opts.read_only) {
+		down_write(&c->state_lock);
+
+		if (opts.read_only) {
+			bch2_fs_read_only(c);
+
+			sb->s_flags |= SB_RDONLY;
+		} else {
+			ret = bch2_fs_read_write(c);
+			if (ret) {
+				bch_err(c, "error going rw: %i", ret);
+				up_write(&c->state_lock);
+				ret = -EINVAL;
+				goto err;
+			}
+
+			sb->s_flags &= ~SB_RDONLY;
+		}
+
+		c->opts.read_only = opts.read_only;
+
+		up_write(&c->state_lock);
+	}
+
+	if (opt_defined(opts, errors))
+		c->opts.errors = opts.errors;
+err:
+	return bch2_err_class(ret);
+}
+
+static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	struct bch_dev *ca;
+	unsigned i;
+	bool first = true;
+
+	for_each_online_member(ca, c, i) {
+		if (!first)
+			seq_putc(seq, ':');
+		first = false;
+		seq_puts(seq, "/dev/");
+		seq_puts(seq, ca->name);
+	}
+
+	return 0;
+}
+
+static int bch2_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	enum bch_opt_id i;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+		if (!(opt->flags & OPT_MOUNT))
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		printbuf_reset(&buf);
+		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
+				 OPT_SHOW_MOUNT_STYLE);
+		seq_putc(seq, ',');
+		seq_puts(seq, buf.buf);
+	}
+
+	if (buf.allocation_failure)
+		ret = -ENOMEM;
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static void bch2_put_super(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	__bch2_fs_stop(c);
+}
+
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
+	return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	down_write(&c->state_lock);
+	ret = bch2_fs_read_write(c);
+	up_write(&c->state_lock);
+	return ret;
+}
+
+static const struct super_operations bch_super_operations = {
+	.alloc_inode	= bch2_alloc_inode,
+	.destroy_inode	= bch2_destroy_inode,
+	.write_inode	= bch2_vfs_write_inode,
+	.evict_inode	= bch2_evict_inode,
+	.sync_fs	= bch2_sync_fs,
+	.statfs		= bch2_statfs,
+	.show_devname	= bch2_show_devname,
+	.show_options	= bch2_show_options,
+	.remount_fs	= bch2_remount,
+	.put_super	= bch2_put_super,
+	.freeze_fs	= bch2_freeze,
+	.unfreeze_fs	= bch2_unfreeze,
+};
+
+static int bch2_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return 0;
+}
+
+static int bch2_noset_super(struct super_block *s, void *data)
+{
+	return -EBUSY;
+}
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+	struct bch_fs *c = s->s_fs_info;
+	struct bch_fs **devs = data;
+	unsigned i;
+
+	if (!c)
+		return false;
+
+	for (i = 0; devs[i]; i++)
+		if (c != devs[i])
+			return false;
+	return true;
+}
+
+static struct dentry *bch2_mount(struct file_system_type *fs_type,
+				 int flags, const char *dev_name, void *data)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	struct super_block *sb;
+	struct inode *vinode;
+	struct bch_opts opts = bch2_opts_empty();
+	char **devs;
+	struct bch_fs **devs_to_fs = NULL;
+	unsigned i, nr_devs;
+	int ret;
+
+	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
+
+	ret = bch2_parse_mount_opts(NULL, &opts, data);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!dev_name || strlen(dev_name) == 0)
+		return ERR_PTR(-EINVAL);
+
+	devs = split_devs(dev_name, &nr_devs);
+	if (!devs)
+		return ERR_PTR(-ENOMEM);
+
+	devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
+	if (!devs_to_fs) {
+		sb = ERR_PTR(-ENOMEM);
+		goto got_sb;
+	}
+
+	for (i = 0; i < nr_devs; i++)
+		devs_to_fs[i] = bch2_path_to_fs(devs[i]);
+
+	sb = sget(fs_type, bch2_test_super, bch2_noset_super,
+		  flags|SB_NOSEC, devs_to_fs);
+	if (!IS_ERR(sb))
+		goto got_sb;
+
+	c = bch2_fs_open(devs, nr_devs, opts);
+	if (IS_ERR(c)) {
+		sb = ERR_CAST(c);
+		goto got_sb;
+	}
+
+	/* Some options can't be parsed until after the fs is started: */
+	ret = bch2_parse_mount_opts(c, &opts, data);
+	if (ret) {
+		bch2_fs_stop(c);
+		sb = ERR_PTR(ret);
+		goto got_sb;
+	}
+
+	bch2_opts_apply(&c->opts, opts);
+
+	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
+	if (IS_ERR(sb))
+		bch2_fs_stop(c);
+got_sb:
+	kfree(devs_to_fs);
+	kfree(devs[0]);
+	kfree(devs);
+
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		ret = bch2_err_class(ret);
+		return ERR_PTR(ret);
+	}
+
+	c = sb->s_fs_info;
+
+	if (sb->s_root) {
+		if ((flags ^ sb->s_flags) & SB_RDONLY) {
+			ret = -EBUSY;
+			goto err_put_super;
+		}
+		goto out;
+	}
+
+	sb->s_blocksize		= block_bytes(c);
+	sb->s_blocksize_bits	= ilog2(block_bytes(c));
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_op		= &bch_super_operations;
+	sb->s_export_op		= &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+	sb->s_qcop		= &bch2_quotactl_operations;
+	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
+	sb->s_xattr		= bch2_xattr_handlers;
+	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
+	sb->s_time_gran		= c->sb.nsec_per_time_unit;
+	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
+	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
+	c->vfs_sb		= sb;
+	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
+
+	ret = super_setup_bdi(sb);
+	if (ret)
+		goto err_put_super;
+
+	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
+
+	for_each_online_member(ca, c, i) {
+		struct block_device *bdev = ca->disk_sb.bdev;
+
+		/* XXX: create an anonymous device for multi device filesystems */
+		sb->s_bdev	= bdev;
+		sb->s_dev	= bdev->bd_dev;
+		percpu_ref_put(&ca->io_ref);
+		break;
+	}
+
+	c->dev = sb->s_dev;
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	if (c->opts.acl)
+		sb->s_flags	|= SB_POSIXACL;
+#endif
+
+	sb->s_shrink.seeks = 0;
+
+	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	if (ret) {
+		bch_err_msg(c, ret, "mounting: error getting root inode");
+		goto err_put_super;
+	}
+
+	sb->s_root = d_make_root(vinode);
+	if (!sb->s_root) {
+		bch_err(c, "error mounting: error allocating root dentry");
+		ret = -ENOMEM;
+		goto err_put_super;
+	}
+
+	sb->s_flags |= SB_ACTIVE;
+out:
+	return dget(sb->s_root);
+
+err_put_super:
+	sb->s_fs_info = NULL;
+	c->vfs_sb = NULL;
+	deactivate_locked_super(sb);
+	bch2_fs_stop(c);
+	return ERR_PTR(bch2_err_class(ret));
+}
+
+static void bch2_kill_sb(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (c)
+		c->vfs_sb = NULL;
+	generic_shutdown_super(sb);
+	if (c)
+		bch2_fs_free(c);
+}
+
+static struct file_system_type bcache_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bcachefs",
+	.mount		= bch2_mount,
+	.kill_sb	= bch2_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+MODULE_ALIAS_FS("bcachefs");
+
+void bch2_vfs_exit(void)
+{
+	unregister_filesystem(&bcache_fs_type);
+	kmem_cache_destroy(bch2_inode_cache);
+}
+
+int __init bch2_vfs_init(void)
+{
+	int ret = -ENOMEM;
+
+	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+	if (!bch2_inode_cache)
+		goto err;
+
+	ret = register_filesystem(&bcache_fs_type);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch2_vfs_exit();
+	return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
new file mode 100644
index 000000000000..5edf1d4b9e6b
--- /dev/null
+++ b/fs/bcachefs/fs.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_H
+#define _BCACHEFS_FS_H
+
+#include "inode.h"
+#include "opts.h"
+#include "str_hash.h"
+#include "quota_types.h"
+#include "two_state_shared_lock.h"
+
+#include <linux/seqlock.h>
+#include <linux/stat.h>
+
+struct bch_inode_info {
+	struct inode		v;
+	struct list_head	ei_vfs_inode_list;
+	unsigned long		ei_flags;
+
+	struct mutex		ei_update_lock;
+	u64			ei_quota_reserved;
+	unsigned long		ei_last_dirtied;
+	two_state_lock_t	ei_pagecache_lock;
+
+	struct mutex		ei_quota_lock;
+	struct bch_qid		ei_qid;
+
+	u32			ei_subvol;
+
+	/*
+	 * When we've been doing nocow writes we'll need to issue flushes to the
+	 * underlying block devices
+	 *
+	 * XXX: a device may have had a flush issued by some other codepath. It
+	 * would be better to keep for each device a sequence number that's
+	 * incremented when we isusue a cache flush, and track here the sequence
+	 * number that needs flushing.
+	 */
+	struct bch_devs_mask	ei_devs_need_flush;
+
+	/* copy of inode in btree: */
+	struct bch_inode_unpacked ei_inode;
+};
+
+#define bch2_pagecache_add_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_tryget(i)	bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 0)
+
+#define bch2_pagecache_block_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
+#define bch2_pagecache_block_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 1)
+
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+	return (subvol_inum) {
+		.subvol	= inode->ei_subvol,
+		.inum	= inode->ei_inode.bi_inum,
+	};
+}
+
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR			0
+
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT		1
+
+#define to_bch_ei(_inode)					\
+	container_of_or_null(_inode, struct bch_inode_info, v)
+
+static inline int ptrcmp(void *l, void *r)
+{
+	return cmp_int(l, r);
+}
+
+enum bch_inode_lock_op {
+	INODE_LOCK		= (1U << 0),
+	INODE_PAGECACHE_BLOCK	= (1U << 1),
+	INODE_UPDATE_LOCK	= (1U << 2),
+};
+
+#define bch2_lock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if ((_locks) & INODE_LOCK)			\
+				down_write_nested(&a[i]->v.i_rwsem, i);	\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_get(a[i]);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
+				mutex_lock_nested(&a[i]->ei_update_lock, i);\
+		}							\
+} while (0)
+
+#define bch2_unlock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if ((_locks) & INODE_LOCK)			\
+				up_write(&a[i]->v.i_rwsem);		\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_put(a[i]);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
+				mutex_unlock(&a[i]->ei_update_lock);	\
+		}							\
+} while (0)
+
+static inline struct bch_inode_info *file_bch_inode(struct file *file)
+{
+	return to_bch_ei(file_inode(file));
+}
+
+static inline bool inode_attr_changing(struct bch_inode_info *dir,
+				struct bch_inode_info *inode,
+				enum inode_opt_id id)
+{
+	return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
+		bch2_inode_opt_get(&dir->ei_inode, id) !=
+		bch2_inode_opt_get(&inode->ei_inode, id);
+}
+
+static inline bool inode_attrs_changing(struct bch_inode_info *dir,
+				 struct bch_inode_info *inode)
+{
+	unsigned id;
+
+	for (id = 0; id < Inode_opt_nr; id++)
+		if (inode_attr_changing(dir, inode, id))
+			return true;
+
+	return false;
+}
+
+struct bch_inode_unpacked;
+
+#ifndef NO_BCACHEFS_FS
+
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
+	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
+int bch2_fs_quota_transfer(struct bch_fs *,
+			   struct bch_inode_info *,
+			   struct bch_qid,
+			   unsigned,
+			   enum quota_acct_mode);
+
+static inline int bch2_set_projid(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  u32 projid)
+{
+	struct bch_qid qid = inode->ei_qid;
+
+	qid.q[QTYP_PRJ] = projid;
+
+	return bch2_fs_quota_transfer(c, inode, qid,
+				      1 << QTYP_PRJ,
+				      KEY_TYPE_QUOTA_PREALLOC);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct btree_trans *,
+			    struct bch_inode_info *,
+			    struct bch_inode_unpacked *, void *);
+
+void bch2_inode_update_after_write(struct btree_trans *,
+				   struct bch_inode_info *,
+				   struct bch_inode_unpacked *,
+				   unsigned);
+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+				  inode_set_fn, void *, unsigned);
+
+int bch2_setattr_nonsize(struct mnt_idmap *,
+			 struct bch_inode_info *,
+			 struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
+
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
+
+void bch2_vfs_exit(void);
+int bch2_vfs_init(void);
+
+#else
+
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
+
+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+					       snapshot_id_list *s) {}
+static inline void bch2_vfs_exit(void) {}
+static inline int bch2_vfs_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
new file mode 100644
index 000000000000..b8f9e7475dc5
--- /dev/null
+++ b/fs/bcachefs/fsck.c
@@ -0,0 +1,2417 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "darray.h"
+#include "dirent.h"
+#include "error.h"
+#include "fs-common.h"
+#include "fsck.h"
+#include "inode.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "snapshot.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/bsearch.h>
+#include <linux/dcache.h> /* struct qstr */
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+/*
+ * XXX: this is handling transaction restarts without returning
+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
+ */
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+				    u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 sectors = 0;
+	int ret;
+
+	for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+				SPOS(inum, 0, snapshot),
+				POS(inum, U64_MAX),
+				0, k, ret)
+		if (bkey_extent_is_allocation(k.k))
+			sectors += k.k->size;
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret ?: sectors;
+}
+
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+				    u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u64 subdirs = 0;
+	int ret;
+
+	for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+				SPOS(inum, 0, snapshot),
+				POS(inum, U64_MAX),
+				0, k, ret) {
+		if (k.k->type != KEY_TYPE_dirent)
+			continue;
+
+		d = bkey_s_c_to_dirent(k);
+		if (d.v->d_type == DT_DIR)
+			subdirs++;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret ?: subdirs;
+}
+
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+				    u32 *subvol)
+{
+	struct bch_snapshot s;
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
+					  POS(0, snapshot), 0,
+					  snapshot, &s);
+	if (!ret)
+		*subvol = le32_to_cpu(s.subvol);
+	else if (bch2_err_matches(ret, ENOENT))
+		bch_err(trans->c, "snapshot %u not found", snapshot);
+	return ret;
+
+}
+
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+			   u32 *snapshot, u64 *inum)
+{
+	struct bch_subvolume s;
+	int ret;
+
+	ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+
+	*snapshot = le32_to_cpu(s.snapshot);
+	*inum = le64_to_cpu(s.inode);
+	return ret;
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+			 u32 *snapshot, u64 *inum)
+{
+	return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
+}
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+			      struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+			     POS(0, inode_nr),
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
+		ret = -BCH_ERR_ENOENT_inode;
+		goto err;
+	}
+
+	ret = bch2_inode_unpack(k, inode);
+err:
+	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
+			  struct bch_inode_unpacked *inode,
+			  u32 *snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inode_nr, *snapshot), 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = bkey_is_inode(k.k)
+		? bch2_inode_unpack(k, inode)
+		: -BCH_ERR_ENOENT_inode;
+	if (!ret)
+		*snapshot = iter.pos.snapshot;
+err:
+	bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
+			struct bch_inode_unpacked *inode,
+			u32 *snapshot)
+{
+	return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
+}
+
+static int __lookup_dirent(struct btree_trans *trans,
+			   struct bch_hash_info hash_info,
+			   subvol_inum dir, struct qstr *name,
+			   u64 *target, unsigned *type)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+			       &hash_info, dir, name, 0);
+	if (ret)
+		return ret;
+
+	d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+	*target = le64_to_cpu(d.v->d_inum);
+	*type = d.v->d_type;
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
+static int __write_inode(struct btree_trans *trans,
+			 struct bch_inode_unpacked *inode,
+			 u32 snapshot)
+{
+	struct bkey_inode_buf *inode_p =
+		bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode);
+	inode_p->inode.k.p.snapshot = snapshot;
+
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+				&inode_p->inode.k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static int fsck_write_inode(struct btree_trans *trans,
+			    struct bch_inode_unpacked *inode,
+			    u32 snapshot)
+{
+	int ret = commit_do(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW,
+				  __write_inode(trans, inode, snapshot));
+	if (ret)
+		bch_err_fn(trans->c, ret);
+	return ret;
+}
+
+static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bch_inode_unpacked dir_inode;
+	struct bch_hash_info dir_hash_info;
+	int ret;
+
+	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
+	if (ret)
+		goto err;
+
+	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+
+	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				  &dir_hash_info, &iter,
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Get lost+found, create if it doesn't exist: */
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
+			    struct bch_inode_unpacked *lostfound)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked root;
+	struct bch_hash_info root_hash_info;
+	struct qstr lostfound_str = QSTR("lost+found");
+	subvol_inum root_inum = { .subvol = subvol };
+	u64 inum = 0;
+	unsigned d_type = 0;
+	u32 snapshot;
+	int ret;
+
+	ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+	if (ret)
+		return ret;
+
+	ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+	if (ret)
+		return ret;
+
+	root_hash_info = bch2_hash_info_init(c, &root);
+
+	ret = __lookup_dirent(trans, root_hash_info, root_inum,
+			    &lostfound_str, &inum, &d_type);
+	if (bch2_err_matches(ret, ENOENT)) {
+		bch_notice(c, "creating lost+found");
+		goto create_lostfound;
+	}
+
+	bch_err_fn(c, ret);
+	if (ret)
+		return ret;
+
+	if (d_type != DT_DIR) {
+		bch_err(c, "error looking up lost+found: not a directory");
+		return -BCH_ERR_ENOENT_not_directory;
+	}
+
+	/*
+	 * The bch2_check_dirents pass has already run, dangling dirents
+	 * shouldn't exist here:
+	 */
+	return __lookup_inode(trans, inum, lostfound, &snapshot);
+
+create_lostfound:
+	bch2_inode_init_early(c, lostfound);
+
+	ret = bch2_create_trans(trans, root_inum, &root,
+				lostfound, &lostfound_str,
+				0, 0, S_IFDIR|0700, 0, NULL, NULL,
+				(subvol_inum) { }, 0);
+	bch_err_msg(c, ret, "creating lost+found");
+	return ret;
+}
+
+static int __reattach_inode(struct btree_trans *trans,
+			  struct bch_inode_unpacked *inode,
+			  u32 inode_snapshot)
+{
+	struct bch_hash_info dir_hash;
+	struct bch_inode_unpacked lostfound;
+	char name_buf[20];
+	struct qstr name;
+	u64 dir_offset = 0;
+	u32 subvol;
+	int ret;
+
+	ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+	if (ret)
+		return ret;
+
+	ret = lookup_lostfound(trans, subvol, &lostfound);
+	if (ret)
+		return ret;
+
+	if (S_ISDIR(inode->bi_mode)) {
+		lostfound.bi_nlink++;
+
+		ret = __write_inode(trans, &lostfound, U32_MAX);
+		if (ret)
+			return ret;
+	}
+
+	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
+
+	snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+	name = (struct qstr) QSTR(name_buf);
+
+	ret = bch2_dirent_create(trans,
+				 (subvol_inum) {
+					.subvol = subvol,
+					.inum = lostfound.bi_inum,
+				 },
+				 &dir_hash,
+				 inode_d_type(inode),
+				 &name, inode->bi_inum, &dir_offset,
+				 BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		return ret;
+
+	inode->bi_dir		= lostfound.bi_inum;
+	inode->bi_dir_offset	= dir_offset;
+
+	return __write_inode(trans, inode, inode_snapshot);
+}
+
+static int reattach_inode(struct btree_trans *trans,
+			  struct bch_inode_unpacked *inode,
+			  u32 inode_snapshot)
+{
+	int ret = commit_do(trans, NULL, NULL,
+				  BTREE_INSERT_LAZY_RW|
+				  BTREE_INSERT_NOFAIL,
+			__reattach_inode(trans, inode, inode_snapshot));
+	bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
+	return ret;
+}
+
+static int remove_backpointer(struct btree_trans *trans,
+			      struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
+				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
+				     dirent);
+	ret =   bkey_err(d) ?:
+		__remove_dirent(trans, d.k->p);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+struct snapshots_seen_entry {
+	u32				id;
+	u32				equiv;
+};
+
+struct snapshots_seen {
+	struct bpos			pos;
+	DARRAY(struct snapshots_seen_entry) ids;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+	darray_exit(&s->ids);
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+	memset(s, 0, sizeof(*s));
+}
+
+static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+	struct snapshots_seen_entry *i, n = {
+		.id	= id,
+		.equiv	= bch2_snapshot_equiv(c, id),
+	};
+	int ret = 0;
+
+	darray_for_each(s->ids, i) {
+		if (i->id == id)
+			return 0;
+		if (i->id > id)
+			break;
+	}
+
+	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+	if (ret)
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
+}
+
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
+				 enum btree_id btree_id, struct bpos pos)
+{
+	struct snapshots_seen_entry *i, n = {
+		.id	= pos.snapshot,
+		.equiv	= bch2_snapshot_equiv(c, pos.snapshot),
+	};
+	int ret = 0;
+
+	if (!bkey_eq(s->pos, pos))
+		s->ids.nr = 0;
+
+	s->pos = pos;
+	s->pos.snapshot = n.equiv;
+
+	darray_for_each(s->ids, i) {
+		if (i->id == n.id)
+			return 0;
+
+		/*
+		 * We currently don't rigorously track for snapshot cleanup
+		 * needing to be run, so it shouldn't be a fsck error yet:
+		 */
+		if (i->equiv == n.equiv) {
+			bch_err(c, "snapshot deletion did not finish:\n"
+				"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
+				bch2_btree_ids[btree_id],
+				pos.inode, pos.offset,
+				i->id, n.id, n.equiv);
+			return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
+		}
+	}
+
+	ret = darray_push(&s->ids, n);
+	if (ret)
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * @c:		filesystem handle
+ * @seen:	list of snapshot ids already seen at current position
+ * @id:		descendent snapshot id
+ * @ancestor:	ancestor snapshot id
+ *
+ * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+				    u32 id, u32 ancestor)
+{
+	ssize_t i;
+
+	EBUG_ON(id > ancestor);
+	EBUG_ON(!bch2_snapshot_is_equiv(c, id));
+	EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
+
+	/* @ancestor should be the snapshot most recently added to @seen */
+	EBUG_ON(ancestor != seen->pos.snapshot);
+	EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
+
+	if (id == ancestor)
+		return true;
+
+	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+		return false;
+
+	/*
+	 * We know that @id is a descendant of @ancestor, we're checking if
+	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
+	 * @ascestor and with @id as a descendent.
+	 *
+	 * But we already know that we're scanning IDs between @id and @ancestor
+	 * numerically, since snapshot ID lists are kept sorted, so if we find
+	 * an id that's an ancestor of @id we're done:
+	 */
+
+	for (i = seen->ids.nr - 2;
+	     i >= 0 && seen->ids.data[i].equiv >= id;
+	     --i)
+		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
+			return false;
+
+	return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * @c:		filesystem handle
+ * @s:		list of snapshot IDs already seen at @src
+ * @src:	snapshot ID of src key
+ * @dst:	snapshot ID of dst key
+ * Returns:	true if there is some snapshot in which @dst is visible
+ *
+ * Assumes we're visiting @src keys in natural key order
+ */
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+			u32 src, u32 dst)
+{
+	return dst <= src
+		? key_visible_in_snapshot(c, s, dst, src)
+		: bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+static int ref_visible2(struct bch_fs *c,
+			u32 src, struct snapshots_seen *src_seen,
+			u32 dst, struct snapshots_seen *dst_seen)
+{
+	src = bch2_snapshot_equiv(c, src);
+	dst = bch2_snapshot_equiv(c, dst);
+
+	if (dst > src) {
+		swap(dst, src);
+		swap(dst_seen, src_seen);
+	}
+	return key_visible_in_snapshot(c, src_seen, dst, src);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
+	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
+	     (_i)->snapshot <= (_snapshot); _i++)					\
+		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
+struct inode_walker_entry {
+	struct bch_inode_unpacked inode;
+	u32			snapshot;
+	bool			seen_this_pos;
+	u64			count;
+};
+
+struct inode_walker {
+	bool				first_this_inode;
+	bool				recalculate_sums;
+	struct bpos			last_pos;
+
+	DARRAY(struct inode_walker_entry) inodes;
+};
+
+static void inode_walker_exit(struct inode_walker *w)
+{
+	darray_exit(&w->inodes);
+}
+
+static struct inode_walker inode_walker_init(void)
+{
+	return (struct inode_walker) { 0, };
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+		     struct bkey_s_c inode)
+{
+	struct bch_inode_unpacked u;
+
+	BUG_ON(bch2_inode_unpack(inode, &u));
+
+	return darray_push(&w->inodes, ((struct inode_walker_entry) {
+		.inode		= u,
+		.snapshot	= bch2_snapshot_equiv(c, inode.k->p.snapshot),
+	}));
+}
+
+static int get_inodes_all_snapshots(struct btree_trans *trans,
+				    struct inode_walker *w, u64 inum)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 restart_count = trans->restart_count;
+	int ret;
+
+	w->recalculate_sums = false;
+	w->inodes.nr = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (k.k->p.offset != inum)
+			break;
+
+		if (bkey_is_inode(k.k))
+			add_inode(c, w, k);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	w->first_this_inode = true;
+
+	return trans_was_restarted(trans, restart_count);
+}
+
+static struct inode_walker_entry *
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
+			  u32 snapshot, bool is_whiteout)
+{
+	struct inode_walker_entry *i;
+
+	snapshot = bch2_snapshot_equiv(c, snapshot);
+
+	darray_for_each(w->inodes, i)
+		if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
+			goto found;
+
+	return NULL;
+found:
+	BUG_ON(snapshot > i->snapshot);
+
+	if (snapshot != i->snapshot && !is_whiteout) {
+		struct inode_walker_entry new = *i;
+		size_t pos;
+		int ret;
+
+		new.snapshot = snapshot;
+		new.count = 0;
+
+		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
+			 w->last_pos.inode, snapshot, i->snapshot);
+
+		while (i > w->inodes.data && i[-1].snapshot > snapshot)
+			--i;
+
+		pos = i - w->inodes.data;
+		ret = darray_insert_item(&w->inodes, pos, new);
+		if (ret)
+			return ERR_PTR(ret);
+
+		i = w->inodes.data + pos;
+	}
+
+	return i;
+}
+
+static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
+					     struct inode_walker *w, struct bpos pos,
+					     bool is_whiteout)
+{
+	if (w->last_pos.inode != pos.inode) {
+		int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+		if (ret)
+			return ERR_PTR(ret);
+	} else if (bkey_cmp(w->last_pos, pos)) {
+		struct inode_walker_entry *i;
+
+		darray_for_each(w->inodes, i)
+			i->seen_this_pos = false;
+
+	}
+
+	w->last_pos = pos;
+
+	return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+}
+
+static int __get_visible_inodes(struct btree_trans *trans,
+				struct inode_walker *w,
+				struct snapshots_seen *s,
+				u64 inum)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	w->inodes.nr = 0;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+		if (k.k->p.offset != inum)
+			break;
+
+		if (!ref_visible(c, s, s->pos.snapshot, equiv))
+			continue;
+
+		if (bkey_is_inode(k.k))
+			add_inode(c, w, k);
+
+		if (equiv >= s->pos.snapshot)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+				  struct btree_iter *iter,
+				  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
+			"key in missing snapshot: %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		ret = bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int hash_redo_key(struct btree_trans *trans,
+			 const struct bch_hash_desc desc,
+			 struct bch_hash_info *hash_info,
+			 struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	struct bkey_i *delete;
+	struct bkey_i *tmp;
+
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	if (IS_ERR(delete))
+		return PTR_ERR(delete);
+
+	tmp = bch2_bkey_make_mut_noupdate(trans, k);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	bkey_init(&delete->k);
+	delete->k.p = k_iter->pos;
+	return  bch2_btree_iter_traverse(k_iter) ?:
+		bch2_trans_update(trans, k_iter, delete, 0) ?:
+		bch2_hash_set_snapshot(trans, desc, hash_info,
+				       (subvol_inum) { 0, k.k->p.inode },
+				       k.k->p.snapshot, tmp,
+				       BCH_HASH_SET_MUST_CREATE,
+				       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW);
+}
+
+static int hash_check_key(struct btree_trans *trans,
+			  const struct bch_hash_desc desc,
+			  struct bch_hash_info *hash_info,
+			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c k;
+	u64 hash;
+	int ret = 0;
+
+	if (hash_k.k->type != desc.key_type)
+		return 0;
+
+	hash = desc.hash_bkey(hash_info, hash_k);
+
+	if (likely(hash == hash_k.k->p.offset))
+		return 0;
+
+	if (hash_k.k->p.offset < hash)
+		goto bad_hash;
+
+	for_each_btree_key_norestart(trans, iter, desc.btree_id,
+				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+				     BTREE_ITER_SLOTS, k, ret) {
+		if (bkey_eq(k.k->p, hash_k.k->p))
+			break;
+
+		if (fsck_err_on(k.k->type == desc.key_type &&
+				!desc.cmp_bkey(k, hash_k), c,
+				"duplicate hash table keys:\n%s",
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, hash_k),
+				 buf.buf))) {
+			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
+			break;
+		}
+
+		if (bkey_deleted(k.k)) {
+			bch2_trans_iter_exit(trans, &iter);
+			goto bad_hash;
+		}
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+bad_hash:
+	if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+		     bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
+		     (printbuf_reset(&buf),
+		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
+		bch_err_fn(c, ret);
+		if (ret)
+			return ret;
+		ret = -BCH_ERR_transaction_restart_nested;
+	}
+fsck_err:
+	goto out;
+}
+
+static int check_inode(struct btree_trans *trans,
+		       struct btree_iter *iter,
+		       struct bkey_s_c k,
+		       struct bch_inode_unpacked *prev,
+		       struct snapshots_seen *s,
+		       bool full)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	bool do_update = false;
+	int ret;
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret < 0)
+		goto err;
+	if (ret)
+		return 0;
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k))
+		return 0;
+
+	BUG_ON(bch2_inode_unpack(k, &u));
+
+	if (!full &&
+	    !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+			    BCH_INODE_I_SECTORS_DIRTY|
+			    BCH_INODE_UNLINKED)))
+		return 0;
+
+	if (prev->bi_inum != u.bi_inum)
+		*prev = u;
+
+	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
+			inode_d_type(prev)	!= inode_d_type(&u), c,
+			"inodes in different snapshots don't match")) {
+		bch_err(c, "repair not implemented yet");
+		return -EINVAL;
+	}
+
+	if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) &&
+	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
+		struct bpos new_min_pos;
+
+		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+		if (ret)
+			goto err;
+
+		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED;
+
+		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		bch_err_msg(c, ret, "in fsck updating inode");
+		if (ret)
+			return ret;
+
+		if (!bpos_eq(new_min_pos, POS_MIN))
+			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
+		return 0;
+	}
+
+	if (u.bi_flags & BCH_INODE_UNLINKED &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
+		      u.bi_inum))) {
+		bch2_trans_unlock(trans);
+		bch2_fs_lazy_rw(c);
+
+		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
+		bch_err_msg(c, ret, "in fsck deleting inode");
+		return ret;
+	}
+
+	if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
+		      u.bi_inum))) {
+		bch_verbose(c, "truncating inode %llu", u.bi_inum);
+
+		bch2_trans_unlock(trans);
+		bch2_fs_lazy_rw(c);
+
+		/*
+		 * XXX: need to truncate partial blocks too here - or ideally
+		 * just switch units to bytes and that issue goes away
+		 */
+		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+				     iter->pos.snapshot),
+				POS(u.bi_inum, U64_MAX),
+				0, NULL);
+		bch_err_msg(c, ret, "in fsck truncating inode");
+		if (ret)
+			return ret;
+
+		/*
+		 * We truncated without our normal sector accounting hook, just
+		 * make sure we recalculate it:
+		 */
+		u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+
+		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+		do_update = true;
+	}
+
+	if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
+	    (!c->sb.clean ||
+	     fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
+		      u.bi_inum))) {
+		s64 sectors;
+
+		bch_verbose(c, "recounting sectors for inode %llu",
+			    u.bi_inum);
+
+		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
+		if (sectors < 0) {
+			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
+			return sectors;
+		}
+
+		u.bi_sectors = sectors;
+		u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+		do_update = true;
+	}
+
+	if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
+		u.bi_dir = 0;
+		u.bi_dir_offset = 0;
+		u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
+		do_update = true;
+	}
+
+	if (do_update) {
+		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		bch_err_msg(c, ret, "in fsck updating inode");
+		if (ret)
+			return ret;
+	}
+err:
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+noinline_for_stack
+int bch2_check_inodes(struct bch_fs *c)
+{
+	bool full = c->opts.fsck;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bch_inode_unpacked prev = { 0 };
+	struct snapshots_seen s;
+	struct bkey_s_c k;
+	int ret;
+
+	snapshots_seen_init(&s);
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+			POS_MIN,
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_inode(trans, &iter, k, &prev, &s, full));
+
+	snapshots_seen_exit(&s);
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+						struct btree_iter *iter,
+						struct bpos pos)
+{
+	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+				   struct bkey_s_c_dirent d)
+{
+	return  inode->bi_dir		== d.k->p.inode &&
+		inode->bi_dir_offset	== d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *inode)
+{
+	return d.v->d_type == DT_SUBVOL
+		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
+		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
+}
+
+static int inode_backpointer_exists(struct btree_trans *trans,
+				    struct bch_inode_unpacked *inode,
+				    u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	d = dirent_get_by_pos(trans, &iter,
+			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
+	ret = bkey_err(d);
+	if (ret)
+		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+
+	ret = dirent_points_to_inode(d, inode);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+	s64 count2;
+
+	darray_for_each(w->inodes, i) {
+		if (i->inode.bi_sectors == i->count)
+			continue;
+
+		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
+
+		if (w->recalculate_sums)
+			i->count = count2;
+
+		if (i->count != count2) {
+			bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+				w->last_pos.inode, i->snapshot, i->count, count2);
+			return -BCH_ERR_internal_fsck_err;
+		}
+
+		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
+			    "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+			    w->last_pos.inode, i->snapshot,
+			    i->inode.bi_sectors, i->count)) {
+			i->inode.bi_sectors = i->count;
+			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			if (ret)
+				break;
+		}
+	}
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+struct extent_end {
+	u32			snapshot;
+	u64			offset;
+	struct snapshots_seen	seen;
+};
+
+struct extent_ends {
+	struct bpos			last_pos;
+	DARRAY(struct extent_end)	e;
+};
+
+static void extent_ends_reset(struct extent_ends *extent_ends)
+{
+	struct extent_end *i;
+
+	darray_for_each(extent_ends->e, i)
+		snapshots_seen_exit(&i->seen);
+
+	extent_ends->e.nr = 0;
+}
+
+static void extent_ends_exit(struct extent_ends *extent_ends)
+{
+	extent_ends_reset(extent_ends);
+	darray_exit(&extent_ends->e);
+}
+
+static void extent_ends_init(struct extent_ends *extent_ends)
+{
+	memset(extent_ends, 0, sizeof(*extent_ends));
+}
+
+static int extent_ends_at(struct bch_fs *c,
+			  struct extent_ends *extent_ends,
+			  struct snapshots_seen *seen,
+			  struct bkey_s_c k)
+{
+	struct extent_end *i, n = (struct extent_end) {
+		.offset		= k.k->p.offset,
+		.snapshot	= k.k->p.snapshot,
+		.seen		= *seen,
+	};
+
+	n.seen.ids.data = kmemdup(seen->ids.data,
+			      sizeof(seen->ids.data[0]) * seen->ids.size,
+			      GFP_KERNEL);
+	if (!n.seen.ids.data)
+		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+
+	darray_for_each(extent_ends->e, i) {
+		if (i->snapshot == k.k->p.snapshot) {
+			snapshots_seen_exit(&i->seen);
+			*i = n;
+			return 0;
+		}
+
+		if (i->snapshot >= k.k->p.snapshot)
+			break;
+	}
+
+	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
+}
+
+static int overlapping_extents_found(struct btree_trans *trans,
+				     enum btree_id btree,
+				     struct bpos pos1, struct snapshots_seen *pos1_seen,
+				     struct bkey pos2,
+				     bool *fixed,
+				     struct extent_end *extent_end)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	struct btree_iter iter1, iter2 = { NULL };
+	struct bkey_s_c k1, k2;
+	int ret;
+
+	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
+
+	bch2_trans_iter_init(trans, &iter1, btree, pos1,
+			     BTREE_ITER_ALL_SNAPSHOTS|
+			     BTREE_ITER_NOT_EXTENTS);
+	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
+	ret = bkey_err(k1);
+	if (ret)
+		goto err;
+
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k1);
+
+	if (!bpos_eq(pos1, k1.k->p)) {
+		prt_str(&buf, "\n  wanted\n  ");
+		bch2_bpos_to_text(&buf, pos1);
+		prt_str(&buf, "\n  ");
+		bch2_bkey_to_text(&buf, &pos2);
+
+		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
+			__func__, buf.buf);
+		ret = -BCH_ERR_internal_fsck_err;
+		goto err;
+	}
+
+	bch2_trans_copy_iter(&iter2, &iter1);
+
+	while (1) {
+		bch2_btree_iter_advance(&iter2);
+
+		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
+		ret = bkey_err(k2);
+		if (ret)
+			goto err;
+
+		if (bpos_ge(k2.k->p, pos2.p))
+			break;
+	}
+
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k2);
+
+	if (bpos_gt(k2.k->p, pos2.p) ||
+	    pos2.size != k2.k->size) {
+		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
+			__func__, buf.buf);
+		ret = -BCH_ERR_internal_fsck_err;
+		goto err;
+	}
+
+	prt_printf(&buf, "\n  overwriting %s extent",
+		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
+
+	if (fsck_err(c, "overlapping extents%s", buf.buf)) {
+		struct btree_iter *old_iter = &iter1;
+		struct disk_reservation res = { 0 };
+
+		if (pos1.snapshot < pos2.p.snapshot) {
+			old_iter = &iter2;
+			swap(k1, k2);
+		}
+
+		trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
+
+		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+				k1, k2) ?:
+			bch2_trans_commit(trans, &res, NULL,
+				BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(c, &res);
+
+		if (ret)
+			goto err;
+
+		*fixed = true;
+
+		if (pos1.snapshot == pos2.p.snapshot) {
+			/*
+			 * We overwrote the first extent, and did the overwrite
+			 * in the same snapshot:
+			 */
+			extent_end->offset = bkey_start_offset(&pos2);
+		} else if (pos1.snapshot > pos2.p.snapshot) {
+			/*
+			 * We overwrote the first extent in pos2's snapshot:
+			 */
+			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
+		} else {
+			/*
+			 * We overwrote the second extent - restart
+			 * check_extent() from the top:
+			 */
+			ret = -BCH_ERR_transaction_restart_nested;
+		}
+	}
+fsck_err:
+err:
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_iter_exit(trans, &iter1);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int check_overlapping_extents(struct btree_trans *trans,
+			      struct snapshots_seen *seen,
+			      struct extent_ends *extent_ends,
+			      struct bkey_s_c k,
+			      u32 equiv,
+			      struct btree_iter *iter,
+			      bool *fixed)
+{
+	struct bch_fs *c = trans->c;
+	struct extent_end *i;
+	int ret = 0;
+
+	/* transaction restart, running again */
+	if (bpos_eq(extent_ends->last_pos, k.k->p))
+		return 0;
+
+	if (extent_ends->last_pos.inode != k.k->p.inode)
+		extent_ends_reset(extent_ends);
+
+	darray_for_each(extent_ends->e, i) {
+		if (i->offset <= bkey_start_offset(k.k))
+			continue;
+
+		if (!ref_visible2(c,
+				  k.k->p.snapshot, seen,
+				  i->snapshot, &i->seen))
+			continue;
+
+		ret = overlapping_extents_found(trans, iter->btree_id,
+						SPOS(iter->pos.inode,
+						     i->offset,
+						     i->snapshot),
+						&i->seen,
+						*k.k, fixed, i);
+		if (ret)
+			goto err;
+	}
+
+	ret = extent_ends_at(c, extent_ends, seen, k);
+	if (ret)
+		goto err;
+
+	extent_ends->last_pos = k.k->p;
+err:
+	return ret;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bkey_s_c k,
+			struct inode_walker *inode,
+			struct snapshots_seen *s,
+			struct extent_ends *extent_ends)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	struct printbuf buf = PRINTBUF;
+	struct bpos equiv = k.k->p;
+	int ret = 0;
+
+	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret) {
+		ret = ret < 0 ? ret : 0;
+		goto out;
+	}
+
+	if (inode->last_pos.inode != k.k->p.inode) {
+		ret = check_i_sectors(trans, inode);
+		if (ret)
+			goto err;
+	}
+
+	i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
+		goto err;
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_whiteout) {
+		if (fsck_err_on(!i, c,
+				"extent in missing inode:\n  %s",
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			goto delete;
+
+		if (fsck_err_on(i &&
+				!S_ISREG(i->inode.bi_mode) &&
+				!S_ISLNK(i->inode.bi_mode), c,
+				"extent in non regular inode mode %o:\n  %s",
+				i->inode.bi_mode,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			goto delete;
+
+		ret = check_overlapping_extents(trans, s, extent_ends, k,
+						equiv.snapshot, iter,
+						&inode->recalculate_sums);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * Check inodes in reverse order, from oldest snapshots to newest,
+	 * starting from the inode that matches this extent's snapshot. If we
+	 * didn't have one, iterate over all inodes:
+	 */
+	if (!i)
+		i = inode->inodes.data + inode->inodes.nr - 1;
+
+	for (;
+	     inode->inodes.data && i >= inode->inodes.data;
+	     --i) {
+		if (i->snapshot > equiv.snapshot ||
+		    !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+			continue;
+
+		if (k.k->type != KEY_TYPE_whiteout) {
+			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+					!bkey_extent_is_reservation(k), c,
+					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
+					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+				struct btree_iter iter2;
+
+				bch2_trans_copy_iter(&iter2, iter);
+				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+				ret =   bch2_btree_iter_traverse(&iter2) ?:
+					bch2_btree_delete_at(trans, &iter2,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+				bch2_trans_iter_exit(trans, &iter2);
+				if (ret)
+					goto err;
+
+				iter->k.type = KEY_TYPE_whiteout;
+			}
+
+			if (bkey_extent_is_allocation(k.k))
+				i->count += k.k->size;
+		}
+
+		i->seen_this_pos = true;
+	}
+out:
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+delete:
+	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	goto out;
+}
+
+/*
+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
+ * that i_size an i_sectors are consistent
+ */
+int bch2_check_extents(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct snapshots_seen s;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct extent_ends extent_ends;
+	struct disk_reservation res = { 0 };
+	int ret = 0;
+
+	snapshots_seen_init(&s);
+	extent_ends_init(&extent_ends);
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+			&res, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+		bch2_disk_reservation_put(c, &res);
+		check_extent(trans, &iter, k, &w, &s, &extent_ends);
+	})) ?:
+	check_i_sectors(trans, &w);
+
+	bch2_disk_reservation_put(c, &res);
+	extent_ends_exit(&extent_ends);
+	inode_walker_exit(&w);
+	snapshots_seen_exit(&s);
+	bch2_trans_put(trans);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+	s64 count2;
+
+	darray_for_each(w->inodes, i) {
+		if (i->inode.bi_nlink == i->count)
+			continue;
+
+		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
+		if (count2 < 0)
+			return count2;
+
+		if (i->count != count2) {
+			bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+				i->count, count2);
+			i->count = count2;
+			if (i->inode.bi_nlink == i->count)
+				continue;
+		}
+
+		if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
+			i->inode.bi_nlink = i->count;
+			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			if (ret)
+				break;
+		}
+	}
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+static int check_dirent_target(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c_dirent d,
+			       struct bch_inode_unpacked *target,
+			       u32 target_snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_dirent *n;
+	bool backpointer_exists = true;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (!target->bi_dir &&
+	    !target->bi_dir_offset) {
+		target->bi_dir		= d.k->p.inode;
+		target->bi_dir_offset	= d.k->p.offset;
+
+		ret = __write_inode(trans, target, target_snapshot);
+		if (ret)
+			goto err;
+	}
+
+	if (!inode_points_to_dirent(target, d)) {
+		ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+		if (ret < 0)
+			goto err;
+
+		backpointer_exists = ret;
+		ret = 0;
+
+		if (fsck_err_on(S_ISDIR(target->bi_mode) &&
+				backpointer_exists, c,
+				"directory %llu with multiple links",
+				target->bi_inum)) {
+			ret = __remove_dirent(trans, d.k->p);
+			goto out;
+		}
+
+		if (fsck_err_on(backpointer_exists &&
+				!target->bi_nlink, c,
+				"inode %llu type %s has multiple links but i_nlink 0",
+				target->bi_inum, bch2_d_types[d.v->d_type])) {
+			target->bi_nlink++;
+			target->bi_flags &= ~BCH_INODE_UNLINKED;
+
+			ret = __write_inode(trans, target, target_snapshot);
+			if (ret)
+				goto err;
+		}
+
+		if (fsck_err_on(!backpointer_exists, c,
+				"inode %llu:%u has wrong backpointer:\n"
+				"got       %llu:%llu\n"
+				"should be %llu:%llu",
+				target->bi_inum, target_snapshot,
+				target->bi_dir,
+				target->bi_dir_offset,
+				d.k->p.inode,
+				d.k->p.offset)) {
+			target->bi_dir		= d.k->p.inode;
+			target->bi_dir_offset	= d.k->p.offset;
+
+			ret = __write_inode(trans, target, target_snapshot);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
+			"incorrect d_type: got %s, should be %s:\n%s",
+			bch2_d_type_str(d.v->d_type),
+			bch2_d_type_str(inode_d_type(target)),
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		bkey_reassemble(&n->k_i, d.s_c);
+		n->v.d_type = inode_d_type(target);
+
+		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+		if (ret)
+			goto err;
+
+		d = dirent_i_to_s_c(n);
+	}
+
+	if (d.v->d_type == DT_SUBVOL &&
+	    target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
+	    (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
+	     fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
+		      le32_to_cpu(d.v->d_parent_subvol),
+		      target->bi_parent_subvol))) {
+		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		bkey_reassemble(&n->k_i, d.s_c);
+		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+
+		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+		if (ret)
+			goto err;
+
+		d = dirent_i_to_s_c(n);
+	}
+out:
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bkey_s_c k,
+			struct bch_hash_info *hash_info,
+			struct inode_walker *dir,
+			struct inode_walker *target,
+			struct snapshots_seen *s)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_dirent d;
+	struct inode_walker_entry *i;
+	struct printbuf buf = PRINTBUF;
+	struct bpos equiv;
+	int ret = 0;
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret) {
+		ret = ret < 0 ? ret : 0;
+		goto out;
+	}
+
+	equiv = k.k->p;
+	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
+	if (k.k->type == KEY_TYPE_whiteout)
+		goto out;
+
+	if (dir->last_pos.inode != k.k->p.inode) {
+		ret = check_subdir_count(trans, dir);
+		if (ret)
+			goto err;
+	}
+
+	BUG_ON(!iter->path->should_be_locked);
+
+	i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret < 0)
+		goto err;
+
+	if (dir->first_this_inode && dir->inodes.nr)
+		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
+	dir->first_this_inode = false;
+
+	if (fsck_err_on(!i, c,
+			"dirent in nonexisting directory:\n%s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		goto out;
+	}
+
+	if (!i)
+		goto out;
+
+	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
+			"dirent in non directory inode type %s:\n%s",
+			bch2_d_type_str(inode_d_type(&i->inode)),
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
+
+	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
+	if (ret < 0)
+		goto err;
+	if (ret) {
+		/* dirent has been deleted */
+		ret = 0;
+		goto out;
+	}
+
+	if (k.k->type != KEY_TYPE_dirent)
+		goto out;
+
+	d = bkey_s_c_to_dirent(k);
+
+	if (d.v->d_type == DT_SUBVOL) {
+		struct bch_inode_unpacked subvol_root;
+		u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+		u32 target_snapshot;
+		u64 target_inum;
+
+		ret = __subvol_lookup(trans, target_subvol,
+				      &target_snapshot, &target_inum);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			goto err;
+
+		if (fsck_err_on(ret, c,
+				"dirent points to missing subvolume %u",
+				le32_to_cpu(d.v->d_child_subvol))) {
+			ret = __remove_dirent(trans, d.k->p);
+			goto err;
+		}
+
+		ret = __lookup_inode(trans, target_inum,
+				   &subvol_root, &target_snapshot);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			goto err;
+
+		if (fsck_err_on(ret, c,
+				"subvolume %u points to missing subvolume root %llu",
+				target_subvol,
+				target_inum)) {
+			bch_err(c, "repair not implemented yet");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+				"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+				target_inum,
+				subvol_root.bi_subvol, target_subvol)) {
+			subvol_root.bi_subvol = target_subvol;
+			ret = __write_inode(trans, &subvol_root, target_snapshot);
+			if (ret)
+				goto err;
+		}
+
+		ret = check_dirent_target(trans, iter, d, &subvol_root,
+					  target_snapshot);
+		if (ret)
+			goto err;
+	} else {
+		ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(!target->inodes.nr, c,
+				"dirent points to missing inode: (equiv %u)\n%s",
+				equiv.snapshot,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k),
+				 buf.buf))) {
+			ret = __remove_dirent(trans, d.k->p);
+			if (ret)
+				goto err;
+		}
+
+		darray_for_each(target->inodes, i) {
+			ret = check_dirent_target(trans, iter, d,
+						  &i->inode, i->snapshot);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (d.v->d_type == DT_DIR)
+		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+			i->count++;
+
+out:
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+int bch2_check_dirents(struct bch_fs *c)
+{
+	struct inode_walker dir = inode_walker_init();
+	struct inode_walker target = inode_walker_init();
+	struct snapshots_seen s;
+	struct bch_hash_info hash_info;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	snapshots_seen_init(&s);
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
+
+	bch2_trans_put(trans);
+	snapshots_seen_exit(&s);
+	inode_walker_exit(&dir);
+	inode_walker_exit(&target);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+		       struct bkey_s_c k,
+		       struct bch_hash_info *hash_info,
+		       struct inode_walker *inode)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	int ret;
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret)
+		return ret;
+
+	i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
+		return ret;
+
+	if (inode->first_this_inode && inode->inodes.nr)
+		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
+	inode->first_this_inode = false;
+
+	if (fsck_err_on(!i, c,
+			"xattr for missing inode %llu",
+			k.k->p.inode))
+		return bch2_btree_delete_at(trans, iter, 0);
+
+	if (!i)
+		return 0;
+
+	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Walk xattrs: verify that they all have a corresponding inode
+ */
+int bch2_check_xattrs(struct bch_fs *c)
+{
+	struct inode_walker inode = inode_walker_init();
+	struct bch_hash_info hash_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_xattr(trans, &iter, k, &hash_info, &inode)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_root_trans(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked root_inode;
+	u32 snapshot;
+	u64 inum;
+	int ret;
+
+	ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+		struct bkey_i_subvolume root_subvol;
+
+		snapshot	= U32_MAX;
+		inum		= BCACHEFS_ROOT_INO;
+
+		bkey_subvolume_init(&root_subvol.k_i);
+		root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+		root_subvol.v.flags	= 0;
+		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
+		root_subvol.v.inode	= cpu_to_le64(inum);
+		ret = commit_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
+					    &root_subvol.k_i, 0));
+		bch_err_msg(c, ret, "writing root subvol");
+		if (ret)
+			goto err;
+
+	}
+
+	ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
+	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+				"root inode not a directory")) {
+		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+				0, NULL);
+		root_inode.bi_inum = inum;
+
+		ret = __write_inode(trans, &root_inode, snapshot);
+		bch_err_msg(c, ret, "writing root inode");
+	}
+err:
+fsck_err:
+	return ret;
+}
+
+/* Get root directory, create if it doesn't exist: */
+int bch2_check_root(struct bch_fs *c)
+{
+	int ret;
+
+	ret = bch2_trans_do(c, NULL, NULL,
+			     BTREE_INSERT_NOFAIL|
+			     BTREE_INSERT_LAZY_RW,
+		check_root_trans(trans));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+struct pathbuf_entry {
+	u64	inum;
+	u32	snapshot;
+};
+
+typedef DARRAY(struct pathbuf_entry) pathbuf;
+
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
+{
+	struct pathbuf_entry *i;
+
+	darray_for_each(*p, i)
+		if (i->inum	== inum &&
+		    i->snapshot	== snapshot)
+			return true;
+
+	return false;
+}
+
+static int path_down(struct bch_fs *c, pathbuf *p,
+		     u64 inum, u32 snapshot)
+{
+	int ret = darray_push(p, ((struct pathbuf_entry) {
+		.inum		= inum,
+		.snapshot	= snapshot,
+	}));
+
+	if (ret)
+		bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+			p->size);
+	return ret;
+}
+
+/*
+ * Check that a given inode is reachable from the root:
+ *
+ * XXX: we should also be verifying that inodes are in the right subvolumes
+ */
+static int check_path(struct btree_trans *trans,
+		      pathbuf *p,
+		      struct bch_inode_unpacked *inode,
+		      u32 snapshot)
+{
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	snapshot = bch2_snapshot_equiv(c, snapshot);
+	p->nr = 0;
+
+	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
+		 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+		struct btree_iter dirent_iter;
+		struct bkey_s_c_dirent d;
+		u32 parent_snapshot = snapshot;
+
+		if (inode->bi_subvol) {
+			u64 inum;
+
+			ret = subvol_lookup(trans, inode->bi_parent_subvol,
+					    &parent_snapshot, &inum);
+			if (ret)
+				break;
+		}
+
+		ret = lockrestart_do(trans,
+			PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
+					  SPOS(inode->bi_dir, inode->bi_dir_offset,
+					       parent_snapshot))).k));
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			break;
+
+		if (!ret && !dirent_points_to_inode(d, inode)) {
+			bch2_trans_iter_exit(trans, &dirent_iter);
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+		}
+
+		if (bch2_err_matches(ret, ENOENT)) {
+			if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+				     inode->bi_inum, snapshot,
+				     bch2_d_type_str(inode_d_type(inode)),
+				     inode->bi_nlink,
+				     inode->bi_dir,
+				     inode->bi_dir_offset))
+				ret = reattach_inode(trans, inode, snapshot);
+			break;
+		}
+
+		bch2_trans_iter_exit(trans, &dirent_iter);
+
+		if (!S_ISDIR(inode->bi_mode))
+			break;
+
+		ret = path_down(c, p, inode->bi_inum, snapshot);
+		if (ret) {
+			bch_err(c, "memory allocation failure");
+			return ret;
+		}
+
+		snapshot = parent_snapshot;
+
+		ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+		if (ret) {
+			/* Should have been caught in dirents pass */
+			bch_err(c, "error looking up parent directory: %i", ret);
+			break;
+		}
+
+		if (path_is_dup(p, inode->bi_inum, snapshot)) {
+			struct pathbuf_entry *i;
+
+			/* XXX print path */
+			bch_err(c, "directory structure loop");
+
+			darray_for_each(*p, i)
+				pr_err("%llu:%u", i->inum, i->snapshot);
+			pr_err("%llu:%u", inode->bi_inum, snapshot);
+
+			if (!fsck_err(c, "directory structure loop"))
+				return 0;
+
+			ret = commit_do(trans, NULL, NULL,
+					      BTREE_INSERT_NOFAIL|
+					      BTREE_INSERT_LAZY_RW,
+					remove_backpointer(trans, inode));
+			if (ret) {
+				bch_err(c, "error removing dirent: %i", ret);
+				break;
+			}
+
+			ret = reattach_inode(trans, inode, snapshot);
+		}
+	}
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Check for unreachable inodes, as well as loops in the directory structure:
+ * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
+ * unreachable:
+ */
+int bch2_check_directory_structure(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked u;
+	pathbuf path = { 0, };
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (!bkey_is_inode(k.k))
+			continue;
+
+		ret = bch2_inode_unpack(k, &u);
+		if (ret) {
+			/* Should have been caught earlier in fsck: */
+			bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
+			break;
+		}
+
+		if (u.bi_flags & BCH_INODE_UNLINKED)
+			continue;
+
+		ret = check_path(trans, &path, &u, iter.pos.snapshot);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	darray_exit(&path);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+struct nlink_table {
+	size_t		nr;
+	size_t		size;
+
+	struct nlink {
+		u64	inum;
+		u32	snapshot;
+		u32	count;
+	}		*d;
+};
+
+static int add_nlink(struct bch_fs *c, struct nlink_table *t,
+		     u64 inum, u32 snapshot)
+{
+	if (t->nr == t->size) {
+		size_t new_size = max_t(size_t, 128UL, t->size * 2);
+		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
+
+		if (!d) {
+			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
+				new_size);
+			return -BCH_ERR_ENOMEM_fsck_add_nlink;
+		}
+
+		if (t->d)
+			memcpy(d, t->d, t->size * sizeof(t->d[0]));
+		kvfree(t->d);
+
+		t->d = d;
+		t->size = new_size;
+	}
+
+
+	t->d[t->nr++] = (struct nlink) {
+		.inum		= inum,
+		.snapshot	= snapshot,
+	};
+
+	return 0;
+}
+
+static int nlink_cmp(const void *_l, const void *_r)
+{
+	const struct nlink *l = _l;
+	const struct nlink *r = _r;
+
+	return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+}
+
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+		     struct nlink_table *links,
+		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
+{
+	struct nlink *link, key = {
+		.inum = inum, .snapshot = U32_MAX,
+	};
+
+	if (inum < range_start || inum >= range_end)
+		return;
+
+	link = __inline_bsearch(&key, links->d, links->nr,
+				sizeof(links->d[0]), nlink_cmp);
+	if (!link)
+		return;
+
+	while (link > links->d && link[0].inum == link[-1].inum)
+		--link;
+
+	for (; link < links->d + links->nr && link->inum == inum; link++)
+		if (ref_visible(c, s, snapshot, link->snapshot)) {
+			link->count++;
+			if (link->snapshot >= snapshot)
+				break;
+		}
+}
+
+noinline_for_stack
+static int check_nlinks_find_hardlinks(struct bch_fs *c,
+				       struct nlink_table *t,
+				       u64 start, u64 *end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked u;
+	int ret = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_inodes,
+			   POS(0, start),
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (!bkey_is_inode(k.k))
+			continue;
+
+		/* Should never fail, checked by bch2_inode_invalid: */
+		BUG_ON(bch2_inode_unpack(k, &u));
+
+		/*
+		 * Backpointer and directory structure checks are sufficient for
+		 * directories, since they can't have hardlinks:
+		 */
+		if (S_ISDIR(u.bi_mode))
+			continue;
+
+		if (!u.bi_nlink)
+			continue;
+
+		ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
+		if (ret) {
+			*end = k.k->p.offset;
+			ret = 0;
+			break;
+		}
+
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+
+	return ret;
+}
+
+noinline_for_stack
+static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
+				     u64 range_start, u64 range_end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct snapshots_seen s;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	snapshots_seen_init(&s);
+
+	for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
+		if (ret)
+			break;
+
+		switch (k.k->type) {
+		case KEY_TYPE_dirent:
+			d = bkey_s_c_to_dirent(k);
+
+			if (d.v->d_type != DT_DIR &&
+			    d.v->d_type != DT_SUBVOL)
+				inc_link(c, &s, links, range_start, range_end,
+					 le64_to_cpu(d.v->d_inum),
+					 bch2_snapshot_equiv(c, d.k->p.snapshot));
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+
+	bch2_trans_put(trans);
+	snapshots_seen_exit(&s);
+	return ret;
+}
+
+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     struct nlink_table *links,
+				     size_t *idx, u64 range_end)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	struct nlink *link = &links->d[*idx];
+	int ret = 0;
+
+	if (k.k->p.offset >= range_end)
+		return 1;
+
+	if (!bkey_is_inode(k.k))
+		return 0;
+
+	BUG_ON(bch2_inode_unpack(k, &u));
+
+	if (S_ISDIR(u.bi_mode))
+		return 0;
+
+	if (!u.bi_nlink)
+		return 0;
+
+	while ((cmp_int(link->inum, k.k->p.offset) ?:
+		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
+		BUG_ON(*idx == links->nr);
+		link = &links->d[++*idx];
+	}
+
+	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
+			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
+			bch2_inode_nlink_get(&u), link->count)) {
+		bch2_inode_nlink_set(&u, link->count);
+		ret = __write_inode(trans, &u, k.k->p.snapshot);
+	}
+fsck_err:
+	return ret;
+}
+
+noinline_for_stack
+static int check_nlinks_update_hardlinks(struct bch_fs *c,
+			       struct nlink_table *links,
+			       u64 range_start, u64 range_end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t idx = 0;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS(0, range_start),
+				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
+	if (ret < 0) {
+		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int bch2_check_nlinks(struct bch_fs *c)
+{
+	struct nlink_table links = { 0 };
+	u64 this_iter_range_start, next_iter_range_start = 0;
+	int ret = 0;
+
+	do {
+		this_iter_range_start = next_iter_range_start;
+		next_iter_range_start = U64_MAX;
+
+		ret = check_nlinks_find_hardlinks(c, &links,
+						  this_iter_range_start,
+						  &next_iter_range_start);
+
+		ret = check_nlinks_walk_dirents(c, &links,
+					  this_iter_range_start,
+					  next_iter_range_start);
+		if (ret)
+			break;
+
+		ret = check_nlinks_update_hardlinks(c, &links,
+					 this_iter_range_start,
+					 next_iter_range_start);
+		if (ret)
+			break;
+
+		links.nr = 0;
+	} while (next_iter_range_start != U64_MAX);
+
+	kvfree(links.d);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
+			     struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p;
+	struct bkey_i_reflink_p *u;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_reflink_p)
+		return 0;
+
+	p = bkey_s_c_to_reflink_p(k);
+
+	if (!p.v->front_pad && !p.v->back_pad)
+		return 0;
+
+	u = bch2_trans_kmalloc(trans, sizeof(*u));
+	ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(&u->k_i, k);
+	u->v.front_pad	= 0;
+	u->v.back_pad	= 0;
+
+	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+}
+
+int bch2_fix_reflink_p(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
+		return 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_extents, POS_MIN,
+				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
+				BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			fix_reflink_p_key(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
new file mode 100644
index 000000000000..90c87b5089a0
--- /dev/null
+++ b/fs/bcachefs/fsck.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FSCK_H
+#define _BCACHEFS_FSCK_H
+
+int bch2_check_inodes(struct bch_fs *);
+int bch2_check_extents(struct bch_fs *);
+int bch2_check_dirents(struct bch_fs *);
+int bch2_check_xattrs(struct bch_fs *);
+int bch2_check_root(struct bch_fs *);
+int bch2_check_directory_structure(struct bch_fs *);
+int bch2_check_nlinks(struct bch_fs *);
+int bch2_fix_reflink_p(struct bch_fs *);
+
+#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
new file mode 100644
index 000000000000..bb3f443d8381
--- /dev/null
+++ b/fs/bcachefs/inode.c
@@ -0,0 +1,1133 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_write_buffer.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "str_hash.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "varint.h"
+
+#include <linux/random.h>
+
+#include <asm/unaligned.h>
+
+const char * const bch2_inode_opts[] = {
+#define x(name, ...)	#name,
+	BCH_INODE_OPTS()
+#undef  x
+	NULL,
+};
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+			      u64 out[2], unsigned *out_bits)
+{
+	__be64 be[2] = { 0, 0 };
+	unsigned bytes, shift;
+	u8 *p;
+
+	if (in >= end)
+		return -1;
+
+	if (!*in)
+		return -1;
+
+	/*
+	 * position of highest set bit indicates number of bytes:
+	 * shift = number of bits to remove in high byte:
+	 */
+	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
+	bytes	= byte_table[shift - 1];
+
+	if (in + bytes > end)
+		return -1;
+
+	p = (u8 *) be + 16 - bytes;
+	memcpy(p, in, bytes);
+	*p ^= (1 << 8) >> shift;
+
+	out[0] = be64_to_cpu(be[0]);
+	out[1] = be64_to_cpu(be[1]);
+	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
+
+	return bytes;
+}
+
+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
+					   const struct bch_inode_unpacked *inode)
+{
+	struct bkey_i_inode_v3 *k = &packed->inode;
+	u8 *out = k->v.fields;
+	u8 *end = (void *) &packed[1];
+	u8 *last_nonzero_field = out;
+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+	unsigned bytes;
+	int ret;
+
+	bkey_inode_v3_init(&packed->inode.k_i);
+	packed->inode.k.p.offset	= inode->bi_inum;
+	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
+	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
+	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
+	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
+	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
+	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
+	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
+	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
+
+
+#define x(_name, _bits)							\
+	nr_fields++;							\
+									\
+	if (inode->_name) {						\
+		ret = bch2_varint_encode_fast(out, inode->_name);	\
+		out += ret;						\
+									\
+		if (_bits > 64)						\
+			*out++ = 0;					\
+									\
+		last_nonzero_field = out;				\
+		last_nonzero_fieldnr = nr_fields;			\
+	} else {							\
+		*out++ = 0;						\
+									\
+		if (_bits > 64)						\
+			*out++ = 0;					\
+	}
+
+	BCH_INODE_FIELDS_v3()
+#undef  x
+	BUG_ON(out > end);
+
+	out = last_nonzero_field;
+	nr_fields = last_nonzero_fieldnr;
+
+	bytes = out - (u8 *) &packed->inode.v;
+	set_bkey_val_bytes(&packed->inode.k, bytes);
+	memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		struct bch_inode_unpacked unpacked;
+
+		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
+		BUG_ON(ret);
+		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
+		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
+		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
+		BUG_ON(unpacked.bi_size		!= inode->bi_size);
+		BUG_ON(unpacked.bi_version	!= inode->bi_version);
+		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
+
+#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
+			panic("unpacked %llu should be %llu",		\
+			      (u64) unpacked._name, (u64) inode->_name);
+		BCH_INODE_FIELDS_v3()
+#undef  x
+	}
+}
+
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
+{
+	bch2_inode_pack_inlined(packed, inode);
+}
+
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+				struct bch_inode_unpacked *unpacked)
+{
+	const u8 *in = inode.v->fields;
+	const u8 *end = bkey_val_end(inode);
+	u64 field[2];
+	unsigned fieldnr = 0, field_bits;
+	int ret;
+
+#define x(_name, _bits)					\
+	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
+		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+		memset((void *) unpacked + offset, 0,			\
+		       sizeof(*unpacked) - offset);			\
+		return 0;						\
+	}								\
+									\
+	ret = inode_decode_field(in, end, field, &field_bits);		\
+	if (ret < 0)							\
+		return ret;						\
+									\
+	if (field_bits > sizeof(unpacked->_name) * 8)			\
+		return -1;						\
+									\
+	unpacked->_name = field[1];					\
+	in += ret;
+
+	BCH_INODE_FIELDS_v2()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+				const u8 *in, const u8 *end,
+				unsigned nr_fields)
+{
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v[2];
+
+#define x(_name, _bits)							\
+	if (fieldnr < nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+									\
+		if (_bits > 64) {					\
+			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
+			if (ret < 0)					\
+				return ret;				\
+			in += ret;					\
+		} else {						\
+			v[1] = 0;					\
+		}							\
+	} else {							\
+		v[0] = v[1] = 0;					\
+	}								\
+									\
+	unpacked->_name = v[0];						\
+	if (v[1] || v[0] != unpacked->_name)				\
+		return -1;						\
+	fieldnr++;
+
+	BCH_INODE_FIELDS_v2()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static int bch2_inode_unpack_v3(struct bkey_s_c k,
+				struct bch_inode_unpacked *unpacked)
+{
+	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+	const u8 *in = inode.v->fields;
+	const u8 *end = bkey_val_end(inode);
+	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v[2];
+
+	unpacked->bi_inum	= inode.k->p.offset;
+	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
+	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
+	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
+	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
+	unpacked->bi_mode	= INODEv3_MODE(inode.v);
+
+#define x(_name, _bits)							\
+	if (fieldnr < nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+									\
+		if (_bits > 64) {					\
+			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
+			if (ret < 0)					\
+				return ret;				\
+			in += ret;					\
+		} else {						\
+			v[1] = 0;					\
+		}							\
+	} else {							\
+		v[0] = v[1] = 0;					\
+	}								\
+									\
+	unpacked->_name = v[0];						\
+	if (v[1] || v[0] != unpacked->_name)				\
+		return -1;						\
+	fieldnr++;
+
+	BCH_INODE_FIELDS_v3()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
+					       struct bch_inode_unpacked *unpacked)
+{
+	memset(unpacked, 0, sizeof(*unpacked));
+
+	switch (k.k->type) {
+	case KEY_TYPE_inode: {
+		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+
+		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_journal_seq= 0;
+		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
+		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+		if (INODE_NEW_VARINT(inode.v)) {
+			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+						    bkey_val_end(inode),
+						    INODE_NR_FIELDS(inode.v));
+		} else {
+			return bch2_inode_unpack_v1(inode, unpacked);
+		}
+		break;
+	}
+	case KEY_TYPE_inode_v2: {
+		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
+		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+					    bkey_val_end(inode),
+					    INODEv2_NR_FIELDS(inode.v));
+	}
+	default:
+		BUG();
+	}
+}
+
+int bch2_inode_unpack(struct bkey_s_c k,
+		      struct bch_inode_unpacked *unpacked)
+{
+	if (likely(k.k->type == KEY_TYPE_inode_v3))
+		return bch2_inode_unpack_v3(k, unpacked);
+	return bch2_inode_unpack_slowpath(k, unpacked);
+}
+
+static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+		    struct btree_iter *iter,
+		    struct bch_inode_unpacked *inode,
+		    subvol_inum inum, unsigned flags)
+{
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
+			       SPOS(0, inum.inum, snapshot),
+			       flags|BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_unpack(k, inode);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+int bch2_inode_peek(struct btree_trans *trans,
+		    struct btree_iter *iter,
+		    struct bch_inode_unpacked *inode,
+		    subvol_inum inum, unsigned flags)
+{
+	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+	bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+	return ret;
+}
+
+int bch2_inode_write(struct btree_trans *trans,
+		     struct btree_iter *iter,
+		     struct bch_inode_unpacked *inode)
+{
+	struct bkey_inode_buf *inode_p;
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack_inlined(inode_p, inode);
+	inode_p->inode.k.p.snapshot = iter->snapshot;
+	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+}
+
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct bch_inode_unpacked u;
+	struct bkey_inode_buf *inode_p;
+	int ret;
+
+	if (!bkey_is_inode(&k->k))
+		return ERR_PTR(-ENOENT);
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return ERR_CAST(inode_p);
+
+	ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
+	if (ret)
+		return ERR_PTR(ret);
+
+	bch2_inode_pack(inode_p, &u);
+	return &inode_p->inode.k_i;
+}
+
+static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
+{
+	struct bch_inode_unpacked unpacked;
+
+	if (k.k->p.inode) {
+		prt_printf(err, "nonzero k.p.inode");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
+		prt_printf(err, "fs inode in blockdev range");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (bch2_inode_unpack(k, &unpacked)) {
+		prt_printf(err, "invalid variable length fields");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
+		prt_printf(err, "invalid data checksum type (%u >= %u",
+			unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
+		prt_printf(err, "invalid data checksum type (%u >= %u)",
+		       unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+	    unpacked.bi_nlink != 0) {
+		prt_printf(err, "flagged as unlinked but bi_nlink != 0");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
+		prt_printf(err, "subvolume root but not a directory");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
+{
+	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+
+	if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+		prt_printf(err, "invalid str hash type (%llu >= %u)",
+		       INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return __bch2_inode_invalid(k, err);
+}
+
+int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+	if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+		prt_printf(err, "invalid str hash type (%llu >= %u)",
+		       INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return __bch2_inode_invalid(k, err);
+}
+
+int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+
+	if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+	    INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
+		prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
+		       INODEv3_FIELDS_START(inode.v),
+		       INODEv3_FIELDS_START_INITIAL,
+		       bkey_val_u64s(inode.k));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+		prt_printf(err, "invalid str hash type (%llu >= %u)",
+		       INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return __bch2_inode_invalid(k, err);
+}
+
+static void __bch2_inode_unpacked_to_text(struct printbuf *out,
+					  struct bch_inode_unpacked *inode)
+{
+	prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
+	       inode->bi_mode, inode->bi_flags,
+	       inode->bi_journal_seq,
+	       inode->bi_size,
+	       inode->bi_sectors,
+	       inode->bi_version);
+
+#define x(_name, _bits)						\
+	prt_printf(out, " "#_name " %llu", (u64) inode->_name);
+	BCH_INODE_FIELDS_v3()
+#undef  x
+}
+
+void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+{
+	prt_printf(out, "inum: %llu ", inode->bi_inum);
+	__bch2_inode_unpacked_to_text(out, inode);
+}
+
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_inode_unpacked inode;
+
+	if (bch2_inode_unpack(k, &inode)) {
+		prt_printf(out, "(unpack error)");
+		return;
+	}
+
+	__bch2_inode_unpacked_to_text(out, &inode);
+}
+
+static inline u64 bkey_inode_flags(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+	case KEY_TYPE_inode_v2:
+		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+	default:
+		return 0;
+	}
+}
+
+static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+{
+	return bkey_inode_flags(k) & BCH_INODE_UNLINKED;
+}
+
+int bch2_trans_mark_inode(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
+			  struct bkey_s_c old,
+			  struct bkey_i *new,
+			  unsigned flags)
+{
+	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
+	bool old_deleted = bkey_is_deleted_inode(old);
+	bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
+
+	if (nr) {
+		int ret = bch2_replicas_deltas_realloc(trans, 0);
+		struct replicas_delta_list *d = trans->fs_usage_deltas;
+
+		if (ret)
+			return ret;
+
+		d->nr_inodes += nr;
+	}
+
+	if (old_deleted != new_deleted) {
+		int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_mark_inode(struct btree_trans *trans,
+		    enum btree_id btree_id, unsigned level,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *fs_usage;
+	u64 journal_seq = trans->journal_res.seq;
+
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
+
+		BUG_ON(!journal_seq);
+		BUG_ON(new.k->type != KEY_TYPE_inode_v3);
+
+		v->bi_journal_seq = cpu_to_le64(journal_seq);
+	}
+
+	if (flags & BTREE_TRIGGER_GC) {
+		percpu_down_read(&c->mark_lock);
+		preempt_disable();
+
+		fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+		fs_usage->nr_inodes += bkey_is_inode(new.k);
+		fs_usage->nr_inodes -= bkey_is_inode(old.k);
+
+		preempt_enable();
+		percpu_up_read(&c->mark_lock);
+	}
+	return 0;
+}
+
+int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				  enum bkey_invalid_flags flags,
+				  struct printbuf *err)
+{
+	if (k.k->p.inode) {
+		prt_printf(err, "nonzero k.p.inode");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
+				   struct bkey_s_c k)
+{
+	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
+
+	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+}
+
+void bch2_inode_init_early(struct bch_fs *c,
+			   struct bch_inode_unpacked *inode_u)
+{
+	enum bch_str_hash_type str_hash =
+		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
+
+	memset(inode_u, 0, sizeof(*inode_u));
+
+	/* ick */
+	inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
+	get_random_bytes(&inode_u->bi_hash_seed,
+			 sizeof(inode_u->bi_hash_seed));
+}
+
+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+			  struct bch_inode_unpacked *parent)
+{
+	inode_u->bi_mode	= mode;
+	inode_u->bi_uid		= uid;
+	inode_u->bi_gid		= gid;
+	inode_u->bi_dev		= rdev;
+	inode_u->bi_atime	= now;
+	inode_u->bi_mtime	= now;
+	inode_u->bi_ctime	= now;
+	inode_u->bi_otime	= now;
+
+	if (parent && parent->bi_mode & S_ISGID) {
+		inode_u->bi_gid = parent->bi_gid;
+		if (S_ISDIR(mode))
+			inode_u->bi_mode |= S_ISGID;
+	}
+
+	if (parent) {
+#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
+		BCH_INODE_OPTS()
+#undef x
+	}
+}
+
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
+{
+	bch2_inode_init_early(c, inode_u);
+	bch2_inode_init_late(inode_u, bch2_current_time(c),
+			     uid, gid, mode, rdev, parent);
+}
+
+static inline u32 bkey_generation(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+	case KEY_TYPE_inode_v2:
+		BUG();
+	case KEY_TYPE_inode_generation:
+		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+	default:
+		return 0;
+	}
+}
+
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+		      struct btree_iter *iter,
+		      struct bch_inode_unpacked *inode_u,
+		      u32 snapshot, u64 cpu)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	u64 min, max, start, pos, *hint;
+	int ret = 0;
+	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
+
+	if (c->opts.shard_inode_numbers) {
+		bits -= c->inode_shard_bits;
+
+		min = (cpu << bits);
+		max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+		hint = c->unused_inode_hints + cpu;
+	} else {
+		min = BLOCKDEV_INODE_MAX;
+		max = ~(ULLONG_MAX << bits);
+		hint = c->unused_inode_hints;
+	}
+
+	start = READ_ONCE(*hint);
+
+	if (start >= max || start < min)
+		start = min;
+
+	pos = start;
+	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+			     BTREE_ITER_ALL_SNAPSHOTS|
+			     BTREE_ITER_INTENT);
+again:
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k)) &&
+	       bkey_lt(k.k->p, POS(0, max))) {
+		if (pos < iter->pos.offset)
+			goto found_slot;
+
+		/*
+		 * We don't need to iterate over keys in every snapshot once
+		 * we've found just one:
+		 */
+		pos = iter->pos.offset + 1;
+		bch2_btree_iter_set_pos(iter, POS(0, pos));
+	}
+
+	if (!ret && pos < max)
+		goto found_slot;
+
+	if (!ret && start == min)
+		ret = -BCH_ERR_ENOSPC_inode_create;
+
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
+	}
+
+	/* Retry from start */
+	pos = start = min;
+	bch2_btree_iter_set_pos(iter, POS(0, pos));
+	goto again;
+found_slot:
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
+	}
+
+	*hint			= k.k->p.offset;
+	inode_u->bi_inum	= k.k->p.offset;
+	inode_u->bi_generation	= bkey_generation(k);
+	return 0;
+}
+
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+				  subvol_inum inum, enum btree_id id)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i delete;
+	struct bpos end = POS(inum.inum, U64_MAX);
+	u32 snapshot;
+	int ret = 0;
+
+	/*
+	 * We're never going to be deleting partial extents, no need to use an
+	 * extent iterator:
+	 */
+	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+			     BTREE_ITER_INTENT);
+
+	while (1) {
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		k = bch2_btree_iter_peek_upto(&iter, end);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k)
+			break;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter.pos;
+
+		if (iter.flags & BTREE_ITER_IS_EXTENTS)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
+
+		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+		      bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL);
+err:
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_i_inode_generation delete;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	/*
+	 * If this was a directory, there shouldn't be any real dirents left -
+	 * but there could be whiteouts (from hash collisions) that we should
+	 * delete:
+	 *
+	 * XXX: the dirent could ideally would delete whiteouts when they're no
+	 * longer needed
+	 */
+	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
+	if (ret)
+		goto err;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum.inum, snapshot),
+			       BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k)) {
+		bch2_fs_inconsistent(c,
+				     "inode %llu:%u not found when deleting",
+				     inum.inum, snapshot);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_inode_unpack(k, &inode_u);
+
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter.pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
+			    struct bch_inode_unpacked *inode)
+{
+	return bch2_trans_do(c, NULL, NULL, 0,
+		bch2_inode_find_by_inum_trans(trans, inum, inode));
+}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_flags & BCH_INODE_UNLINKED)
+		bi->bi_flags &= ~BCH_INODE_UNLINKED;
+	else {
+		if (bi->bi_nlink == U32_MAX)
+			return -EINVAL;
+
+		bi->bi_nlink++;
+	}
+
+	return 0;
+}
+
+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
+		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
+					bi->bi_inum);
+		return;
+	}
+
+	if (bi->bi_flags & BCH_INODE_UNLINKED) {
+		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
+		return;
+	}
+
+	if (bi->bi_nlink)
+		bi->bi_nlink--;
+	else
+		bi->bi_flags |= BCH_INODE_UNLINKED;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
+{
+	struct bch_opts ret = { 0 };
+#define x(_name, _bits)							\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef x
+	return ret;
+}
+
+void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
+			 struct bch_inode_unpacked *inode)
+{
+#define x(_name, _bits)		opts->_name = inode_opt_get(c, inode, _name);
+	BCH_INODE_OPTS()
+#undef x
+
+	if (opts->nocow)
+		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct bkey_i_inode_generation delete;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c k;
+	int ret;
+
+	do {
+		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL);
+	} while (ret == -BCH_ERR_transaction_restart_nested);
+	if (ret)
+		goto err;
+retry:
+	bch2_trans_begin(trans);
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k)) {
+		bch2_fs_inconsistent(c,
+				     "inode %llu:%u not found when deleting",
+				     inum, snapshot);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_inode_unpack(k, &inode_u);
+
+	/* Subvolume root? */
+	if (inode_u.bi_subvol)
+		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
+
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter.pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	return ret ?: -BCH_ERR_transaction_restart_nested;
+}
+
+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	if (bch2_snapshot_is_internal_node(c, pos.snapshot))
+		return 0;
+
+	if (!fsck_err_on(c->sb.clean, c,
+			 "filesystem marked as clean but have deleted inode %llu:%u",
+			 pos.offset, pos.snapshot))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+	if (fsck_err_on(!bkey_is_inode(k.k), c,
+			"nonexistent inode %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	ret = bch2_inode_unpack(k, &inode);
+	if (ret)
+		goto err;
+
+	if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
+			"directory %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c,
+			"non-deleted inode %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	return 1;
+err:
+fsck_err:
+	return ret;
+delete:
+	return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+}
+
+int bch2_delete_dead_inodes(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_btree_write_buffer_flush_sync(trans);
+	if (ret)
+		goto err;
+
+	/*
+	 * Weird transaction restart handling here because on successful delete,
+	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
+	 * but we can't retry because the btree write buffer won't have been
+	 * flushed and we'd spin:
+	 */
+	for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+			   BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p));
+		if (ret < 0)
+			break;
+
+		if (ret) {
+			if (!test_bit(BCH_FS_RW, &c->flags)) {
+				bch2_trans_unlock(trans);
+				bch2_fs_lazy_rw(c);
+			}
+
+			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
+			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch2_trans_put(trans);
+
+	return ret;
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
new file mode 100644
index 000000000000..a7464e1b6960
--- /dev/null
+++ b/fs/bcachefs/inode.h
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_H
+#define _BCACHEFS_INODE_H
+
+#include "bkey.h"
+#include "opts.h"
+
+enum bkey_invalid_flags;
+extern const char * const bch2_inode_opts[];
+
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+		    struct bkey_s_c, struct bkey_s_c, unsigned);
+
+#define bch2_bkey_ops_inode ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_invalid,		\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 16,				\
+})
+
+#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_v2_invalid,	\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 32,				\
+})
+
+#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_v3_invalid,	\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 48,				\
+})
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_inode ||
+		k->type == KEY_TYPE_inode_v2 ||
+		k->type == KEY_TYPE_inode_v3;
+}
+
+int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
+				  enum bkey_invalid_flags, struct printbuf *);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_generation_invalid,	\
+	.val_to_text	= bch2_inode_generation_to_text,	\
+	.min_val_size	= 8,					\
+})
+
+#if 0
+typedef struct {
+	u64			lo;
+	u32			hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
+struct bch_inode_unpacked {
+	u64			bi_inum;
+	u64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	u64			bi_size;
+	u64			bi_sectors;
+	u64			bi_version;
+	u32			bi_flags;
+	u16			bi_mode;
+
+#define x(_name, _bits)	u##_bits _name;
+	BCH_INODE_FIELDS_v3()
+#undef  x
+};
+
+struct bkey_inode_buf {
+	struct bkey_i_inode_v3	inode;
+
+#define x(_name, _bits)		+ 8 + _bits / 8
+	u8		_pad[0 + BCH_INODE_FIELDS_v3()];
+#undef  x
+} __packed __aligned(8);
+
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
+
+void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
+
+int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+		    struct bch_inode_unpacked *, subvol_inum, unsigned);
+int bch2_inode_write(struct btree_trans *, struct btree_iter *,
+		     struct bch_inode_unpacked *);
+
+void bch2_inode_init_early(struct bch_fs *,
+			   struct bch_inode_unpacked *);
+void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+			  uid_t, gid_t, umode_t, dev_t,
+			  struct bch_inode_unpacked *);
+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
+		     uid_t, gid_t, umode_t, dev_t,
+		     struct bch_inode_unpacked *);
+
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+		      struct bch_inode_unpacked *, u32, u64);
+
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+				  subvol_inum,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+			    struct bch_inode_unpacked *);
+
+#define inode_opt_get(_c, _inode, _name)			\
+	((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum inode_opt_id id, u64 v)
+{
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		inode->bi_##_name = v;					\
+		break;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
+				     enum inode_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		return inode->bi_##_name;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
+{
+	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
+}
+
+/* i_nlink: */
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+	return S_ISDIR(mode) ? 2 : 1;
+}
+
+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
+{
+	return bi->bi_flags & BCH_INODE_UNLINKED
+		  ? 0
+		  : bi->bi_nlink + nlink_bias(bi->bi_mode);
+}
+
+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
+					unsigned nlink)
+{
+	if (nlink) {
+		bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
+		bi->bi_flags &= ~BCH_INODE_UNLINKED;
+	} else {
+		bi->bi_nlink = 0;
+		bi->bi_flags |= BCH_INODE_UNLINKED;
+	}
+}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
+void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
+			 struct bch_inode_unpacked *);
+
+int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
+int bch2_delete_dead_inodes(struct bch_fs *);
+
+#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
new file mode 100644
index 000000000000..119834cb8f9e
--- /dev/null
+++ b/fs/bcachefs/io_misc.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_misc.c - fallocate, fpunch, truncate:
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "logged_ops.h"
+#include "subvolume.h"
+
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+			  subvol_inum inum,
+			  struct btree_iter *iter,
+			  unsigned sectors,
+			  struct bch_io_opts opts,
+			  s64 *i_sectors_delta,
+			  struct write_point_specifier write_point)
+{
+	struct bch_fs *c = trans->c;
+	struct disk_reservation disk_res = { 0 };
+	struct closure cl;
+	struct open_buckets open_buckets = { 0 };
+	struct bkey_s_c k;
+	struct bkey_buf old, new;
+	unsigned sectors_allocated = 0;
+	bool have_reservation = false;
+	bool unwritten = opts.nocow &&
+	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+	int ret;
+
+	bch2_bkey_buf_init(&old);
+	bch2_bkey_buf_init(&new);
+	closure_init_stack(&cl);
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+
+	if (!have_reservation) {
+		unsigned new_replicas =
+			max(0, (int) opts.data_replicas -
+			    (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+		/*
+		 * Get a disk reservation before (in the nocow case) calling
+		 * into the allocator:
+		 */
+		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+		if (unlikely(ret))
+			goto err;
+
+		bch2_bkey_buf_reassemble(&old, c, k);
+	}
+
+	if (have_reservation) {
+		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+			goto err;
+
+		bch2_key_resize(&new.k->k, sectors);
+	} else if (!unwritten) {
+		struct bkey_i_reservation *reservation;
+
+		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+		reservation = bkey_reservation_init(new.k);
+		reservation->k.p = iter->pos;
+		bch2_key_resize(&reservation->k, sectors);
+		reservation->v.nr_replicas = opts.data_replicas;
+	} else {
+		struct bkey_i_extent *e;
+		struct bch_devs_list devs_have;
+		struct write_point *wp;
+		struct bch_extent_ptr *ptr;
+
+		devs_have.nr = 0;
+
+		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+		e = bkey_extent_init(new.k);
+		e->k.p = iter->pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				opts.foreground_target,
+				false,
+				write_point,
+				&devs_have,
+				opts.data_replicas,
+				opts.data_replicas,
+				BCH_WATERMARK_normal, 0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			ret = -BCH_ERR_transaction_restart_nested;
+		if (ret)
+			goto err;
+
+		sectors = min(sectors, wp->sectors_free);
+		sectors_allocated = sectors;
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+	}
+
+	have_reservation = true;
+
+	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+				 0, i_sectors_delta, true);
+err:
+	if (!ret && sectors_allocated)
+		bch2_increment_clock(c, sectors_allocated, WRITE);
+
+	bch2_open_buckets_put(c, &open_buckets);
+	bch2_disk_reservation_put(c, &disk_res);
+	bch2_bkey_buf_exit(&new, c);
+	bch2_bkey_buf_exit(&old, c);
+
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+
+	return ret;
+}
+
+/*
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
+ */
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+		   subvol_inum inum, u64 end,
+		   s64 *i_sectors_delta)
+{
+	struct bch_fs *c	= trans->c;
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bpos end_pos = POS(inum.inum, end);
+	struct bkey_s_c k;
+	int ret = 0, ret2 = 0;
+	u32 snapshot;
+
+	while (!ret ||
+	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+
+		if (ret)
+			ret2 = ret;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(iter, snapshot);
+
+		/*
+		 * peek_upto() doesn't have ideal semantics for extents:
+		 */
+		k = bch2_btree_iter_peek_upto(iter, end_pos);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k);
+		if (ret)
+			continue;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter->pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end_pos, &delete);
+
+		ret = bch2_extent_update(trans, inum, iter, &delete,
+				&disk_res, 0, i_sectors_delta, false);
+		bch2_disk_reservation_put(c, &disk_res);
+	}
+
+	return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+		s64 *i_sectors_delta)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, start),
+			     BTREE_ITER_INTENT);
+
+	ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+
+	return ret;
+}
+
+/* truncate: */
+
+void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
+
+	prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+	prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
+}
+
+static int truncate_set_isize(struct btree_trans *trans,
+			      subvol_inum inum,
+			      u64 new_i_size)
+{
+	struct btree_iter iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+		(inode_u.bi_size = new_i_size, 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u);
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
+					    struct bkey_i *op_k,
+					    u64 *i_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter fpunch_iter;
+	struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+	int ret;
+
+	ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			truncate_set_isize(trans, inum, new_i_size));
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
+			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+			     BTREE_ITER_INTENT);
+	ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+	bch2_trans_iter_exit(trans, &fpunch_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+err:
+	bch2_logged_op_finish(trans, op_k);
+	return ret;
+}
+
+int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_truncate op;
+
+	bkey_logged_op_truncate_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.new_i_size	= cpu_to_le64(new_i_size);
+
+	/*
+	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+	 * snapshot while they're in progress, then crashing, will result in the
+	 * resume only proceeding in one of the snapshots
+	 */
+	down_read(&c->snapshot_create_lock);
+	int ret = bch2_trans_run(c,
+		bch2_logged_op_start(trans, &op.k_i) ?:
+		__bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+	up_read(&c->snapshot_create_lock);
+
+	return ret;
+}
+
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+	prt_printf(out, "subvol=%u",		le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu",		le64_to_cpu(op.v->inum));
+	prt_printf(out, " dst_offset=%lli",	le64_to_cpu(op.v->dst_offset));
+	prt_printf(out, " src_offset=%llu",	le64_to_cpu(op.v->src_offset));
+}
+
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+{
+	struct btree_iter iter;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	offset	<<= 9;
+	len	<<= 9;
+
+	ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
+
+	if (len > 0) {
+		if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
+			ret = -EFBIG;
+			goto err;
+		}
+
+		if (offset >= inode_u.bi_size) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	inode_u.bi_size += len;
+	inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
+
+	ret = bch2_inode_write(trans, &iter, &inode_u);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+					   struct bkey_i *op_k,
+					   u64 *i_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+	u64 src_offset = le64_to_cpu(op->v.src_offset);
+	s64 shift = dst_offset - src_offset;
+	u64 len = abs(shift);
+	u64 pos = le64_to_cpu(op->v.pos);
+	bool insert = shift > 0;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, 0),
+			     BTREE_ITER_INTENT);
+
+	switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+	op->v.state = LOGGED_OP_FINSERT_shift_extents;
+
+	if (insert) {
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, src_offset, len) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+		if (ret)
+			goto err;
+	} else {
+		bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+
+		ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto err;
+
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+	while (1) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete, *copy;
+		struct bkey_s_c k;
+		struct bpos src_pos = POS(inum.inum, src_offset);
+		u32 snapshot;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto btree_err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+		bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+
+		k = insert
+			? bch2_btree_iter_peek_prev(&iter)
+			: bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+		if ((ret = bkey_err(k)))
+			goto btree_err;
+
+		if (!k.k ||
+		    k.k->p.inode != inum.inum ||
+		    bkey_le(k.k->p, POS(inum.inum, src_offset)))
+			break;
+
+		copy = bch2_bkey_make_mut_noupdate(trans, k);
+		if ((ret = PTR_ERR_OR_ZERO(copy)))
+			goto btree_err;
+
+		if (insert &&
+		    bkey_lt(bkey_start_pos(k.k), src_pos)) {
+			bch2_cut_front(src_pos, copy);
+
+			/* Splitting compressed extent? */
+			bch2_disk_reservation_add(c, &disk_res,
+					copy->k.size *
+					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+					BCH_DISK_RESERVATION_NOFAIL);
+		}
+
+		bkey_init(&delete.k);
+		delete.k.p = copy->k.p;
+		delete.k.p.snapshot = snapshot;
+		delete.k.size = copy->k.size;
+
+		copy->k.p.offset += shift;
+		copy->k.p.snapshot = snapshot;
+
+		op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
+
+		ret =   bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+			bch2_logged_op_update(trans, &op->k_i) ?:
+			bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+btree_err:
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+
+		pos = le64_to_cpu(op->v.pos);
+	}
+
+	op->v.state = LOGGED_OP_FINSERT_finish;
+
+	if (!insert) {
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, src_offset, shift) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	} else {
+		/* We need an inode update to update bi_journal_seq for fsync: */
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, 0, 0) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	break;
+case LOGGED_OP_FINSERT_finish:
+	break;
+	}
+err:
+	bch2_logged_op_finish(trans, op_k);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+			   u64 offset, u64 len, bool insert,
+			   s64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_finsert op;
+	s64 shift = insert ? len : -len;
+
+	bkey_logged_op_finsert_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.dst_offset	= cpu_to_le64(offset + shift);
+	op.v.src_offset	= cpu_to_le64(offset);
+	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
+
+	/*
+	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+	 * snapshot while they're in progress, then crashing, will result in the
+	 * resume only proceeding in one of the snapshots
+	 */
+	down_read(&c->snapshot_create_lock);
+	int ret = bch2_trans_run(c,
+		bch2_logged_op_start(trans, &op.k_i) ?:
+		__bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+	up_read(&c->snapshot_create_lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
new file mode 100644
index 000000000000..c9e6ed40e1b8
--- /dev/null
+++ b/fs/bcachefs/io_misc.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_MISC_H
+#define _BCACHEFS_IO_MISC_H
+
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+			  unsigned, struct bch_io_opts, s64 *,
+			  struct write_point_specifier);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+		   subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+
+void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_truncate_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
+
+int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_finsert_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
+int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
+
+#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
new file mode 100644
index 000000000000..443c3ea65527
--- /dev/null
+++ b/fs/bcachefs/io_read.c
@@ -0,0 +1,1210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	const struct bch_devs_mask *devs;
+	unsigned d, nr = 0, total = 0;
+	u64 now = local_clock(), last;
+	s64 congested;
+	struct bch_dev *ca;
+
+	if (!target)
+		return false;
+
+	rcu_read_lock();
+	devs = bch2_target_to_mask(c, target) ?:
+		&c->rw_devs[BCH_DATA_user];
+
+	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+		ca = rcu_dereference(c->devs[d]);
+		if (!ca)
+			continue;
+
+		congested = atomic_read(&ca->congested);
+		last = READ_ONCE(ca->congested_last);
+		if (time_after64(now, last))
+			congested -= (now - last) >> 12;
+
+		total += max(congested, 0LL);
+		nr++;
+	}
+	rcu_read_unlock();
+
+	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	return false;
+}
+
+#endif
+
+/* Cache promotion on read */
+
+struct promote_op {
+	struct rcu_head		rcu;
+	u64			start_time;
+
+	struct rhash_head	hash;
+	struct bpos		pos;
+
+	struct data_update	write;
+	struct bio_vec		bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+	.head_offset	= offsetof(struct promote_op, hash),
+	.key_offset	= offsetof(struct promote_op, pos),
+	.key_len	= sizeof(struct bpos),
+};
+
+static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
+				  struct bpos pos,
+				  struct bch_io_opts opts,
+				  unsigned flags)
+{
+	BUG_ON(!opts.promote_target);
+
+	if (!(flags & BCH_READ_MAY_PROMOTE))
+		return -BCH_ERR_nopromote_may_not;
+
+	if (bch2_bkey_has_target(c, k, opts.promote_target))
+		return -BCH_ERR_nopromote_already_promoted;
+
+	if (bkey_extent_is_unwritten(k))
+		return -BCH_ERR_nopromote_unwritten;
+
+	if (bch2_target_congested(c, opts.promote_target))
+		return -BCH_ERR_nopromote_congested;
+
+	if (rhashtable_lookup_fast(&c->promote_table, &pos,
+				   bch_promote_params))
+		return -BCH_ERR_nopromote_in_flight;
+
+	return 0;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+	int ret;
+
+	bch2_data_update_exit(&op->write);
+
+	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+				     bch_promote_params);
+	BUG_ON(ret);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct bch_write_op *wop)
+{
+	struct promote_op *op =
+		container_of(wop, struct promote_op, write.op);
+	struct bch_fs *c = op->write.op.c;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+			       op->start_time);
+	promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+
+	/* we now own pages: */
+	BUG_ON(!rbio->bounce);
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+	bch2_data_update_read_done(&op->write, rbio->pick.crc);
+}
+
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
+					  enum btree_id btree_id,
+					  struct bkey_s_c k,
+					  struct bpos pos,
+					  struct extent_ptr_decoded *pick,
+					  struct bch_io_opts opts,
+					  unsigned sectors,
+					  struct bch_read_bio **rbio)
+{
+	struct bch_fs *c = trans->c;
+	struct promote_op *op = NULL;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	int ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+		return NULL;
+
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
+	if (!op)
+		goto err;
+
+	op->start_time = local_clock();
+	op->pos = pos;
+
+	/*
+	 * We don't use the mempool here because extents that aren't
+	 * checksummed or compressed can be too big for the mempool:
+	 */
+	*rbio = kzalloc(sizeof(struct bch_read_bio) +
+			sizeof(struct bio_vec) * pages,
+			GFP_NOFS);
+	if (!*rbio)
+		goto err;
+
+	rbio_init(&(*rbio)->bio, opts);
+	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+
+	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+				 GFP_NOFS))
+		goto err;
+
+	(*rbio)->bounce		= true;
+	(*rbio)->split		= true;
+	(*rbio)->kmalloc	= true;
+
+	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+					  bch_promote_params))
+		goto err;
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+
+	ret = bch2_data_update_init(trans, NULL, &op->write,
+			writepoint_hashed((unsigned long) current),
+			opts,
+			(struct data_update_opts) {
+				.target		= opts.promote_target,
+				.extra_replicas	= 1,
+				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
+			},
+			btree_id, k);
+	/*
+	 * possible errors: -BCH_ERR_nocow_lock_blocked,
+	 * -BCH_ERR_ENOSPC_disk_reservation:
+	 */
+	if (ret) {
+		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+					bch_promote_params);
+		BUG_ON(ret);
+		goto err;
+	}
+
+	op->write.op.end_io = promote_done;
+
+	return op;
+err:
+	if (*rbio)
+		bio_free_pages(&(*rbio)->bio);
+	kfree(*rbio);
+	*rbio = NULL;
+	kfree(op);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	return NULL;
+}
+
+noinline
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+					struct bvec_iter iter,
+					struct bkey_s_c k,
+					struct extent_ptr_decoded *pick,
+					struct bch_io_opts opts,
+					unsigned flags,
+					struct bch_read_bio **rbio,
+					bool *bounce,
+					bool *read_full)
+{
+	struct bch_fs *c = trans->c;
+	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	/* data might have to be decompressed in the write path: */
+	unsigned sectors = promote_full
+		? max(pick->crc.compressed_size, pick->crc.live_size)
+		: bvec_iter_sectors(iter);
+	struct bpos pos = promote_full
+		? bkey_start_pos(k.k)
+		: POS(k.k->p.inode, iter.bi_sector);
+	struct promote_op *promote;
+	int ret;
+
+	ret = should_promote(c, k, pos, opts, flags);
+	if (ret)
+		goto nopromote;
+
+	promote = __promote_alloc(trans,
+				  k.k->type == KEY_TYPE_reflink_v
+				  ? BTREE_ID_reflink
+				  : BTREE_ID_extents,
+				  k, pos, pick, opts, sectors, rbio);
+	if (!promote) {
+		ret = -BCH_ERR_nopromote_enomem;
+		goto nopromote;
+	}
+
+	*bounce		= true;
+	*read_full	= promote_full;
+	return promote;
+nopromote:
+	trace_read_nopromote(c, ret);
+	return NULL;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID	1
+#define READ_RETRY		2
+#define READ_ERR		3
+
+enum rbio_context {
+	RBIO_CONTEXT_NULL,
+	RBIO_CONTEXT_HIGHPRI,
+	RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   enum rbio_context context,
+			   struct workqueue_struct *wq)
+{
+	if (context <= rbio->context) {
+		fn(&rbio->work);
+	} else {
+		rbio->work.func		= fn;
+		rbio->context		= context;
+		queue_work(wq, &rbio->work);
+	}
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+	BUG_ON(rbio->bounce && !rbio->split);
+
+	if (rbio->promote)
+		promote_free(rbio->c, rbio->promote);
+	rbio->promote = NULL;
+
+	if (rbio->bounce)
+		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+	if (rbio->split) {
+		struct bch_read_bio *parent = rbio->parent;
+
+		if (rbio->kmalloc)
+			kfree(rbio);
+		else
+			bio_put(&rbio->bio);
+
+		rbio = parent;
+	}
+
+	return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+	if (rbio->start_time)
+		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+				       rbio->start_time);
+	bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter,
+				     struct bch_io_failures *failed,
+				     unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+
+	bch2_bkey_buf_init(&sk);
+
+	bch2_trans_iter_init(trans, &iter, rbio->data_btree,
+			     rbio->read_pos, BTREE_ITER_SLOTS);
+retry:
+	rbio->bio.bi_status = 0;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (bkey_err(k))
+		goto err;
+
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+	bch2_trans_unlock(trans);
+
+	if (!bch2_bkey_matches_ptr(c, k,
+				   rbio->pick.ptr,
+				   rbio->data_pos.offset -
+				   rbio->pick.crc.offset)) {
+		/* extent we wanted to read no longer exists: */
+		rbio->hole = true;
+		goto out;
+	}
+
+	ret = __bch2_read_extent(trans, rbio, bvec_iter,
+				 rbio->read_pos,
+				 rbio->data_btree,
+				 k, 0, failed, flags);
+	if (ret == READ_RETRY)
+		goto retry;
+	if (ret)
+		goto err;
+out:
+	bch2_rbio_done(rbio);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+	return;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+	goto out;
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bvec_iter iter	= rbio->bvec_iter;
+	unsigned flags		= rbio->flags;
+	subvol_inum inum = {
+		.subvol = rbio->subvol,
+		.inum	= rbio->read_pos.inode,
+	};
+	struct bch_io_failures failed = { .nr = 0 };
+
+	trace_and_count(c, read_retry, &rbio->bio);
+
+	if (rbio->retry == READ_RETRY_AVOID)
+		bch2_mark_io_failure(&failed, &rbio->pick);
+
+	rbio->bio.bi_status = 0;
+
+	rbio = bch2_rbio_free(rbio);
+
+	flags |= BCH_READ_IN_RETRY;
+	flags &= ~BCH_READ_MAY_PROMOTE;
+
+	if (flags & BCH_READ_NODECODE) {
+		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+	} else {
+		flags &= ~BCH_READ_LAST_FRAGMENT;
+		flags |= BCH_READ_MUST_CLONE;
+
+		__bch2_read(c, rbio, iter, inum, &failed, flags);
+	}
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
+{
+	rbio->retry = retry;
+
+	if (rbio->flags & BCH_READ_IN_RETRY)
+		return;
+
+	if (retry == READ_ERR) {
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->bio.bi_status = error;
+		bch2_rbio_done(rbio);
+	} else {
+		bch2_rbio_punt(rbio, bch2_rbio_retry,
+			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	}
+}
+
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+				   struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+	struct bch_extent_crc_unpacked new_crc;
+	struct btree_iter iter;
+	struct bkey_i *new;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (crc_is_compressed(rbio->pick.crc))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if ((ret = bkey_err(k)))
+		goto out;
+
+	if (bversion_cmp(k.k->version, rbio->version) ||
+	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+		goto out;
+
+	/* Extent was merged? */
+	if (bkey_start_offset(k.k) < data_offset ||
+	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+		goto out;
+
+	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+			rbio->pick.crc, NULL, &new_crc,
+			bkey_start_offset(k.k) - data_offset, k.k->size,
+			rbio->pick.crc.csum_type)) {
+		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * going to be temporarily appending another checksum entry:
+	 */
+	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+				 sizeof(struct bch_extent_crc128));
+	if ((ret = PTR_ERR_OR_ZERO(new)))
+		goto out;
+
+	bkey_reassemble(new, k);
+
+	if (!bch2_bkey_narrow_crcs(new, new_crc))
+		goto out;
+
+	ret = bch2_trans_update(trans, &iter, new,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+		      __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct bio *src		= &rbio->bio;
+	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->bvec_iter;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	unsigned nofs_flags;
+	struct bch_csum csum;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	/* Reset iterator for checksumming and copying bounced data: */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter			= rbio->bvec_iter;
+	}
+
+	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+		goto csum_err;
+
+	/*
+	 * XXX
+	 * We need to rework the narrow_crcs path to deliver the read completion
+	 * first, and then punt to a different workqueue, otherwise we're
+	 * holding up reads while doing btree updates which is bad for memory
+	 * reclaim.
+	 */
+	if (unlikely(rbio->narrow_crcs))
+		bch2_rbio_narrow_crcs(rbio);
+
+	if (rbio->flags & BCH_READ_NODECODE)
+		goto nodecode;
+
+	/* Adjust crc to point to subset of data we want: */
+	crc.offset     += rbio->offset_into_extent;
+	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
+
+	if (crc_is_compressed(crc)) {
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+		    !c->opts.no_data_io)
+			goto decompression_err;
+	} else {
+		/* don't need to decrypt the entire bio: */
+		nonce = nonce_add(nonce, crc.offset << 9);
+		bio_advance(src, crc.offset << 9);
+
+		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+		src->bi_iter.bi_size = dst_iter.bi_size;
+
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (rbio->bounce) {
+			struct bvec_iter src_iter = src->bi_iter;
+
+			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+		}
+	}
+
+	if (rbio->promote) {
+		/*
+		 * Re encrypt data we decrypted, so it's consistent with
+		 * rbio->crc:
+		 */
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		promote_start(rbio->promote, rbio);
+		rbio->promote = NULL;
+	}
+nodecode:
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+		rbio = bch2_rbio_free(rbio);
+		bch2_rbio_done(rbio);
+	}
+out:
+	memalloc_nofs_restore(nofs_flags);
+	return;
+csum_err:
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+		rbio->flags |= BCH_READ_MUST_BOUNCE;
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+		goto out;
+	}
+
+	bch_err_inum_offset_ratelimited(ca,
+		rbio->read_pos.inode,
+		rbio->read_pos.offset << 9,
+		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
+		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+	bch2_io_error(ca);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	goto out;
+decompression_err:
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decompression error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
+decrypt_err:
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decrypt error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct workqueue_struct *wq = NULL;
+	enum rbio_context context = RBIO_CONTEXT_NULL;
+
+	if (rbio->have_ioref) {
+		bch2_latency_acct(ca, rbio->submit_time, READ);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+				    rbio->read_pos.inode,
+				    rbio->read_pos.offset,
+				    "data read error: %s",
+			       bch2_blk_status_to_str(bio->bi_status))) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+		return;
+	}
+
+	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+	    ptr_stale(ca, &rbio->pick.ptr)) {
+		trace_and_count(c, read_reuse_race, &rbio->bio);
+
+		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+		else
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+		return;
+	}
+
+	if (rbio->narrow_crcs ||
+	    rbio->promote ||
+	    crc_is_compressed(rbio->pick.crc) ||
+	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
+	else if (rbio->pick.crc.csum_type)
+		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
+
+	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+				unsigned *offset_into_extent,
+				struct bkey_buf *orig_k)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 reflink_offset;
+	int ret;
+
+	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
+		*offset_into_extent;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+			       POS(0, reflink_offset), 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_reflink_v &&
+	    k.k->type != KEY_TYPE_indirect_inline_data) {
+		bch_err_inum_offset_ratelimited(trans->c,
+			orig_k->k->k.p.inode,
+			orig_k->k->k.p.offset << 9,
+			"%llu len %u points to nonexistent indirect extent %llu",
+			orig_k->k->k.p.offset,
+			orig_k->k->k.size,
+			reflink_offset);
+		bch2_inconsistent_error(trans->c);
+		ret = -EIO;
+		goto err;
+	}
+
+	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+						   struct bkey_s_c k,
+						   struct bch_extent_ptr ptr)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+	struct btree_iter iter;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     PTR_BUCKET_POS(c, &ptr),
+			     BTREE_ITER_CACHED);
+
+	prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+	printbuf_indent_add(&buf, 2);
+	prt_newline(&buf);
+
+	bch2_bkey_val_to_text(&buf, c, k);
+	prt_newline(&buf);
+
+	prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	if (!ret) {
+		prt_newline(&buf);
+		bch2_bkey_val_to_text(&buf, c, k);
+	}
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+}
+
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
+		       struct bvec_iter iter, struct bpos read_pos,
+		       enum btree_id data_btree, struct bkey_s_c k,
+		       unsigned offset_into_extent,
+		       struct bch_io_failures *failed, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct extent_ptr_decoded pick;
+	struct bch_read_bio *rbio = NULL;
+	struct bch_dev *ca = NULL;
+	struct promote_op *promote = NULL;
+	bool bounce = false, read_full = false, narrow_crcs = false;
+	struct bpos data_pos = bkey_start_pos(k.k);
+	int pick_ret;
+
+	if (bkey_extent_is_inline_data(k.k)) {
+		unsigned bytes = min_t(unsigned, iter.bi_size,
+				       bkey_inline_data_bytes(k.k));
+
+		swap(iter.bi_size, bytes);
+		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
+		swap(iter.bi_size, bytes);
+		bio_advance_iter(&orig->bio, &iter, bytes);
+		zero_fill_bio_iter(&orig->bio, iter);
+		goto out_read_done;
+	}
+retry_pick:
+	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+	/* hole or reservation - just zero fill: */
+	if (!pick_ret)
+		goto hole;
+
+	if (pick_ret < 0) {
+		bch_err_inum_offset_ratelimited(c,
+				read_pos.inode, read_pos.offset << 9,
+				"no device to read from");
+		goto err;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	/*
+	 * Stale dirty pointers are treated as IO errors, but @failed isn't
+	 * allocated unless we're in the retry path - so if we're not in the
+	 * retry path, don't check here, it'll be caught in bch2_read_endio()
+	 * and we'll end up in the retry path:
+	 */
+	if ((flags & BCH_READ_IN_RETRY) &&
+	    !pick.ptr.cached &&
+	    unlikely(ptr_stale(ca, &pick.ptr))) {
+		read_from_stale_dirty_pointer(trans, k, pick.ptr);
+		bch2_mark_io_failure(failed, &pick);
+		goto retry_pick;
+	}
+
+	/*
+	 * Unlock the iterator while the btree node's lock is still in
+	 * cache, before doing the IO:
+	 */
+	bch2_trans_unlock(trans);
+
+	if (flags & BCH_READ_NODECODE) {
+		/*
+		 * can happen if we retry, and the extent we were going to read
+		 * has been merged in the meantime:
+		 */
+		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+			goto hole;
+
+		iter.bi_size	= pick.crc.compressed_size << 9;
+		goto get_bio;
+	}
+
+	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+	    bio_flagged(&orig->bio, BIO_CHAIN))
+		flags |= BCH_READ_MUST_CLONE;
+
+	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+		bch2_can_narrow_extent_crcs(k, pick.crc);
+
+	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+		flags |= BCH_READ_MUST_BOUNCE;
+
+	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+	if (crc_is_compressed(pick.crc) ||
+	    (pick.crc.csum_type != BCH_CSUM_none &&
+	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+	       (flags & BCH_READ_USER_MAPPED)) ||
+	      (flags & BCH_READ_MUST_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	if (orig->opts.promote_target)
+		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
+					&rbio, &bounce, &read_full);
+
+	if (!read_full) {
+		EBUG_ON(crc_is_compressed(pick.crc));
+		EBUG_ON(pick.crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick.crc.live_size ||
+			 pick.crc.offset ||
+			 offset_into_extent));
+
+		data_pos.offset += offset_into_extent;
+		pick.ptr.offset += pick.crc.offset +
+			offset_into_extent;
+		offset_into_extent		= 0;
+		pick.crc.compressed_size	= bvec_iter_sectors(iter);
+		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick.crc.offset			= 0;
+		pick.crc.live_size		= bvec_iter_sectors(iter);
+	}
+get_bio:
+	if (rbio) {
+		/*
+		 * promote already allocated bounce rbio:
+		 * promote needs to allocate a bio big enough for uncompressing
+		 * data in the write path, but we're not going to use it all
+		 * here:
+		 */
+		EBUG_ON(rbio->bio.bi_iter.bi_size <
+		       pick.crc.compressed_size << 9);
+		rbio->bio.bi_iter.bi_size =
+			pick.crc.compressed_size << 9;
+	} else if (bounce) {
+		unsigned sectors = pick.crc.compressed_size;
+
+		rbio = rbio_init(bio_alloc_bioset(NULL,
+						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+						  0,
+						  GFP_NOFS,
+						  &c->bio_read_split),
+				 orig->opts);
+
+		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		rbio->bounce	= true;
+		rbio->split	= true;
+	} else if (flags & BCH_READ_MUST_CLONE) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+						 &c->bio_read_split),
+				 orig->opts);
+		rbio->bio.bi_iter = iter;
+		rbio->split	= true;
+	} else {
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+	rbio->c			= c;
+	rbio->submit_time	= local_clock();
+	if (rbio->split)
+		rbio->parent	= orig;
+	else
+		rbio->end_io	= orig->bio.bi_end_io;
+	rbio->bvec_iter		= iter;
+	rbio->offset_into_extent= offset_into_extent;
+	rbio->flags		= flags;
+	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+	rbio->narrow_crcs	= narrow_crcs;
+	rbio->hole		= 0;
+	rbio->retry		= 0;
+	rbio->context		= 0;
+	/* XXX: only initialize this if needed */
+	rbio->devs_have		= bch2_bkey_devs(k);
+	rbio->pick		= pick;
+	rbio->subvol		= orig->subvol;
+	rbio->read_pos		= read_pos;
+	rbio->data_btree	= data_btree;
+	rbio->data_pos		= data_pos;
+	rbio->version		= k.k->version;
+	rbio->promote		= promote;
+	INIT_WORK(&rbio->work, NULL);
+
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+	rbio->bio.bi_end_io	= bch2_read_endio;
+
+	if (rbio->bounce)
+		trace_and_count(c, read_bounce, &rbio->bio);
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+	/*
+	 * If it's being moved internally, we don't want to flag it as a cache
+	 * hit:
+	 */
+	if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+			PTR_BUCKET_NR(ca, &pick.ptr), READ);
+
+	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+		bio_inc_remaining(&orig->bio);
+		trace_and_count(c, read_split, &orig->bio);
+	}
+
+	if (!rbio->pick.idx) {
+		if (!rbio->have_ioref) {
+			bch_err_inum_offset_ratelimited(c,
+					read_pos.inode,
+					read_pos.offset << 9,
+					"no device to read from");
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+		if (unlikely(c->opts.no_data_io)) {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				bio_endio(&rbio->bio);
+		} else {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				submit_bio(&rbio->bio);
+			else
+				submit_bio_wait(&rbio->bio);
+		}
+
+		/*
+		 * We just submitted IO which may block, we expect relock fail
+		 * events and shouldn't count them:
+		 */
+		trans->notrace_relock_fail = true;
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(c, rbio)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		return 0;
+	} else {
+		int ret;
+
+		rbio->context = RBIO_CONTEXT_UNBOUND;
+		bch2_read_endio(&rbio->bio);
+
+		ret = rbio->retry;
+		rbio = bch2_rbio_free(rbio);
+
+		if (ret == READ_RETRY_AVOID) {
+			bch2_mark_io_failure(failed, &pick);
+			ret = READ_RETRY;
+		}
+
+		if (!ret)
+			goto out_read_done;
+
+		return ret;
+	}
+
+err:
+	if (flags & BCH_READ_IN_RETRY)
+		return READ_ERR;
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;
+
+hole:
+	/*
+	 * won't normally happen in the BCH_READ_NODECODE
+	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
+	 * to read no longer exists we have to signal that:
+	 */
+	if (flags & BCH_READ_NODECODE)
+		orig->hole = true;
+
+	zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+	if (flags & BCH_READ_LAST_FRAGMENT)
+		bch2_rbio_done(orig);
+	return 0;
+}
+
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+		 struct bvec_iter bvec_iter, subvol_inum inum,
+		 struct bch_io_failures *failed, unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	BUG_ON(flags & BCH_READ_NODECODE);
+
+	bch2_bkey_buf_init(&sk);
+retry:
+	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			break;
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, bvec_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		/*
+		 * With indirect extents, the amount of data to read is the min
+		 * of the original extent and the indirect extent:
+		 */
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+		swap(bvec_iter.bi_size, bytes);
+
+		if (bvec_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
+					 data_btree, k,
+					 offset_into_extent, failed, flags);
+		if (ret)
+			break;
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    ret == READ_RETRY ||
+	    ret == READ_RETRY_AVOID)
+		goto retry;
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c, inum.inum,
+						bvec_iter.bi_sector << 9,
+						"read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bch2_rbio_done(rbio);
+	}
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *c)
+{
+	if (c->promote_table.tbl)
+		rhashtable_destroy(&c->promote_table);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_read_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_init;
+
+	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+	if (rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -BCH_ERR_ENOMEM_promote_table_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
new file mode 100644
index 000000000000..d9c18bb7d403
--- /dev/null
+++ b/fs/bcachefs/io_read.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_READ_H
+#define _BCACHEFS_IO_READ_H
+
+#include "bkey_buf.h"
+
+struct bch_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
+
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*end_io;
+	};
+
+	/*
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	bvec_iter;
+
+	unsigned		offset_into_extent;
+
+	u16			flags;
+	union {
+	struct {
+	u16			bounce:1,
+				split:1,
+				kmalloc:1,
+				have_ioref:1,
+				narrow_crcs:1,
+				hole:1,
+				retry:2,
+				context:2;
+	};
+	u16			_state;
+	};
+
+	struct bch_devs_list	devs_have;
+
+	struct extent_ptr_decoded pick;
+
+	/*
+	 * pos we read from - different from data_pos for indirect extents:
+	 */
+	u32			subvol;
+	struct bpos		read_pos;
+
+	/*
+	 * start pos of data we read (may not be pos of data we want) - for
+	 * promote, narrow extents paths:
+	 */
+	enum btree_id		data_btree;
+	struct bpos		data_pos;
+	struct bversion		version;
+
+	struct promote_op	*promote;
+
+	struct bch_io_opts	opts;
+
+	struct work_struct	work;
+
+	struct bio		bio;
+};
+
+#define to_rbio(_bio)		container_of((_bio), struct bch_read_bio, bio)
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+				struct bkey_buf *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+					    enum btree_id *data_btree,
+					    unsigned *offset_into_extent,
+					    struct bkey_buf *k)
+{
+	if (k->k->k.type != KEY_TYPE_reflink_p)
+		return 0;
+
+	*data_btree = BTREE_ID_reflink;
+	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+}
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
+};
+
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
+		       struct bvec_iter, struct bpos, enum btree_id,
+		       struct bkey_s_c, unsigned,
+		       struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct btree_trans *trans,
+			struct bch_read_bio *rbio, struct bpos read_pos,
+			enum btree_id data_btree, struct bkey_s_c k,
+			unsigned offset_into_extent, unsigned flags)
+{
+	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+			   data_btree, k, offset_into_extent, NULL, flags);
+}
+
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		 subvol_inum, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum)
+{
+	struct bch_io_failures failed = { .nr = 0 };
+
+	BUG_ON(rbio->_state);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
+		    BCH_READ_RETRY_IF_STALE|
+		    BCH_READ_MAY_PROMOTE|
+		    BCH_READ_USER_MAPPED);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);
+
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
+	return rbio;
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *);
+int bch2_fs_io_read_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_READ_H */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
new file mode 100644
index 000000000000..6e4f85eb6ec8
--- /dev/null
+++ b/fs/bcachefs/io_write.c
@@ -0,0 +1,1671 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+				       u64 now, int rw)
+{
+	u64 latency_capable =
+		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+	/* ideally we'd be taking into account the device's variance here: */
+	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+	s64 latency_over = io_latency - latency_threshold;
+
+	if (latency_threshold && latency_over > 0) {
+		/*
+		 * bump up congested by approximately latency_over * 4 /
+		 * latency_threshold - we don't need much accuracy here so don't
+		 * bother with the divide:
+		 */
+		if (atomic_read(&ca->congested) < CONGESTED_MAX)
+			atomic_add(latency_over >>
+				   max_t(int, ilog2(latency_threshold) - 2, 0),
+				   &ca->congested);
+
+		ca->congested_last = now;
+	} else if (atomic_read(&ca->congested) > 0) {
+		atomic_dec(&ca->congested);
+	}
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+	atomic64_t *latency = &ca->cur_latency[rw];
+	u64 now = local_clock();
+	u64 io_latency = time_after64(now, submit_time)
+		? now - submit_time
+		: 0;
+	u64 old, new, v = atomic64_read(latency);
+
+	do {
+		old = v;
+
+		/*
+		 * If the io latency was reasonably close to the current
+		 * latency, skip doing the update and atomic operation - most of
+		 * the time:
+		 */
+		if (abs((int) (old - io_latency)) < (old >> 1) &&
+		    now & ~(~0U << 5))
+			break;
+
+		new = ewma_add(old, io_latency, 5);
+	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+	bch2_congested_acct(ca, io_latency, now, rw);
+
+	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+}
+
+#endif
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, iter)
+		if (bv->bv_page != ZERO_PAGE(0))
+			mempool_free(bv->bv_page, &c->bio_bounce_pages);
+	bio->bi_vcnt = 0;
+}
+
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
+{
+	struct page *page;
+
+	if (likely(!*using_mempool)) {
+		page = alloc_page(GFP_NOFS);
+		if (unlikely(!page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
+	}
+
+	return page;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+			       size_t size)
+{
+	bool using_mempool = false;
+
+	while (size) {
+		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, 0));
+		size -= len;
+	}
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Extent update path: */
+
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+			       struct btree_iter *extent_iter,
+			       struct bkey_i *new,
+			       bool *usage_increasing,
+			       s64 *i_sectors_delta,
+			       s64 *disk_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c old;
+	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
+	int ret = 0;
+
+	*usage_increasing	= false;
+	*i_sectors_delta	= 0;
+	*disk_sectors_delta	= 0;
+
+	bch2_trans_copy_iter(&iter, extent_iter);
+
+	for_each_btree_key_upto_continue_norestart(iter,
+				new->k.p, BTREE_ITER_SLOTS, old, ret) {
+		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+			max(bkey_start_offset(&new->k),
+			    bkey_start_offset(old.k));
+
+		*i_sectors_delta += sectors *
+			(bkey_extent_is_allocation(&new->k) -
+			 bkey_extent_is_allocation(old.k));
+
+		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+			: 0;
+
+		if (!*usage_increasing &&
+		    (new->k.p.snapshot != old.k->p.snapshot ||
+		     new_replicas > bch2_bkey_replicas(c, old) ||
+		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
+			*usage_increasing = true;
+
+		if (bkey_ge(old.k->p, new->k.p))
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+						    struct btree_iter *extent_iter,
+						    u64 new_i_size,
+						    s64 i_sectors_delta)
+{
+	struct btree_iter iter;
+	struct bkey_i *k;
+	struct bkey_i_inode_v3 *inode;
+	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
+	int ret;
+
+	k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+			      SPOS(0,
+				   extent_iter->pos.inode,
+				   extent_iter->snapshot),
+			      BTREE_ITER_CACHED);
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		return ret;
+
+	if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+		k = bch2_inode_to_v3(trans, k);
+		ret = PTR_ERR_OR_ZERO(k);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	inode = bkey_i_to_inode_v3(k);
+
+	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+	    new_i_size > le64_to_cpu(inode->v.bi_size)) {
+		inode->v.bi_size = cpu_to_le64(new_i_size);
+		inode_update_flags = 0;
+	}
+
+	if (i_sectors_delta) {
+		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+		inode_update_flags = 0;
+	}
+
+	if (inode->k.p.snapshot != iter.snapshot) {
+		inode->k.p.snapshot = iter.snapshot;
+		inode_update_flags = 0;
+	}
+
+	ret = bch2_trans_update(trans, &iter, &inode->k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+				inode_update_flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+		       subvol_inum inum,
+		       struct btree_iter *iter,
+		       struct bkey_i *k,
+		       struct disk_reservation *disk_res,
+		       u64 new_i_size,
+		       s64 *i_sectors_delta_total,
+		       bool check_enospc)
+{
+	struct bpos next_pos;
+	bool usage_increasing;
+	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+	int ret;
+
+	/*
+	 * This traverses us the iterator without changing iter->path->pos to
+	 * search_key() (which is pos + 1 for extents): we want there to be a
+	 * path already traversed at iter->pos because
+	 * bch2_trans_extent_update() will use it to attempt extent merging
+	 */
+	ret = __bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	ret = bch2_extent_trim_atomic(trans, iter, k);
+	if (ret)
+		return ret;
+
+	next_pos = k->k.p;
+
+	ret = bch2_sum_sector_overwrites(trans, iter, k,
+			&usage_increasing,
+			&i_sectors_delta,
+			&disk_sectors_delta);
+	if (ret)
+		return ret;
+
+	if (disk_res &&
+	    disk_sectors_delta > (s64) disk_res->sectors) {
+		ret = bch2_disk_reservation_add(trans->c, disk_res,
+					disk_sectors_delta - disk_res->sectors,
+					!check_enospc || !usage_increasing
+					? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Note:
+	 * We always have to do an inode update - even when i_size/i_sectors
+	 * aren't changing - for fsync to work properly; fsync relies on
+	 * inode->bi_journal_seq which is updated by the trigger code:
+	 */
+	ret =   bch2_extent_update_i_size_sectors(trans, iter,
+						  min(k->k.p.offset << 9, new_i_size),
+						  i_sectors_delta) ?:
+		bch2_trans_update(trans, iter, k, 0) ?:
+		bch2_trans_commit(trans, disk_res, NULL,
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL);
+	if (unlikely(ret))
+		return ret;
+
+	if (i_sectors_delta_total)
+		*i_sectors_delta_total += i_sectors_delta;
+	bch2_btree_iter_set_pos(iter, next_pos);
+	return 0;
+}
+
+static int bch2_write_index_default(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_buf sk;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	subvol_inum inum = {
+		.subvol = op->subvol,
+		.inum	= k->k.p.inode,
+	};
+	int ret;
+
+	BUG_ON(!inum.subvol);
+
+	bch2_bkey_buf_init(&sk);
+
+	do {
+		bch2_trans_begin(trans);
+
+		k = bch2_keylist_front(keys);
+		bch2_bkey_buf_copy(&sk, c, k);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
+						  &sk.k->k.p.snapshot);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+				     bkey_start_pos(&sk.k->k),
+				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+		ret = bch2_extent_update(trans, inum, &iter, sk.k,
+					 &op->res,
+					 op->new_i_size, &op->i_sectors_delta,
+					 op->flags & BCH_WRITE_CHECK_ENOSPC);
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		if (bkey_ge(iter.pos, k->k.p))
+			bch2_keylist_pop_front(&op->insert_keys);
+		else
+			bch2_cut_front(iter.pos, k);
+	} while (!bch2_keylist_empty(keys));
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret;
+}
+
+/* Writes */
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+			       enum bch_data_type type,
+			       const struct bkey_i *k,
+			       bool nocow)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+	const struct bch_extent_ptr *ptr;
+	struct bch_write_bio *n;
+	struct bch_dev *ca;
+
+	BUG_ON(c->opts.nochanges);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+		       !c->devs[ptr->dev]);
+
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (to_entry(ptr + 1) < ptrs.end) {
+			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+						GFP_NOFS, &ca->replica_set));
+
+			n->bio.bi_end_io	= wbio->bio.bi_end_io;
+			n->bio.bi_private	= wbio->bio.bi_private;
+			n->parent		= wbio;
+			n->split		= true;
+			n->bounce		= false;
+			n->put_bio		= true;
+			n->bio.bi_opf		= wbio->bio.bi_opf;
+			bio_inc_remaining(&wbio->bio);
+		} else {
+			n = wbio;
+			n->split		= false;
+		}
+
+		n->c			= c;
+		n->dev			= ptr->dev;
+		n->have_ioref		= nocow || bch2_dev_get_ioref(ca,
+					type == BCH_DATA_btree ? READ : WRITE);
+		n->nocow		= nocow;
+		n->submit_time		= local_clock();
+		n->inode_offset		= bkey_start_offset(&k->k);
+		n->bio.bi_iter.bi_sector = ptr->offset;
+
+		if (likely(n->have_ioref)) {
+			this_cpu_add(ca->io_done->sectors[WRITE][type],
+				     bio_sectors(&n->bio));
+
+			bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+			if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+				bio_endio(&n->bio);
+				continue;
+			}
+
+			submit_bio(&n->bio);
+		} else {
+			n->bio.bi_status	= BLK_STS_REMOVED;
+			bio_endio(&n->bio);
+		}
+	}
+}
+
+static void __bch2_write(struct bch_write_op *);
+
+static void bch2_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	EBUG_ON(op->open_buckets.nr);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+	bch2_disk_reservation_put(c, &op->res);
+
+	if (!(op->flags & BCH_WRITE_MOVE))
+		bch2_write_ref_put(c, BCH_WRITE_REF_write);
+	bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+	EBUG_ON(cl->parent);
+	closure_debug_destroy(cl);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
+{
+	struct keylist *keys = &op->insert_keys;
+	struct bch_extent_ptr *ptr;
+	struct bkey_i *src, *dst = keys->keys, *n;
+
+	for (src = keys->keys; src != keys->top; src = n) {
+		n = bkey_next(src);
+
+		if (bkey_extent_is_direct_data(&src->k)) {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+					    test_bit(ptr->dev, op->failed.d));
+
+			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+				return -EIO;
+		}
+
+		if (dst != src)
+			memmove_u64s_down(dst, src, src->k.u64s);
+		dst = bkey_next(dst);
+	}
+
+	keys->top = dst;
+	return 0;
+}
+
+/**
+ * __bch2_write_index - after a write, update index to point to new data
+ * @op:		bch_write_op to process
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k;
+	unsigned dev;
+	int ret = 0;
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		ret = bch2_write_drop_io_error_ptrs(op);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * probably not the ideal place to hook this in, but I don't
+	 * particularly want to plumb io_opts all the way through the btree
+	 * update stack right now
+	 */
+	for_each_keylist_key(keys, k)
+		bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+
+	if (!bch2_keylist_empty(keys)) {
+		u64 sectors_start = keylist_sectors(keys);
+
+		ret = !(op->flags & BCH_WRITE_MOVE)
+			? bch2_write_index_default(op)
+			: bch2_data_update_index_update(op);
+
+		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+		BUG_ON(keylist_sectors(keys) && !ret);
+
+		op->written += sectors_start - keylist_sectors(keys);
+
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+			bch_err_inum_offset_ratelimited(c,
+				insert->k.p.inode, insert->k.p.offset << 9,
+				"write error while doing btree update: %s",
+				bch2_err_str(ret));
+		}
+
+		if (ret)
+			goto err;
+	}
+out:
+	/* If some a bucket wasn't written, we can't erasure code it: */
+	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
+	bch2_open_buckets_put(c, &op->open_buckets);
+	return;
+err:
+	keys->top = keys->keys;
+	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;
+	goto out;
+}
+
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+	if (state != wp->state) {
+		u64 now = ktime_get_ns();
+
+		if (wp->last_state_change &&
+		    time_after64(now, wp->last_state_change))
+			wp->time[wp->state] += now - wp->last_state_change;
+		wp->state = state;
+		wp->last_state_change = now;
+	}
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+	enum write_point_state state;
+
+	state = running			 ? WRITE_POINT_running :
+		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+					 : WRITE_POINT_stopped;
+
+	__wp_update_state(wp, state);
+}
+
+static void bch2_write_index(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct write_point *wp = op->wp;
+	struct workqueue_struct *wq = index_update_wq(op);
+	unsigned long flags;
+
+	if ((op->flags & BCH_WRITE_DONE) &&
+	    (op->flags & BCH_WRITE_MOVE))
+		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
+	spin_lock_irqsave(&wp->writes_lock, flags);
+	if (wp->state == WRITE_POINT_waiting_io)
+		__wp_update_state(wp, WRITE_POINT_waiting_work);
+	list_add_tail(&op->wp_list, &wp->writes);
+	spin_unlock_irqrestore (&wp->writes_lock, flags);
+
+	queue_work(wq, &wp->index_update_work);
+}
+
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+	op->wp = wp;
+
+	if (wp->state == WRITE_POINT_stopped) {
+		spin_lock_irq(&wp->writes_lock);
+		__wp_update_state(wp, WRITE_POINT_waiting_io);
+		spin_unlock_irq(&wp->writes_lock);
+	}
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+	struct write_point *wp =
+		container_of(work, struct write_point, index_update_work);
+	struct bch_write_op *op;
+
+	while (1) {
+		spin_lock_irq(&wp->writes_lock);
+		op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+		if (op)
+			list_del(&op->wp_list);
+		wp_update_state(wp, op != NULL);
+		spin_unlock_irq(&wp->writes_lock);
+
+		if (!op)
+			break;
+
+		op->flags |= BCH_WRITE_IN_WORKER;
+
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_DONE))
+			__bch2_write(op);
+		else
+			bch2_write_done(&op->cl);
+	}
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+	struct closure *cl		= bio->bi_private;
+	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_fs *c		= wbio->c;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+				    op->pos.inode,
+				    wbio->inode_offset << 9,
+				    "data write error: %s",
+				    bch2_blk_status_to_str(bio->bi_status))) {
+		set_bit(wbio->dev, op->failed.d);
+		op->flags |= BCH_WRITE_IO_ERROR;
+	}
+
+	if (wbio->nocow)
+		set_bit(wbio->dev, op->devs_need_flush->d);
+
+	if (wbio->have_ioref) {
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (wbio->bounce)
+		bch2_bio_free_pages_pool(c, bio);
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (parent)
+		bio_endio(&parent->bio);
+	else
+		closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+			       struct write_point *wp,
+			       struct bversion version,
+			       struct bch_extent_crc_unpacked crc)
+{
+	struct bkey_i_extent *e;
+
+	op->pos.offset += crc.uncompressed_size;
+
+	e = bkey_extent_init(op->insert_keys.top);
+	e->k.p		= op->pos;
+	e->k.size	= crc.uncompressed_size;
+	e->k.version	= version;
+
+	if (crc.csum_type ||
+	    crc.compression_type ||
+	    crc.nonce)
+		bch2_extent_crc_append(&e->k_i, crc);
+
+	bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
+				       op->flags & BCH_WRITE_CACHED);
+
+	bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+					struct write_point *wp,
+					struct bio *src,
+					bool *page_alloc_failed,
+					void *buf)
+{
+	struct bch_write_bio *wbio;
+	struct bio *bio;
+	unsigned output_available =
+		min(wp->sectors_free << 9, src->bi_iter.bi_size);
+	unsigned pages = DIV_ROUND_UP(output_available +
+				      (buf
+				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
+				       : 0), PAGE_SIZE);
+
+	pages = min(pages, BIO_MAX_VECS);
+
+	bio = bio_alloc_bioset(NULL, pages, 0,
+			       GFP_NOFS, &c->bio_write);
+	wbio			= wbio_init(bio);
+	wbio->put_bio		= true;
+	/* copy WRITE_SYNC flag */
+	wbio->bio.bi_opf	= src->bi_opf;
+
+	if (buf) {
+		bch2_bio_map(bio, buf, output_available);
+		return bio;
+	}
+
+	wbio->bounce		= true;
+
+	/*
+	 * We can't use mempool for more than c->sb.encoded_extent_max
+	 * worth of pages, but we'd like to allocate more if we can:
+	 */
+	bch2_bio_alloc_pages_pool(c, bio,
+				  min_t(unsigned, output_available,
+					c->opts.encoded_extent_max));
+
+	if (bio->bi_iter.bi_size < output_available)
+		*page_alloc_failed =
+			bch2_bio_alloc_pages(bio,
+					     output_available -
+					     bio->bi_iter.bi_size,
+					     GFP_NOFS) != 0;
+
+	return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+				 struct bch_write_op *op,
+				 unsigned new_csum_type)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bch_extent_crc_unpacked new_crc;
+	int ret;
+
+	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	    bch2_csum_type_is_encryption(new_csum_type))
+		new_csum_type = op->crc.csum_type;
+
+	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+				  NULL, &new_crc,
+				  op->crc.offset, op->crc.live_size,
+				  new_csum_type);
+	if (ret)
+		return ret;
+
+	bio_advance(bio, op->crc.offset << 9);
+	bio->bi_iter.bi_size = op->crc.live_size << 9;
+	op->crc = new_crc;
+	return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct nonce nonce = extent_nonce(op->version, op->crc);
+	struct bch_csum csum;
+	int ret;
+
+	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+		return 0;
+
+	/*
+	 * If we need to decrypt data in the write path, we'll no longer be able
+	 * to verify the existing checksum (poly1305 mac, in this case) after
+	 * it's decrypted - this is the last point we'll be able to reverify the
+	 * checksum:
+	 */
+	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	if (bch2_crc_cmp(op->crc.csum, csum))
+		return -EIO;
+
+	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	op->crc.csum_type = 0;
+	op->crc.csum = (struct bch_csum) { 0, 0 };
+	return ret;
+}
+
+static enum prep_encoded_ret {
+	PREP_ENCODED_OK,
+	PREP_ENCODED_ERR,
+	PREP_ENCODED_CHECKSUM_ERR,
+	PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *bio = &op->wbio.bio;
+
+	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+		return PREP_ENCODED_OK;
+
+	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+	/* Can we just write the entire extent as is? */
+	if (op->crc.uncompressed_size == op->crc.live_size &&
+	    op->crc.compressed_size <= wp->sectors_free &&
+	    (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
+	     op->incompressible)) {
+		if (!crc_is_compressed(op->crc) &&
+		    op->csum_type != op->crc.csum_type &&
+		    bch2_write_rechecksum(c, op, op->csum_type) &&
+		    !c->opts.no_data_io)
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		return PREP_ENCODED_DO_WRITE;
+	}
+
+	/*
+	 * If the data is compressed and we couldn't write the entire extent as
+	 * is, we have to decompress it:
+	 */
+	if (crc_is_compressed(op->crc)) {
+		struct bch_csum csum;
+
+		if (bch2_write_decrypt(op))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		/* Last point we can still verify checksum: */
+		csum = bch2_checksum_bio(c, op->crc.csum_type,
+					 extent_nonce(op->version, op->crc),
+					 bio);
+		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+			return PREP_ENCODED_ERR;
+	}
+
+	/*
+	 * No longer have compressed data after this point - data might be
+	 * encrypted:
+	 */
+
+	/*
+	 * If the data is checksummed and we're only writing a subset,
+	 * rechecksum and adjust bio to point to currently live data:
+	 */
+	if ((op->crc.live_size != op->crc.uncompressed_size ||
+	     op->crc.csum_type != op->csum_type) &&
+	    bch2_write_rechecksum(c, op, op->csum_type) &&
+	    !c->opts.no_data_io)
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	/*
+	 * If we want to compress the data, it has to be decrypted:
+	 */
+	if ((op->compression_opt ||
+	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	     bch2_csum_type_is_encryption(op->csum_type)) &&
+	    bch2_write_decrypt(op))
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+			     struct bio **_dst)
+{
+	struct bch_fs *c = op->c;
+	struct bio *src = &op->wbio.bio, *dst = src;
+	struct bvec_iter saved_iter;
+	void *ec_buf;
+	unsigned total_output = 0, total_input = 0;
+	bool bounce = false;
+	bool page_alloc_failed = false;
+	int ret, more = 0;
+
+	BUG_ON(!bio_sectors(src));
+
+	ec_buf = bch2_writepoint_ec_buf(c, wp);
+
+	switch (bch2_write_prep_encoded_data(op, wp)) {
+	case PREP_ENCODED_OK:
+		break;
+	case PREP_ENCODED_ERR:
+		ret = -EIO;
+		goto err;
+	case PREP_ENCODED_CHECKSUM_ERR:
+		goto csum_err;
+	case PREP_ENCODED_DO_WRITE:
+		/* XXX look for bug here */
+		if (ec_buf) {
+			dst = bch2_write_bio_alloc(c, wp, src,
+						   &page_alloc_failed,
+						   ec_buf);
+			bio_copy_data(dst, src);
+			bounce = true;
+		}
+		init_append_extent(op, wp, op->version, op->crc);
+		goto do_write;
+	}
+
+	if (ec_buf ||
+	    op->compression_opt ||
+	    (op->csum_type &&
+	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+	    (bch2_csum_type_is_encryption(op->csum_type) &&
+	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
+		bounce = true;
+	}
+
+	saved_iter = dst->bi_iter;
+
+	do {
+		struct bch_extent_crc_unpacked crc = { 0 };
+		struct bversion version = op->version;
+		size_t dst_len = 0, src_len = 0;
+
+		if (page_alloc_failed &&
+		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
+		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
+			break;
+
+		BUG_ON(op->compression_opt &&
+		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
+		       bch2_csum_type_is_encryption(op->crc.csum_type));
+		BUG_ON(op->compression_opt && !bounce);
+
+		crc.compression_type = op->incompressible
+			? BCH_COMPRESSION_TYPE_incompressible
+			: op->compression_opt
+			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+					    op->compression_opt)
+			: 0;
+		if (!crc_is_compressed(crc)) {
+			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+			if (op->csum_type)
+				dst_len = min_t(unsigned, dst_len,
+						c->opts.encoded_extent_max);
+
+			if (bounce) {
+				swap(dst->bi_iter.bi_size, dst_len);
+				bio_copy_data(dst, src);
+				swap(dst->bi_iter.bi_size, dst_len);
+			}
+
+			src_len = dst_len;
+		}
+
+		BUG_ON(!src_len || !dst_len);
+
+		if (bch2_csum_type_is_encryption(op->csum_type)) {
+			if (bversion_zero(version)) {
+				version.lo = atomic64_inc_return(&c->key_version);
+			} else {
+				crc.nonce = op->nonce;
+				op->nonce += src_len >> 9;
+			}
+		}
+
+		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+		    !crc_is_compressed(crc) &&
+		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
+		    bch2_csum_type_is_encryption(op->csum_type)) {
+			u8 compression_type = crc.compression_type;
+			u16 nonce = crc.nonce;
+			/*
+			 * Note: when we're using rechecksum(), we need to be
+			 * checksumming @src because it has all the data our
+			 * existing checksum covers - if we bounced (because we
+			 * were trying to compress), @dst will only have the
+			 * part of the data the new checksum will cover.
+			 *
+			 * But normally we want to be checksumming post bounce,
+			 * because part of the reason for bouncing is so the
+			 * data can't be modified (by userspace) while it's in
+			 * flight.
+			 */
+			if (bch2_rechecksum_bio(c, src, version, op->crc,
+					&crc, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->csum_type))
+				goto csum_err;
+			/*
+			 * rchecksum_bio sets compression_type on crc from op->crc,
+			 * this isn't always correct as sometimes we're changing
+			 * an extent from uncompressed to incompressible.
+			 */
+			crc.compression_type = compression_type;
+			crc.nonce = nonce;
+		} else {
+			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+			    bch2_rechecksum_bio(c, src, version, op->crc,
+					NULL, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->crc.csum_type))
+				goto csum_err;
+
+			crc.compressed_size	= dst_len >> 9;
+			crc.uncompressed_size	= src_len >> 9;
+			crc.live_size		= src_len >> 9;
+
+			swap(dst->bi_iter.bi_size, dst_len);
+			ret = bch2_encrypt_bio(c, op->csum_type,
+					       extent_nonce(version, crc), dst);
+			if (ret)
+				goto err;
+
+			crc.csum = bch2_checksum_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum_type = op->csum_type;
+			swap(dst->bi_iter.bi_size, dst_len);
+		}
+
+		init_append_extent(op, wp, version, crc);
+
+		if (dst != src)
+			bio_advance(dst, dst_len);
+		bio_advance(src, src_len);
+		total_output	+= dst_len;
+		total_input	+= src_len;
+	} while (dst->bi_iter.bi_size &&
+		 src->bi_iter.bi_size &&
+		 wp->sectors_free &&
+		 !bch2_keylist_realloc(&op->insert_keys,
+				      op->inline_keys,
+				      ARRAY_SIZE(op->inline_keys),
+				      BKEY_EXTENT_U64s_MAX));
+
+	more = src->bi_iter.bi_size != 0;
+
+	dst->bi_iter = saved_iter;
+
+	if (dst == src && more) {
+		BUG_ON(total_output != total_input);
+
+		dst = bio_split(src, total_input >> 9,
+				GFP_NOFS, &c->bio_write);
+		wbio_init(dst)->put_bio	= true;
+		/* copy WRITE_SYNC flag */
+		dst->bi_opf		= src->bi_opf;
+	}
+
+	dst->bi_iter.bi_size = total_output;
+do_write:
+	*_dst = dst;
+	return more;
+csum_err:
+	bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+	ret = -EIO;
+err:
+	if (to_wbio(dst)->bounce)
+		bch2_bio_free_pages_pool(c, dst);
+	if (to_wbio(dst)->put_bio)
+		bio_put(dst);
+
+	return ret;
+}
+
+static bool bch2_extent_is_writeable(struct bch_write_op *op,
+				     struct bkey_s_c k)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_s_c_extent e;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	unsigned replicas = 0;
+
+	if (k.k->type != KEY_TYPE_extent)
+		return false;
+
+	e = bkey_s_c_to_extent(k);
+	extent_for_each_ptr_decode(e, p, entry) {
+		if (p.crc.csum_type ||
+		    crc_is_compressed(p.crc) ||
+		    p.has_ec)
+			return false;
+
+		replicas += bch2_extent_ptr_durability(c, &p);
+	}
+
+	return replicas >= op->opts.data_replicas;
+}
+
+static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	const struct bch_extent_ptr *ptr;
+	struct bkey_i *k;
+
+	for_each_keylist_key(&op->insert_keys, k) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+
+		bkey_for_each_ptr(ptrs, ptr)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+					       PTR_BUCKET_POS(c, ptr),
+					       BUCKET_NOCOW_LOCK_UPDATE);
+	}
+}
+
+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
+						  struct btree_iter *iter,
+						  struct bkey_i *orig,
+						  struct bkey_s_c k,
+						  u64 new_i_size)
+{
+	struct bkey_i *new;
+	struct bkey_ptrs ptrs;
+	struct bch_extent_ptr *ptr;
+	int ret;
+
+	if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
+		/* trace this */
+		return 0;
+	}
+
+	new = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
+
+	bch2_cut_front(bkey_start_pos(&orig->k), new);
+	bch2_cut_back(orig->k.p, new);
+
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+	bkey_for_each_ptr(ptrs, ptr)
+		ptr->unwritten = 0;
+
+	/*
+	 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
+	 * that was done when we kicked off the write, and here it's important
+	 * that we update the extent that we wrote to - even if a snapshot has
+	 * since been created. The write is still outstanding, so we're ok
+	 * w.r.t. snapshot atomicity:
+	 */
+	return  bch2_extent_update_i_size_sectors(trans, iter,
+					min(new->k.p.offset << 9, new_i_size), 0) ?:
+		bch2_trans_update(trans, iter, new,
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_i *orig;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_keylist_key(&op->insert_keys, orig) {
+		ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+				     bkey_start_pos(&orig->k), orig->k.p,
+				     BTREE_ITER_INTENT, k,
+				     NULL, NULL, BTREE_INSERT_NOFAIL, ({
+			bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
+		}));
+
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+			bch_err_inum_offset_ratelimited(c,
+				insert->k.p.inode, insert->k.p.offset << 9,
+				"write error while doing btree update: %s",
+				bch2_err_str(ret));
+		}
+
+		if (ret) {
+			op->error = ret;
+			break;
+		}
+	}
+
+	bch2_trans_put(trans);
+}
+
+static void __bch2_nocow_write_done(struct bch_write_op *op)
+{
+	bch2_nocow_write_unlock(op);
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		op->error = -EIO;
+	} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+		bch2_nocow_write_convert_unwritten(op);
+}
+
+static void bch2_nocow_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+	__bch2_nocow_write_done(op);
+	bch2_write_done(cl);
+}
+
+static void bch2_nocow_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_ptrs_c ptrs;
+	const struct bch_extent_ptr *ptr;
+	struct {
+		struct bpos	b;
+		unsigned	gen;
+		struct nocow_lock_bucket *l;
+	} buckets[BCH_REPLICAS_MAX];
+	unsigned nr_buckets = 0;
+	u32 snapshot;
+	int ret, i;
+
+	if (op->flags & BCH_WRITE_MOVE)
+		return;
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
+	if (unlikely(ret))
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(op->pos.inode, op->pos.offset, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		struct bio *bio = &op->wbio.bio;
+
+		nr_buckets = 0;
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		/* fall back to normal cow write path? */
+		if (unlikely(k.k->p.snapshot != snapshot ||
+			     !bch2_extent_is_writeable(op, k)))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					k.k->u64s))
+			break;
+
+		/* Get iorefs before dropping btree locks: */
+		ptrs = bch2_bkey_ptrs_c(k);
+		bkey_for_each_ptr(ptrs, ptr) {
+			buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
+			buckets[nr_buckets].gen = ptr->gen;
+			buckets[nr_buckets].l =
+				bucket_nocow_lock(&c->nocow_locks,
+						  bucket_to_u64(buckets[nr_buckets].b));
+
+			prefetch(buckets[nr_buckets].l);
+
+			if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
+				goto err_get_ioref;
+
+			nr_buckets++;
+
+			if (ptr->unwritten)
+				op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+		}
+
+		/* Unlock before taking nocow locks, doing IO: */
+		bkey_reassemble(op->insert_keys.top, k);
+		bch2_trans_unlock(trans);
+
+		bch2_cut_front(op->pos, op->insert_keys.top);
+		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
+		for (i = 0; i < nr_buckets; i++) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
+			struct nocow_lock_bucket *l = buckets[i].l;
+			bool stale;
+
+			__bch2_bucket_nocow_lock(&c->nocow_locks, l,
+						 bucket_to_u64(buckets[i].b),
+						 BUCKET_NOCOW_LOCK_UPDATE);
+
+			rcu_read_lock();
+			stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
+			rcu_read_unlock();
+
+			if (unlikely(stale))
+				goto err_bucket_stale;
+		}
+
+		bio = &op->wbio.bio;
+		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
+			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
+					GFP_KERNEL, &c->bio_write);
+			wbio_init(bio)->put_bio = true;
+			bio->bi_opf = op->wbio.bio.bi_opf;
+		} else {
+			op->flags |= BCH_WRITE_DONE;
+		}
+
+		op->pos.offset += bio_sectors(bio);
+		op->written += bio_sectors(bio);
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+		closure_get(&op->cl);
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  op->insert_keys.top, true);
+
+		bch2_keylist_push(&op->insert_keys);
+		if (op->flags & BCH_WRITE_DONE)
+			break;
+		bch2_btree_iter_advance(&iter);
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c,
+				op->pos.inode,
+				op->pos.offset << 9,
+				"%s: btree lookup error %s",
+				__func__, bch2_err_str(ret));
+		op->error = ret;
+		op->flags |= BCH_WRITE_DONE;
+	}
+
+	bch2_trans_put(trans);
+
+	/* fallback to cow write path? */
+	if (!(op->flags & BCH_WRITE_DONE)) {
+		closure_sync(&op->cl);
+		__bch2_nocow_write_done(op);
+		op->insert_keys.top = op->insert_keys.keys;
+	} else if (op->flags & BCH_WRITE_SYNC) {
+		closure_sync(&op->cl);
+		bch2_nocow_write_done(&op->cl);
+	} else {
+		/*
+		 * XXX
+		 * needs to run out of process context because ei_quota_lock is
+		 * a mutex
+		 */
+		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
+	}
+	return;
+err_get_ioref:
+	for (i = 0; i < nr_buckets; i++)
+		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+
+	/* Fall back to COW path: */
+	goto out;
+err_bucket_stale:
+	while (i >= 0) {
+		bch2_bucket_nocow_unlock(&c->nocow_locks,
+					 buckets[i].b,
+					 BUCKET_NOCOW_LOCK_UPDATE);
+		--i;
+	}
+	for (i = 0; i < nr_buckets; i++)
+		percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+
+	/* We can retry this: */
+	ret = -BCH_ERR_transaction_restart;
+	goto out;
+}
+
+static void __bch2_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct write_point *wp = NULL;
+	struct bio *bio = NULL;
+	unsigned nofs_flags;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
+		bch2_nocow_write(op);
+		if (op->flags & BCH_WRITE_DONE)
+			goto out_nofs_restore;
+	}
+again:
+	memset(&op->failed, 0, sizeof(op->failed));
+
+	do {
+		struct bkey_i *key_to_write;
+		unsigned key_to_write_offset = op->insert_keys.top_p -
+			op->insert_keys.keys_p;
+
+		/* +1 for possible cache device: */
+		if (op->open_buckets.nr + op->nr_replicas + 1 >
+		    ARRAY_SIZE(op->open_buckets.v))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					BKEY_EXTENT_U64s_MAX))
+			break;
+
+		/*
+		 * The copygc thread is now global, which means it's no longer
+		 * freeing up space on specific disks, which means that
+		 * allocations for specific disks may hang arbitrarily long:
+		 */
+		ret = bch2_trans_do(c, NULL, NULL, 0,
+			bch2_alloc_sectors_start_trans(trans,
+				op->target,
+				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+				op->write_point,
+				&op->devs_have,
+				op->nr_replicas,
+				op->nr_replicas_required,
+				op->watermark,
+				op->flags,
+				(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
+				? NULL : &op->cl, &wp));
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+				break;
+
+			goto err;
+		}
+
+		EBUG_ON(!wp);
+
+		bch2_open_bucket_get(c, wp, &op->open_buckets);
+		ret = bch2_write_extent(op, wp, &bio);
+
+		bch2_alloc_sectors_done_inlined(c, wp);
+err:
+		if (ret <= 0) {
+			op->flags |= BCH_WRITE_DONE;
+
+			if (ret < 0) {
+				op->error = ret;
+				break;
+			}
+		}
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+
+		closure_get(bio->bi_private);
+
+		key_to_write = (void *) (op->insert_keys.keys_p +
+					 key_to_write_offset);
+
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  key_to_write, false);
+	} while (ret);
+
+	/*
+	 * Sync or no?
+	 *
+	 * If we're running asynchronously, wne may still want to block
+	 * synchronously here if we weren't able to submit all of the IO at
+	 * once, as that signals backpressure to the caller.
+	 */
+	if ((op->flags & BCH_WRITE_SYNC) ||
+	    (!(op->flags & BCH_WRITE_DONE) &&
+	     !(op->flags & BCH_WRITE_IN_WORKER))) {
+		closure_sync(&op->cl);
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_DONE))
+			goto again;
+		bch2_write_done(&op->cl);
+	} else {
+		bch2_write_queue(op, wp);
+		continue_at(&op->cl, bch2_write_index, NULL);
+	}
+out_nofs_restore:
+	memalloc_nofs_restore(nofs_flags);
+}
+
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bvec_iter iter;
+	struct bkey_i_inline_data *id;
+	unsigned sectors;
+	int ret;
+
+	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	op->flags |= BCH_WRITE_DONE;
+
+	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
+
+	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+				   ARRAY_SIZE(op->inline_keys),
+				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+	if (ret) {
+		op->error = ret;
+		goto err;
+	}
+
+	sectors = bio_sectors(bio);
+	op->pos.offset += sectors;
+
+	id = bkey_inline_data_init(op->insert_keys.top);
+	id->k.p		= op->pos;
+	id->k.version	= op->version;
+	id->k.size	= sectors;
+
+	iter = bio->bi_iter;
+	iter.bi_size = data_len;
+	memcpy_from_bio(id->v.data, bio, iter);
+
+	while (data_len & 7)
+		id->v.data[data_len++] = '\0';
+	set_bkey_val_bytes(&id->k, data_len);
+	bch2_keylist_push(&op->insert_keys);
+
+	__bch2_write_index(op);
+err:
+	bch2_write_done(&op->cl);
+}
+
+/**
+ * bch2_write() - handle a write to a cache device or flash only volume
+ * @cl:		&bch_write_op->cl
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch2_write(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bio *bio = &op->wbio.bio;
+	struct bch_fs *c = op->c;
+	unsigned data_len;
+
+	EBUG_ON(op->cl.parent);
+	BUG_ON(!op->nr_replicas);
+	BUG_ON(!op->write_point.v);
+	BUG_ON(bkey_eq(op->pos, POS_MAX));
+
+	op->start_time = local_clock();
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(bio)->put_bio = false;
+
+	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
+		bch_err_inum_offset_ratelimited(c,
+			op->pos.inode,
+			op->pos.offset << 9,
+			"misaligned write");
+		op->error = -EIO;
+		goto err;
+	}
+
+	if (c->opts.nochanges) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	if (!(op->flags & BCH_WRITE_MOVE) &&
+	    !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+	bch2_increment_clock(c, bio_sectors(bio), WRITE);
+
+	data_len = min_t(u64, bio->bi_iter.bi_size,
+			 op->new_i_size - (op->pos.offset << 9));
+
+	if (c->opts.inline_data &&
+	    data_len <= min(block_bytes(c) / 2, 1024U)) {
+		bch2_write_data_inline(op, data_len);
+		return;
+	}
+
+	__bch2_write(op);
+	return;
+err:
+	bch2_disk_reservation_put(c, &op->res);
+
+	closure_debug_destroy(&op->cl);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static const char * const bch2_write_flags[] = {
+#define x(f)	#f,
+	BCH_WRITE_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+	prt_str(out, "pos: ");
+	bch2_bpos_to_text(out, op->pos);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_str(out, "started: ");
+	bch2_pr_time_units(out, local_clock() - op->start_time);
+	prt_newline(out);
+
+	prt_str(out, "flags: ");
+	prt_bitflags(out, bch2_write_flags, op->flags);
+	prt_newline(out);
+
+	prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_fs_io_write_exit(struct bch_fs *c)
+{
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->bio_write);
+}
+
+int bch2_fs_io_write_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_write_init;
+
+	if (mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->opts.btree_node_size,
+					 c->opts.encoded_extent_max) /
+				   PAGE_SIZE, 0))
+		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
new file mode 100644
index 000000000000..9323167229ee
--- /dev/null
+++ b/fs/bcachefs/io_write.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_H
+#define _BCACHEFS_IO_WRITE_H
+
+#include "checksum.h"
+#include "io_write_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       enum bch_data_type, const struct bkey_i *, bool);
+
+#define BCH_WRITE_FLAGS()		\
+	x(ALLOC_NOWAIT)			\
+	x(CACHED)			\
+	x(DATA_ENCODED)			\
+	x(PAGES_STABLE)			\
+	x(PAGES_OWNED)			\
+	x(ONLY_SPECIFIED_DEVS)		\
+	x(WROTE_DATA_INLINE)		\
+	x(FROM_INTERNAL)		\
+	x(CHECK_ENOSPC)			\
+	x(SYNC)				\
+	x(MOVE)				\
+	x(IN_WORKER)			\
+	x(DONE)				\
+	x(IO_ERROR)			\
+	x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)	__BCH_WRITE_##f,
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->watermark == BCH_WATERMARK_copygc
+		? op->c->copygc_wq
+		: op->c->btree_update_wq;
+}
+
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+			       struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+		       struct btree_iter *, struct bkey_i *,
+		       struct disk_reservation *, u64, s64 *, bool);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct bch_io_opts opts)
+{
+	op->c			= c;
+	op->end_io		= NULL;
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c, opts);
+	op->compression_opt	= opts.compression;
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->watermark		= BCH_WATERMARK_normal;
+	op->incompressible	= 0;
+	op->open_buckets.nr	= 0;
+	op->devs_have.nr	= 0;
+	op->target		= 0;
+	op->opts		= opts;
+	op->subvol		= 0;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->new_i_size		= U64_MAX;
+	op->i_sectors_delta	= 0;
+	op->devs_need_flush	= NULL;
+}
+
+void bch2_write(struct closure *);
+
+void bch2_write_point_do_index_updates(struct work_struct *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+	struct bch_write_bio *wbio = to_wbio(bio);
+
+	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+	return wbio;
+}
+
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
+void bch2_fs_io_write_exit(struct bch_fs *);
+int bch2_fs_io_write_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
new file mode 100644
index 000000000000..c7f97c2c4805
--- /dev/null
+++ b/fs/bcachefs/io_write_types.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_TYPES_H
+#define _BCACHEFS_IO_WRITE_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_write_bio {
+	struct_group(wbio,
+	struct bch_fs		*c;
+	struct bch_write_bio	*parent;
+
+	u64			submit_time;
+	u64			inode_offset;
+
+	struct bch_devs_list	failed;
+	u8			dev;
+
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1,
+				have_ioref:1,
+				nocow:1,
+				used_mempool:1,
+				first_btree_write:1;
+	);
+
+	struct bio		bio;
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct bch_fs		*c;
+	void			(*end_io)(struct bch_write_op *);
+	u64			start_time;
+
+	unsigned		written; /* sectors */
+	u16			flags;
+	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+	unsigned		compression_opt:8;
+	unsigned		csum_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
+	unsigned		watermark:3;
+	unsigned		incompressible:1;
+	unsigned		stripe_waited:1;
+
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;
+	struct bch_io_opts	opts;
+
+	u32			subvol;
+	struct bpos		pos;
+	struct bversion		version;
+
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;
+
+	struct write_point_specifier write_point;
+
+	struct write_point	*wp;
+	struct list_head	wp_list;
+
+	struct disk_reservation	res;
+
+	struct open_buckets	open_buckets;
+
+	u64			new_i_size;
+	s64			i_sectors_delta;
+
+	struct bch_devs_mask	failed;
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+	/*
+	 * Bitmask of devices that have had nocow writes issued to them since
+	 * last flush:
+	 */
+	struct bch_devs_mask	*devs_need_flush;
+
+	/* Must be last: */
+	struct bch_write_bio	wbio;
+};
+
+#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
new file mode 100644
index 000000000000..0e7a9ffa3671
--- /dev/null
+++ b/fs/bcachefs/journal.c
@@ -0,0 +1,1449 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_sb.h"
+#include "journal_seq_blacklist.h"
+#include "trace.h"
+
+static const char * const bch2_journal_errors[] = {
+#define x(n)	#n,
+	JOURNAL_ERRORS()
+#undef x
+	NULL
+};
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+	return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+	return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return __journal_entry_is_open(j->reservations);
+}
+
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+	struct journal_buf *buf = NULL;
+
+	EBUG_ON(seq > journal_cur_seq(j));
+
+	if (journal_seq_unwritten(j, seq)) {
+		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+	}
+	return buf;
+}
+
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(p->list); i++)
+		INIT_LIST_HEAD(&p->list[i]);
+	INIT_LIST_HEAD(&p->flushed);
+	atomic_set(&p->count, count);
+	p->devs.nr = 0;
+}
+
+/*
+ * Detect stuck journal conditions and trigger shutdown. Technically the journal
+ * can end up stuck for a variety of reasons, such as a blocked I/O, journal
+ * reservation lockup, etc. Since this is a fatal error with potentially
+ * unpredictable characteristics, we want to be fairly conservative before we
+ * decide to shut things down.
+ *
+ * Consider the journal stuck when it appears full with no ability to commit
+ * btree transactions, to discard journal buckets, nor acquire priority
+ * (reserved watermark) reservation.
+ */
+static inline bool
+journal_error_check_stuck(struct journal *j, int error, unsigned flags)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool stuck = false;
+	struct printbuf buf = PRINTBUF;
+
+	if (!(error == JOURNAL_ERR_journal_full ||
+	      error == JOURNAL_ERR_journal_pin_full) ||
+	    nr_unwritten_journal_entries(j) ||
+	    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
+		return stuck;
+
+	spin_lock(&j->lock);
+
+	if (j->can_discard) {
+		spin_unlock(&j->lock);
+		return stuck;
+	}
+
+	stuck = true;
+
+	/*
+	 * The journal shutdown path will set ->err_seq, but do it here first to
+	 * serialize against concurrent failures and avoid duplicate error
+	 * reports.
+	 */
+	if (j->err_seq) {
+		spin_unlock(&j->lock);
+		return stuck;
+	}
+	j->err_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+
+	bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
+		bch2_journal_errors[error]);
+	bch2_journal_debug_to_text(&buf, j);
+	bch_err(c, "%s", buf.buf);
+
+	printbuf_reset(&buf);
+	bch2_journal_pins_to_text(&buf, j);
+	bch_err(c, "Journal pins:\n%s", buf.buf);
+	printbuf_exit(&buf);
+
+	bch2_fatal_error(c);
+	dump_stack();
+
+	return stuck;
+}
+
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	lockdep_assert_held(&j->lock);
+
+	if (__bch2_journal_pin_put(j, seq))
+		bch2_journal_reclaim_fast(j);
+	if (write)
+		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ *
+ * We don't close a journal_buf until the next journal_buf is finished writing,
+ * and can be opened again - this also initializes the next journal_buf:
+ */
+static void __journal_entry_close(struct journal *j, unsigned closed_val)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf = journal_cur_buf(j);
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+	unsigned sectors;
+
+	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+	       closed_val != JOURNAL_ENTRY_ERROR_VAL);
+
+	lockdep_assert_held(&j->lock);
+
+	do {
+		old.v = new.v = v;
+		new.cur_entry_offset = closed_val;
+
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+		    old.cur_entry_offset == new.cur_entry_offset)
+			return;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	if (!__journal_entry_is_open(old))
+		return;
+
+	/* Close out old buffer: */
+	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
+
+	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+				      buf->u64s_reserved) << c->block_bits;
+	BUG_ON(sectors > buf->sectors);
+	buf->sectors = sectors;
+
+	/*
+	 * We have to set last_seq here, _before_ opening a new journal entry:
+	 *
+	 * A threads may replace an old pin with a new pin on their current
+	 * journal reservation - the expectation being that the journal will
+	 * contain either what the old pin protected or what the new pin
+	 * protects.
+	 *
+	 * After the old pin is dropped journal_last_seq() won't include the old
+	 * pin, so we can only write the updated last_seq on the entry that
+	 * contains whatever the new pin protects.
+	 *
+	 * Restated, we can _not_ update last_seq for a given entry if there
+	 * could be a newer entry open with reservations/pins that have been
+	 * taken against it.
+	 *
+	 * Hence, we want update/set last_seq on the current journal entry right
+	 * before we open a new one:
+	 */
+	buf->last_seq		= journal_last_seq(j);
+	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
+	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
+
+	cancel_delayed_work(&j->write_work);
+
+	bch2_journal_space_available(j);
+
+	__bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+	spin_lock(&j->lock);
+	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+	if (!j->err_seq)
+		j->err_seq = journal_cur_seq(j);
+	journal_wake(j);
+	spin_unlock(&j->lock);
+}
+
+static bool journal_entry_want_write(struct journal *j)
+{
+	bool ret = !journal_entry_is_open(j) ||
+		journal_cur_seq(j) == journal_last_unwritten_seq(j);
+
+	/* Don't close it yet if we already have a write in flight: */
+	if (ret)
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+	else if (nr_unwritten_journal_entries(j)) {
+		struct journal_buf *buf = journal_cur_buf(j);
+
+		if (!buf->flush_time) {
+			buf->flush_time	= local_clock() ?: 1;
+			buf->expires = jiffies;
+		}
+	}
+
+	return ret;
+}
+
+static bool journal_entry_close(struct journal *j)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = journal_entry_want_write(j);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * should _only_ called from journal_res_get() - when we actually want a
+ * journal reservation - journal entry is open means journal is dirty:
+ */
+static int journal_entry_open(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf = j->buf +
+		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
+	union journal_res_state old, new;
+	int u64s;
+	u64 v;
+
+	lockdep_assert_held(&j->lock);
+	BUG_ON(journal_entry_is_open(j));
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+	if (j->blocked)
+		return JOURNAL_ERR_blocked;
+
+	if (j->cur_entry_error)
+		return j->cur_entry_error;
+
+	if (bch2_journal_error(j))
+		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
+	if (!fifo_free(&j->pin))
+		return JOURNAL_ERR_journal_pin_full;
+
+	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
+		return JOURNAL_ERR_max_in_flight;
+
+	BUG_ON(!j->cur_entry_sectors);
+
+	buf->expires		=
+		(journal_cur_seq(j) == j->flushed_seq_ondisk
+		 ? jiffies
+		 : j->last_flush_write) +
+		msecs_to_jiffies(c->opts.journal_flush_delay);
+
+	buf->u64s_reserved	= j->entry_u64s_reserved;
+	buf->disk_sectors	= j->cur_entry_sectors;
+	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
+
+	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+		journal_entry_overhead(j);
+	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+
+	if (u64s <= (ssize_t) j->early_journal_entries.nr)
+		return JOURNAL_ERR_journal_full;
+
+	if (fifo_empty(&j->pin) && j->reclaim_thread)
+		wake_up_process(j->reclaim_thread);
+
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for journal_last_seq() to be calculated correctly
+	 */
+	atomic64_inc(&j->seq);
+	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+
+	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+	bkey_extent_init(&buf->key);
+	buf->noflush	= false;
+	buf->must_flush	= false;
+	buf->separate_flush = false;
+	buf->flush_time	= 0;
+
+	memset(buf->data, 0, sizeof(*buf->data));
+	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
+	buf->data->u64s	= 0;
+
+	if (j->early_journal_entries.nr) {
+		memcpy(buf->data->_data, j->early_journal_entries.data,
+		       j->early_journal_entries.nr * sizeof(u64));
+		le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
+	}
+
+	/*
+	 * Must be set before marking the journal entry as open:
+	 */
+	j->cur_entry_u64s = u64s;
+
+	v = atomic64_read(&j->reservations.counter);
+	do {
+		old.v = new.v = v;
+
+		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
+
+		new.idx++;
+		BUG_ON(journal_state_count(new, new.idx));
+		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
+
+		journal_state_inc(&new);
+
+		/* Handle any already added entries */
+		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	if (j->res_get_blocked_start)
+		bch2_time_stats_update(j->blocked_time,
+				       j->res_get_blocked_start);
+	j->res_get_blocked_start = 0;
+
+	mod_delayed_work(c->io_complete_wq,
+			 &j->write_work,
+			 msecs_to_jiffies(c->opts.journal_flush_delay));
+	journal_wake(j);
+
+	if (j->early_journal_entries.nr)
+		darray_exit(&j->early_journal_entries);
+	return 0;
+}
+
+static bool journal_quiesced(struct journal *j)
+{
+	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
+
+	if (!ret)
+		journal_entry_close(j);
+	return ret;
+}
+
+static void journal_quiesce(struct journal *j)
+{
+	wait_event(j->wait, journal_quiesced(j));
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+	struct journal *j = container_of(work, struct journal, write_work.work);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	long delta;
+
+	spin_lock(&j->lock);
+	if (!__journal_entry_is_open(j->reservations))
+		goto unlock;
+
+	delta = journal_cur_buf(j)->expires - jiffies;
+
+	if (delta > 0)
+		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+	else
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+unlock:
+	spin_unlock(&j->lock);
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+			     unsigned flags)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf;
+	bool can_discard;
+	int ret;
+retry:
+	if (journal_res_get_fast(j, res, flags))
+		return 0;
+
+	if (bch2_journal_error(j))
+		return -BCH_ERR_erofs_journal_err;
+
+	spin_lock(&j->lock);
+
+	/* check once more in case somebody else shut things down... */
+	if (bch2_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return -BCH_ERR_erofs_journal_err;
+	}
+
+	/*
+	 * Recheck after taking the lock, so we don't race with another thread
+	 * that just did journal_entry_open() and call journal_entry_close()
+	 * unnecessarily
+	 */
+	if (journal_res_get_fast(j, res, flags)) {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+
+	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+		/*
+		 * Don't want to close current journal entry, just need to
+		 * invoke reclaim:
+		 */
+		ret = JOURNAL_ERR_journal_full;
+		goto unlock;
+	}
+
+	/*
+	 * If we couldn't get a reservation because the current buf filled up,
+	 * and we had room for a bigger entry on disk, signal that we want to
+	 * realloc the journal bufs:
+	 */
+	buf = journal_cur_buf(j);
+	if (journal_entry_is_open(j) &&
+	    buf->buf_size >> 9 < buf->disk_sectors &&
+	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
+
+	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+	ret = journal_entry_open(j);
+
+	if (ret == JOURNAL_ERR_max_in_flight)
+		trace_and_count(c, journal_entry_full, c);
+unlock:
+	if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
+	    !j->res_get_blocked_start) {
+		j->res_get_blocked_start = local_clock() ?: 1;
+		trace_and_count(c, journal_full, c);
+	}
+
+	can_discard = j->can_discard;
+	spin_unlock(&j->lock);
+
+	if (!ret)
+		goto retry;
+	if (journal_error_check_stuck(j, ret, flags))
+		ret = -BCH_ERR_journal_res_get_blocked;
+
+	/*
+	 * Journal is full - can't rely on reclaim from work item due to
+	 * freezing:
+	 */
+	if ((ret == JOURNAL_ERR_journal_full ||
+	     ret == JOURNAL_ERR_journal_pin_full) &&
+	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+		if (can_discard) {
+			bch2_journal_do_discards(j);
+			goto retry;
+		}
+
+		if (mutex_trylock(&j->reclaim_lock)) {
+			bch2_journal_reclaim(j);
+			mutex_unlock(&j->reclaim_lock);
+		}
+	}
+
+	return ret == JOURNAL_ERR_insufficient_devices
+		? -BCH_ERR_erofs_journal_err
+		: -BCH_ERR_journal_res_get_blocked;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcachefs is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the next
+ * write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+				  unsigned flags)
+{
+	int ret;
+
+	closure_wait_event(&j->async_wait,
+		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK));
+	return ret;
+}
+
+/* journal_preres: */
+
+static bool journal_preres_available(struct journal *j,
+				     struct journal_preres *res,
+				     unsigned new_u64s,
+				     unsigned flags)
+{
+	bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
+
+	if (!ret && mutex_trylock(&j->reclaim_lock)) {
+		bch2_journal_reclaim(j);
+		mutex_unlock(&j->reclaim_lock);
+	}
+
+	return ret;
+}
+
+int __bch2_journal_preres_get(struct journal *j,
+			      struct journal_preres *res,
+			      unsigned new_u64s,
+			      unsigned flags)
+{
+	int ret;
+
+	closure_wait_event(&j->preres_wait,
+		   (ret = bch2_journal_error(j)) ||
+		   journal_preres_available(j, res, new_u64s, flags));
+	return ret;
+}
+
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *j,
+				   struct journal_entry_res *res,
+				   unsigned new_u64s)
+{
+	union journal_res_state state;
+	int d = new_u64s - res->u64s;
+
+	spin_lock(&j->lock);
+
+	j->entry_u64s_reserved += d;
+	if (d <= 0)
+		goto out;
+
+	j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
+	smp_mb();
+	state = READ_ONCE(j->reservations);
+
+	if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
+	    state.cur_entry_offset > j->cur_entry_u64s) {
+		j->cur_entry_u64s += d;
+		/*
+		 * Not enough room in current journal entry, have to flush it:
+		 */
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+	} else {
+		journal_cur_buf(j)->u64s_reserved += d;
+	}
+out:
+	spin_unlock(&j->lock);
+	res->u64s += d;
+}
+
+/* journal flushing: */
+
+/**
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j:		journal object
+ * @seq:	seq to flush
+ * @parent:	closure object to wait with
+ * Returns:	1 if @seq has already been flushed, 0 if @seq is being flushed,
+ *		-EIO if @seq will never be flushed
+ *
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
+ */
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+				 struct closure *parent)
+{
+	struct journal_buf *buf;
+	int ret = 0;
+
+	if (seq <= j->flushed_seq_ondisk)
+		return 1;
+
+	spin_lock(&j->lock);
+
+	if (WARN_ONCE(seq > journal_cur_seq(j),
+		      "requested to flush journal seq %llu, but currently at %llu",
+		      seq, journal_cur_seq(j)))
+		goto out;
+
+	/* Recheck under lock: */
+	if (j->err_seq && seq >= j->err_seq) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (seq <= j->flushed_seq_ondisk) {
+		ret = 1;
+		goto out;
+	}
+
+	/* if seq was written, but not flushed - flush a newer one instead */
+	seq = max(seq, journal_last_unwritten_seq(j));
+
+recheck_need_open:
+	if (seq > journal_cur_seq(j)) {
+		struct journal_res res = { 0 };
+
+		if (journal_entry_is_open(j))
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
+		spin_unlock(&j->lock);
+
+		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+		if (ret)
+			return ret;
+
+		seq = res.seq;
+		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		buf->must_flush = true;
+
+		if (!buf->flush_time) {
+			buf->flush_time	= local_clock() ?: 1;
+			buf->expires = jiffies;
+		}
+
+		if (parent && !closure_wait(&buf->wait, parent))
+			BUG();
+
+		bch2_journal_res_put(j, &res);
+
+		spin_lock(&j->lock);
+		goto want_write;
+	}
+
+	/*
+	 * if write was kicked off without a flush, flush the next sequence
+	 * number instead
+	 */
+	buf = journal_seq_to_buf(j, seq);
+	if (buf->noflush) {
+		seq++;
+		goto recheck_need_open;
+	}
+
+	buf->must_flush = true;
+
+	if (parent && !closure_wait(&buf->wait, parent))
+		BUG();
+want_write:
+	if (seq == journal_cur_seq(j))
+		journal_entry_want_write(j);
+out:
+	spin_unlock(&j->lock);
+	return ret;
+}
+
+int bch2_journal_flush_seq(struct journal *j, u64 seq)
+{
+	u64 start_time = local_clock();
+	int ret, ret2;
+
+	/*
+	 * Don't update time_stats when @seq is already flushed:
+	 */
+	if (seq <= j->flushed_seq_ondisk)
+		return 0;
+
+	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+
+	if (!ret)
+		bch2_time_stats_update(j->flush_seq_time, start_time);
+
+	return ret ?: ret2 < 0 ? ret2 : 0;
+}
+
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+	bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
+}
+
+int bch2_journal_flush(struct journal *j)
+{
+	return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
+}
+
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	u64 unwritten_seq;
+	bool ret = false;
+
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+		return false;
+
+	if (seq <= c->journal.flushed_seq_ondisk)
+		return false;
+
+	spin_lock(&j->lock);
+	if (seq <= c->journal.flushed_seq_ondisk)
+		goto out;
+
+	for (unwritten_seq = journal_last_unwritten_seq(j);
+	     unwritten_seq < seq;
+	     unwritten_seq++) {
+		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+		/* journal write is already in flight, and was a flush write: */
+		if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+			goto out;
+
+		buf->noflush = true;
+	}
+
+	ret = true;
+out:
+	spin_unlock(&j->lock);
+	return ret;
+}
+
+int bch2_journal_meta(struct journal *j)
+{
+	struct journal_buf *buf;
+	struct journal_res res;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	if (ret)
+		return ret;
+
+	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+	buf->must_flush = true;
+
+	if (!buf->flush_time) {
+		buf->flush_time	= local_clock() ?: 1;
+		buf->expires = jiffies;
+	}
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
+/* block/unlock the journal: */
+
+void bch2_journal_unblock(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked--;
+	spin_unlock(&j->lock);
+
+	journal_wake(j);
+}
+
+void bch2_journal_block(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked++;
+	spin_unlock(&j->lock);
+
+	journal_quiesce(j);
+}
+
+/* allocate journal on a device: */
+
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+					 bool new_fs, struct closure *cl)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+	struct open_bucket **ob = NULL;
+	long *bu = NULL;
+	unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
+	int ret = 0;
+
+	BUG_ON(nr <= ja->nr);
+
+	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
+	ob		= kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
+	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
+	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
+	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
+		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		goto err_free;
+	}
+
+	for (nr_got = 0; nr_got < nr_want; nr_got++) {
+		if (new_fs) {
+			bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+			if (bu[nr_got] < 0) {
+				ret = -BCH_ERR_ENOSPC_bucket_alloc;
+				break;
+			}
+		} else {
+			ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl);
+			ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+			if (ret)
+				break;
+
+			ret = bch2_trans_run(c,
+				bch2_trans_mark_metadata_bucket(trans, ca,
+						ob[nr_got]->bucket, BCH_DATA_journal,
+						ca->mi.bucket_size));
+			if (ret) {
+				bch2_open_bucket_put(c, ob[nr_got]);
+				bch_err_msg(c, ret, "marking new journal buckets");
+				break;
+			}
+
+			bu[nr_got] = ob[nr_got]->bucket;
+		}
+	}
+
+	if (!nr_got)
+		goto err_free;
+
+	/* Don't return an error if we successfully allocated some buckets: */
+	ret = 0;
+
+	if (c) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_block(&c->journal);
+		mutex_lock(&c->sb_lock);
+	}
+
+	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
+	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
+
+	BUG_ON(ja->discard_idx > ja->nr);
+
+	pos = ja->discard_idx ?: ja->nr;
+
+	memmove(new_buckets + pos + nr_got,
+		new_buckets + pos,
+		sizeof(new_buckets[0]) * (ja->nr - pos));
+	memmove(new_bucket_seq + pos + nr_got,
+		new_bucket_seq + pos,
+		sizeof(new_bucket_seq[0]) * (ja->nr - pos));
+
+	for (i = 0; i < nr_got; i++) {
+		new_buckets[pos + i] = bu[i];
+		new_bucket_seq[pos + i] = 0;
+	}
+
+	nr = ja->nr + nr_got;
+
+	ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+	if (ret)
+		goto err_unblock;
+
+	if (!new_fs)
+		bch2_write_super(c);
+
+	/* Commit: */
+	if (c)
+		spin_lock(&c->journal.lock);
+
+	swap(new_buckets,	ja->buckets);
+	swap(new_bucket_seq,	ja->bucket_seq);
+	ja->nr = nr;
+
+	if (pos <= ja->discard_idx)
+		ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+	if (pos <= ja->dirty_idx_ondisk)
+		ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+	if (pos <= ja->dirty_idx)
+		ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+	if (pos <= ja->cur_idx)
+		ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
+
+	if (c)
+		spin_unlock(&c->journal.lock);
+err_unblock:
+	if (c) {
+		bch2_journal_unblock(&c->journal);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (ret && !new_fs)
+		for (i = 0; i < nr_got; i++)
+			bch2_trans_run(c,
+				bch2_trans_mark_metadata_bucket(trans, ca,
+						bu[i], BCH_DATA_free, 0));
+err_free:
+	if (!new_fs)
+		for (i = 0; i < nr_got; i++)
+			bch2_open_bucket_put(c, ob[i]);
+
+	kfree(new_bucket_seq);
+	kfree(new_buckets);
+	kfree(ob);
+	kfree(bu);
+	return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+				unsigned nr)
+{
+	struct journal_device *ja = &ca->journal;
+	struct closure cl;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	down_write(&c->state_lock);
+
+	/* don't handle reducing nr of buckets yet: */
+	if (nr < ja->nr)
+		goto unlock;
+
+	while (ja->nr < nr) {
+		struct disk_reservation disk_res = { 0, 0, 0 };
+
+		/*
+		 * note: journal buckets aren't really counted as _sectors_ used yet, so
+		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+		 * when space used goes up without a reservation - but we do need the
+		 * reservation to ensure we'll actually be able to allocate:
+		 *
+		 * XXX: that's not right, disk reservations only ensure a
+		 * filesystem-wide allocation will succeed, this is a device
+		 * specific allocation - we can hang here:
+		 */
+
+		ret = bch2_disk_reservation_get(c, &disk_res,
+						bucket_to_sector(ca, nr - ja->nr), 1, 0);
+		if (ret)
+			break;
+
+		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+		bch2_disk_reservation_put(c, &disk_res);
+
+		closure_sync(&cl);
+
+		if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
+			break;
+	}
+
+	if (ret)
+		bch_err_fn(c, ret);
+unlock:
+	up_write(&c->state_lock);
+	return ret;
+}
+
+int bch2_dev_journal_alloc(struct bch_dev *ca)
+{
+	unsigned nr;
+	int ret;
+
+	if (dynamic_fault("bcachefs:add:journal_alloc")) {
+		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		goto err;
+	}
+
+	/* 1/128th of the device by default: */
+	nr = ca->mi.nbuckets >> 7;
+
+	/*
+	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
+	 * is smaller:
+	 */
+	nr = clamp_t(unsigned, nr,
+		     BCH_JOURNAL_BUCKETS_MIN,
+		     min(1 << 13,
+			 (1 << 24) / ca->mi.bucket_size));
+
+	ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+err:
+	if (ret)
+		bch_err_fn(ca, ret);
+	return ret;
+}
+
+/* startup/shutdown: */
+
+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
+{
+	bool ret = false;
+	u64 seq;
+
+	spin_lock(&j->lock);
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j) && !ret;
+	     seq++) {
+		struct journal_buf *buf = journal_seq_to_buf(j, seq);
+
+		if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
+			ret = true;
+	}
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
+{
+	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
+}
+
+void bch2_fs_journal_stop(struct journal *j)
+{
+	bch2_journal_reclaim_stop(j);
+	bch2_journal_flush_all_pins(j);
+
+	wait_event(j->wait, journal_entry_close(j));
+
+	/*
+	 * Always write a new journal entry, to make sure the clock hands are up
+	 * to date (and match the superblock)
+	 */
+	bch2_journal_meta(j);
+
+	journal_quiesce(j);
+
+	BUG_ON(!bch2_journal_error(j) &&
+	       test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
+	       j->last_empty_seq != journal_cur_seq(j));
+
+	cancel_delayed_work_sync(&j->write_work);
+}
+
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct journal_replay *i, **_i;
+	struct genradix_iter iter;
+	bool had_entries = false;
+	unsigned ptr;
+	u64 last_seq = cur_seq, nr, seq;
+
+	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		last_seq = le64_to_cpu(i->j.last_seq);
+		break;
+	}
+
+	nr = cur_seq - last_seq;
+
+	if (nr + 1 > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+			return -BCH_ERR_ENOMEM_journal_pin_fifo;
+		}
+	}
+
+	j->replay_journal_seq	= last_seq;
+	j->replay_journal_seq_end = cur_seq;
+	j->last_seq_ondisk	= last_seq;
+	j->flushed_seq_ondisk	= cur_seq - 1;
+	j->seq_ondisk		= cur_seq - 1;
+	j->pin.front		= last_seq;
+	j->pin.back		= cur_seq;
+	atomic64_set(&j->seq, cur_seq - 1);
+
+	fifo_for_each_entry_ptr(p, &j->pin, seq)
+		journal_pin_list_init(p, 1);
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		seq = le64_to_cpu(i->j.seq);
+		BUG_ON(seq >= cur_seq);
+
+		if (seq < last_seq)
+			continue;
+
+		if (journal_entry_empty(&i->j))
+			j->last_empty_seq = le64_to_cpu(i->j.seq);
+
+		p = journal_seq_pin(j, seq);
+
+		p->devs.nr = 0;
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+		had_entries = true;
+	}
+
+	if (!had_entries)
+		j->last_empty_seq = cur_seq;
+
+	spin_lock(&j->lock);
+
+	set_bit(JOURNAL_STARTED, &j->flags);
+	j->last_flush_write = jiffies;
+
+	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
+	j->reservations.unwritten_idx++;
+
+	c->last_bucket_seq_cleanup = journal_cur_seq(j);
+
+	bch2_journal_space_available(j);
+	spin_unlock(&j->lock);
+
+	return bch2_journal_reclaim_start(j);
+}
+
+/* init/exit: */
+
+void bch2_dev_journal_exit(struct bch_dev *ca)
+{
+	kfree(ca->journal.bio);
+	kfree(ca->journal.buckets);
+	kfree(ca->journal.bucket_seq);
+
+	ca->journal.bio		= NULL;
+	ca->journal.buckets	= NULL;
+	ca->journal.bucket_seq	= NULL;
+}
+
+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets =
+		bch2_sb_field_get(sb, journal);
+	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+		bch2_sb_field_get(sb, journal_v2);
+	unsigned i, nr_bvecs;
+
+	ja->nr = 0;
+
+	if (journal_buckets_v2) {
+		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+		for (i = 0; i < nr; i++)
+			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+	} else if (journal_buckets) {
+		ja->nr = bch2_nr_journal_buckets(journal_buckets);
+	}
+
+	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->bucket_seq)
+		return -BCH_ERR_ENOMEM_dev_journal_init;
+
+	nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+
+	ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+	if (!ca->journal.bio)
+		return -BCH_ERR_ENOMEM_dev_journal_init;
+
+	bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+
+	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->buckets)
+		return -BCH_ERR_ENOMEM_dev_journal_init;
+
+	if (journal_buckets_v2) {
+		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+		unsigned j, dst = 0;
+
+		for (i = 0; i < nr; i++)
+			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+				ja->buckets[dst++] =
+					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+	} else if (journal_buckets) {
+		for (i = 0; i < ja->nr; i++)
+			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+	}
+
+	return 0;
+}
+
+void bch2_fs_journal_exit(struct journal *j)
+{
+	unsigned i;
+
+	darray_exit(&j->early_journal_entries);
+
+	for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+		kvpfree(j->buf[i].data, j->buf[i].buf_size);
+	free_fifo(&j->pin);
+}
+
+int bch2_fs_journal_init(struct journal *j)
+{
+	static struct lock_class_key res_key;
+	unsigned i;
+
+	spin_lock_init(&j->lock);
+	spin_lock_init(&j->err_lock);
+	init_waitqueue_head(&j->wait);
+	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	init_waitqueue_head(&j->reclaim_wait);
+	init_waitqueue_head(&j->pin_flush_wait);
+	mutex_init(&j->reclaim_lock);
+	mutex_init(&j->discard_lock);
+
+	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+	atomic64_set(&j->reservations.counter,
+		((union journal_res_state)
+		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
+		return -BCH_ERR_ENOMEM_journal_pin_fifo;
+
+	for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+		if (!j->buf[i].data)
+			return -BCH_ERR_ENOMEM_journal_buf;
+	}
+
+	j->pin.front = j->pin.back = 1;
+	return 0;
+}
+
+/* debug: */
+
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	union journal_res_state s;
+	struct bch_dev *ca;
+	unsigned long now = jiffies;
+	u64 seq;
+	unsigned i;
+
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 24);
+	out->atomic++;
+
+	rcu_read_lock();
+	s = READ_ONCE(j->reservations);
+
+	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",	fifo_used(&j->pin), j->pin.size);
+	prt_printf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
+	prt_printf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
+	prt_printf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
+	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
+	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
+	prt_printf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
+	prt_printf(out, "watermark:\t\t%s\n",		bch2_watermarks[j->watermark]);
+	prt_printf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
+	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
+	prt_printf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
+	prt_printf(out, "nr direct reclaim:\t%llu\n",	j->nr_direct_reclaim);
+	prt_printf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
+	prt_printf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
+	prt_printf(out, "reclaim runs in:\t%u ms\n",	time_after(j->next_reclaim, now)
+	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+	prt_printf(out, "current entry sectors:\t%u\n",	j->cur_entry_sectors);
+	prt_printf(out, "current entry error:\t%s\n",	bch2_journal_errors[j->cur_entry_error]);
+	prt_printf(out, "current entry:\t\t");
+
+	switch (s.cur_entry_offset) {
+	case JOURNAL_ENTRY_ERROR_VAL:
+		prt_printf(out, "error");
+		break;
+	case JOURNAL_ENTRY_CLOSED_VAL:
+		prt_printf(out, "closed");
+		break;
+	default:
+		prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
+		break;
+	}
+
+	prt_newline(out);
+
+	for (seq = journal_cur_seq(j);
+	     seq >= journal_last_unwritten_seq(j);
+	     --seq) {
+		i = seq & JOURNAL_BUF_MASK;
+
+		prt_printf(out, "unwritten entry:");
+		prt_tab(out);
+		prt_printf(out, "%llu", seq);
+		prt_newline(out);
+		printbuf_indent_add(out, 2);
+
+		prt_printf(out, "refcount:");
+		prt_tab(out);
+		prt_printf(out, "%u", journal_state_count(s, i));
+		prt_newline(out);
+
+		prt_printf(out, "sectors:");
+		prt_tab(out);
+		prt_printf(out, "%u", j->buf[i].sectors);
+		prt_newline(out);
+
+		prt_printf(out, "expires");
+		prt_tab(out);
+		prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
+		prt_newline(out);
+
+		printbuf_indent_sub(out, 2);
+	}
+
+	prt_printf(out,
+	       "replay done:\t\t%i\n",
+	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
+
+	prt_printf(out, "space:\n");
+	prt_printf(out, "\tdiscarded\t%u:%u\n",
+	       j->space[journal_space_discarded].next_entry,
+	       j->space[journal_space_discarded].total);
+	prt_printf(out, "\tclean ondisk\t%u:%u\n",
+	       j->space[journal_space_clean_ondisk].next_entry,
+	       j->space[journal_space_clean_ondisk].total);
+	prt_printf(out, "\tclean\t\t%u:%u\n",
+	       j->space[journal_space_clean].next_entry,
+	       j->space[journal_space_clean].total);
+	prt_printf(out, "\ttotal\t\t%u:%u\n",
+	       j->space[journal_space_total].next_entry,
+	       j->space[journal_space_total].total);
+
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_journal]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
+			continue;
+
+		if (!ja->nr)
+			continue;
+
+		prt_printf(out, "dev %u:\n",		i);
+		prt_printf(out, "\tnr\t\t%u\n",		ja->nr);
+		prt_printf(out, "\tbucket size\t%u\n",	ca->mi.bucket_size);
+		prt_printf(out, "\tavailable\t%u:%u\n",	bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+		prt_printf(out, "\tdiscard_idx\t%u\n",	ja->discard_idx);
+		prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
+		prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
+		prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
+	}
+
+	rcu_read_unlock();
+
+	--out->atomic;
+}
+
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+	spin_lock(&j->lock);
+	__bch2_journal_debug_to_text(out, j);
+	spin_unlock(&j->lock);
+}
+
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *pin;
+	unsigned i;
+
+	spin_lock(&j->lock);
+	*seq = max(*seq, j->pin.front);
+
+	if (*seq >= j->pin.back) {
+		spin_unlock(&j->lock);
+		return true;
+	}
+
+	out->atomic++;
+
+	pin_list = journal_seq_pin(j, *seq);
+
+	prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+		list_for_each_entry(pin, &pin_list->list[i], list) {
+			prt_printf(out, "\t%px %ps", pin, pin->flush);
+			prt_newline(out);
+		}
+
+	if (!list_empty(&pin_list->flushed)) {
+		prt_printf(out, "flushed:");
+		prt_newline(out);
+	}
+
+	list_for_each_entry(pin, &pin_list->flushed, list) {
+		prt_printf(out, "\t%px %ps", pin, pin->flush);
+		prt_newline(out);
+	}
+
+	printbuf_indent_sub(out, 2);
+
+	--out->atomic;
+	spin_unlock(&j->lock);
+
+	return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+	u64 seq = 0;
+
+	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+		seq++;
+}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
new file mode 100644
index 000000000000..491133cc52f3
--- /dev/null
+++ b/fs/bcachefs/journal.h
@@ -0,0 +1,548 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_H
+#define _BCACHEFS_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will wait on the journal write to
+ * complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+struct bch_fs;
+
+static inline void journal_wake(struct journal *j)
+{
+	wake_up(&j->wait);
+	closure_wake_up(&j->async_wait);
+	closure_wake_up(&j->preres_wait);
+}
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+	return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+	EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+	return j->pin.back - 1;
+}
+
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+	return j->seq_ondisk + 1;
+}
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+	switch (idx) {
+	case 0: return s.buf0_count;
+	case 1: return s.buf1_count;
+	case 2: return s.buf2_count;
+	case 3: return s.buf3_count;
+	}
+	BUG();
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+	s->buf0_count += s->idx == 0;
+	s->buf1_count += s->idx == 1;
+	s->buf2_count += s->idx == 2;
+	s->buf3_count += s->idx == 3;
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+	return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline int journal_entry_overhead(struct journal *j)
+{
+	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
+{
+	struct jset *jset = buf->data;
+	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
+
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s = cpu_to_le16(u64s);
+
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+	return entry;
+}
+
+static inline struct jset_entry *
+journal_res_entry(struct journal *j, struct journal_res *res)
+{
+	return vstruct_idx(j->buf[res->idx].data, res->offset);
+}
+
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
+					  enum btree_id id, unsigned level,
+					  unsigned u64s)
+{
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->btree_id = id;
+	entry->level	= level;
+	entry->type	= type;
+	entry->pad[0]	= 0;
+	entry->pad[1]	= 0;
+	entry->pad[2]	= 0;
+	return jset_u64s(u64s);
+}
+
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+					  enum btree_id id, unsigned level,
+					  const void *data, unsigned u64s)
+{
+	unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+	memcpy_u64s_small(entry->_data, data, u64s);
+	return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+			 unsigned type, enum btree_id id,
+			 unsigned level, unsigned u64s)
+{
+	struct jset_entry *entry = journal_res_entry(j, res);
+	unsigned actual = journal_entry_init(entry, type, id, level, u64s);
+
+	EBUG_ON(!res->ref);
+	EBUG_ON(actual > res->u64s);
+
+	res->offset	+= actual;
+	res->u64s	-= actual;
+	return entry;
+}
+
+static inline bool journal_entry_empty(struct jset *j)
+{
+	struct jset_entry *i;
+
+	if (j->seq != j->last_seq)
+		return false;
+
+	vstruct_for_each(j, i)
+		if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
+			return false;
+	return true;
+}
+
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
+{
+	union journal_res_state s;
+
+	s.v = atomic64_sub_return(((union journal_res_state) {
+				    .buf0_count = idx == 0,
+				    .buf1_count = idx == 1,
+				    .buf2_count = idx == 2,
+				    .buf3_count = idx == 3,
+				    }).v, &j->reservations.counter);
+	return s;
+}
+
+void bch2_journal_buf_put_final(struct journal *, u64, bool);
+
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx))
+		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+}
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx)) {
+		spin_lock(&j->lock);
+		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		spin_unlock(&j->lock);
+	}
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch2_journal_res_put(struct journal *j,
+				       struct journal_res *res)
+{
+	if (!res->ref)
+		return;
+
+	lock_release(&j->res_map, _THIS_IP_);
+
+	while (res->u64s)
+		bch2_journal_add_entry(j, res,
+				       BCH_JSET_ENTRY_btree_keys,
+				       0, 0, 0);
+
+	bch2_journal_buf_put(j, res->idx, res->seq);
+
+	res->ref = 0;
+}
+
+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
+				  unsigned);
+
+/* First bits for BCH_WATERMARK: */
+enum journal_res_flags {
+	__JOURNAL_RES_GET_NONBLOCK	= BCH_WATERMARK_BITS,
+	__JOURNAL_RES_GET_CHECK,
+};
+
+#define JOURNAL_RES_GET_NONBLOCK	(1 << __JOURNAL_RES_GET_NONBLOCK)
+#define JOURNAL_RES_GET_CHECK		(1 << __JOURNAL_RES_GET_CHECK)
+
+static inline int journal_res_get_fast(struct journal *j,
+				       struct journal_res *res,
+				       unsigned flags)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+
+		/*
+		 * Check if there is still room in the current journal
+		 * entry:
+		 */
+		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
+			return 0;
+
+		EBUG_ON(!journal_state_count(new, new.idx));
+
+		if ((flags & BCH_WATERMARK_MASK) < j->watermark)
+			return 0;
+
+		new.cur_entry_offset += res->u64s;
+		journal_state_inc(&new);
+
+		/*
+		 * If the refcount would overflow, we have to wait:
+		 * XXX - tracepoint this:
+		 */
+		if (!journal_state_count(new, new.idx))
+			return 0;
+
+		if (flags & JOURNAL_RES_GET_CHECK)
+			return 1;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	res->ref	= true;
+	res->idx	= old.idx;
+	res->offset	= old.cur_entry_offset;
+	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
+	return 1;
+}
+
+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
+				       unsigned u64s, unsigned flags)
+{
+	int ret;
+
+	EBUG_ON(res->ref);
+	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+	res->u64s = u64s;
+
+	if (journal_res_get_fast(j, res, flags))
+		goto out;
+
+	ret = bch2_journal_res_get_slowpath(j, res, flags);
+	if (ret)
+		return ret;
+out:
+	if (!(flags & JOURNAL_RES_GET_CHECK)) {
+		lock_acquire_shared(&j->res_map, 0,
+				    (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
+				    NULL, _THIS_IP_);
+		EBUG_ON(!res->ref);
+	}
+	return 0;
+}
+
+/* journal_preres: */
+
+static inline void journal_set_watermark(struct journal *j)
+{
+	union journal_preres_state s = READ_ONCE(j->prereserved);
+	unsigned watermark = BCH_WATERMARK_stripe;
+
+	if (fifo_free(&j->pin) < j->pin.size / 4)
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
+	if (fifo_free(&j->pin) < j->pin.size / 8)
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+
+	if (s.reserved > s.remaining)
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
+	if (!s.remaining)
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+
+	if (watermark == j->watermark)
+		return;
+
+	swap(watermark, j->watermark);
+	if (watermark > j->watermark)
+		journal_wake(j);
+}
+
+static inline void bch2_journal_preres_put(struct journal *j,
+					   struct journal_preres *res)
+{
+	union journal_preres_state s = { .reserved = res->u64s };
+
+	if (!res->u64s)
+		return;
+
+	s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
+	res->u64s = 0;
+
+	if (unlikely(s.waiting)) {
+		clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
+			  (unsigned long *) &j->prereserved.v);
+		closure_wake_up(&j->preres_wait);
+	}
+
+	if (s.reserved <= s.remaining && j->watermark)
+		journal_set_watermark(j);
+}
+
+int __bch2_journal_preres_get(struct journal *,
+			struct journal_preres *, unsigned, unsigned);
+
+static inline int bch2_journal_preres_get_fast(struct journal *j,
+					       struct journal_preres *res,
+					       unsigned new_u64s,
+					       unsigned flags,
+					       bool set_waiting)
+{
+	int d = new_u64s - res->u64s;
+	union journal_preres_state old, new;
+	u64 v = atomic64_read(&j->prereserved.counter);
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	int ret;
+
+	do {
+		old.v = new.v = v;
+		ret = 0;
+
+		if (watermark == BCH_WATERMARK_reclaim ||
+		    new.reserved + d < new.remaining) {
+			new.reserved += d;
+			ret = 1;
+		} else if (set_waiting && !new.waiting)
+			new.waiting = true;
+		else
+			return 0;
+	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+				       old.v, new.v)) != old.v);
+
+	if (ret)
+		res->u64s += d;
+	return ret;
+}
+
+static inline int bch2_journal_preres_get(struct journal *j,
+					  struct journal_preres *res,
+					  unsigned new_u64s,
+					  unsigned flags)
+{
+	if (new_u64s <= res->u64s)
+		return 0;
+
+	if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
+		return 0;
+
+	if (flags & JOURNAL_RES_GET_NONBLOCK)
+		return -BCH_ERR_journal_preres_get_blocked;
+
+	return __bch2_journal_preres_get(j, res, new_u64s, flags);
+}
+
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *,
+				   struct journal_entry_res *,
+				   unsigned);
+
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch2_journal_flush_async(struct journal *, struct closure *);
+
+int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
+int bch2_journal_meta(struct journal *);
+
+void bch2_journal_halt(struct journal *);
+
+static inline int bch2_journal_error(struct journal *j)
+{
+	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+		? -EIO : 0;
+}
+
+struct bch_dev;
+
+static inline void bch2_journal_set_replay_done(struct journal *j)
+{
+	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+}
+
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
+
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+				unsigned nr);
+int bch2_dev_journal_alloc(struct bch_dev *);
+
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+
+void bch2_fs_journal_stop(struct journal *);
+int bch2_fs_journal_start(struct journal *, u64);
+
+void bch2_dev_journal_exit(struct bch_dev *);
+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
+void bch2_fs_journal_exit(struct journal *);
+int bch2_fs_journal_init(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
new file mode 100644
index 000000000000..6a3d6a374e9c
--- /dev/null
+++ b/fs/bcachefs/journal_io.c
@@ -0,0 +1,1894 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_io.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "trace.h"
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = ((__le32 *) &jset->seq)[0],
+		[2] = ((__le32 *) &jset->seq)[1],
+		[3] = BCH_NONCE_JOURNAL,
+	}};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+{
+	return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
+		!bch2_crc_cmp(j->csum,
+			      csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+}
+
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
+{
+	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+				  struct journal_replay *i)
+{
+	struct journal_replay **p =
+		genradix_ptr(&c->journal_entries,
+			     journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
+
+	BUG_ON(*p != i);
+	*p = NULL;
+	kvpfree(i, offsetof(struct journal_replay, j) +
+		vstruct_bytes(&i->j));
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+	i->ignore = true;
+
+	if (!c->opts.read_entire_journal)
+		__journal_replay_free(c, i);
+}
+
+struct journal_list {
+	struct closure		cl;
+	u64			last_seq;
+	struct mutex		lock;
+	int			ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK		0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_ptr entry_ptr,
+			     struct journal_list *jlist, struct jset *j)
+{
+	struct genradix_iter iter;
+	struct journal_replay **_i, *i, *dup;
+	struct journal_ptr *ptr;
+	size_t bytes = vstruct_bytes(j);
+	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+	int ret = JOURNAL_ENTRY_ADD_OK;
+
+	/* Is this entry older than the range we need? */
+	if (!c->opts.read_entire_journal &&
+	    le64_to_cpu(j->seq) < jlist->last_seq)
+		return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
+	/*
+	 * genradixes are indexed by a ulong, not a u64, so we can't index them
+	 * by sequence number directly: Assume instead that they will all fall
+	 * within the range of +-2billion of the filrst one we find.
+	 */
+	if (!c->journal_entries_base_seq)
+		c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
+
+	/* Drop entries we don't need anymore */
+	if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+		genradix_for_each_from(&c->journal_entries, iter, _i,
+				       journal_entry_radix_idx(c, jlist->last_seq)) {
+			i = *_i;
+
+			if (!i || i->ignore)
+				continue;
+
+			if (le64_to_cpu(i->j.seq) >= last_seq)
+				break;
+			journal_replay_free(c, i);
+		}
+	}
+
+	jlist->last_seq = max(jlist->last_seq, last_seq);
+
+	_i = genradix_ptr_alloc(&c->journal_entries,
+				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+				GFP_KERNEL);
+	if (!_i)
+		return -BCH_ERR_ENOMEM_journal_entry_add;
+
+	/*
+	 * Duplicate journal entries? If so we want the one that didn't have a
+	 * checksum error:
+	 */
+	dup = *_i;
+	if (dup) {
+		if (bytes == vstruct_bytes(&dup->j) &&
+		    !memcmp(j, &dup->j, bytes)) {
+			i = dup;
+			goto found;
+		}
+
+		if (!entry_ptr.csum_good) {
+			i = dup;
+			goto found;
+		}
+
+		if (!dup->csum_good)
+			goto replace;
+
+		fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+			 le64_to_cpu(j->seq));
+		i = dup;
+		goto found;
+	}
+replace:
+	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	if (!i)
+		return -BCH_ERR_ENOMEM_journal_entry_add;
+
+	i->nr_ptrs	= 0;
+	i->csum_good	= entry_ptr.csum_good;
+	i->ignore	= false;
+	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+	i->ptrs[i->nr_ptrs++] = entry_ptr;
+
+	if (dup) {
+		if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
+			bch_err(c, "found too many copies of journal entry %llu",
+				le64_to_cpu(i->j.seq));
+			dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
+		}
+
+		/* The first ptr should represent the jset we kept: */
+		memcpy(i->ptrs + i->nr_ptrs,
+		       dup->ptrs,
+		       sizeof(dup->ptrs[0]) * dup->nr_ptrs);
+		i->nr_ptrs += dup->nr_ptrs;
+		__journal_replay_free(c, dup);
+	}
+
+	*_i = i;
+	return 0;
+found:
+	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
+		if (ptr->dev == ca->dev_idx) {
+			bch_err(c, "duplicate journal entry %llu on same device",
+				le64_to_cpu(i->j.seq));
+			goto out;
+		}
+	}
+
+	if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
+		bch_err(c, "found too many copies of journal entry %llu",
+			le64_to_cpu(i->j.seq));
+		goto out;
+	}
+
+	i->ptrs[i->nr_ptrs++] = entry_ptr;
+out:
+fsck_err:
+	return ret;
+}
+
+/* this fills in a range with empty jset_entries: */
+static void journal_entry_null_range(void *start, void *end)
+{
+	struct jset_entry *entry;
+
+	for (entry = start; entry != end; entry = vstruct_next(entry))
+		memset(entry, 0, sizeof(*entry));
+}
+
+#define JOURNAL_ENTRY_REREAD	5
+#define JOURNAL_ENTRY_NONE	6
+#define JOURNAL_ENTRY_BAD	7
+
+static void journal_entry_err_msg(struct printbuf *out,
+				  u32 version,
+				  struct jset *jset,
+				  struct jset_entry *entry)
+{
+	prt_str(out, "invalid journal entry, version=");
+	bch2_version_to_text(out, version);
+
+	if (entry) {
+		prt_str(out, " type=");
+		prt_str(out, bch2_jset_entry_types[entry->type]);
+	}
+
+	if (!jset) {
+		prt_printf(out, " in superblock");
+	} else {
+
+		prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
+
+		if (entry)
+			prt_printf(out, " offset=%zi/%u",
+				   (u64 *) entry - jset->_data,
+				   le32_to_cpu(jset->u64s));
+	}
+
+	prt_str(out, ": ");
+}
+
+#define journal_entry_err(c, version, jset, entry, msg, ...)		\
+({									\
+	struct printbuf _buf = PRINTBUF;				\
+									\
+	journal_entry_err_msg(&_buf, version, jset, entry);		\
+	prt_printf(&_buf, msg, ##__VA_ARGS__);				\
+									\
+	switch (flags & BKEY_INVALID_WRITE) {				\
+	case READ:							\
+		mustfix_fsck_err(c, "%s", _buf.buf);			\
+		break;							\
+	case WRITE:							\
+		bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = -BCH_ERR_fsck_errors_not_fixed;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	}								\
+									\
+	printbuf_exit(&_buf);						\
+	true;								\
+})
+
+#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...)	\
+	((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false)
+
+#define FSCK_DELETED_KEY	5
+
+static int journal_validate_key(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned level, enum btree_id btree_id,
+				struct bkey_i *k,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	int write = flags & BKEY_INVALID_WRITE;
+	void *next = vstruct_next(entry);
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return FSCK_DELETED_KEY;
+	}
+
+	if (journal_entry_err_on((void *) bkey_next(k) >
+				 (void *) vstruct_next(entry),
+				 c, version, jset, entry,
+				 "extends past end of journal entry")) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return FSCK_DELETED_KEY;
+	}
+
+	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
+				 c, version, jset, entry,
+				 "bad format %u", k->k.format)) {
+		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+		return FSCK_DELETED_KEY;
+	}
+
+	if (!write)
+		bch2_bkey_compat(level, btree_id, version, big_endian,
+				 write, NULL, bkey_to_packed(k));
+
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+			      __btree_node_type(level, btree_id), write, &buf)) {
+		printbuf_reset(&buf);
+		journal_entry_err_msg(&buf, version, jset, entry);
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+		prt_newline(&buf);
+		bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+				  __btree_node_type(level, btree_id), write, &buf);
+
+		mustfix_fsck_err(c, "%s", buf.buf);
+
+		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+
+		printbuf_exit(&buf);
+		return FSCK_DELETED_KEY;
+	}
+
+	if (write)
+		bch2_bkey_compat(level, btree_id, version, big_endian,
+				 write, NULL, bkey_to_packed(k));
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int journal_entry_btree_keys_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct bkey_i *k = entry->start;
+
+	while (k != vstruct_last(entry)) {
+		int ret = journal_validate_key(c, jset, entry,
+					       entry->level,
+					       entry->btree_id,
+					       k, version, big_endian,
+					       flags|BKEY_INVALID_JOURNAL);
+		if (ret == FSCK_DELETED_KEY)
+			continue;
+
+		k = bkey_next(k);
+	}
+
+	return 0;
+}
+
+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	struct bkey_i *k;
+	bool first = true;
+
+	jset_entry_for_each_key(entry, k) {
+		if (!first) {
+			prt_newline(out);
+			prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		}
+		prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+		first = false;
+	}
+}
+
+static int journal_entry_btree_root_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct bkey_i *k = entry->start;
+	int ret = 0;
+
+	if (journal_entry_err_on(!entry->u64s ||
+				 le16_to_cpu(entry->u64s) != k->k.u64s,
+				 c, version, jset, entry,
+				 "invalid btree root journal entry: wrong number of keys")) {
+		void *next = vstruct_next(entry);
+		/*
+		 * we don't want to null out this jset_entry,
+		 * just the contents, so that later we can tell
+		 * we were _supposed_ to have a btree root
+		 */
+		entry->u64s = 0;
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
+				    version, big_endian, flags);
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	/* obsolete, don't care: */
+	return 0;
+}
+
+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+}
+
+static int journal_entry_blacklist_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
+				 c, version, jset, entry,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_blacklist *bl =
+		container_of(entry, struct jset_entry_blacklist, entry);
+
+	prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
+}
+
+static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_blacklist_v2 *bl_entry;
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
+				 c, version, jset, entry,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
+	}
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
+				 le64_to_cpu(bl_entry->end),
+				 c, version, jset, entry,
+		"invalid journal seq blacklist entry: start > end")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+out:
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
+					       struct jset_entry *entry)
+{
+	struct jset_entry_blacklist_v2 *bl =
+		container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	prt_printf(out, "start=%llu end=%llu",
+	       le64_to_cpu(bl->start),
+	       le64_to_cpu(bl->end));
+}
+
+static int journal_entry_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u),
+				 c, version, jset, entry,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					struct jset_entry *entry)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+
+	prt_printf(out, "type=%s v=%llu",
+	       bch2_fs_usage_types[u->entry.btree_id],
+	       le64_to_cpu(u->v));
+}
+
+static int journal_entry_data_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u) ||
+				 bytes < sizeof(*u) + u->r.nr_devs,
+				 c, version, jset, entry,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+
+	bch2_replicas_entry_to_text(out, &u->r);
+	prt_printf(out, "=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_clock_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes != sizeof(*clock),
+				 c, version, jset, entry, "bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(clock->rw > 1,
+				 c, version, jset, entry, "bad rw")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
+					struct jset_entry *entry)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+
+	prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+}
+
+static int journal_entry_dev_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	unsigned expected = sizeof(*u);
+	unsigned dev;
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < expected,
+				 c, version, jset, entry, "bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	dev = le32_to_cpu(u->dev);
+
+	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+				 c, version, jset, entry, "bad dev")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(u->pad,
+				 c, version, jset, entry, "bad pad")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
+
+	for (i = 0; i < nr_types; i++) {
+		if (i < BCH_DATA_NR)
+			prt_printf(out, " %s", bch2_data_types[i]);
+		else
+			prt_printf(out, " (unknown data type %u)", i);
+		prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+		       le64_to_cpu(u->d[i].buckets),
+		       le64_to_cpu(u->d[i].sectors),
+		       le64_to_cpu(u->d[i].fragmented));
+	}
+
+	prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
+}
+
+static int journal_entry_log_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	return 0;
+}
+
+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
+				      struct jset_entry *entry)
+{
+	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
+
+	prt_printf(out, "%.*s", bytes, l->d);
+}
+
+static int journal_entry_overwrite_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	return journal_entry_btree_keys_validate(c, jset, entry,
+				version, big_endian, READ);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+struct jset_entry_ops {
+	int (*validate)(struct bch_fs *, struct jset *,
+			struct jset_entry *, unsigned, int,
+			enum bkey_invalid_flags);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
+};
+
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
+#define x(f, nr)						\
+	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
+		.validate	= journal_entry_##f##_validate,	\
+		.to_text	= journal_entry_##f##_to_text,	\
+	},
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+};
+
+int bch2_journal_entry_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	return entry->type < BCH_JSET_ENTRY_NR
+		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
+				version, big_endian, flags)
+		: 0;
+}
+
+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
+				struct jset_entry *entry)
+{
+	if (entry->type < BCH_JSET_ENTRY_NR) {
+		prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
+	} else {
+		prt_printf(out, "(unknown type %u)", entry->type);
+	}
+}
+
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
+				 enum bkey_invalid_flags flags)
+{
+	struct jset_entry *entry;
+	unsigned version = le32_to_cpu(jset->version);
+	int ret = 0;
+
+	vstruct_for_each(jset, entry) {
+		if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
+					 c, version, jset, entry,
+				"journal entry extends past end of jset")) {
+			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
+			break;
+		}
+
+		ret = bch2_journal_entry_validate(c, jset, entry,
+					version, JSET_BIG_ENDIAN(jset), flags);
+		if (ret)
+			break;
+	}
+fsck_err:
+	return ret;
+}
+
+static int jset_validate(struct bch_fs *c,
+			 struct bch_dev *ca,
+			 struct jset *jset, u64 sector,
+			 enum bkey_invalid_flags flags)
+{
+	unsigned version;
+	int ret = 0;
+
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
+
+	version = le32_to_cpu(jset->version);
+	if (journal_entry_err_on(!bch2_version_compatible(version),
+			c, version, jset, NULL,
+			"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			BCH_VERSION_MAJOR(version),
+			BCH_VERSION_MINOR(version))) {
+		/* don't try to continue: */
+		return -EINVAL;
+	}
+
+	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
+				 c, version, jset, NULL,
+			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			JSET_CSUM_TYPE(jset)))
+		ret = JOURNAL_ENTRY_BAD;
+
+	/* last_seq is ignored when JSET_NO_FLUSH is true */
+	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
+				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
+				 c, version, jset, NULL,
+				 "invalid journal entry: last_seq > seq (%llu > %llu)",
+				 le64_to_cpu(jset->last_seq),
+				 le64_to_cpu(jset->seq))) {
+		jset->last_seq = jset->seq;
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	ret = jset_validate_entries(c, jset, flags);
+fsck_err:
+	return ret;
+}
+
+static int jset_validate_early(struct bch_fs *c,
+			 struct bch_dev *ca,
+			 struct jset *jset, u64 sector,
+			 unsigned bucket_sectors_left,
+			 unsigned sectors_read)
+{
+	size_t bytes = vstruct_bytes(jset);
+	unsigned version;
+	enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+	int ret = 0;
+
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
+
+	version = le32_to_cpu(jset->version);
+	if (journal_entry_err_on(!bch2_version_compatible(version),
+			 c, version, jset, NULL,
+			"%s sector %llu seq %llu: unknown journal entry version %u.%u",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			BCH_VERSION_MAJOR(version),
+			BCH_VERSION_MINOR(version))) {
+		/* don't try to continue: */
+		return -EINVAL;
+	}
+
+	if (bytes > (sectors_read << 9) &&
+	    sectors_read < bucket_sectors_left)
+		return JOURNAL_ENTRY_REREAD;
+
+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+			 c, version, jset, NULL,
+			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq), bytes))
+		le32_add_cpu(&jset->u64s,
+			     -((bytes - (bucket_sectors_left << 9)) / 8));
+fsck_err:
+	return ret;
+}
+
+struct journal_read_buf {
+	void		*data;
+	size_t		size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+				    size_t new_size)
+{
+	void *n;
+
+	/* the bios are sized for this many pages, max: */
+	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+
+	new_size = roundup_pow_of_two(new_size);
+	n = kvpmalloc(new_size, GFP_KERNEL);
+	if (!n)
+		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+
+	kvpfree(b->data, b->size);
+	b->data = n;
+	b->size = new_size;
+	return 0;
+}
+
+static int journal_read_bucket(struct bch_dev *ca,
+			       struct journal_read_buf *buf,
+			       struct journal_list *jlist,
+			       unsigned bucket)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	struct jset *j = NULL;
+	unsigned sectors, sectors_read = 0;
+	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+	    end = offset + ca->mi.bucket_size;
+	bool saw_bad = false, csum_good;
+	int ret = 0;
+
+	pr_debug("reading %u", bucket);
+
+	while (offset < end) {
+		if (!sectors_read) {
+			struct bio *bio;
+			unsigned nr_bvecs;
+reread:
+			sectors_read = min_t(unsigned,
+				end - offset, buf->size >> 9);
+			nr_bvecs = buf_pages(buf->data, sectors_read << 9);
+
+			bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+			bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
+
+			bio->bi_iter.bi_sector = offset;
+			bch2_bio_map(bio, buf->data, sectors_read << 9);
+
+			ret = submit_bio_wait(bio);
+			kfree(bio);
+
+			if (bch2_dev_io_err_on(ret, ca,
+					       "journal read error: sector %llu",
+					       offset) ||
+			    bch2_meta_read_fault("journal")) {
+				/*
+				 * We don't error out of the recovery process
+				 * here, since the relevant journal entry may be
+				 * found on a different device, and missing or
+				 * no journal entries will be handled later
+				 */
+				return 0;
+			}
+
+			j = buf->data;
+		}
+
+		ret = jset_validate_early(c, ca, j, offset,
+				    end - offset, sectors_read);
+		switch (ret) {
+		case 0:
+			sectors = vstruct_sectors(j, c->block_bits);
+			break;
+		case JOURNAL_ENTRY_REREAD:
+			if (vstruct_bytes(j) > buf->size) {
+				ret = journal_read_buf_realloc(buf,
+							vstruct_bytes(j));
+				if (ret)
+					return ret;
+			}
+			goto reread;
+		case JOURNAL_ENTRY_NONE:
+			if (!saw_bad)
+				return 0;
+			/*
+			 * On checksum error we don't really trust the size
+			 * field of the journal entry we read, so try reading
+			 * again at next block boundary:
+			 */
+			sectors = block_sectors(c);
+			goto next_block;
+		default:
+			return ret;
+		}
+
+		/*
+		 * This happens sometimes if we don't have discards on -
+		 * when we've partially overwritten a bucket with new
+		 * journal entries. We don't need the rest of the
+		 * bucket:
+		 */
+		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+			return 0;
+
+		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+		csum_good = jset_csum_good(c, j);
+		if (!csum_good)
+			saw_bad = true;
+
+		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+			     j->encrypted_start,
+			     vstruct_end(j) - (void *) j->encrypted_start);
+		bch2_fs_fatal_err_on(ret, c,
+				"error decrypting journal entry: %i", ret);
+
+		mutex_lock(&jlist->lock);
+		ret = journal_entry_add(c, ca, (struct journal_ptr) {
+					.csum_good	= csum_good,
+					.dev		= ca->dev_idx,
+					.bucket		= bucket,
+					.bucket_offset	= offset -
+						bucket_to_sector(ca, ja->buckets[bucket]),
+					.sector		= offset,
+					}, jlist, j);
+		mutex_unlock(&jlist->lock);
+
+		switch (ret) {
+		case JOURNAL_ENTRY_ADD_OK:
+			break;
+		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+			break;
+		default:
+			return ret;
+		}
+next_block:
+		pr_debug("next");
+		offset		+= sectors;
+		sectors_read	-= sectors;
+		j = ((void *) j) + (sectors << 9);
+	}
+
+	return 0;
+}
+
+static void bch2_journal_read_device(struct closure *cl)
+{
+	struct journal_device *ja =
+		container_of(cl, struct journal_device, read);
+	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+	struct bch_fs *c = ca->fs;
+	struct journal_list *jlist =
+		container_of(cl->parent, struct journal_list, cl);
+	struct journal_replay *r, **_r;
+	struct genradix_iter iter;
+	struct journal_read_buf buf = { NULL, 0 };
+	unsigned i;
+	int ret = 0;
+
+	if (!ja->nr)
+		goto out;
+
+	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+	if (ret)
+		goto err;
+
+	pr_debug("%u journal buckets", ja->nr);
+
+	for (i = 0; i < ja->nr; i++) {
+		ret = journal_read_bucket(ca, &buf, jlist, i);
+		if (ret)
+			goto err;
+	}
+
+	ja->sectors_free = ca->mi.bucket_size;
+
+	mutex_lock(&jlist->lock);
+	genradix_for_each_reverse(&c->journal_entries, iter, _r) {
+		r = *_r;
+
+		if (!r)
+			continue;
+
+		for (i = 0; i < r->nr_ptrs; i++) {
+			if (r->ptrs[i].dev == ca->dev_idx) {
+				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+					vstruct_sectors(&r->j, c->block_bits);
+
+				ja->cur_idx = r->ptrs[i].bucket;
+				ja->sectors_free = ca->mi.bucket_size - wrote;
+				goto found;
+			}
+		}
+	}
+found:
+	mutex_unlock(&jlist->lock);
+
+	if (ja->bucket_seq[ja->cur_idx] &&
+	    ja->sectors_free == ca->mi.bucket_size) {
+		bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
+		bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
+		for (i = 0; i < 3; i++) {
+			unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+
+			bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
+		}
+		ja->sectors_free = 0;
+	}
+
+	/*
+	 * Set dirty_idx to indicate the entire journal is full and needs to be
+	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
+	 * pinned when it first runs:
+	 */
+	ja->discard_idx = ja->dirty_idx_ondisk =
+		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
+out:
+	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
+	kvpfree(buf.data, buf.size);
+	percpu_ref_put(&ca->io_ref);
+	closure_return(cl);
+	return;
+err:
+	mutex_lock(&jlist->lock);
+	jlist->ret = ret;
+	mutex_unlock(&jlist->lock);
+	goto out;
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct journal_replay *j)
+{
+	unsigned i;
+
+	for (i = 0; i < j->nr_ptrs; i++) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
+		u64 offset;
+
+		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
+
+		if (i)
+			prt_printf(out, " ");
+		prt_printf(out, "%u:%u:%u (sector %llu)",
+		       j->ptrs[i].dev,
+		       j->ptrs[i].bucket,
+		       j->ptrs[i].bucket_offset,
+		       j->ptrs[i].sector);
+	}
+}
+
+int bch2_journal_read(struct bch_fs *c,
+		      u64 *last_seq,
+		      u64 *blacklist_seq,
+		      u64 *start_seq)
+{
+	struct journal_list jlist;
+	struct journal_replay *i, **_i, *prev = NULL;
+	struct genradix_iter radix_iter;
+	struct bch_dev *ca;
+	unsigned iter;
+	struct printbuf buf = PRINTBUF;
+	bool degraded = false, last_write_torn = false;
+	u64 seq;
+	int ret = 0;
+
+	closure_init_stack(&jlist.cl);
+	mutex_init(&jlist.lock);
+	jlist.last_seq = 0;
+	jlist.ret = 0;
+
+	for_each_member_device(ca, c, iter) {
+		if (!c->opts.fsck &&
+		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
+			continue;
+
+		if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+		     ca->mi.state == BCH_MEMBER_STATE_ro) &&
+		    percpu_ref_tryget(&ca->io_ref))
+			closure_call(&ca->journal.read,
+				     bch2_journal_read_device,
+				     system_unbound_wq,
+				     &jlist.cl);
+		else
+			degraded = true;
+	}
+
+	closure_sync(&jlist.cl);
+
+	if (jlist.ret)
+		return jlist.ret;
+
+	*last_seq	= 0;
+	*start_seq	= 0;
+	*blacklist_seq	= 0;
+
+	/*
+	 * Find most recent flush entry, and ignore newer non flush entries -
+	 * those entries will be blacklisted:
+	 */
+	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+		enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		if (!*start_seq)
+			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+		if (JSET_NO_FLUSH(&i->j)) {
+			i->ignore = true;
+			continue;
+		}
+
+		if (!last_write_torn && !i->csum_good) {
+			last_write_torn = true;
+			i->ignore = true;
+			continue;
+		}
+
+		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+					 c, le32_to_cpu(i->j.version), &i->j, NULL,
+					 "invalid journal entry: last_seq > seq (%llu > %llu)",
+					 le64_to_cpu(i->j.last_seq),
+					 le64_to_cpu(i->j.seq)))
+			i->j.last_seq = i->j.seq;
+
+		*last_seq	= le64_to_cpu(i->j.last_seq);
+		*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
+		break;
+	}
+
+	if (!*start_seq) {
+		bch_info(c, "journal read done, but no entries found");
+		return 0;
+	}
+
+	if (!*last_seq) {
+		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+		return 0;
+	}
+
+	bch_info(c, "journal read done, replaying entries %llu-%llu",
+		 *last_seq, *blacklist_seq - 1);
+
+	if (*start_seq != *blacklist_seq)
+		bch_info(c, "dropped unflushed entries %llu-%llu",
+			 *blacklist_seq, *start_seq - 1);
+
+	/* Drop blacklisted entries and entries older than last_seq: */
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		seq = le64_to_cpu(i->j.seq);
+		if (seq < *last_seq) {
+			journal_replay_free(c, i);
+			continue;
+		}
+
+		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+				    "found blacklisted journal entry %llu", seq);
+			i->ignore = true;
+		}
+	}
+
+	/* Check for missing entries: */
+	seq = *last_seq;
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+		while (seq < le64_to_cpu(i->j.seq)) {
+			u64 missing_start, missing_end;
+			struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			if (seq == le64_to_cpu(i->j.seq))
+				break;
+
+			missing_start = seq;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       !bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			if (prev) {
+				bch2_journal_ptrs_to_text(&buf1, c, prev);
+				prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+			} else
+				prt_printf(&buf1, "(none)");
+			bch2_journal_ptrs_to_text(&buf2, c, i);
+
+			missing_end = seq - 1;
+			fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+				 "  prev at %s\n"
+				 "  next at %s",
+				 missing_start, missing_end,
+				 *last_seq, *blacklist_seq - 1,
+				 buf1.buf, buf2.buf);
+
+			printbuf_exit(&buf1);
+			printbuf_exit(&buf2);
+		}
+
+		prev = i;
+		seq++;
+	}
+
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		struct bch_replicas_padded replicas = {
+			.e.data_type = BCH_DATA_journal,
+			.e.nr_required = 1,
+		};
+		unsigned ptr;
+
+		i = *_i;
+		if (!i || i->ignore)
+			continue;
+
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
+			ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+
+			if (!i->ptrs[ptr].csum_good)
+				bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+						   "invalid journal checksum, seq %llu%s",
+						   le64_to_cpu(i->j.seq),
+						   i->csum_good ? " (had good copy on another device)" : "");
+		}
+
+		ret = jset_validate(c,
+				    bch_dev_bkey_exists(c, i->ptrs[0].dev),
+				    &i->j,
+				    i->ptrs[0].sector,
+				    READ);
+		if (ret)
+			goto err;
+
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+
+		bch2_replicas_entry_sort(&replicas.e);
+
+		printbuf_reset(&buf);
+		bch2_replicas_entry_to_text(&buf, &replicas.e);
+
+		if (!degraded &&
+		    !bch2_replicas_marked(c, &replicas.e) &&
+		    (le64_to_cpu(i->j.seq) == *last_seq ||
+		     fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n  %s",
+			      le64_to_cpu(i->j.seq), buf.buf))) {
+			ret = bch2_mark_replicas(c, &replicas.e);
+			if (ret)
+				goto err;
+		}
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/* journal write: */
+
+static void __journal_write_alloc(struct journal *j,
+				  struct journal_buf *w,
+				  struct dev_alloc_list *devs_sorted,
+				  unsigned sectors,
+				  unsigned *replicas,
+				  unsigned replicas_want)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (*replicas >= replicas_want)
+		return;
+
+	for (i = 0; i < devs_sorted->nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+		if (!ca)
+			continue;
+
+		ja = &ca->journal;
+
+		/*
+		 * Check that we can use this device, and aren't already using
+		 * it:
+		 */
+		if (!ca->mi.durability ||
+		    ca->mi.state != BCH_MEMBER_STATE_rw ||
+		    !ja->nr ||
+		    bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
+		    sectors > ja->sectors_free)
+			continue;
+
+		bch2_dev_stripe_increment(ca, &j->wp.stripe);
+
+		bch2_bkey_append_ptr(&w->key,
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					ja->buckets[ja->cur_idx]) +
+					ca->mi.bucket_size -
+					ja->sectors_free,
+				  .dev = ca->dev_idx,
+		});
+
+		ja->sectors_free -= sectors;
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+		*replicas += ca->mi.durability;
+
+		if (*replicas >= replicas_want)
+			break;
+	}
+}
+
+/**
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j:		journal object
+ * @w:		journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_devs_mask devs;
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	struct dev_alloc_list devs_sorted;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+	unsigned target = c->opts.metadata_target ?:
+		c->opts.foreground_target;
+	unsigned i, replicas = 0, replicas_want =
+		READ_ONCE(c->opts.metadata_replicas);
+
+	rcu_read_lock();
+retry:
+	devs = target_rw_devs(c, BCH_DATA_journal, target);
+
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+
+	if (replicas >= replicas_want)
+		goto done;
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		ja = &ca->journal;
+
+		if (sectors > ja->sectors_free &&
+		    sectors <= ca->mi.bucket_size &&
+		    bch2_journal_dev_buckets_available(j, ja,
+					journal_space_discarded)) {
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+			ja->sectors_free = ca->mi.bucket_size;
+
+			/*
+			 * ja->bucket_seq[ja->cur_idx] must always have
+			 * something sensible:
+			 */
+			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+		}
+	}
+
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+
+	if (replicas < replicas_want && target) {
+		/* Retry from all devices: */
+		target = 0;
+		goto retry;
+	}
+done:
+	rcu_read_unlock();
+
+	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+
+	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+}
+
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+	/* we aren't holding j->lock: */
+	unsigned new_size = READ_ONCE(j->buf_size_want);
+	void *new_buf;
+
+	if (buf->buf_size >= new_size)
+		return;
+
+	new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+	if (!new_buf)
+		return;
+
+	memcpy(new_buf, buf->data, buf->buf_size);
+
+	spin_lock(&j->lock);
+	swap(buf->data,		new_buf);
+	swap(buf->buf_size,	new_size);
+	spin_unlock(&j->lock);
+
+	kvpfree(new_buf, new_size);
+}
+
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
+{
+	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
+}
+
+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_replicas_padded replicas;
+	union journal_res_state old, new;
+	u64 v, seq;
+	int err = 0;
+
+	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+			       ? j->flush_write_time
+			       : j->noflush_write_time, j->write_start_time);
+
+	if (!w->devs_written.nr) {
+		bch_err(c, "unable to write journal to sufficient devices");
+		err = -EIO;
+	} else {
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+					 w->devs_written);
+		if (bch2_mark_replicas(c, &replicas.e))
+			err = -EIO;
+	}
+
+	if (err)
+		bch2_fatal_error(c);
+
+	spin_lock(&j->lock);
+	seq = le64_to_cpu(w->data->seq);
+
+	if (seq >= j->pin.front)
+		journal_seq_pin(j, seq)->devs = w->devs_written;
+
+	if (!err) {
+		if (!JSET_NO_FLUSH(w->data)) {
+			j->flushed_seq_ondisk = seq;
+			j->last_seq_ondisk = w->last_seq;
+
+			bch2_do_discards(c);
+			closure_wake_up(&c->freelist_wait);
+
+			bch2_reset_alloc_cursors(c);
+		}
+	} else if (!j->err_seq || seq < j->err_seq)
+		j->err_seq	= seq;
+
+	j->seq_ondisk		= seq;
+
+	/*
+	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+	 * more buckets:
+	 *
+	 * Must come before signaling write completion, for
+	 * bch2_fs_journal_stop():
+	 */
+	if (j->watermark != BCH_WATERMARK_stripe)
+		journal_reclaim_kick(&c->journal);
+
+	/* also must come before signalling write completion: */
+	closure_debug_destroy(cl);
+
+	v = atomic64_read(&j->reservations.counter);
+	do {
+		old.v = new.v = v;
+		BUG_ON(journal_state_count(new, new.unwritten_idx));
+
+		new.unwritten_idx++;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	bch2_journal_space_available(j);
+
+	closure_wake_up(&w->wait);
+	journal_wake(j);
+
+	if (!journal_state_count(new, new.unwritten_idx) &&
+	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+		spin_unlock(&j->lock);
+		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+		struct journal_buf *buf = journal_cur_buf(j);
+		long delta = buf->expires - jiffies;
+
+		/*
+		 * We don't close a journal entry to write it while there's
+		 * previous entries still in flight - the current journal entry
+		 * might want to be written now:
+		 */
+
+		spin_unlock(&j->lock);
+		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+	} else {
+		spin_unlock(&j->lock);
+	}
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+	struct journal *j = &ca->fs->journal;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	unsigned long flags;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
+			       le64_to_cpu(w->data->seq),
+			       bch2_blk_status_to_str(bio->bi_status)) ||
+	    bch2_meta_write_fault("journal")) {
+		spin_lock_irqsave(&j->err_lock, flags);
+		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
+		spin_unlock_irqrestore(&j->err_lock, flags);
+	}
+
+	closure_put(&j->io);
+	percpu_ref_put(&ca->io_ref);
+}
+
+static void do_journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_extent_ptr *ptr;
+	struct bio *bio;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		if (!percpu_ref_tryget(&ca->io_ref)) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+			     sectors);
+
+		bio = ca->journal.bio;
+		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+
+		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+		ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
+		if (!JSET_NO_FLUSH(w->data))
+			bio->bi_opf    |= REQ_FUA;
+		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+			bio->bi_opf    |= REQ_PREFLUSH;
+
+		bch2_bio_map(bio, w->data, sectors << 9);
+
+		trace_and_count(c, journal_write, bio);
+		closure_bio_submit(bio, cl);
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] =
+			le64_to_cpu(w->data->seq);
+	}
+
+	continue_at(cl, journal_write_done, c->io_complete_wq);
+}
+
+static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+{
+	struct jset_entry *i, *next, *prev = NULL;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	vstruct_for_each_safe(jset, i, next) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		if (i->type == BCH_JSET_ENTRY_btree_root)
+			bch2_journal_entry_to_btree_root(c, i);
+
+		/* Can we merge with previous entry? */
+		if (prev &&
+		    i->btree_id == prev->btree_id &&
+		    i->level	== prev->level &&
+		    i->type	== prev->type &&
+		    i->type	== BCH_JSET_ENTRY_btree_keys &&
+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+			memmove_u64s_down(vstruct_next(prev),
+					  i->_data,
+					  u64s);
+			le16_add_cpu(&prev->u64s, u64s);
+			continue;
+		}
+
+		/* Couldn't merge, move i into new position (after prev): */
+		prev = prev ? vstruct_next(prev) : jset->start;
+		if (i != prev)
+			memmove_u64s_down(prev, i, jset_u64s(u64s));
+	}
+
+	prev = prev ? vstruct_next(prev) : jset->start;
+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_replicas_padded replicas;
+	struct jset_entry *start, *end;
+	struct jset *jset;
+	struct bio *bio;
+	struct printbuf journal_debug_buf = PRINTBUF;
+	bool validate_before_checksum = false;
+	unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
+	int ret;
+
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+	journal_buf_realloc(j, w);
+	jset = w->data;
+
+	j->write_start_time = local_clock();
+
+	spin_lock(&j->lock);
+
+	/*
+	 * If the journal is in an error state - we did an emergency shutdown -
+	 * we prefer to continue doing journal writes. We just mark them as
+	 * noflush so they'll never be used, but they'll still be visible by the
+	 * list_journal tool - this helps in debugging.
+	 *
+	 * There's a caveat: the first journal write after marking the
+	 * superblock dirty must always be a flush write, because on startup
+	 * from a clean shutdown we didn't necessarily read the journal and the
+	 * new journal write might overwrite whatever was in the journal
+	 * previously - we can't leave the journal without any flush writes in
+	 * it.
+	 *
+	 * So if we're in an error state, and we're still starting up, we don't
+	 * write anything at all.
+	 */
+	if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
+	    (bch2_journal_error(j) ||
+	     w->noflush ||
+	     (!w->must_flush &&
+	      (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+	      test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
+		w->noflush = true;
+		SET_JSET_NO_FLUSH(jset, true);
+		jset->last_seq	= 0;
+		w->last_seq	= 0;
+
+		j->nr_noflush_writes++;
+	} else if (!bch2_journal_error(j)) {
+		j->last_flush_write = jiffies;
+		j->nr_flush_writes++;
+		clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+	} else {
+		spin_unlock(&j->lock);
+		goto err;
+	}
+	spin_unlock(&j->lock);
+
+	/*
+	 * New btree roots are set by journalling them; when the journal entry
+	 * gets written we have to propagate them to c->btree_roots
+	 *
+	 * But, every journal entry we write has to contain all the btree roots
+	 * (at least for now); so after we copy btree roots to c->btree_roots we
+	 * have to get any missing btree roots and add them to this journal
+	 * entry:
+	 */
+
+	bch2_journal_entries_postprocess(c, jset);
+
+	start = end = vstruct_last(jset);
+
+	end	= bch2_btree_roots_to_journal_entries(c, jset->start, end);
+
+	bch2_journal_super_entries_add_common(c, &end,
+				le64_to_cpu(jset->seq));
+	u64s	= (u64 *) end - (u64 *) start;
+	BUG_ON(u64s > j->entry_u64s_reserved);
+
+	le32_add_cpu(&jset->u64s, u64s);
+
+	sectors = vstruct_sectors(jset, c->block_bits);
+	bytes	= vstruct_bytes(jset);
+
+	if (sectors > w->sectors) {
+		bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+				    vstruct_bytes(jset), w->sectors << 9,
+				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
+		goto err;
+	}
+
+	jset->magic		= cpu_to_le64(jset_magic(c));
+	jset->version		= cpu_to_le32(c->sb.version);
+
+	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+
+	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
+		j->last_empty_seq = le64_to_cpu(jset->seq);
+
+	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
+		validate_before_checksum = true;
+
+	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
+		validate_before_checksum = true;
+
+	if (validate_before_checksum &&
+	    jset_validate(c, NULL, jset, 0, WRITE))
+		goto err;
+
+	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		    jset->encrypted_start,
+		    vstruct_end(jset) - (void *) jset->encrypted_start);
+	if (bch2_fs_fatal_err_on(ret, c,
+			"error decrypting journal entry: %i", ret))
+		goto err;
+
+	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+				  journal_nonce(jset), jset);
+
+	if (!validate_before_checksum &&
+	    jset_validate(c, NULL, jset, 0, WRITE))
+		goto err;
+
+	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+
+retry_alloc:
+	spin_lock(&j->lock);
+	ret = journal_write_alloc(j, w);
+
+	if (ret && j->can_discard) {
+		spin_unlock(&j->lock);
+		bch2_journal_do_discards(j);
+		goto retry_alloc;
+	}
+
+	if (ret)
+		__bch2_journal_debug_to_text(&journal_debug_buf, j);
+
+	/*
+	 * write is allocated, no longer need to account for it in
+	 * bch2_journal_space_available():
+	 */
+	w->sectors = 0;
+
+	/*
+	 * journal entry has been compacted and allocated, recalculate space
+	 * available:
+	 */
+	bch2_journal_space_available(j);
+	spin_unlock(&j->lock);
+
+	if (ret) {
+		bch_err(c, "Unable to allocate journal write:\n%s",
+			journal_debug_buf.buf);
+		printbuf_exit(&journal_debug_buf);
+		goto err;
+	}
+
+	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+
+	if (c->opts.nochanges)
+		goto no_io;
+
+	for_each_rw_member(ca, c, i)
+		nr_rw_members++;
+
+	if (nr_rw_members > 1)
+		w->separate_flush = true;
+
+	/*
+	 * Mark journal replicas before we submit the write to guarantee
+	 * recovery will find the journal entries after a crash.
+	 */
+	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+				 w->devs_written);
+	ret = bch2_mark_replicas(c, &replicas.e);
+	if (ret)
+		goto err;
+
+	if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+		for_each_rw_member(ca, c, i) {
+			percpu_ref_get(&ca->io_ref);
+
+			bio = ca->journal.bio;
+			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			closure_bio_submit(bio, cl);
+		}
+	}
+
+	continue_at(cl, do_journal_write, c->io_complete_wq);
+	return;
+no_io:
+	continue_at(cl, journal_write_done, c->io_complete_wq);
+	return;
+err:
+	bch2_fatal_error(c);
+	continue_at(cl, journal_write_done, c->io_complete_wq);
+}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
new file mode 100644
index 000000000000..a88d097b13f1
--- /dev/null
+++ b/fs/bcachefs/journal_io.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct journal_ptr {
+		bool		csum_good;
+		u8		dev;
+		u32		bucket;
+		u32		bucket_offset;
+		u64		sector;
+	}			ptrs[BCH_REPLICAS_MAX];
+	unsigned		nr_ptrs;
+
+	bool			csum_good;
+	bool			ignore;
+	/* must be last: */
+	struct jset		j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < vstruct_last(jset)) {
+		if (entry->type == type)
+			return entry;
+
+		entry = vstruct_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = vstruct_next(entry))
+
+#define jset_entry_for_each_key(_e, _k)					\
+	for (_k = (_e)->start;						\
+	     _k < vstruct_last(_e);					\
+	     _k = bkey_next(_k))
+
+#define for_each_jset_key(k, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
+		jset_entry_for_each_key(entry, k)
+
+int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
+				struct jset_entry *, unsigned, int,
+				enum bkey_invalid_flags);
+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
+				struct jset_entry *);
+
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			       struct journal_replay *);
+
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
+
+void bch2_journal_write(struct closure *);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
new file mode 100644
index 000000000000..9a584aaaa2eb
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.c
@@ -0,0 +1,876 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "trace.h"
+
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+
+/* Free space calculations: */
+
+static unsigned journal_space_from(struct journal_device *ja,
+				   enum journal_space_from from)
+{
+	switch (from) {
+	case journal_space_discarded:
+		return ja->discard_idx;
+	case journal_space_clean_ondisk:
+		return ja->dirty_idx_ondisk;
+	case journal_space_clean:
+		return ja->dirty_idx;
+	default:
+		BUG();
+	}
+}
+
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+					    struct journal_device *ja,
+					    enum journal_space_from from)
+{
+	unsigned available = (journal_space_from(ja, from) -
+			      ja->cur_idx - 1 + ja->nr) % ja->nr;
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
+		--available;
+
+	return available;
+}
+
+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+{
+	union journal_preres_state old, new;
+	u64 v = atomic64_read(&j->prereserved.counter);
+
+	do {
+		old.v = new.v = v;
+		new.remaining = u64s_remaining;
+	} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
+				       old.v, new.v)) != old.v);
+}
+
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
+			    enum journal_space_from from)
+{
+	struct journal_device *ja = &ca->journal;
+	unsigned sectors, buckets, unwritten;
+	u64 seq;
+
+	if (from == journal_space_total)
+		return (struct journal_space) {
+			.next_entry	= ca->mi.bucket_size,
+			.total		= ca->mi.bucket_size * ja->nr,
+		};
+
+	buckets = bch2_journal_dev_buckets_available(j, ja, from);
+	sectors = ja->sectors_free;
+
+	/*
+	 * We that we don't allocate the space for a journal entry
+	 * until we write it out - thus, account for it here:
+	 */
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+		if (!unwritten)
+			continue;
+
+		/* entry won't fit on this device, skip: */
+		if (unwritten > ca->mi.bucket_size)
+			continue;
+
+		if (unwritten >= sectors) {
+			if (!buckets) {
+				sectors = 0;
+				break;
+			}
+
+			buckets--;
+			sectors = ca->mi.bucket_size;
+		}
+
+		sectors -= unwritten;
+	}
+
+	if (sectors < ca->mi.bucket_size && buckets) {
+		buckets--;
+		sectors = ca->mi.bucket_size;
+	}
+
+	return (struct journal_space) {
+		.next_entry	= sectors,
+		.total		= sectors + buckets * ca->mi.bucket_size,
+	};
+}
+
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+			    enum journal_space_from from)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned i, pos, nr_devs = 0;
+	struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_journal]) {
+		if (!ca->journal.nr)
+			continue;
+
+		space = journal_dev_space_available(j, ca, from);
+		if (!space.next_entry)
+			continue;
+
+		for (pos = 0; pos < nr_devs; pos++)
+			if (space.total > dev_space[pos].total)
+				break;
+
+		array_insert_item(dev_space, nr_devs, pos, space);
+	}
+	rcu_read_unlock();
+
+	if (nr_devs < nr_devs_want)
+		return (struct journal_space) { 0, 0 };
+
+	/*
+	 * We sorted largest to smallest, and we want the smallest out of the
+	 * @nr_devs_want largest devices:
+	 */
+	return dev_space[nr_devs_want - 1];
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned clean, clean_ondisk, total;
+	s64 u64s_remaining = 0;
+	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
+				       j->buf[1].buf_size >> 9);
+	unsigned i, nr_online = 0, nr_devs_want;
+	bool can_discard = false;
+	int ret = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_journal]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (ja->dirty_idx != ja->cur_idx &&
+		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+
+		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
+		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+
+		if (ja->discard_idx != ja->dirty_idx_ondisk)
+			can_discard = true;
+
+		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	j->can_discard = can_discard;
+
+	if (nr_online < c->opts.metadata_replicas_required) {
+		ret = JOURNAL_ERR_insufficient_devices;
+		goto out;
+	}
+
+	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
+
+	for (i = 0; i < journal_space_nr; i++)
+		j->space[i] = __journal_space_available(j, nr_devs_want, i);
+
+	clean_ondisk	= j->space[journal_space_clean_ondisk].total;
+	clean		= j->space[journal_space_clean].total;
+	total		= j->space[journal_space_total].total;
+
+	if (!j->space[journal_space_discarded].next_entry)
+		ret = JOURNAL_ERR_journal_full;
+
+	if ((j->space[journal_space_clean_ondisk].next_entry <
+	     j->space[journal_space_clean_ondisk].total) &&
+	    (clean - clean_ondisk <= total / 8) &&
+	    (clean_ondisk * 2 > clean))
+		set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+	else
+		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
+	u64s_remaining  = (u64) clean << 6;
+	u64s_remaining -= (u64) total << 3;
+	u64s_remaining = max(0LL, u64s_remaining);
+	u64s_remaining /= 4;
+	u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+out:
+	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
+	j->cur_entry_error	= ret;
+	journal_set_remaining(j, u64s_remaining);
+	journal_set_watermark(j);
+
+	if (!ret)
+		journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = ja->discard_idx != ja->dirty_idx_ondisk;
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * Advance ja->discard_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+void bch2_journal_do_discards(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned iter;
+
+	mutex_lock(&j->discard_lock);
+
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!c->opts.nochanges &&
+			    ca->mi.discard &&
+			    bdev_max_discard_sectors(ca->disk_sb.bdev))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->discard_idx]),
+					ca->mi.bucket_size, GFP_NOFS);
+
+			spin_lock(&j->lock);
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
+
+			bch2_journal_space_available(j);
+			spin_unlock(&j->lock);
+		}
+	}
+
+	mutex_unlock(&j->discard_lock);
+}
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!fifo_empty(&j->pin) &&
+	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
+		j->pin.front++;
+		popped = true;
+	}
+
+	if (popped)
+		bch2_journal_space_available(j);
+}
+
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+	return atomic_dec_and_test(&pin_list->count);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+	if (__bch2_journal_pin_put(j, seq)) {
+		spin_lock(&j->lock);
+		bch2_journal_reclaim_fast(j);
+		spin_unlock(&j->lock);
+	}
+}
+
+static inline bool __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list;
+
+	if (!journal_pin_active(pin))
+		return false;
+
+	if (j->flush_in_progress == pin)
+		j->flush_in_progress_dropped = true;
+
+	pin_list = journal_seq_pin(j, pin->seq);
+	pin->seq = 0;
+	list_del_init(&pin->list);
+
+	/*
+	 * Unpinning a journal entry may make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 */
+	return atomic_dec_and_test(&pin_list->count) &&
+		pin_list == &fifo_peek_front(&j->pin);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+			   struct journal_entry_pin *pin)
+{
+	spin_lock(&j->lock);
+	if (__journal_pin_drop(j, pin))
+		bch2_journal_reclaim_fast(j);
+	spin_unlock(&j->lock);
+}
+
+static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+{
+	if (fn == bch2_btree_node_flush0 ||
+	    fn == bch2_btree_node_flush1)
+		return JOURNAL_PIN_btree;
+	else if (fn == bch2_btree_key_cache_journal_flush)
+		return JOURNAL_PIN_key_cache;
+	else
+		return JOURNAL_PIN_other;
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
+{
+	struct journal_entry_pin_list *pin_list;
+	bool reclaim;
+
+	spin_lock(&j->lock);
+
+	if (seq < journal_last_seq(j)) {
+		/*
+		 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
+		 * the src pin - with the pin dropped, the entry to pin might no
+		 * longer to exist, but that means there's no longer anything to
+		 * copy and we can bail out here:
+		 */
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	pin_list = journal_seq_pin(j, seq);
+
+	reclaim = __journal_pin_drop(j, pin);
+
+	atomic_inc(&pin_list->count);
+	pin->seq	= seq;
+	pin->flush	= flush_fn;
+
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
+	else
+		list_add(&pin->list, &pin_list->flushed);
+
+	if (reclaim)
+		bch2_journal_reclaim_fast(j);
+	spin_unlock(&j->lock);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
+}
+
+/**
+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j:		journal object
+ * @pin:	pin to flush
+ */
+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+	BUG_ON(journal_pin_active(pin));
+
+	wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j,
+		     u64 seq_to_flush,
+		     unsigned allowed_below_seq,
+		     unsigned allowed_above_seq,
+		     u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret = NULL;
+	unsigned i;
+
+	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
+		if (*seq > seq_to_flush && !allowed_above_seq)
+			break;
+
+		for (i = 0; i < JOURNAL_PIN_NR; i++)
+			if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+			    ((1U << i) & allowed_above_seq)) {
+				ret = list_first_entry_or_null(&pin_list->list[i],
+					struct journal_entry_pin, list);
+				if (ret)
+					return ret;
+			}
+	}
+
+	return NULL;
+}
+
+/* returns true if we did work */
+static size_t journal_flush_pins(struct journal *j,
+				 u64 seq_to_flush,
+				 unsigned allowed_below_seq,
+				 unsigned allowed_above_seq,
+				 unsigned min_any,
+				 unsigned min_key_cache)
+{
+	struct journal_entry_pin *pin;
+	size_t nr_flushed = 0;
+	journal_pin_flush_fn flush_fn;
+	u64 seq;
+	int err;
+
+	lockdep_assert_held(&j->reclaim_lock);
+
+	while (1) {
+		unsigned allowed_above = allowed_above_seq;
+		unsigned allowed_below = allowed_below_seq;
+
+		if (min_any) {
+			allowed_above |= ~0;
+			allowed_below |= ~0;
+		}
+
+		if (min_key_cache) {
+			allowed_above |= 1U << JOURNAL_PIN_key_cache;
+			allowed_below |= 1U << JOURNAL_PIN_key_cache;
+		}
+
+		cond_resched();
+
+		j->last_flushed = jiffies;
+
+		spin_lock(&j->lock);
+		pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
+		if (pin) {
+			BUG_ON(j->flush_in_progress);
+			j->flush_in_progress = pin;
+			j->flush_in_progress_dropped = false;
+			flush_fn = pin->flush;
+		}
+		spin_unlock(&j->lock);
+
+		if (!pin)
+			break;
+
+		if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
+			min_key_cache--;
+
+		if (min_any)
+			min_any--;
+
+		err = flush_fn(j, pin, seq);
+
+		spin_lock(&j->lock);
+		/* Pin might have been dropped or rearmed: */
+		if (likely(!err && !j->flush_in_progress_dropped))
+			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+		j->flush_in_progress = NULL;
+		j->flush_in_progress_dropped = false;
+		spin_unlock(&j->lock);
+
+		wake_up(&j->pin_flush_wait);
+
+		if (err)
+			break;
+
+		nr_flushed++;
+	}
+
+	return nr_flushed;
+}
+
+static u64 journal_seq_to_flush(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	u64 seq_to_flush = 0;
+	unsigned iter;
+
+	spin_lock(&j->lock);
+
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned nr_buckets, bucket_to_flush;
+
+		if (!ja->nr)
+			continue;
+
+		/* Try to keep the journal at most half full: */
+		nr_buckets = ja->nr / 2;
+
+		/* And include pre-reservations: */
+		nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
+					   (ca->mi.bucket_size << 6) -
+					   journal_entry_overhead(j));
+
+		nr_buckets = min(nr_buckets, ja->nr);
+
+		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
+		seq_to_flush = max(seq_to_flush,
+				   ja->bucket_seq[bucket_to_flush]);
+	}
+
+	/* Also flush if the pin fifo is more than half full */
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) journal_cur_seq(j) -
+			     (j->pin.size >> 1));
+	spin_unlock(&j->lock);
+
+	return seq_to_flush;
+}
+
+/**
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j:		journal object
+ * @direct:	direct or background reclaim?
+ * @kicked:	requested to run since we last ran?
+ * Returns:	0 on success, or -EIO if the journal has been shutdown
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	u64 seq_to_flush;
+	size_t min_nr, min_key_cache, nr_flushed;
+	unsigned flags;
+	int ret = 0;
+
+	/*
+	 * We can't invoke memory reclaim while holding the reclaim_lock -
+	 * journal reclaim is required to make progress for memory reclaim
+	 * (cleaning the caches), so we can't get stuck in memory reclaim while
+	 * we're holding the reclaim lock:
+	 */
+	lockdep_assert_held(&j->reclaim_lock);
+	flags = memalloc_noreclaim_save();
+
+	do {
+		if (kthread && kthread_should_stop())
+			break;
+
+		if (bch2_journal_error(j)) {
+			ret = -EIO;
+			break;
+		}
+
+		bch2_journal_do_discards(j);
+
+		seq_to_flush = journal_seq_to_flush(j);
+		min_nr = 0;
+
+		/*
+		 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+		 * make sure to flush at least one journal pin:
+		 */
+		if (time_after(jiffies, j->last_flushed +
+			       msecs_to_jiffies(c->opts.journal_reclaim_delay)))
+			min_nr = 1;
+
+		if (j->prereserved.reserved * 4 > j->prereserved.remaining)
+			min_nr = 1;
+
+		if (fifo_free(&j->pin) <= 32)
+			min_nr = 1;
+
+		if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+			min_nr = 1;
+
+		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+		trace_and_count(c, journal_reclaim_start, c,
+				direct, kicked,
+				min_nr, min_key_cache,
+				j->prereserved.reserved,
+				j->prereserved.remaining,
+				atomic_read(&c->btree_cache.dirty),
+				c->btree_cache.used,
+				atomic_long_read(&c->btree_key_cache.nr_dirty),
+				atomic_long_read(&c->btree_key_cache.nr_keys));
+
+		nr_flushed = journal_flush_pins(j, seq_to_flush,
+						~0, 0,
+						min_nr, min_key_cache);
+
+		if (direct)
+			j->nr_direct_reclaim += nr_flushed;
+		else
+			j->nr_background_reclaim += nr_flushed;
+		trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
+
+		if (nr_flushed)
+			wake_up(&j->reclaim_wait);
+	} while ((min_nr || min_key_cache) && nr_flushed && !direct);
+
+	memalloc_noreclaim_restore(flags);
+
+	return ret;
+}
+
+int bch2_journal_reclaim(struct journal *j)
+{
+	return __bch2_journal_reclaim(j, true, true);
+}
+
+static int bch2_journal_reclaim_thread(void *arg)
+{
+	struct journal *j = arg;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	unsigned long delay, now;
+	bool journal_empty;
+	int ret = 0;
+
+	set_freezable();
+
+	j->last_flushed = jiffies;
+
+	while (!ret && !kthread_should_stop()) {
+		bool kicked = j->reclaim_kicked;
+
+		j->reclaim_kicked = false;
+
+		mutex_lock(&j->reclaim_lock);
+		ret = __bch2_journal_reclaim(j, false, kicked);
+		mutex_unlock(&j->reclaim_lock);
+
+		now = jiffies;
+		delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
+		j->next_reclaim = j->last_flushed + delay;
+
+		if (!time_in_range(j->next_reclaim, now, now + delay))
+			j->next_reclaim = now + delay;
+
+		while (1) {
+			set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+			if (kthread_should_stop())
+				break;
+			if (j->reclaim_kicked)
+				break;
+
+			spin_lock(&j->lock);
+			journal_empty = fifo_empty(&j->pin);
+			spin_unlock(&j->lock);
+
+			if (journal_empty)
+				schedule();
+			else if (time_after(j->next_reclaim, jiffies))
+				schedule_timeout(j->next_reclaim - jiffies);
+			else
+				break;
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	return 0;
+}
+
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+	struct task_struct *p = j->reclaim_thread;
+
+	j->reclaim_thread = NULL;
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_journal_reclaim_start(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct task_struct *p;
+	int ret;
+
+	if (j->reclaim_thread)
+		return 0;
+
+	p = kthread_create(bch2_journal_reclaim_thread, j,
+			   "bch-reclaim/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(p);
+	if (ret) {
+		bch_err_msg(c, ret, "creating journal reclaim thread");
+		return ret;
+	}
+
+	get_task_struct(p);
+	j->reclaim_thread = p;
+	wake_up_process(p);
+	return 0;
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+			      bool *did_work)
+{
+	int ret;
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	mutex_lock(&j->reclaim_lock);
+
+	if (journal_flush_pins(j, seq_to_flush,
+			       (1U << JOURNAL_PIN_key_cache)|
+			       (1U << JOURNAL_PIN_other), 0, 0, 0) ||
+	    journal_flush_pins(j, seq_to_flush,
+			       (1U << JOURNAL_PIN_btree), 0, 0, 0))
+		*did_work = true;
+
+	spin_lock(&j->lock);
+	/*
+	 * If journal replay hasn't completed, the unreplayed journal entries
+	 * hold refs on their corresponding sequence numbers
+	 */
+	ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+		journal_last_seq(j) > seq_to_flush ||
+		!fifo_used(&j->pin);
+
+	spin_unlock(&j->lock);
+	mutex_unlock(&j->reclaim_lock);
+
+	return ret;
+}
+
+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+	bool did_work = false;
+
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return false;
+
+	closure_wait_event(&j->async_wait,
+		journal_flush_done(j, seq_to_flush, &did_work));
+
+	return did_work;
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	u64 iter, seq = 0;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (dev_idx >= 0
+		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
+		    : p->devs.nr < c->opts.metadata_replicas)
+			seq = iter;
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_pins(j, seq);
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
+
+	/*
+	 * Now that we've populated replicas_gc, write to the journal to mark
+	 * active journal devices. This handles the case where the journal might
+	 * be empty. Otherwise we could clear all journal replicas and
+	 * temporarily put the fs into an unrecoverable state. Journal recovery
+	 * expects to find devices marked for journal data on unclean mount.
+	 */
+	ret = bch2_journal_meta(&c->journal);
+	if (ret)
+		goto err;
+
+	seq = 0;
+	spin_lock(&j->lock);
+	while (!ret) {
+		struct bch_replicas_padded replicas;
+
+		seq = max(seq, journal_last_seq(j));
+		if (seq >= j->pin.back)
+			break;
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+					 journal_seq_pin(j, seq)->devs);
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_mark_replicas(c, &replicas.e);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+err:
+	ret = bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
new file mode 100644
index 000000000000..494d1a6eddb0
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN	(32 * 1024)
+
+static inline void journal_reclaim_kick(struct journal *j)
+{
+	struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+	j->reclaim_kicked = true;
+	if (p)
+		wake_up_process(p);
+}
+
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+					    struct journal_device *,
+					    enum journal_space_from);
+void bch2_journal_space_available(struct journal *);
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->seq != 0;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+	EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+	return &j->pin.data[seq & j->pin.mask];
+}
+
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
+			  journal_pin_flush_fn);
+
+static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
+					struct journal_entry_pin *pin,
+					journal_pin_flush_fn flush_fn)
+{
+	if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
+		bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
+
+static inline void bch2_journal_pin_copy(struct journal *j,
+					 struct journal_entry_pin *dst,
+					 struct journal_entry_pin *src,
+					 journal_pin_flush_fn flush_fn)
+{
+	/* Guard against racing with journal_pin_drop(src): */
+	u64 seq = READ_ONCE(src->seq);
+
+	if (seq)
+		bch2_journal_pin_add(j, seq, dst, flush_fn);
+}
+
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
+					   struct journal_entry_pin *pin,
+					   journal_pin_flush_fn flush_fn)
+{
+	if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
+		bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
+
+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_do_discards(struct journal *);
+int bch2_journal_reclaim(struct journal *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
+
+bool bch2_journal_flush_pins(struct journal *, u64);
+
+static inline bool bch2_journal_flush_all_pins(struct journal *j)
+{
+	return bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
new file mode 100644
index 000000000000..ae4fb8c3a2bc
--- /dev/null
+++ b/fs/bcachefs/journal_sb.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+#include "darray.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+	const u64 *l = _l;
+	const u64 *r = _r;
+
+	return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+	int ret = -BCH_ERR_invalid_sb_journal;
+	unsigned nr;
+	unsigned i;
+	u64 *b;
+
+	nr = bch2_nr_journal_buckets(journal);
+	if (!nr)
+		return 0;
+
+	b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
+	if (!b)
+		return -BCH_ERR_ENOMEM_sb_journal_validate;
+
+	for (i = 0; i < nr; i++)
+		b[i] = le64_to_cpu(journal->buckets[i]);
+
+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+	if (!b[0]) {
+		prt_printf(err, "journal bucket at sector 0");
+		goto err;
+	}
+
+	if (b[0] < le16_to_cpu(m.first_bucket)) {
+		prt_printf(err, "journal bucket %llu before first bucket %u",
+		       b[0], le16_to_cpu(m.first_bucket));
+		goto err;
+	}
+
+	if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
+		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1], le64_to_cpu(m.nbuckets));
+		goto err;
+	}
+
+	for (i = 0; i + 1 < nr; i++)
+		if (b[i] == b[i + 1]) {
+			prt_printf(err, "duplicate journal buckets %llu", b[i]);
+			goto err;
+		}
+
+	ret = 0;
+err:
+	kfree(b);
+	return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+	prt_printf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+	.validate	= bch2_sb_journal_validate,
+	.to_text	= bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+	u64	start;
+	u64	end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+	const struct u64_range *l = _l;
+	const struct u64_range *r = _r;
+
+	return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+	int ret = -BCH_ERR_invalid_sb_journal;
+	unsigned nr;
+	unsigned i;
+	struct u64_range *b;
+
+	nr = bch2_sb_field_journal_v2_nr_entries(journal);
+	if (!nr)
+		return 0;
+
+	b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
+	if (!b)
+		return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
+
+	for (i = 0; i < nr; i++) {
+		b[i].start = le64_to_cpu(journal->d[i].start);
+		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+	}
+
+	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+	if (!b[0].start) {
+		prt_printf(err, "journal bucket at sector 0");
+		goto err;
+	}
+
+	if (b[0].start < le16_to_cpu(m.first_bucket)) {
+		prt_printf(err, "journal bucket %llu before first bucket %u",
+		       b[0].start, le16_to_cpu(m.first_bucket));
+		goto err;
+	}
+
+	if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
+		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
+		goto err;
+	}
+
+	for (i = 0; i + 1 < nr; i++) {
+		if (b[i].end > b[i + 1].start) {
+			prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+			goto err;
+		}
+	}
+
+	ret = 0;
+err:
+	kfree(b);
+	return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+	unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+	prt_printf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		prt_printf(out, " %llu-%llu",
+		       le64_to_cpu(journal->d[i].start),
+		       le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+	.validate	= bch2_sb_journal_v2_validate,
+	.to_text	= bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+			       u64 *buckets, unsigned nr)
+{
+	struct bch_sb_field_journal_v2 *j;
+	unsigned i, dst = 0, nr_compacted = 1;
+
+	if (c)
+		lockdep_assert_held(&c->sb_lock);
+
+	if (!nr) {
+		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+		return 0;
+	}
+
+	for (i = 0; i + 1 < nr; i++)
+		if (buckets[i] + 1 != buckets[i + 1])
+			nr_compacted++;
+
+	j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
+			 (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
+	if (!j)
+		return -BCH_ERR_ENOSPC_sb_journal;
+
+	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+	j->d[dst].start = cpu_to_le64(buckets[0]);
+	j->d[dst].nr	= cpu_to_le64(1);
+
+	for (i = 1; i < nr; i++) {
+		if (buckets[i] == buckets[i - 1] + 1) {
+			le64_add_cpu(&j->d[dst].nr, 1);
+		} else {
+			dst++;
+			j->d[dst].start = cpu_to_le64(buckets[i]);
+			j->d[dst].nr	= cpu_to_le64(1);
+		}
+	}
+
+	BUG_ON(dst + 1 != nr_compacted);
+	return 0;
+}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
new file mode 100644
index 000000000000..ba40a7e8d90a
--- /dev/null
+++ b/fs/bcachefs/journal_sb.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+	return j
+		? (__le64 *) vstruct_end(&j->field) - j->buckets
+		: 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+	if (!j)
+		return 0;
+
+	return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
new file mode 100644
index 000000000000..f9d9aa95bf3a
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "eytzinger.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ */
+
+static unsigned sb_blacklist_u64s(unsigned nr)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+
+	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
+}
+
+static struct bch_sb_field_journal_seq_blacklist *
+blacklist_entry_try_merge(struct bch_fs *c,
+			  struct bch_sb_field_journal_seq_blacklist *bl,
+			  unsigned i)
+{
+	unsigned nr = blacklist_nr_entries(bl);
+
+	if (le64_to_cpu(bl->start[i].end) >=
+	    le64_to_cpu(bl->start[i + 1].start)) {
+		bl->start[i].end = bl->start[i + 1].end;
+		--nr;
+		memmove(&bl->start[i],
+			&bl->start[i + 1],
+			sizeof(bl->start[0]) * (nr - i));
+
+		bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+					  sb_blacklist_u64s(nr));
+		BUG_ON(!bl);
+	}
+
+	return bl;
+}
+
+static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
+					u64 start, u64 end)
+{
+	return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
+}
+
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	unsigned i, nr;
+	int ret = 0;
+
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+	nr = blacklist_nr_entries(bl);
+
+	for (i = 0; i < nr; i++) {
+		struct journal_seq_blacklist_entry *e =
+			bl->start + i;
+
+		if (bl_entry_contig_or_overlaps(e, start, end)) {
+			e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
+			e->end	= cpu_to_le64(max(end, le64_to_cpu(e->end)));
+
+			if (i + 1 < nr)
+				bl = blacklist_entry_try_merge(c,
+							bl, i);
+			if (i)
+				bl = blacklist_entry_try_merge(c,
+							bl, i - 1);
+			goto out_write_sb;
+		}
+	}
+
+	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+				  sb_blacklist_u64s(nr + 1));
+	if (!bl) {
+		ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
+		goto out;
+	}
+
+	bl->start[nr].start	= cpu_to_le64(start);
+	bl->start[nr].end	= cpu_to_le64(end);
+out_write_sb:
+	c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
+
+	ret = bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret ?: bch2_blacklist_table_initialize(c);
+}
+
+static int journal_seq_blacklist_table_cmp(const void *_l,
+					   const void *_r, size_t size)
+{
+	const struct journal_seq_blacklist_table_entry *l = _l;
+	const struct journal_seq_blacklist_table_entry *r = _r;
+
+	return cmp_int(l->start, r->start);
+}
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
+				     bool dirty)
+{
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+	struct journal_seq_blacklist_table_entry search = { .start = seq };
+	int idx;
+
+	if (!t)
+		return false;
+
+	idx = eytzinger0_find_le(t->entries, t->nr,
+				 sizeof(t->entries[0]),
+				 journal_seq_blacklist_table_cmp,
+				 &search);
+	if (idx < 0)
+		return false;
+
+	BUG_ON(t->entries[idx].start > seq);
+
+	if (seq >= t->entries[idx].end)
+		return false;
+
+	if (dirty)
+		t->entries[idx].dirty = true;
+	return true;
+}
+
+int bch2_blacklist_table_initialize(struct bch_fs *c)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+	struct journal_seq_blacklist_table *t;
+	unsigned i, nr = blacklist_nr_entries(bl);
+
+	if (!bl)
+		return 0;
+
+	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
+		    GFP_KERNEL);
+	if (!t)
+		return -BCH_ERR_ENOMEM_blacklist_table_init;
+
+	t->nr = nr;
+
+	for (i = 0; i < nr; i++) {
+		t->entries[i].start	= le64_to_cpu(bl->start[i].start);
+		t->entries[i].end	= le64_to_cpu(bl->start[i].end);
+	}
+
+	eytzinger0_sort(t->entries,
+			t->nr,
+			sizeof(t->entries[0]),
+			journal_seq_blacklist_table_cmp,
+			NULL);
+
+	kfree(c->journal_seq_blacklist_table);
+	c->journal_seq_blacklist_table = t;
+	return 0;
+}
+
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+						  struct bch_sb_field *f,
+						  struct printbuf *err)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	unsigned i, nr = blacklist_nr_entries(bl);
+
+	for (i = 0; i < nr; i++) {
+		struct journal_seq_blacklist_entry *e = bl->start + i;
+
+		if (le64_to_cpu(e->start) >=
+		    le64_to_cpu(e->end)) {
+			prt_printf(err, "entry %u start >= end (%llu >= %llu)",
+			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
+			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
+		}
+
+		if (i + 1 < nr &&
+		    le64_to_cpu(e[0].end) >
+		    le64_to_cpu(e[1].start)) {
+			prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
+			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
+			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
+						  struct bch_sb *sb,
+						  struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);
+
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (i != bl->start)
+			prt_printf(out, " ");
+
+		prt_printf(out, "%llu-%llu",
+		       le64_to_cpu(i->start),
+		       le64_to_cpu(i->end));
+	}
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
+	.validate	= bch2_sb_journal_seq_blacklist_validate,
+	.to_text	= bch2_sb_journal_seq_blacklist_to_text
+};
+
+void bch2_blacklist_entries_gc(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+					journal_seq_blacklist_gc_work);
+	struct journal_seq_blacklist_table *t;
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	struct journal_seq_blacklist_entry *src, *dst;
+	struct btree_trans *trans = bch2_trans_get(c);
+	unsigned i, nr, new_nr;
+	int ret;
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
+					  0, 0, BTREE_ITER_PREFETCH);
+retry:
+		bch2_trans_begin(trans);
+
+		b = bch2_btree_iter_peek_node(&iter);
+
+		while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+		       b &&
+		       !test_bit(BCH_FS_STOPPING, &c->flags))
+			b = bch2_btree_iter_next_node(&iter);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	bch2_trans_put(trans);
+	if (ret)
+		return;
+
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+	if (!bl)
+		goto out;
+
+	nr = blacklist_nr_entries(bl);
+	dst = bl->start;
+
+	t = c->journal_seq_blacklist_table;
+	BUG_ON(nr != t->nr);
+
+	for (src = bl->start, i = eytzinger0_first(t->nr);
+	     src < bl->start + nr;
+	     src++, i = eytzinger0_next(i, nr)) {
+		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
+		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
+
+		if (t->entries[i].dirty)
+			*dst++ = *src;
+	}
+
+	new_nr = dst - bl->start;
+
+	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
+
+	if (new_nr != nr) {
+		bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+				new_nr ? sb_blacklist_u64s(new_nr) : 0);
+		BUG_ON(new_nr && !bl);
+
+		if (!new_nr)
+			c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
+
+		bch2_write_super(c);
+	}
+out:
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
new file mode 100644
index 000000000000..afb886ec8e25
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+static inline unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
+{
+	return bl
+		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+		   sizeof(struct journal_seq_blacklist_entry))
+		: 0;
+}
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
+int bch2_blacklist_table_initialize(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
+
+void bch2_blacklist_entries_gc(struct work_struct *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
new file mode 100644
index 000000000000..42504e16acb6
--- /dev/null
+++ b/fs/bcachefs/journal_types.h
@@ -0,0 +1,345 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
+#define _BCACHEFS_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "super_types.h"
+#include "fifo.h"
+
+#define JOURNAL_BUF_BITS	2
+#define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
+
+/*
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
+ */
+struct journal_buf {
+	struct jset		*data;
+
+	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
+	struct bch_devs_list	devs_written;
+
+	struct closure_waitlist	wait;
+	u64			last_seq;	/* copy of data->last_seq */
+	long			expires;
+	u64			flush_time;
+
+	unsigned		buf_size;	/* size in bytes of @data */
+	unsigned		sectors;	/* maximum size for current entry */
+	unsigned		disk_sectors;	/* maximum size entry could have been, if
+						   buf_size was bigger */
+	unsigned		u64s_reserved;
+	bool			noflush;	/* write has already been kicked off, and was noflush */
+	bool			must_flush;	/* something wants a flush */
+	bool			separate_flush;
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+enum journal_pin_type {
+	JOURNAL_PIN_btree,
+	JOURNAL_PIN_key_cache,
+	JOURNAL_PIN_other,
+	JOURNAL_PIN_NR,
+};
+
+struct journal_entry_pin_list {
+	struct list_head		list[JOURNAL_PIN_NR];
+	struct list_head		flushed;
+	atomic_t			count;
+	struct bch_devs_list		devs;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef int (*journal_pin_flush_fn)(struct journal *j,
+				struct journal_entry_pin *, u64);
+
+struct journal_entry_pin {
+	struct list_head		list;
+	journal_pin_flush_fn		flush;
+	u64				seq;
+};
+
+struct journal_res {
+	bool			ref;
+	u8			idx;
+	u16			u64s;
+	u32			offset;
+	u64			seq;
+};
+
+/*
+ * For reserving space in the journal prior to getting a reservation on a
+ * particular journal entry:
+ */
+struct journal_preres {
+	unsigned		u64s;
+};
+
+union journal_res_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64		cur_entry_offset:20,
+				idx:2,
+				unwritten_idx:2,
+				buf0_count:10,
+				buf1_count:10,
+				buf2_count:10,
+				buf3_count:10;
+	};
+};
+
+union journal_preres_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64		waiting:1,
+				reserved:31,
+				remaining:32;
+	};
+};
+
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
+
+struct journal_space {
+	/* Units of 512 bytes sectors: */
+	unsigned	next_entry; /* How big the next journal entry can be */
+	unsigned	total;
+};
+
+enum journal_space_from {
+	journal_space_discarded,
+	journal_space_clean_ondisk,
+	journal_space_clean,
+	journal_space_total,
+	journal_space_nr,
+};
+
+enum journal_flags {
+	JOURNAL_REPLAY_DONE,
+	JOURNAL_STARTED,
+	JOURNAL_MAY_SKIP_FLUSH,
+	JOURNAL_NEED_FLUSH_WRITE,
+};
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS()		\
+	x(ok)				\
+	x(blocked)			\
+	x(max_in_flight)		\
+	x(journal_full)			\
+	x(journal_pin_full)		\
+	x(journal_stuck)		\
+	x(insufficient_devices)
+
+enum journal_errors {
+#define x(n)	JOURNAL_ERR_##n,
+	JOURNAL_ERRORS()
+#undef x
+};
+
+typedef DARRAY(u64)		darray_u64;
+
+/* Embedded in struct bch_fs */
+struct journal {
+	/* Fastpath stuff up front: */
+	struct {
+
+	union journal_res_state reservations;
+	enum bch_watermark	watermark;
+
+	union journal_preres_state prereserved;
+
+	} __aligned(SMP_CACHE_BYTES);
+
+	unsigned long		flags;
+
+	/* Max size of current journal entry */
+	unsigned		cur_entry_u64s;
+	unsigned		cur_entry_sectors;
+
+	/* Reserved space in journal entry to be used just prior to write */
+	unsigned		entry_u64s_reserved;
+
+
+	/*
+	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+	 * insufficient devices:
+	 */
+	enum journal_errors	cur_entry_error;
+
+	unsigned		buf_size_want;
+	/*
+	 * We may queue up some things to be journalled (log messages) before
+	 * the journal has actually started - stash them here:
+	 */
+	darray_u64		early_journal_entries;
+
+	/*
+	 * Two journal entries -- one is currently open for new entries, the
+	 * other is possibly being written out.
+	 */
+	struct journal_buf	buf[JOURNAL_BUF_NR];
+
+	spinlock_t		lock;
+
+	/* if nonzero, we may not open a new journal entry: */
+	unsigned		blocked;
+
+	/* Used when waiting because the journal was full */
+	wait_queue_head_t	wait;
+	struct closure_waitlist	async_wait;
+	struct closure_waitlist	preres_wait;
+
+	struct closure		io;
+	struct delayed_work	write_work;
+
+	/* Sequence number of most recent journal entry (last entry in @pin) */
+	atomic64_t		seq;
+
+	/* seq, last_seq from the most recent journal entry successfully written */
+	u64			seq_ondisk;
+	u64			flushed_seq_ondisk;
+	u64			last_seq_ondisk;
+	u64			err_seq;
+	u64			last_empty_seq;
+
+	/*
+	 * FIFO of journal entries whose btree updates have not yet been
+	 * written out.
+	 *
+	 * Each entry is a reference count. The position in the FIFO is the
+	 * entry's sequence number relative to @seq.
+	 *
+	 * The journal entry itself holds a reference count, put when the
+	 * journal entry is written out. Each btree node modified by the journal
+	 * entry also holds a reference count, put when the btree node is
+	 * written.
+	 *
+	 * When a reference count reaches zero, the journal entry is no longer
+	 * needed. When all journal entries in the oldest journal bucket are no
+	 * longer needed, the bucket can be discarded and reused.
+	 */
+	struct {
+		u64 front, back, size, mask;
+		struct journal_entry_pin_list *data;
+	}			pin;
+
+	struct journal_space	space[journal_space_nr];
+
+	u64			replay_journal_seq;
+	u64			replay_journal_seq_end;
+
+	struct write_point	wp;
+	spinlock_t		err_lock;
+
+	struct mutex		reclaim_lock;
+	/*
+	 * Used for waiting until journal reclaim has freed up space in the
+	 * journal:
+	 */
+	wait_queue_head_t	reclaim_wait;
+	struct task_struct	*reclaim_thread;
+	bool			reclaim_kicked;
+	unsigned long		next_reclaim;
+	u64			nr_direct_reclaim;
+	u64			nr_background_reclaim;
+
+	unsigned long		last_flushed;
+	struct journal_entry_pin *flush_in_progress;
+	bool			flush_in_progress_dropped;
+	wait_queue_head_t	pin_flush_wait;
+
+	/* protects advancing ja->discard_idx: */
+	struct mutex		discard_lock;
+	bool			can_discard;
+
+	unsigned long		last_flush_write;
+
+	u64			res_get_blocked_start;
+	u64			write_start_time;
+
+	u64			nr_flush_writes;
+	u64			nr_noflush_writes;
+
+	struct bch2_time_stats	*flush_write_time;
+	struct bch2_time_stats	*noflush_write_time;
+	struct bch2_time_stats	*blocked_time;
+	struct bch2_time_stats	*flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	res_map;
+#endif
+} __aligned(SMP_CACHE_BYTES);
+
+/*
+ * Embedded in struct bch_dev. First three fields refer to the array of journal
+ * buckets, in bch_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	u64			*bucket_seq;
+
+	unsigned		sectors_free;
+
+	/*
+	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
+	 */
+	unsigned		discard_idx;		/* Next bucket to discard */
+	unsigned		dirty_idx_ondisk;
+	unsigned		dirty_idx;
+	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
+	unsigned		nr;
+
+	u64			*buckets;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		*bio;
+
+	/* for bch_journal_read_device */
+	struct closure		read;
+};
+
+/*
+ * journal_entry_res - reserve space in every journal entry:
+ */
+struct journal_entry_res {
+	unsigned		u64s;
+};
+
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
new file mode 100644
index 000000000000..5699cd4873c8
--- /dev/null
+++ b/fs/bcachefs/keylist.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "keylist.h"
+
+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+			size_t nr_inline_u64s, size_t new_u64s)
+{
+	size_t oldsize = bch2_keylist_u64s(l);
+	size_t newsize = oldsize + new_u64s;
+	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+	u64 *new_keys;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= nr_inline_u64s ||
+	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
+		return 0;
+
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
+	if (!new_keys)
+		return -ENOMEM;
+
+	if (!old_buf)
+		memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
+
+	return 0;
+}
+
+void bch2_keylist_pop_front(struct keylist *l)
+{
+	l->top_p -= bch2_keylist_front(l)->k.u64s;
+
+	memmove_u64s_down(l->keys,
+			  bkey_next(l->keys),
+			  bch2_keylist_u64s(l));
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *l)
+{
+	struct bkey_i *k;
+
+	for_each_keylist_key(l, k)
+		BUG_ON(bkey_next(k) != l->top &&
+		       bpos_ge(k->k.p, bkey_next(k)->k.p));
+}
+#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
new file mode 100644
index 000000000000..fe759c7031e0
--- /dev/null
+++ b/fs/bcachefs/keylist.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_H
+#define _BCACHEFS_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch2_keylist_pop_front(struct keylist *);
+
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
+{
+	l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+	if (l->keys_p != inline_keys)
+		kfree(l->keys_p);
+}
+
+static inline void bch2_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+	bkey_copy(l->top, k);
+	bch2_keylist_push(l);
+}
+
+static inline bool bch2_keylist_empty(struct keylist *l)
+{
+	return l->top == l->keys;
+}
+
+static inline size_t bch2_keylist_u64s(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch2_keylist_bytes(struct keylist *l)
+{
+	return bch2_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
+{
+	return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k)			\
+	for (_k = (_keylist)->keys;				\
+	     _k != (_keylist)->top;				\
+	     _k = bkey_next(_k))
+
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+	struct bkey_i *k;
+	u64 ret = 0;
+
+	for_each_keylist_key(keys, k)
+		ret += k->k.size;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *);
+#else
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
+#endif
+
+#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
new file mode 100644
index 000000000000..4b3ff7d8a875
--- /dev/null
+++ b/fs/bcachefs/keylist_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
+#define _BCACHEFS_KEYLIST_TYPES_H
+
+struct keylist {
+	union {
+		struct bkey_i		*keys;
+		u64			*keys_p;
+	};
+	union {
+		struct bkey_i		*top;
+		u64			*top_p;
+	};
+};
+
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
new file mode 100644
index 000000000000..8640f7dee0de
--- /dev/null
+++ b/fs/bcachefs/logged_ops.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "error.h"
+#include "io_misc.h"
+#include "logged_ops.h"
+#include "super.h"
+
+struct bch_logged_op_fn {
+	u8		type;
+	int		(*resume)(struct btree_trans *, struct bkey_i *);
+};
+
+static const struct bch_logged_op_fn logged_op_fns[] = {
+#define x(n)		{					\
+	.type		= KEY_TYPE_logged_op_##n,		\
+	.resume		= bch2_resume_logged_op_##n,		\
+},
+	BCH_LOGGED_OPS()
+#undef x
+};
+
+static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
+{
+	for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
+		if (logged_op_fns[i].type == type)
+			return logged_op_fns + i;
+	return NULL;
+}
+
+static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
+			    struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
+	struct bkey_buf sk;
+	u32 restart_count = trans->restart_count;
+	int ret;
+
+	if (!fn)
+		return 0;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_bkey_buf_reassemble(&sk, c, k);
+
+	ret =   drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
+		fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
+int bch2_resume_logged_ops(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter,
+				BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+			resume_logged_op(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+	if (ret)
+		return ret;
+
+	k->k.p = iter.pos;
+
+	ret = bch2_trans_update(trans, &iter, k, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			 __bch2_logged_op_start(trans, k));
+}
+
+void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+{
+	int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			    bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
+	/*
+	 * This needs to be a fatal error because we've left an unfinished
+	 * operation in the logged ops btree.
+	 *
+	 * We should only ever see an error here if the filesystem has already
+	 * been shut down, but make sure of that here:
+	 */
+	if (ret) {
+		struct bch_fs *c = trans->c;
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+		bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
+				     __func__, buf.buf, bch2_err_str(ret));
+		printbuf_exit(&buf);
+	}
+}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
new file mode 100644
index 000000000000..4d1e786a27a8
--- /dev/null
+++ b/fs/bcachefs/logged_ops.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_H
+#define _BCACHEFS_LOGGED_OPS_H
+
+#include "bkey.h"
+
+#define BCH_LOGGED_OPS()			\
+	x(truncate)				\
+	x(finsert)
+
+static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
+{
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *);
+int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
+void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+
+#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
new file mode 100644
index 000000000000..215a653322f3
--- /dev/null
+++ b/fs/bcachefs/lru.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+/* KEY_TYPE_lru is obsolete: */
+int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
+		     enum bkey_invalid_flags flags,
+		     struct printbuf *err)
+{
+	if (!lru_pos_time(k.k->p)) {
+		prt_printf(err, "lru entry at time=0");
+		return -BCH_ERR_invalid_bkey;
+
+	}
+
+	return 0;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+		      struct bkey_s_c k)
+{
+	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+	prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
+{
+	prt_printf(out, "%llu:%llu -> %llu:%llu",
+		   lru_pos_id(lru),
+		   lru_pos_time(lru),
+		   u64_to_bucket(lru.offset).inode,
+		   u64_to_bucket(lru.offset).offset);
+}
+
+static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
+			  u64 dev_bucket, u64 time, bool set)
+{
+	return time
+		? bch2_btree_bit_mod(trans, BTREE_ID_lru,
+				     lru_pos(lru_id, dev_bucket, time), set)
+		: 0;
+}
+
+int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
+}
+
+int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
+}
+
+int bch2_lru_change(struct btree_trans *trans,
+		    u16 lru_id, u64 dev_bucket,
+		    u64 old_time, u64 new_time)
+{
+	if (old_time == new_time)
+		return 0;
+
+	return  bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
+		bch2_lru_set(trans, lru_id, dev_bucket, new_time);
+}
+
+static const char * const bch2_lru_types[] = {
+#define x(n) #n,
+	BCH_LRU_TYPES()
+#undef x
+	NULL
+};
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+			      struct btree_iter *lru_iter,
+			      struct bkey_s_c lru_k,
+			      struct bpos *last_flushed_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	enum bch_lru_type type = lru_type(lru_k);
+	struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
+	u64 idx;
+	int ret;
+
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+			"lru key points to nonexistent device:bucket %llu:%llu",
+			alloc_pos.inode, alloc_pos.offset))
+		return bch2_btree_delete_at(trans, lru_iter, 0);
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	a = bch2_alloc_to_v4(k, &a_convert);
+
+	switch (type) {
+	case BCH_LRU_read:
+		idx = alloc_lru_idx_read(*a);
+		break;
+	case BCH_LRU_fragmentation:
+		idx = a->fragmentation_lru;
+		break;
+	}
+
+	if (lru_k.k->type != KEY_TYPE_set ||
+	    lru_pos_time(lru_k.k->p) != idx) {
+		if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
+			*last_flushed_pos = lru_k.k->p;
+			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+				-BCH_ERR_transaction_restart_write_buffer_flush;
+			goto out;
+		}
+
+		if (c->opts.reconstruct_alloc ||
+		    fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+			     "  %s\n"
+			     "  for %s",
+			     bch2_lru_types[type],
+			     lru_pos_time(lru_k.k->p),
+			     (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+			     (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
+			ret = bch2_btree_delete_at(trans, lru_iter, 0);
+	}
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bpos last_flushed_pos = POS_MIN;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+
+}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
new file mode 100644
index 000000000000..be66bf9ad809
--- /dev/null
+++ b/fs/bcachefs/lru.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+#define LRU_TIME_BITS	48
+#define LRU_TIME_MAX	((1ULL << LRU_TIME_BITS) - 1)
+
+static inline u64 lru_pos_id(struct bpos pos)
+{
+	return pos.inode >> LRU_TIME_BITS;
+}
+
+static inline u64 lru_pos_time(struct bpos pos)
+{
+	return pos.inode & ~(~0ULL << LRU_TIME_BITS);
+}
+
+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
+{
+	struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
+
+	EBUG_ON(time > LRU_TIME_MAX);
+	EBUG_ON(lru_pos_id(pos) != lru_id);
+	EBUG_ON(lru_pos_time(pos) != time);
+	EBUG_ON(pos.offset != dev_bucket);
+
+	return pos;
+}
+
+#define BCH_LRU_TYPES()		\
+	x(read)			\
+	x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+	BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START	((1U << 16) - 1)
+
+static inline enum bch_lru_type lru_type(struct bkey_s_c l)
+{
+	u16 lru_id = l.k->p.inode >> 48;
+
+	if (lru_id == BCH_LRU_FRAGMENTATION_START)
+		return BCH_LRU_fragmentation;
+	return BCH_LRU_read;
+}
+
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c,
+		     enum bkey_invalid_flags, struct printbuf *);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
+
+#define bch2_bkey_ops_lru ((struct bkey_ops) {	\
+	.key_invalid	= bch2_lru_invalid,	\
+	.val_to_text	= bch2_lru_to_text,	\
+	.min_val_size	= 8,			\
+})
+
+int bch2_lru_del(struct btree_trans *, u16, u64, u64);
+int bch2_lru_set(struct btree_trans *, u16, u64, u64);
+int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+
+int bch2_check_lrus(struct bch_fs *);
+
+#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
new file mode 100644
index 000000000000..1f0801e2e565
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions for incremental mean and variance.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Copyright © 2022 Daniel B. Hill
+ *
+ * Author: Daniel B. Hill <daniel@gluo.nz>
+ *
+ * Description:
+ *
+ * This is includes some incremental algorithms for mean and variance calculation
+ *
+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ *
+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
+ *
+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
+ *
+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
+ * is deferred to these functions for performance reasons.
+ *
+ * see lib/math/mean_and_variance_test.c for examples of usage.
+ *
+ * DO NOT access the mean and variance fields of the weighted variants directly.
+ * DO NOT change the weight after calling update.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+
+#include "mean_and_variance.h"
+
+u128_u u128_div(u128_u n, u64 d)
+{
+	u128_u r;
+	u64 rem;
+	u64 hi = u128_hi(n);
+	u64 lo = u128_lo(n);
+	u64  h =  hi & ((u64) U32_MAX  << 32);
+	u64  l = (hi &  (u64) U32_MAX) << 32;
+
+	r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
+	r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+	return r;
+}
+EXPORT_SYMBOL_GPL(u128_div);
+
+/**
+ * mean_and_variance_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_get_mean(struct mean_and_variance s)
+{
+	return s.n ? div64_u64(s.sum, s.n) : 0;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
+
+/**
+ * mean_and_variance_get_variance() -  get variance from @s1
+ *
+ * see linked pdf equation 12.
+ */
+u64 mean_and_variance_get_variance(struct mean_and_variance s1)
+{
+	if (s1.n) {
+		u128_u s2 = u128_div(s1.sum_squares, s1.n);
+		u64  s3 = abs(mean_and_variance_get_mean(s1));
+
+		return u128_lo(u128_sub(s2, u128_square(s3)));
+	} else {
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
+
+/**
+ * mean_and_variance_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_get_stddev(struct mean_and_variance s)
+{
+	return int_sqrt64(mean_and_variance_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
+
+/**
+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
+ * @s1: ..
+ * @s2: ..
+ *
+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
+ * values are stored bitshifted for performance and added precision.
+ */
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+{
+	// previous weighted variance.
+	u8 w		= s->weight;
+	u64 var_w0	= s->variance;
+	// new value weighted.
+	s64 x_w		= x << w;
+	s64 diff_w	= x_w - s->mean;
+	s64 diff	= fast_divpow2(diff_w, w);
+	// new mean weighted.
+	s64 u_w1	= s->mean + diff;
+
+	if (!s->init) {
+		s->mean = x_w;
+		s->variance = 0;
+	} else {
+		s->mean = u_w1;
+		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+	}
+	s->init = true;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
+
+/**
+ * mean_and_variance_weighted_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+{
+	return fast_divpow2(s.mean, s.weight);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
+
+/**
+ * mean_and_variance_weighted_get_variance() -- get variance from @s
+ */
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+{
+	// always positive don't need fast divpow2
+	return s.variance >> s.weight;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
+
+/**
+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+{
+	return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
new file mode 100644
index 000000000000..647505010b39
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance.h
@@ -0,0 +1,198 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MEAN_AND_VARIANCE_H_
+#define MEAN_AND_VARIANCE_H_
+
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+
+#define SQRT_U64_MAX 4294967295ULL
+
+/*
+ * u128_u: u128 user mode, because not all architectures support a real int128
+ * type
+ */
+
+#ifdef __SIZEOF_INT128__
+
+typedef struct {
+	unsigned __int128 v;
+} __aligned(16) u128_u;
+
+static inline u128_u u64_to_u128(u64 a)
+{
+	return (u128_u) { .v = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+	return a.v;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+	return a.v >> 64;
+}
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+	a.v += b.v;
+	return a;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+	a.v -= b.v;
+	return a;
+}
+
+static inline u128_u u128_shl(u128_u a, s8 shift)
+{
+	a.v <<= shift;
+	return a;
+}
+
+static inline u128_u u128_square(u64 a)
+{
+	u128_u b = u64_to_u128(a);
+
+	b.v *= b.v;
+	return b;
+}
+
+#else
+
+typedef struct {
+	u64 hi, lo;
+} __aligned(16) u128_u;
+
+/* conversions */
+
+static inline u128_u u64_to_u128(u64 a)
+{
+	return (u128_u) { .lo = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+	return a.lo;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+	return a.hi;
+}
+
+/* arithmetic */
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+	u128_u c;
+
+	c.lo = a.lo + b.lo;
+	c.hi = a.hi + b.hi + (c.lo < a.lo);
+	return c;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+	u128_u c;
+
+	c.lo = a.lo - b.lo;
+	c.hi = a.hi - b.hi - (c.lo > a.lo);
+	return c;
+}
+
+static inline u128_u u128_shl(u128_u i, s8 shift)
+{
+	u128_u r;
+
+	r.lo = i.lo << shift;
+	if (shift < 64)
+		r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
+	else {
+		r.hi = i.lo << (shift - 64);
+		r.lo = 0;
+	}
+	return r;
+}
+
+static inline u128_u u128_square(u64 i)
+{
+	u128_u r;
+	u64  h = i >> 32, l = i & U32_MAX;
+
+	r =             u128_shl(u64_to_u128(h*h), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
+	r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
+	r = u128_add(r,          u64_to_u128(l*l));
+	return r;
+}
+
+#endif
+
+static inline u128_u u64s_to_u128(u64 hi, u64 lo)
+{
+	u128_u c = u64_to_u128(hi);
+
+	c = u128_shl(c, 64);
+	c = u128_add(c, u64_to_u128(lo));
+	return c;
+}
+
+u128_u u128_div(u128_u n, u64 d);
+
+struct mean_and_variance {
+	s64	n;
+	s64	sum;
+	u128_u	sum_squares;
+};
+
+/* expontentially weighted variant */
+struct mean_and_variance_weighted {
+	bool	init;
+	u8	weight;	/* base 2 logarithim */
+	s64	mean;
+	u64	variance;
+};
+
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+static inline s64 fast_divpow2(s64 n, u8 d)
+{
+	return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
+
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
+static inline void
+mean_and_variance_update(struct mean_and_variance *s, s64 v)
+{
+	s->n++;
+	s->sum += v;
+	s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
+}
+
+s64 mean_and_variance_get_mean(struct mean_and_variance s);
+u64 mean_and_variance_get_variance(struct mean_and_variance s1);
+u32 mean_and_variance_get_stddev(struct mean_and_variance s);
+
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+
+#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
new file mode 100644
index 000000000000..019583c3ca0e
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "mean_and_variance.h"
+
+#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
+
+static void mean_and_variance_basic_test(struct kunit *test)
+{
+	struct mean_and_variance s = {};
+
+	mean_and_variance_update(&s, 2);
+	mean_and_variance_update(&s, 2);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
+	KUNIT_EXPECT_EQ(test, s.n, 2);
+
+	mean_and_variance_update(&s, 4);
+	mean_and_variance_update(&s, 4);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
+	KUNIT_EXPECT_EQ(test, s.n, 4);
+}
+
+/*
+ * Test values computed using a spreadsheet from the psuedocode at the bottom:
+ * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ */
+
+static void mean_and_variance_weighted_test(struct kunit *test)
+{
+	struct mean_and_variance_weighted s = { .weight = 2 };
+
+	mean_and_variance_weighted_update(&s, 10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+
+	mean_and_variance_weighted_update(&s, 20);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+
+	mean_and_variance_weighted_update(&s, 30);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+
+	s = (struct mean_and_variance_weighted) { .weight = 2 };
+
+	mean_and_variance_weighted_update(&s, -10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+
+	mean_and_variance_weighted_update(&s, -20);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+
+	mean_and_variance_weighted_update(&s, -30);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+}
+
+static void mean_and_variance_weighted_advanced_test(struct kunit *test)
+{
+	struct mean_and_variance_weighted s = { .weight = 8 };
+	s64 i;
+
+	for (i = 10; i <= 100; i += 10)
+		mean_and_variance_weighted_update(&s, i);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+
+	s = (struct mean_and_variance_weighted) { .weight = 8 };
+
+	for (i = -10; i >= -100; i -= 10)
+		mean_and_variance_weighted_update(&s, i);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+}
+
+static void do_mean_and_variance_test(struct kunit *test,
+				      s64 initial_value,
+				      s64 initial_n,
+				      s64 n,
+				      unsigned weight,
+				      s64 *data,
+				      s64 *mean,
+				      s64 *stddev,
+				      s64 *weighted_mean,
+				      s64 *weighted_stddev)
+{
+	struct mean_and_variance mv = {};
+	struct mean_and_variance_weighted vw = { .weight = weight };
+
+	for (unsigned i = 0; i < initial_n; i++) {
+		mean_and_variance_update(&mv, initial_value);
+		mean_and_variance_weighted_update(&vw, initial_value);
+
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		initial_value);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		0);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	initial_value);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+	}
+
+	for (unsigned i = 0; i < n; i++) {
+		mean_and_variance_update(&mv, data[i]);
+		mean_and_variance_weighted_update(&vw, data[i]);
+
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		mean[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		stddev[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	weighted_mean[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+	}
+
+	KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
+}
+
+/* Test behaviour with a single outlier, then back to steady state: */
+static void mean_and_variance_test_1(struct kunit *test)
+{
+	s64 d[]			= { 100, 10, 10, 10, 10, 10, 10 };
+	s64 mean[]		= {  22, 21, 20, 19, 18, 17, 16 };
+	s64 stddev[]		= {  32, 29, 28, 27, 26, 25, 24 };
+	s64 weighted_mean[]	= {  32, 27, 22, 19, 17, 15, 14 };
+	s64 weighted_stddev[]	= {  38, 35, 31, 27, 24, 21, 18 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_test_2(struct kunit *test)
+{
+	s64 d[]			= { 100, 10, 10, 10, 10, 10, 10 };
+	s64 mean[]		= {  10, 10, 10, 10, 10, 10, 10 };
+	s64 stddev[]		= {   9,  9,  9,  9,  9,  9,  9 };
+	s64 weighted_mean[]	= {  32, 27, 22, 19, 17, 15, 14 };
+	s64 weighted_stddev[]	= {  38, 35, 31, 27, 24, 21, 18 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+/* Test behaviour where we switch from one steady state to another: */
+static void mean_and_variance_test_3(struct kunit *test)
+{
+	s64 d[]			= { 100, 100, 100, 100, 100 };
+	s64 mean[]		= {  22,  32,  40,  46,  50 };
+	s64 stddev[]		= {  32,  39,  42,  44,  45 };
+	s64 weighted_mean[]	= {  32,  49,  61,  71,  78 };
+	s64 weighted_stddev[]	= {  38,  44,  44,  41,  38 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_test_4(struct kunit *test)
+{
+	s64 d[]			= { 100, 100, 100, 100, 100 };
+	s64 mean[]		= {  10,  11,  12,  13,  14 };
+	s64 stddev[]		= {   9,  13,  15,  17,  19 };
+	s64 weighted_mean[]	= {  32,  49,  61,  71,  78 };
+	s64 weighted_stddev[]	= {  38,  44,  44,  41,  38 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_fast_divpow2(struct kunit *test)
+{
+	s64 i;
+	u8 d;
+
+	for (i = 0; i < 100; i++) {
+		d = 0;
+		KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
+		KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
+		for (d = 1; d < 32; d++) {
+			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
+					    div_u64(i, 1 << d), "%lld %u", i, d);
+			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
+					    div_u64(i, 1 << d), "%lld %u", -i, d);
+		}
+	}
+}
+
+static void mean_and_variance_u128_basic_test(struct kunit *test)
+{
+	u128_u a  = u64s_to_u128(0, U64_MAX);
+	u128_u a1 = u64s_to_u128(0, 1);
+	u128_u b  = u64s_to_u128(1, 0);
+	u128_u c  = u64s_to_u128(0, 1LLU << 63);
+	u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0);
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX);
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1);
+
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
+}
+
+static struct kunit_case mean_and_variance_test_cases[] = {
+	KUNIT_CASE(mean_and_variance_fast_divpow2),
+	KUNIT_CASE(mean_and_variance_u128_basic_test),
+	KUNIT_CASE(mean_and_variance_basic_test),
+	KUNIT_CASE(mean_and_variance_weighted_test),
+	KUNIT_CASE(mean_and_variance_weighted_advanced_test),
+	KUNIT_CASE(mean_and_variance_test_1),
+	KUNIT_CASE(mean_and_variance_test_2),
+	KUNIT_CASE(mean_and_variance_test_3),
+	KUNIT_CASE(mean_and_variance_test_4),
+	{}
+};
+
+static struct kunit_suite mean_and_variance_test_suite = {
+	.name		= "mean and variance tests",
+	.test_cases	= mean_and_variance_test_cases
+};
+
+kunit_test_suite(mean_and_variance_test_suite);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
new file mode 100644
index 000000000000..e3a51f6d6c9b
--- /dev/null
+++ b/fs/bcachefs/migrate.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "extents.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
+			 unsigned dev_idx, int flags, bool metadata)
+{
+	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+	unsigned nr_good;
+
+	bch2_bkey_drop_device(k, dev_idx);
+
+	nr_good = bch2_bkey_durability(c, k.s_c);
+	if ((!nr_good && !(flags & lost)) ||
+	    (nr_good < replicas && !(flags & degraded)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     unsigned dev_idx,
+				     int flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	if (!bch2_bkey_has_device_c(k, dev_idx))
+		return 0;
+
+	n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
+	if (ret)
+		return ret;
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k))
+		n->k.size = 0;
+	return 0;
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	enum btree_id id;
+	int ret = 0;
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		if (!btree_type_has_ptrs(id))
+			continue;
+
+		ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL,
+			bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	struct bkey_buf k;
+	unsigned id;
+	int ret;
+
+	/* don't handle this yet: */
+	if (flags & BCH_FORCE_IF_METADATA_LOST)
+		return -EINVAL;
+
+	trans = bch2_trans_get(c);
+	bch2_bkey_buf_init(&k);
+	closure_init_stack(&cl);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+					  BTREE_ITER_PREFETCH);
+retry:
+		ret = 0;
+		while (bch2_trans_begin(trans),
+		       (b = bch2_btree_iter_peek_node(&iter)) &&
+		       !(ret = PTR_ERR_OR_ZERO(b))) {
+			if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
+				goto next;
+
+			bch2_bkey_buf_copy(&k, c, &b->key);
+
+			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
+					    dev_idx, flags, true);
+			if (ret) {
+				bch_err(c, "Cannot drop device without losing data");
+				break;
+			}
+
+			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+				ret = 0;
+				continue;
+			}
+
+			if (ret) {
+				bch_err_msg(c, ret, "updating btree node key");
+				break;
+			}
+next:
+			bch2_btree_iter_next_node(&iter);
+		}
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret)
+			goto err;
+	}
+
+	bch2_btree_interior_updates_flush(c);
+	ret = 0;
+err:
+	bch2_bkey_buf_exit(&k, c);
+	bch2_trans_put(trans);
+
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+	return ret;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+		bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
new file mode 100644
index 000000000000..027efaa0d575
--- /dev/null
+++ b/fs/bcachefs/migrate.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MIGRATE_H
+#define _BCACHEFS_MIGRATE_H
+
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+
+#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
new file mode 100644
index 000000000000..39a14e321680
--- /dev/null
+++ b/fs/bcachefs/move.c
@@ -0,0 +1,1159 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "io_read.h"
+#include "io_write.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/ioprio.h>
+#include <linux/kthread.h>
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_read_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_read(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_alloc_mem_fail_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_alloc_mem_fail(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
+{
+	mutex_lock(&c->data_progress_lock);
+	list_add(&stats->list, &c->data_progress_list);
+	mutex_unlock(&c->data_progress_lock);
+}
+
+static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
+{
+	mutex_lock(&c->data_progress_lock);
+	list_del(&stats->list);
+	mutex_unlock(&c->data_progress_lock);
+}
+
+struct moving_io {
+	struct list_head		read_list;
+	struct list_head		io_list;
+	struct move_bucket_in_flight	*b;
+	struct closure			cl;
+	bool				read_completed;
+
+	unsigned			read_sectors;
+	unsigned			write_sectors;
+
+	struct bch_read_bio		rbio;
+
+	struct data_update		write;
+	/* Must be last since it is variable size */
+	struct bio_vec			bi_inline_vecs[0];
+};
+
+static void move_free(struct moving_io *io)
+{
+	struct moving_context *ctxt = io->write.ctxt;
+
+	if (io->b)
+		atomic_dec(&io->b->count);
+
+	bch2_data_update_exit(&io->write);
+
+	mutex_lock(&ctxt->lock);
+	list_del(&io->io_list);
+	wake_up(&ctxt->wait);
+	mutex_unlock(&ctxt->lock);
+
+	kfree(io);
+}
+
+static void move_write_done(struct bch_write_op *op)
+{
+	struct moving_io *io = container_of(op, struct moving_io, write.op);
+	struct moving_context *ctxt = io->write.ctxt;
+
+	if (io->write.op.error)
+		ctxt->write_error = true;
+
+	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+	atomic_dec(&io->write.ctxt->write_ios);
+	move_free(io);
+	closure_put(&ctxt->cl);
+}
+
+static void move_write(struct moving_io *io)
+{
+	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+		move_free(io);
+		return;
+	}
+
+	closure_get(&io->write.ctxt->cl);
+	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+	atomic_inc(&io->write.ctxt->write_ios);
+
+	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
+}
+
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
+{
+	struct moving_io *io =
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
+
+	return io && io->read_completed ? io : NULL;
+}
+
+static void move_read_endio(struct bio *bio)
+{
+	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+	struct moving_context *ctxt = io->write.ctxt;
+
+	atomic_sub(io->read_sectors, &ctxt->read_sectors);
+	atomic_dec(&ctxt->read_ios);
+	io->read_completed = true;
+
+	wake_up(&ctxt->wait);
+	closure_put(&ctxt->cl);
+}
+
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
+					struct btree_trans *trans)
+{
+	struct moving_io *io;
+
+	if (trans)
+		bch2_trans_unlock(trans);
+
+	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+		list_del(&io->read_list);
+		move_write(io);
+	}
+}
+
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
+				       struct btree_trans *trans)
+{
+	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+
+	move_ctxt_wait_event(ctxt, trans,
+		!atomic_read(&ctxt->write_sectors) ||
+		atomic_read(&ctxt->write_sectors) != sectors_pending);
+}
+
+void bch2_moving_ctxt_exit(struct moving_context *ctxt)
+{
+	struct bch_fs *c = ctxt->c;
+
+	move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+	closure_sync(&ctxt->cl);
+
+	EBUG_ON(atomic_read(&ctxt->write_sectors));
+	EBUG_ON(atomic_read(&ctxt->write_ios));
+	EBUG_ON(atomic_read(&ctxt->read_sectors));
+	EBUG_ON(atomic_read(&ctxt->read_ios));
+
+	if (ctxt->stats) {
+		progress_list_del(c, ctxt->stats);
+		trace_move_data(c,
+				atomic64_read(&ctxt->stats->sectors_moved),
+				atomic64_read(&ctxt->stats->keys_moved));
+	}
+
+	mutex_lock(&c->moving_context_lock);
+	list_del(&ctxt->list);
+	mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_moving_ctxt_init(struct moving_context *ctxt,
+			   struct bch_fs *c,
+			   struct bch_ratelimit *rate,
+			   struct bch_move_stats *stats,
+			   struct write_point_specifier wp,
+			   bool wait_on_copygc)
+{
+	memset(ctxt, 0, sizeof(*ctxt));
+
+	ctxt->c		= c;
+	ctxt->fn	= (void *) _RET_IP_;
+	ctxt->rate	= rate;
+	ctxt->stats	= stats;
+	ctxt->wp	= wp;
+	ctxt->wait_on_copygc = wait_on_copygc;
+
+	closure_init_stack(&ctxt->cl);
+
+	mutex_init(&ctxt->lock);
+	INIT_LIST_HEAD(&ctxt->reads);
+	INIT_LIST_HEAD(&ctxt->ios);
+	init_waitqueue_head(&ctxt->wait);
+
+	mutex_lock(&c->moving_context_lock);
+	list_add(&ctxt->list, &c->moving_context_list);
+	mutex_unlock(&c->moving_context_lock);
+
+	if (stats) {
+		progress_list_add(c, stats);
+		stats->data_type = BCH_DATA_user;
+	}
+}
+
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+	memset(stats, 0, sizeof(*stats));
+	scnprintf(stats->name, sizeof(stats->name), "%s", name);
+}
+
+static int bch2_extent_drop_ptrs(struct btree_trans *trans,
+				 struct btree_iter *iter,
+				 struct bkey_s_c k,
+				 struct data_update_opts data_opts)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	n = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	while (data_opts.kill_ptrs) {
+		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+		struct bch_extent_ptr *ptr;
+
+		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+		data_opts.kill_ptrs ^= 1U << drop;
+	}
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k))
+		n->k.size = 0;
+
+	return bch2_trans_relock(trans) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+static int bch2_move_extent(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct moving_context *ctxt,
+			    struct move_bucket_in_flight *bucket_in_flight,
+			    struct bch_io_opts io_opts,
+			    enum btree_id btree_id,
+			    struct bkey_s_c k,
+			    struct data_update_opts data_opts)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct moving_io *io;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned sectors = k.k->size, pages;
+	int ret = -ENOMEM;
+
+	trace_move_extent2(c, k);
+
+	bch2_data_update_opts_normalize(k, &data_opts);
+
+	if (!data_opts.rewrite_ptrs &&
+	    !data_opts.extra_replicas) {
+		if (data_opts.kill_ptrs)
+			return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+		return 0;
+	}
+
+	/*
+	 * Before memory allocations & taking nocow locks in
+	 * bch2_data_update_init():
+	 */
+	bch2_trans_unlock(trans);
+
+	/* write path might have to decompress data: */
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
+
+	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	io = kzalloc(sizeof(struct moving_io) +
+		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
+	if (!io)
+		goto err;
+
+	INIT_LIST_HEAD(&io->io_list);
+	io->write.ctxt		= ctxt;
+	io->read_sectors	= k.k->size;
+	io->write_sectors	= k.k->size;
+
+	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+	bio_set_prio(&io->write.op.wbio.bio,
+		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+				 GFP_KERNEL))
+		goto err_free;
+
+	io->rbio.c		= c;
+	io->rbio.opts		= io_opts;
+	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+	io->rbio.bio.bi_vcnt = pages;
+	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->rbio.bio.bi_iter.bi_size = sectors << 9;
+
+	io->rbio.bio.bi_opf		= REQ_OP_READ;
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
+	io->rbio.bio.bi_end_io		= move_read_endio;
+
+	ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
+				    io_opts, data_opts, btree_id, k);
+	if (ret && ret != -BCH_ERR_unwritten_extent_update)
+		goto err_free_pages;
+
+	if (ret == -BCH_ERR_unwritten_extent_update) {
+		bch2_update_unwritten_extent(trans, &io->write);
+		move_free(io);
+		return 0;
+	}
+
+	BUG_ON(ret);
+
+	io->write.ctxt = ctxt;
+	io->write.op.end_io = move_write_done;
+
+	if (ctxt->stats) {
+		atomic64_inc(&ctxt->stats->keys_moved);
+		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+	}
+
+	if (bucket_in_flight) {
+		io->b = bucket_in_flight;
+		atomic_inc(&io->b->count);
+	}
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
+	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+	trace_move_extent_read2(c, k);
+
+	mutex_lock(&ctxt->lock);
+	atomic_add(io->read_sectors, &ctxt->read_sectors);
+	atomic_inc(&ctxt->read_ios);
+
+	list_add_tail(&io->read_list, &ctxt->reads);
+	list_add_tail(&io->io_list, &ctxt->ios);
+	mutex_unlock(&ctxt->lock);
+
+	/*
+	 * dropped by move_read_endio() - guards against use after free of
+	 * ctxt when doing wakeup
+	 */
+	closure_get(&ctxt->cl);
+	bch2_read_extent(trans, &io->rbio,
+			 bkey_start_pos(k.k),
+			 btree_id, k, 0,
+			 BCH_READ_NODECODE|
+			 BCH_READ_LAST_FRAGMENT);
+	return 0;
+err_free_pages:
+	bio_free_pages(&io->write.op.wbio.bio);
+err_free:
+	kfree(io);
+err:
+	this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
+	trace_move_extent_alloc_mem_fail2(c, k);
+	return ret;
+}
+
+static int lookup_inode(struct btree_trans *trans, struct bpos pos,
+			struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k || !bkey_eq(k.k->p, pos)) {
+		ret = -BCH_ERR_ENOENT_inode;
+		goto err;
+	}
+
+	ret = bkey_is_inode(k.k) ? 0 : -EIO;
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_unpack(k, inode);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int move_ratelimit(struct btree_trans *trans,
+			  struct moving_context *ctxt)
+{
+	struct bch_fs *c = trans->c;
+	u64 delay;
+
+	if (ctxt->wait_on_copygc) {
+		bch2_trans_unlock(trans);
+		wait_event_killable(c->copygc_running_wq,
+				    !c->copygc_running ||
+				    kthread_should_stop());
+	}
+
+	do {
+		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
+		if (delay) {
+			bch2_trans_unlock(trans);
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			return 1;
+		}
+
+		if (delay)
+			schedule_timeout(delay);
+
+		if (unlikely(freezing(current))) {
+			move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+			try_to_freeze();
+		}
+	} while (delay);
+
+	/*
+	 * XXX: these limits really ought to be per device, SSDs and hard drives
+	 * will want different limits
+	 */
+	move_ctxt_wait_event(ctxt, trans,
+		atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+		atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+		atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
+		atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
+
+	return 0;
+}
+
+static int move_get_io_opts(struct btree_trans *trans,
+			    struct bch_io_opts *io_opts,
+			    struct bkey_s_c k, u64 *cur_inum)
+{
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	if (*cur_inum == k.k->p.inode)
+		return 0;
+
+	ret = lookup_inode(trans,
+			   SPOS(0, k.k->p.inode, k.k->p.snapshot),
+			   &inode);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ret;
+
+	if (!ret)
+		bch2_inode_opts_get(io_opts, trans->c, &inode);
+	else
+		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+	*cur_inum = k.k->p.inode;
+	return 0;
+}
+
+static int __bch2_move_data(struct moving_context *ctxt,
+			    struct bpos start,
+			    struct bpos end,
+			    move_pred_fn pred, void *arg,
+			    enum btree_id btree_id)
+{
+	struct bch_fs *c = ctxt->c;
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct bkey_buf sk;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct data_update_opts data_opts;
+	u64 cur_inum = U64_MAX;
+	int ret = 0, ret2;
+
+	bch2_bkey_buf_init(&sk);
+
+	if (ctxt->stats) {
+		ctxt->stats->data_type	= BCH_DATA_user;
+		ctxt->stats->btree_id	= btree_id;
+		ctxt->stats->pos	= start;
+	}
+
+	bch2_trans_iter_init(trans, &iter, btree_id, start,
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+
+	if (ctxt->rate)
+		bch2_ratelimit_reset(ctxt->rate);
+
+	while (!move_ratelimit(trans, ctxt)) {
+		bch2_trans_begin(trans);
+
+		k = bch2_btree_iter_peek(&iter);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		if (bkey_ge(bkey_start_pos(k.k), end))
+			break;
+
+		if (ctxt->stats)
+			ctxt->stats->pos = iter.pos;
+
+		if (!bkey_extent_is_direct_data(k.k))
+			goto next_nondata;
+
+		ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+		if (ret)
+			continue;
+
+		memset(&data_opts, 0, sizeof(data_opts));
+		if (!pred(c, arg, k, &io_opts, &data_opts))
+			goto next;
+
+		/*
+		 * The iterator gets unlocked by __bch2_read_extent - need to
+		 * save a copy of @k elsewhere:
+		 */
+		bch2_bkey_buf_reassemble(&sk, c, k);
+		k = bkey_i_to_s_c(sk.k);
+
+		ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
+					io_opts, btree_id, k, data_opts);
+		if (ret2) {
+			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
+				continue;
+
+			if (ret2 == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(ctxt, trans);
+				continue;
+			}
+
+			/* XXX signal failure */
+			goto next;
+		}
+
+		if (ctxt->rate)
+			bch2_ratelimit_increment(ctxt->rate, k.k->size);
+next:
+		if (ctxt->stats)
+			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+next_nondata:
+		bch2_btree_iter_advance(&iter);
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+		   enum btree_id start_btree_id, struct bpos start_pos,
+		   enum btree_id end_btree_id,   struct bpos end_pos,
+		   struct bch_ratelimit *rate,
+		   struct bch_move_stats *stats,
+		   struct write_point_specifier wp,
+		   bool wait_on_copygc,
+		   move_pred_fn pred, void *arg)
+{
+	struct moving_context ctxt;
+	enum btree_id id;
+	int ret = 0;
+
+	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+
+	for (id = start_btree_id;
+	     id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+	     id++) {
+		stats->btree_id = id;
+
+		if (id != BTREE_ID_extents &&
+		    id != BTREE_ID_reflink)
+			continue;
+
+		if (!bch2_btree_id_root(c, id)->b)
+			continue;
+
+		ret = __bch2_move_data(&ctxt,
+				       id == start_btree_id ? start_pos : POS_MIN,
+				       id == end_btree_id   ? end_pos   : POS_MAX,
+				       pred, arg, id);
+		if (ret)
+			break;
+	}
+
+	bch2_moving_ctxt_exit(&ctxt);
+
+	return ret;
+}
+
+int __bch2_evacuate_bucket(struct btree_trans *trans,
+			   struct moving_context *ctxt,
+			   struct move_bucket_in_flight *bucket_in_flight,
+			   struct bpos bucket, int gen,
+			   struct data_update_opts _data_opts)
+{
+	struct bch_fs *c = ctxt->c;
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bch_backpointer bp;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct bkey_s_c k;
+	struct data_update_opts data_opts;
+	unsigned dirty_sectors, bucket_size;
+	u64 fragmentation;
+	u64 cur_inum = U64_MAX;
+	struct bpos bp_pos = POS_MIN;
+	int ret = 0;
+
+	trace_bucket_evacuate(c, &bucket);
+
+	bch2_bkey_buf_init(&sk);
+
+	/*
+	 * We're not run in a context that handles transaction restarts:
+	 */
+	bch2_trans_begin(trans);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     bucket, BTREE_ITER_CACHED);
+	ret = lockrestart_do(trans,
+			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret) {
+		bch_err_msg(c, ret, "looking up alloc key");
+		goto err;
+	}
+
+	a = bch2_alloc_to_v4(k, &a_convert);
+	dirty_sectors = a->dirty_sectors;
+	bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+	fragmentation = a->fragmentation_lru;
+
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (ret) {
+		bch_err_msg(c, ret, "flushing btree write buffer");
+		goto err;
+	}
+
+	while (!(ret = move_ratelimit(trans, ctxt))) {
+		bch2_trans_begin(trans);
+
+		ret = bch2_get_next_backpointer(trans, bucket, gen,
+						&bp_pos, &bp,
+						BTREE_ITER_CACHED);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+		if (bkey_eq(bp_pos, POS_MAX))
+			break;
+
+		if (!bp.level) {
+			const struct bch_extent_ptr *ptr;
+			unsigned i = 0;
+
+			k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
+			ret = bkey_err(k);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+			if (!k.k)
+				goto next;
+
+			bch2_bkey_buf_reassemble(&sk, c, k);
+			k = bkey_i_to_s_c(sk.k);
+
+			ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+			if (ret) {
+				bch2_trans_iter_exit(trans, &iter);
+				continue;
+			}
+
+			data_opts = _data_opts;
+			data_opts.target	= io_opts.background_target;
+			data_opts.rewrite_ptrs = 0;
+
+			bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+				if (ptr->dev == bucket.inode) {
+					data_opts.rewrite_ptrs |= 1U << i;
+					if (ptr->cached) {
+						bch2_trans_iter_exit(trans, &iter);
+						goto next;
+					}
+				}
+				i++;
+			}
+
+			ret = bch2_move_extent(trans, &iter, ctxt,
+					bucket_in_flight,
+					io_opts, bp.btree_id, k, data_opts);
+			bch2_trans_iter_exit(trans, &iter);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(ctxt, trans);
+				continue;
+			}
+			if (ret)
+				goto err;
+
+			if (ctxt->rate)
+				bch2_ratelimit_increment(ctxt->rate, k.k->size);
+			if (ctxt->stats)
+				atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+		} else {
+			struct btree *b;
+
+			b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
+			ret = PTR_ERR_OR_ZERO(b);
+			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+				continue;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+			if (!b)
+				goto next;
+
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+			bch2_trans_iter_exit(trans, &iter);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+
+			if (ctxt->rate)
+				bch2_ratelimit_increment(ctxt->rate,
+							 c->opts.btree_node_size >> 9);
+			if (ctxt->stats) {
+				atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
+				atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+			}
+		}
+next:
+		bp_pos = bpos_nosnap_successor(bp_pos);
+	}
+
+	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
+err:
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
+int bch2_evacuate_bucket(struct bch_fs *c,
+			 struct bpos bucket, int gen,
+			 struct data_update_opts data_opts,
+			 struct bch_ratelimit *rate,
+			 struct bch_move_stats *stats,
+			 struct write_point_specifier wp,
+			 bool wait_on_copygc)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct moving_context ctxt;
+	int ret;
+
+	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+	ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
+	bch2_moving_ctxt_exit(&ctxt);
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+				struct btree *, struct bch_io_opts *,
+				struct data_update_opts *);
+
+static int bch2_move_btree(struct bch_fs *c,
+			   enum btree_id start_btree_id, struct bpos start_pos,
+			   enum btree_id end_btree_id,   struct bpos end_pos,
+			   move_btree_pred pred, void *arg,
+			   struct bch_move_stats *stats)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct btree *b;
+	enum btree_id id;
+	struct data_update_opts data_opts;
+	int ret = 0;
+
+	progress_list_add(c, stats);
+
+	stats->data_type = BCH_DATA_btree;
+
+	for (id = start_btree_id;
+	     id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+	     id++) {
+		stats->btree_id = id;
+
+		if (!bch2_btree_id_root(c, id)->b)
+			continue;
+
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+					  BTREE_ITER_PREFETCH);
+retry:
+		ret = 0;
+		while (bch2_trans_begin(trans),
+		       (b = bch2_btree_iter_peek_node(&iter)) &&
+		       !(ret = PTR_ERR_OR_ZERO(b))) {
+			if (kthread && kthread_should_stop())
+				break;
+
+			if ((cmp_int(id, end_btree_id) ?:
+			     bpos_cmp(b->key.k.p, end_pos)) > 0)
+				break;
+
+			stats->pos = iter.pos;
+
+			if (!pred(c, arg, b, &io_opts, &data_opts))
+				goto next;
+
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				break;
+next:
+			bch2_btree_iter_next_node(&iter);
+		}
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (kthread && kthread_should_stop())
+			break;
+	}
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+
+	bch2_btree_interior_updates_flush(c);
+
+	progress_list_del(c, stats);
+	return ret;
+}
+
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+			     struct bkey_s_c k,
+			     struct bch_io_opts *io_opts,
+			     struct data_update_opts *data_opts)
+{
+	unsigned nr_good = bch2_bkey_durability(c, k);
+	unsigned replicas = bkey_is_btree_ptr(k.k)
+		? c->opts.metadata_replicas
+		: io_opts->data_replicas;
+
+	if (!nr_good || nr_good >= replicas)
+		return false;
+
+	data_opts->target		= 0;
+	data_opts->extra_replicas	= replicas - nr_good;
+	data_opts->btree_insert_flags	= 0;
+	return true;
+}
+
+static bool migrate_pred(struct bch_fs *c, void *arg,
+			 struct bkey_s_c k,
+			 struct bch_io_opts *io_opts,
+			 struct data_update_opts *data_opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	struct bch_ioctl_data *op = arg;
+	unsigned i = 0;
+
+	data_opts->rewrite_ptrs		= 0;
+	data_opts->target		= 0;
+	data_opts->extra_replicas	= 0;
+	data_opts->btree_insert_flags	= 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (ptr->dev == op->migrate.dev)
+			data_opts->rewrite_ptrs |= 1U << i;
+		i++;
+	}
+
+	return data_opts->rewrite_ptrs != 0;
+}
+
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+			       struct btree *b,
+			       struct bch_io_opts *io_opts,
+			       struct data_update_opts *data_opts)
+{
+	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool bformat_needs_redo(struct bkey_format *f)
+{
+	unsigned i;
+
+	for (i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (f->bits_per_field[i] > unpacked_bits)
+			return true;
+
+		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+			return true;
+
+		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+		     unpacked_mask) <
+		    field_offset)
+			return true;
+	}
+
+	return false;
+}
+
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	if (b->version_ondisk != c->sb.version ||
+	    btree_node_need_rewrite(b) ||
+	    bformat_needs_redo(&b->format)) {
+		data_opts->target		= 0;
+		data_opts->extra_replicas	= 0;
+		data_opts->btree_insert_flags	= 0;
+		return true;
+	}
+
+	return false;
+}
+
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
+{
+	int ret;
+
+	ret = bch2_move_btree(c,
+			      0,		POS_MIN,
+			      BTREE_ID_NR,	SPOS_MAX,
+			      rewrite_old_nodes_pred, c, stats);
+	if (!ret) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_data_job(struct bch_fs *c,
+		  struct bch_move_stats *stats,
+		  struct bch_ioctl_data op)
+{
+	int ret = 0;
+
+	switch (op.op) {
+	case BCH_DATA_OP_REREPLICATE:
+		bch2_move_stats_init(stats, "rereplicate");
+		stats->data_type = BCH_DATA_journal;
+		ret = bch2_journal_flush_device_pins(&c->journal, -1);
+
+		ret = bch2_move_btree(c,
+				      op.start_btree,	op.start_pos,
+				      op.end_btree,	op.end_pos,
+				      rereplicate_btree_pred, c, stats) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+
+		ret = bch2_move_data(c,
+				     op.start_btree,	op.start_pos,
+				     op.end_btree,	op.end_pos,
+				     NULL,
+				     stats,
+				     writepoint_hashed((unsigned long) current),
+				     true,
+				     rereplicate_pred, c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+		break;
+	case BCH_DATA_OP_MIGRATE:
+		if (op.migrate.dev >= c->sb.nr_devices)
+			return -EINVAL;
+
+		bch2_move_stats_init(stats, "migrate");
+		stats->data_type = BCH_DATA_journal;
+		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
+
+		ret = bch2_move_btree(c,
+				      op.start_btree,	op.start_pos,
+				      op.end_btree,	op.end_pos,
+				      migrate_btree_pred, &op, stats) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+
+		ret = bch2_move_data(c,
+				     op.start_btree,	op.start_pos,
+				     op.end_btree,	op.end_pos,
+				     NULL,
+				     stats,
+				     writepoint_hashed((unsigned long) current),
+				     true,
+				     migrate_pred, &op) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+		break;
+	case BCH_DATA_OP_REWRITE_OLD_NODES:
+		bch2_move_stats_init(stats, "rewrite_old_nodes");
+		ret = bch2_scan_old_btree_nodes(c, stats);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+{
+	struct bch_move_stats *stats = ctxt->stats;
+	struct moving_io *io;
+
+	prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
+	prt_newline(out);
+
+	prt_printf(out, " data type %s btree_id %s position: ",
+		   bch2_data_types[stats->data_type],
+		   bch2_btree_ids[stats->btree_id]);
+	bch2_bpos_to_text(out, stats->pos);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "reads: ios %u/%u sectors %u/%u",
+		   atomic_read(&ctxt->read_ios),
+		   c->opts.move_ios_in_flight,
+		   atomic_read(&ctxt->read_sectors),
+		   c->opts.move_bytes_in_flight >> 9);
+	prt_newline(out);
+
+	prt_printf(out, "writes: ios %u/%u sectors %u/%u",
+		   atomic_read(&ctxt->write_ios),
+		   c->opts.move_ios_in_flight,
+		   atomic_read(&ctxt->write_sectors),
+		   c->opts.move_bytes_in_flight >> 9);
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+
+	mutex_lock(&ctxt->lock);
+	list_for_each_entry(io, &ctxt->ios, io_list)
+		bch2_write_op_to_text(out, &io->write.op);
+	mutex_unlock(&ctxt->lock);
+
+	printbuf_indent_sub(out, 4);
+}
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct moving_context *ctxt;
+
+	mutex_lock(&c->moving_context_lock);
+	list_for_each_entry(ctxt, &c->moving_context_list, list)
+		bch2_moving_ctxt_to_text(out, c, ctxt);
+	mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_fs_move_init(struct bch_fs *c)
+{
+	INIT_LIST_HEAD(&c->moving_context_list);
+	mutex_init(&c->moving_context_lock);
+
+	INIT_LIST_HEAD(&c->data_progress_list);
+	mutex_init(&c->data_progress_lock);
+}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
new file mode 100644
index 000000000000..cbdd58db8782
--- /dev/null
+++ b/fs/bcachefs/move.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_H
+#define _BCACHEFS_MOVE_H
+
+#include "bcachefs_ioctl.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "move_types.h"
+
+struct bch_read_bio;
+
+struct moving_context {
+	struct bch_fs		*c;
+	struct list_head	list;
+	void			*fn;
+
+	struct bch_ratelimit	*rate;
+	struct bch_move_stats	*stats;
+	struct write_point_specifier wp;
+	bool			wait_on_copygc;
+	bool			write_error;
+
+	/* For waiting on outstanding reads and writes: */
+	struct closure		cl;
+
+	struct mutex		lock;
+	struct list_head	reads;
+	struct list_head	ios;
+
+	/* in flight sectors: */
+	atomic_t		read_sectors;
+	atomic_t		write_sectors;
+	atomic_t		read_ios;
+	atomic_t		write_ios;
+
+	wait_queue_head_t	wait;
+};
+
+#define move_ctxt_wait_event(_ctxt, _trans, _cond)			\
+do {									\
+	bool cond_finished = false;					\
+	bch2_moving_ctxt_do_pending_writes(_ctxt, _trans);		\
+									\
+	if (_cond)							\
+		break;							\
+	__wait_event((_ctxt)->wait,					\
+		     bch2_moving_ctxt_next_pending_write(_ctxt) ||	\
+		     (cond_finished = (_cond)));			\
+	if (cond_finished)						\
+		break;							\
+} while (1)
+
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+			     struct bch_io_opts *, struct data_update_opts *);
+
+void bch2_moving_ctxt_exit(struct moving_context *);
+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
+			   struct bch_ratelimit *, struct bch_move_stats *,
+			   struct write_point_specifier, bool);
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
+					struct btree_trans *);
+
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+
+int bch2_move_data(struct bch_fs *,
+		   enum btree_id, struct bpos,
+		   enum btree_id, struct bpos,
+		   struct bch_ratelimit *,
+		   struct bch_move_stats *,
+		   struct write_point_specifier,
+		   bool,
+		   move_pred_fn, void *);
+
+int __bch2_evacuate_bucket(struct btree_trans *,
+			   struct moving_context *,
+			   struct move_bucket_in_flight *,
+			   struct bpos, int,
+			   struct data_update_opts);
+int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
+			 struct data_update_opts,
+			 struct bch_ratelimit *,
+			 struct bch_move_stats *,
+			 struct write_point_specifier,
+			 bool);
+int bch2_data_job(struct bch_fs *,
+		  struct bch_move_stats *,
+		  struct bch_ioctl_data);
+
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_move_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
new file mode 100644
index 000000000000..baf1f8570b3f
--- /dev/null
+++ b/fs/bcachefs/move_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+struct bch_move_stats {
+	enum bch_data_type	data_type;
+	enum btree_id		btree_id;
+	struct bpos		pos;
+	struct list_head	list;
+	char			name[32];
+
+	atomic64_t		keys_moved;
+	atomic64_t		keys_raced;
+	atomic64_t		sectors_moved;
+	atomic64_t		sectors_seen;
+	atomic64_t		sectors_raced;
+};
+
+struct move_bucket_key {
+	struct bpos		bucket;
+	u8			gen;
+};
+
+struct move_bucket {
+	struct move_bucket_key	k;
+	unsigned		sectors;
+};
+
+struct move_bucket_in_flight {
+	struct move_bucket_in_flight *next;
+	struct rhash_head	hash;
+	struct move_bucket	bucket;
+	atomic_t		count;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
new file mode 100644
index 000000000000..4017120baeee
--- /dev/null
+++ b/fs/bcachefs/movinggc.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "clock.h"
+#include "errcode.h"
+#include "error.h"
+#include "lru.h"
+#include "move.h"
+#include "movinggc.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/sched/task.h>
+#include <linux/wait.h>
+
+struct buckets_in_flight {
+	struct rhashtable		table;
+	struct move_bucket_in_flight	*first;
+	struct move_bucket_in_flight	*last;
+	size_t				nr;
+	size_t				sectors;
+};
+
+static const struct rhashtable_params bch_move_bucket_params = {
+	.head_offset	= offsetof(struct move_bucket_in_flight, hash),
+	.key_offset	= offsetof(struct move_bucket_in_flight, bucket.k),
+	.key_len	= sizeof(struct move_bucket_key),
+};
+
+static struct move_bucket_in_flight *
+move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
+{
+	struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
+	int ret;
+
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+
+	new->bucket = b;
+
+	ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
+					    bch_move_bucket_params);
+	if (ret) {
+		kfree(new);
+		return ERR_PTR(ret);
+	}
+
+	if (!list->first)
+		list->first = new;
+	else
+		list->last->next = new;
+
+	list->last = new;
+	list->nr++;
+	list->sectors += b.sectors;
+	return new;
+}
+
+static int bch2_bucket_is_movable(struct btree_trans *trans,
+				  struct move_bucket *b, u64 time)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a;
+	int ret;
+
+	if (bch2_bucket_is_open(trans->c,
+				b->k.bucket.inode,
+				b->k.bucket.offset))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+			       b->k.bucket, BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	a = bch2_alloc_to_v4(k, &_a);
+	b->k.gen	= a->gen;
+	b->sectors	= a->dirty_sectors;
+
+	ret = data_type_movable(a->data_type) &&
+		a->fragmentation_lru &&
+		a->fragmentation_lru <= time;
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void move_buckets_wait(struct btree_trans *trans,
+			      struct moving_context *ctxt,
+			      struct buckets_in_flight *list,
+			      bool flush)
+{
+	struct move_bucket_in_flight *i;
+	int ret;
+
+	while ((i = list->first)) {
+		if (flush)
+			move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
+
+		if (atomic_read(&i->count))
+			break;
+
+		list->first = i->next;
+		if (!list->first)
+			list->last = NULL;
+
+		list->nr--;
+		list->sectors -= i->bucket.sectors;
+
+		ret = rhashtable_remove_fast(&list->table, &i->hash,
+					     bch_move_bucket_params);
+		BUG_ON(ret);
+		kfree(i);
+	}
+
+	bch2_trans_unlock(trans);
+}
+
+static bool bucket_in_flight(struct buckets_in_flight *list,
+			     struct move_bucket_key k)
+{
+	return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
+}
+
+typedef DARRAY(struct move_bucket) move_buckets;
+
+static int bch2_copygc_get_buckets(struct btree_trans *trans,
+			struct moving_context *ctxt,
+			struct buckets_in_flight *buckets_in_flight,
+			move_buckets *buckets)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
+	size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
+	int ret;
+
+	move_buckets_wait(trans, ctxt, buckets_in_flight, false);
+
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+				 __func__, bch2_err_str(ret)))
+		return ret;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+				  lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
+				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+				  0, k, ({
+		struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
+		int ret2 = 0;
+
+		saw++;
+
+		if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
+			not_movable++;
+		else if (bucket_in_flight(buckets_in_flight, b.k))
+			in_flight++;
+		else {
+			ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+			if (ret2 >= 0)
+				sectors += b.sectors;
+		}
+		ret2;
+	}));
+
+	pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
+		 buckets_in_flight->nr, buckets_in_flight->sectors,
+		 saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
+
+	return ret < 0 ? ret : 0;
+}
+
+noinline
+static int bch2_copygc(struct btree_trans *trans,
+		       struct moving_context *ctxt,
+		       struct buckets_in_flight *buckets_in_flight)
+{
+	struct bch_fs *c = trans->c;
+	struct data_update_opts data_opts = {
+		.btree_insert_flags = BCH_WATERMARK_copygc,
+	};
+	move_buckets buckets = { 0 };
+	struct move_bucket_in_flight *f;
+	struct move_bucket *i;
+	u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
+	int ret = 0;
+
+	ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+	if (ret)
+		goto err;
+
+	darray_for_each(buckets, i) {
+		if (unlikely(freezing(current)))
+			break;
+
+		f = move_bucket_in_flight_add(buckets_in_flight, *i);
+		ret = PTR_ERR_OR_ZERO(f);
+		if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
+			ret = 0;
+			continue;
+		}
+		if (ret == -ENOMEM) { /* flush IO, continue later */
+			ret = 0;
+			break;
+		}
+
+		ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+					     f->bucket.k.gen, data_opts);
+		if (ret)
+			goto err;
+	}
+err:
+	darray_exit(&buckets);
+
+	/* no entries in LRU btree found, or got to end: */
+	if (bch2_err_matches(ret, ENOENT))
+		ret = 0;
+
+	if (ret < 0 && !bch2_err_matches(ret, EROFS))
+		bch_err_msg(c, ret, "from bch2_move_data()");
+
+	moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
+	trace_and_count(c, copygc, c, moved, 0, 0, 0);
+	return ret;
+}
+
+/*
+ * Copygc runs when the amount of fragmented data is above some arbitrary
+ * threshold:
+ *
+ * The threshold at the limit - when the device is full - is the amount of space
+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of
+ * disk space stranded due to fragmentation and store everything we have
+ * promised to store.
+ *
+ * But we don't want to be running copygc unnecessarily when the device still
+ * has plenty of free space - rather, we want copygc to smoothly run every so
+ * often and continually reduce the amount of fragmented space as the device
+ * fills up. So, we increase the threshold by half the current free space.
+ */
+unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_idx;
+	s64 wait = S64_MAX, fragmented_allowed, fragmented;
+	unsigned i;
+
+	for_each_rw_member(ca, c, dev_idx) {
+		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+
+		fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
+				       ca->mi.bucket_size) >> 1);
+		fragmented = 0;
+
+		for (i = 0; i < BCH_DATA_NR; i++)
+			if (data_type_movable(i))
+				fragmented += usage.d[i].fragmented;
+
+		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
+	}
+
+	return wait;
+}
+
+void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	prt_printf(out, "Currently waiting for:     ");
+	prt_human_readable_u64(out, max(0LL, c->copygc_wait -
+					atomic64_read(&c->io_clock[WRITE].now)) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "Currently waiting since:   ");
+	prt_human_readable_u64(out, max(0LL,
+					atomic64_read(&c->io_clock[WRITE].now) -
+					c->copygc_wait_at) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "Currently calculated wait: ");
+	prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
+	prt_newline(out);
+}
+
+static int bch2_copygc_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct btree_trans *trans;
+	struct moving_context ctxt;
+	struct bch_move_stats move_stats;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct buckets_in_flight buckets;
+	u64 last, wait;
+	int ret = 0;
+
+	memset(&buckets, 0, sizeof(buckets));
+
+	ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
+	if (ret) {
+		bch_err_msg(c, ret, "allocating copygc buckets in flight");
+		return ret;
+	}
+
+	set_freezable();
+	trans = bch2_trans_get(c);
+
+	bch2_move_stats_init(&move_stats, "copygc");
+	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+			      writepoint_ptr(&c->copygc_write_point),
+			      false);
+
+	while (!ret && !kthread_should_stop()) {
+		bch2_trans_unlock(trans);
+		cond_resched();
+
+		if (!c->copy_gc_enabled) {
+			move_buckets_wait(trans, &ctxt, &buckets, true);
+			kthread_wait_freezable(c->copy_gc_enabled);
+		}
+
+		if (unlikely(freezing(current))) {
+			move_buckets_wait(trans, &ctxt, &buckets, true);
+			__refrigerator(false);
+			continue;
+		}
+
+		last = atomic64_read(&clock->now);
+		wait = bch2_copygc_wait_amount(c);
+
+		if (wait > clock->max_slop) {
+			c->copygc_wait_at = last;
+			c->copygc_wait = last + wait;
+			move_buckets_wait(trans, &ctxt, &buckets, true);
+			trace_and_count(c, copygc_wait, c, wait, last + wait);
+			bch2_kthread_io_clock_wait(clock, last + wait,
+					MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		c->copygc_wait = 0;
+
+		c->copygc_running = true;
+		ret = bch2_copygc(trans, &ctxt, &buckets);
+		c->copygc_running = false;
+
+		wake_up(&c->copygc_running_wq);
+	}
+
+	move_buckets_wait(trans, &ctxt, &buckets, true);
+	rhashtable_destroy(&buckets.table);
+	bch2_trans_put(trans);
+	bch2_moving_ctxt_exit(&ctxt);
+
+	return 0;
+}
+
+void bch2_copygc_stop(struct bch_fs *c)
+{
+	if (c->copygc_thread) {
+		kthread_stop(c->copygc_thread);
+		put_task_struct(c->copygc_thread);
+	}
+	c->copygc_thread = NULL;
+}
+
+int bch2_copygc_start(struct bch_fs *c)
+{
+	struct task_struct *t;
+	int ret;
+
+	if (c->copygc_thread)
+		return 0;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	if (bch2_fs_init_fault("copygc_start"))
+		return -ENOMEM;
+
+	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(t);
+	if (ret) {
+		bch_err_msg(c, ret, "creating copygc thread");
+		return ret;
+	}
+
+	get_task_struct(t);
+
+	c->copygc_thread = t;
+	wake_up_process(c->copygc_thread);
+
+	return 0;
+}
+
+void bch2_fs_copygc_init(struct bch_fs *c)
+{
+	init_waitqueue_head(&c->copygc_running_wq);
+	c->copygc_running = false;
+}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
new file mode 100644
index 000000000000..ea181fef5bc9
--- /dev/null
+++ b/fs/bcachefs/movinggc.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVINGGC_H
+#define _BCACHEFS_MOVINGGC_H
+
+unsigned long bch2_copygc_wait_amount(struct bch_fs *);
+void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_copygc_stop(struct bch_fs *);
+int bch2_copygc_start(struct bch_fs *);
+void bch2_fs_copygc_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
new file mode 100644
index 000000000000..3c21981a4a1c
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "nocow_locking.h"
+#include "util.h"
+
+#include <linux/closure.h>
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
+			return true;
+	return false;
+}
+
+#define sign(v)		(v < 0 ? -1 : v > 0 ? 1 : 0)
+
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+	int lock_val = flags ? 1 : -1;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket) {
+			int v = atomic_sub_return(lock_val, &l->l[i]);
+
+			BUG_ON(v && sign(v) != lock_val);
+			if (!v)
+				closure_wake_up(&l->wait);
+			return;
+		}
+
+	BUG();
+}
+
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
+				 u64 dev_bucket, int flags)
+{
+	int v, lock_val = flags ? 1 : -1;
+	unsigned i;
+
+	spin_lock(&l->lock);
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket)
+			goto got_entry;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (!atomic_read(&l->l[i])) {
+			l->b[i] = dev_bucket;
+			goto take_lock;
+		}
+fail:
+	spin_unlock(&l->lock);
+	return false;
+got_entry:
+	v = atomic_read(&l->l[i]);
+	if (lock_val > 0 ? v < 0 : v > 0)
+		goto fail;
+take_lock:
+	v = atomic_read(&l->l[i]);
+	/* Overflow? */
+	if (v && sign(v + lock_val) != sign(v))
+		goto fail;
+
+	atomic_add(lock_val, &l->l[i]);
+	spin_unlock(&l->lock);
+	return true;
+}
+
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+			      struct nocow_lock_bucket *l,
+			      u64 dev_bucket, int flags)
+{
+	if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
+		struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+		u64 start_time = local_clock();
+
+		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+	}
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
+
+{
+	unsigned i, nr_zero = 0;
+	struct nocow_lock_bucket *l;
+
+	for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
+		unsigned v = 0;
+
+		for (i = 0; i < ARRAY_SIZE(l->l); i++)
+			v |= atomic_read(&l->l[i]);
+
+		if (!v) {
+			nr_zero++;
+			continue;
+		}
+
+		if (nr_zero)
+			prt_printf(out, "(%u empty entries)\n", nr_zero);
+		nr_zero = 0;
+
+		for (i = 0; i < ARRAY_SIZE(l->l); i++) {
+			int v = atomic_read(&l->l[i]);
+			if (v) {
+				bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
+				prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
+			}
+		}
+		prt_newline(out);
+	}
+
+	if (nr_zero)
+		prt_printf(out, "(%u empty entries)\n", nr_zero);
+}
+
+void bch2_fs_nocow_locking_exit(struct bch_fs *c)
+{
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+		for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
+			BUG_ON(atomic_read(&l->l[j]));
+}
+
+int bch2_fs_nocow_locking_init(struct bch_fs *c)
+{
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+		spin_lock_init(&l->lock);
+
+	return 0;
+}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
new file mode 100644
index 000000000000..f9d6a426a960
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_H
+#define _BCACHEFS_NOCOW_LOCKING_H
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "nocow_locking_types.h"
+
+#include <linux/hash.h>
+
+static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+							  u64 dev_bucket)
+{
+	unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
+
+	return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
+}
+
+#define BUCKET_NOCOW_LOCK_UPDATE	(1 << 0)
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
+			      struct nocow_lock_bucket *, u64, int);
+
+static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+					  struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+	__bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
+}
+
+static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
+					  struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+	return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
+
+void bch2_fs_nocow_locking_exit(struct bch_fs *);
+int bch2_fs_nocow_locking_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
new file mode 100644
index 000000000000..bd12bf677924
--- /dev/null
+++ b/fs/bcachefs/nocow_locking_types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
+#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
+
+#define BUCKET_NOCOW_LOCKS_BITS		10
+#define BUCKET_NOCOW_LOCKS		(1U << BUCKET_NOCOW_LOCKS_BITS)
+
+struct nocow_lock_bucket {
+	struct closure_waitlist		wait;
+	spinlock_t			lock;
+	u64				b[4];
+	atomic_t			l[4];
+} __aligned(SMP_CACHE_BYTES);
+
+struct bucket_nocow_lock_table {
+	struct nocow_lock_bucket	l[BUCKET_NOCOW_LOCKS];
+};
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
+
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
new file mode 100644
index 000000000000..232f50c73a94
--- /dev/null
+++ b/fs/bcachefs/opts.c
@@ -0,0 +1,605 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+
+#include "bcachefs.h"
+#include "compress.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "opts.h"
+#include "super-io.h"
+#include "util.h"
+
+#define x(t, n, ...) [n] = #t,
+
+const char * const bch2_iops_measurements[] = {
+	BCH_IOPS_MEASUREMENTS()
+	NULL
+};
+
+const char * const bch2_error_actions[] = {
+	BCH_ERROR_ACTIONS()
+	NULL
+};
+
+const char * const bch2_fsck_fix_opts[] = {
+	BCH_FIX_ERRORS_OPTS()
+	NULL
+};
+
+const char * const bch2_version_upgrade_opts[] = {
+	BCH_VERSION_UPGRADE_OPTS()
+	NULL
+};
+
+const char * const bch2_sb_features[] = {
+	BCH_SB_FEATURES()
+	NULL
+};
+
+const char * const bch2_sb_compat[] = {
+	BCH_SB_COMPAT()
+	NULL
+};
+
+const char * const bch2_btree_ids[] = {
+	BCH_BTREE_IDS()
+	"interior btree node",
+	NULL
+};
+
+const char * const bch2_csum_types[] = {
+	BCH_CSUM_TYPES()
+	NULL
+};
+
+const char * const bch2_csum_opts[] = {
+	BCH_CSUM_OPTS()
+	NULL
+};
+
+const char * const bch2_compression_types[] = {
+	BCH_COMPRESSION_TYPES()
+	NULL
+};
+
+const char * const bch2_compression_opts[] = {
+	BCH_COMPRESSION_OPTS()
+	NULL
+};
+
+const char * const bch2_str_hash_types[] = {
+	BCH_STR_HASH_TYPES()
+	NULL
+};
+
+const char * const bch2_str_hash_opts[] = {
+	BCH_STR_HASH_OPTS()
+	NULL
+};
+
+const char * const bch2_data_types[] = {
+	BCH_DATA_TYPES()
+	NULL
+};
+
+const char * const bch2_member_states[] = {
+	BCH_MEMBER_STATES()
+	NULL
+};
+
+const char * const bch2_jset_entry_types[] = {
+	BCH_JSET_ENTRY_TYPES()
+	NULL
+};
+
+const char * const bch2_fs_usage_types[] = {
+	BCH_FS_USAGE_TYPES()
+	NULL
+};
+
+#undef x
+
+static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
+				     struct printbuf *err)
+{
+	if (!val) {
+		*res = FSCK_FIX_yes;
+	} else {
+		int ret = match_string(bch2_fsck_fix_opts, -1, val);
+
+		if (ret < 0 && err)
+			prt_str(err, "fix_errors: invalid selection");
+		if (ret < 0)
+			return ret;
+		*res = ret;
+	}
+
+	return 0;
+}
+
+static void bch2_opt_fix_errors_to_text(struct printbuf *out,
+					struct bch_fs *c,
+					struct bch_sb *sb,
+					u64 v)
+{
+	prt_str(out, bch2_fsck_fix_opts[v]);
+}
+
+#define bch2_opt_fix_errors (struct bch_opt_fn) {	\
+	.parse = bch2_opt_fix_errors_parse,		\
+	.to_text = bch2_opt_fix_errors_to_text,		\
+}
+
+const char * const bch2_d_types[BCH_DT_MAX] = {
+	[DT_UNKNOWN]	= "unknown",
+	[DT_FIFO]	= "fifo",
+	[DT_CHR]	= "chr",
+	[DT_DIR]	= "dir",
+	[DT_BLK]	= "blk",
+	[DT_REG]	= "reg",
+	[DT_LNK]	= "lnk",
+	[DT_SOCK]	= "sock",
+	[DT_WHT]	= "whiteout",
+	[DT_SUBVOL]	= "subvol",
+};
+
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+	BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+	BUG();
+}
+
+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
+{
+#define x(_name, ...)						\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+
+	BCH_OPTS()
+#undef x
+}
+
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		return opt_defined(*opts, _name);
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		return opts->_name;
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		opt_set(*opts, _name, v);				\
+		break;
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+const struct bch_option bch2_opt_table[] = {
+#define OPT_BOOL()		.type = BCH_OPT_BOOL, .min = 0, .max = 2
+#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT,			\
+				.min = _min, .max = _max
+#define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
+				.min = 0, .max = ARRAY_SIZE(_choices),	\
+				.choices = _choices
+#define OPT_FN(_fn)		.type = BCH_OPT_FN, .fn	= _fn
+
+#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
+	[Opt_##_name] = {						\
+		.attr	= {						\
+			.name	= #_name,				\
+			.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444,	\
+		},							\
+		.flags	= _flags,					\
+		.hint	= _hint,					\
+		.help	= _help,					\
+		.get_sb = _sb_opt,					\
+		.set_sb	= SET_##_sb_opt,				\
+		_type							\
+	},
+
+	BCH_OPTS()
+#undef x
+};
+
+int bch2_opt_lookup(const char *name)
+{
+	const struct bch_option *i;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
+	     i++)
+		if (!strcmp(name, i->attr.name))
+			return i - bch2_opt_table;
+
+	return -1;
+}
+
+struct synonym {
+	const char	*s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+	{ "quota",	"usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+	const struct synonym *i;
+
+	for (i = bch_opt_synonyms;
+	     i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+	     i++)
+		if (!strcmp(name, i->s1))
+			name = i->s2;
+
+	return bch2_opt_lookup(name);
+}
+
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
+{
+	if (v < opt->min) {
+		if (err)
+			prt_printf(err, "%s: too small (min %llu)",
+			       opt->attr.name, opt->min);
+		return -ERANGE;
+	}
+
+	if (opt->max && v >= opt->max) {
+		if (err)
+			prt_printf(err, "%s: too big (max %llu)",
+			       opt->attr.name, opt->max);
+		return -ERANGE;
+	}
+
+	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
+		if (err)
+			prt_printf(err, "%s: not a multiple of 512",
+			       opt->attr.name);
+		return -EINVAL;
+	}
+
+	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
+		if (err)
+			prt_printf(err, "%s: must be a power of two",
+			       opt->attr.name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int bch2_opt_parse(struct bch_fs *c,
+		   const struct bch_option *opt,
+		   const char *val, u64 *res,
+		   struct printbuf *err)
+{
+	ssize_t ret;
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+		if (val) {
+			ret = kstrtou64(val, 10, res);
+		} else {
+			ret = 0;
+			*res = 1;
+		}
+
+		if (ret < 0 || (*res != 0 && *res != 1)) {
+			if (err)
+				prt_printf(err, "%s: must be bool", opt->attr.name);
+			return ret;
+		}
+		break;
+	case BCH_OPT_UINT:
+		if (!val) {
+			prt_printf(err, "%s: required value",
+				   opt->attr.name);
+			return -EINVAL;
+		}
+
+		ret = opt->flags & OPT_HUMAN_READABLE
+			? bch2_strtou64_h(val, res)
+			: kstrtou64(val, 10, res);
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: must be a number",
+					   opt->attr.name);
+			return ret;
+		}
+		break;
+	case BCH_OPT_STR:
+		if (!val) {
+			prt_printf(err, "%s: required value",
+				   opt->attr.name);
+			return -EINVAL;
+		}
+
+		ret = match_string(opt->choices, -1, val);
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: invalid selection",
+					   opt->attr.name);
+			return ret;
+		}
+
+		*res = ret;
+		break;
+	case BCH_OPT_FN:
+		ret = opt->fn.parse(c, val, res, err);
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: parse error",
+					   opt->attr.name);
+			return ret;
+		}
+	}
+
+	return bch2_opt_validate(opt, *res, err);
+}
+
+void bch2_opt_to_text(struct printbuf *out,
+		      struct bch_fs *c, struct bch_sb *sb,
+		      const struct bch_option *opt, u64 v,
+		      unsigned flags)
+{
+	if (flags & OPT_SHOW_MOUNT_STYLE) {
+		if (opt->type == BCH_OPT_BOOL) {
+			prt_printf(out, "%s%s",
+			       v ? "" : "no",
+			       opt->attr.name);
+			return;
+		}
+
+		prt_printf(out, "%s=", opt->attr.name);
+	}
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+	case BCH_OPT_UINT:
+		if (opt->flags & OPT_HUMAN_READABLE)
+			prt_human_readable_u64(out, v);
+		else
+			prt_printf(out, "%lli", v);
+		break;
+	case BCH_OPT_STR:
+		if (flags & OPT_SHOW_FULL_LIST)
+			prt_string_option(out, opt->choices, v);
+		else
+			prt_str(out, opt->choices[v]);
+		break;
+	case BCH_OPT_FN:
+		opt->fn.to_text(out, c, sb, v);
+		break;
+	default:
+		BUG();
+	}
+}
+
+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
+{
+	int ret = 0;
+
+	switch (id) {
+	case Opt_compression:
+	case Opt_background_compression:
+		ret = bch2_check_set_has_compressed_data(c, v);
+		break;
+	case Opt_erasure_code:
+		if (v)
+			bch2_check_set_feature(c, BCH_FEATURE_ec);
+		break;
+	}
+
+	return ret;
+}
+
+int bch2_opts_check_may_set(struct bch_fs *c)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		ret = bch2_opt_check_may_set(c, i,
+				bch2_opt_get_by_id(&c->opts, i));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
+			  char *options)
+{
+	char *copied_opts, *copied_opts_start;
+	char *opt, *name, *val;
+	int ret, id;
+	struct printbuf err = PRINTBUF;
+	u64 v;
+
+	if (!options)
+		return 0;
+
+	/*
+	 * sys_fsconfig() is now occasionally providing us with option lists
+	 * starting with a comma - weird.
+	 */
+	if (*options == ',')
+		options++;
+
+	copied_opts = kstrdup(options, GFP_KERNEL);
+	if (!copied_opts)
+		return -1;
+	copied_opts_start = copied_opts;
+
+	while ((opt = strsep(&copied_opts, ",")) != NULL) {
+		name	= strsep(&opt, "=");
+		val	= opt;
+
+		id = bch2_mount_opt_lookup(name);
+
+		/* Check for the form "noopt", negation of a boolean opt: */
+		if (id < 0 &&
+		    !val &&
+		    !strncmp("no", name, 2)) {
+			id = bch2_mount_opt_lookup(name + 2);
+			val = "0";
+		}
+
+		/* Unknown options are ignored: */
+		if (id < 0)
+			continue;
+
+		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+			goto bad_opt;
+
+		if (id == Opt_acl &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+			goto bad_opt;
+
+		if ((id == Opt_usrquota ||
+		     id == Opt_grpquota) &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+			goto bad_opt;
+
+		ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+		if (ret < 0)
+			goto bad_val;
+
+		bch2_opt_set_by_id(opts, id, v);
+	}
+
+	ret = 0;
+	goto out;
+
+bad_opt:
+	pr_err("Bad mount option %s", name);
+	ret = -1;
+	goto out;
+bad_val:
+	pr_err("Invalid mount option %s", err.buf);
+	ret = -1;
+	goto out;
+out:
+	kfree(copied_opts_start);
+	printbuf_exit(&err);
+	return ret;
+}
+
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+{
+	const struct bch_option *opt = bch2_opt_table + id;
+	u64 v;
+
+	v = opt->get_sb(sb);
+
+	if (opt->flags & OPT_SB_FIELD_ILOG2)
+		v = 1ULL << v;
+
+	if (opt->flags & OPT_SB_FIELD_SECTORS)
+		v <<= 9;
+
+	return v;
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
+{
+	unsigned id;
+
+	for (id = 0; id < bch2_opts_nr; id++) {
+		const struct bch_option *opt = bch2_opt_table + id;
+
+		if (opt->get_sb == BCH2_NO_SB_OPT)
+			continue;
+
+		bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
+	}
+
+	return 0;
+}
+
+void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+{
+	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
+		return;
+
+	if (opt->flags & OPT_SB_FIELD_SECTORS)
+		v >>= 9;
+
+	if (opt->flags & OPT_SB_FIELD_ILOG2)
+		v = ilog2(v);
+
+	opt->set_sb(sb, v);
+}
+
+void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+{
+	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
+		return;
+
+	mutex_lock(&c->sb_lock);
+	__bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+	return (struct bch_io_opts) {
+#define x(_name, _bits)	._name = src._name,
+	BCH_INODE_OPTS()
+#undef x
+	};
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+	static const enum bch_opt_id inode_opt_list[] = {
+#define x(_name, _bits)	Opt_##_name,
+	BCH_INODE_OPTS()
+#undef x
+	};
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+		if (inode_opt_list[i] == id)
+			return true;
+
+	return false;
+}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
new file mode 100644
index 000000000000..55014336c5f7
--- /dev/null
+++ b/fs/bcachefs/opts.h
@@ -0,0 +1,564 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_OPTS_H
+#define _BCACHEFS_OPTS_H
+
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include "bcachefs_format.h"
+
+struct bch_fs;
+
+extern const char * const bch2_iops_measurements[];
+extern const char * const bch2_error_actions[];
+extern const char * const bch2_fsck_fix_opts[];
+extern const char * const bch2_version_upgrade_opts[];
+extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
+extern const char * const bch2_btree_ids[];
+extern const char * const bch2_csum_types[];
+extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_types[];
+extern const char * const bch2_compression_opts[];
+extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_str_hash_opts[];
+extern const char * const bch2_data_types[];
+extern const char * const bch2_member_states[];
+extern const char * const bch2_jset_entry_types[];
+extern const char * const bch2_fs_usage_types[];
+extern const char * const bch2_d_types[];
+
+static inline const char *bch2_d_type_str(unsigned d_type)
+{
+	return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
+}
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
+ * apply the options from that struct that are defined.
+ */
+
+/* dummy option, for options that aren't stored in the superblock */
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
+
+/* When can be set: */
+enum opt_flags {
+	OPT_FS		= (1 << 0),	/* Filesystem option */
+	OPT_DEVICE	= (1 << 1),	/* Device option */
+	OPT_INODE	= (1 << 2),	/* Inode option */
+	OPT_FORMAT	= (1 << 3),	/* May be specified at format time */
+	OPT_MOUNT	= (1 << 4),	/* May be specified at mount time */
+	OPT_RUNTIME	= (1 << 5),	/* May be specified at runtime */
+	OPT_HUMAN_READABLE = (1 << 6),
+	OPT_MUST_BE_POW_2 = (1 << 7),	/* Must be power of 2 */
+	OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
+	OPT_SB_FIELD_ILOG2 = (1 << 9),	/* Superblock field is ilog2 of actual value */
+};
+
+enum opt_type {
+	BCH_OPT_BOOL,
+	BCH_OPT_UINT,
+	BCH_OPT_STR,
+	BCH_OPT_FN,
+};
+
+struct bch_opt_fn {
+	int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+};
+
+/**
+ * x(name, shortopt, type, in mem type, mode, sb_opt)
+ *
+ * @name	- name of mount option, sysfs attribute, and struct bch_opts
+ *		  member
+ *
+ * @mode	- when opt may be set
+ *
+ * @sb_option	- name of corresponding superblock option
+ *
+ * @type	- one of OPT_BOOL, OPT_UINT, OPT_STR
+ */
+
+/*
+ * XXX: add fields for
+ *  - default value
+ *  - helptext
+ */
+
+#ifdef __KERNEL__
+#define RATELIMIT_ERRORS_DEFAULT true
+#else
+#define RATELIMIT_ERRORS_DEFAULT false
+#endif
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCACHEFS_VERBOSE_DEFAULT	true
+#else
+#define BCACHEFS_VERBOSE_DEFAULT	false
+#endif
+
+#define BCH_FIX_ERRORS_OPTS()		\
+	x(exit,	0)			\
+	x(yes,	1)			\
+	x(no,	2)			\
+	x(ask,	3)
+
+enum fsck_err_opts {
+#define x(t, n)	FSCK_FIX_##t,
+	BCH_FIX_ERRORS_OPTS()
+#undef x
+};
+
+#define BCH_OPTS()							\
+	x(block_size,			u16,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
+	  OPT_UINT(512, 1U << 16),					\
+	  BCH_SB_BLOCK_SIZE,		8,				\
+	  "size",	NULL)						\
+	x(btree_node_size,		u32,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
+	  OPT_UINT(512, 1U << 20),					\
+	  BCH_SB_BTREE_NODE_SIZE,	512,				\
+	  "size",	"Btree node size, default 256k")		\
+	x(errors,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(bch2_error_actions),					\
+	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_ro,		\
+	  NULL,		"Action to take on filesystem error")		\
+	x(metadata_replicas,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_WANT,	1,				\
+	  "#",		"Number of metadata replicas")			\
+	x(data_replicas,		u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
+	  "#",		"Number of data replicas")			\
+	x(metadata_replicas_required, u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(data_replicas_required,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(encoded_extent_max,		u32,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
+	  OPT_UINT(4096, 2U << 20),					\
+	  BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,			\
+	  "size",	"Maximum size of checksummed/compressed extents")\
+	x(metadata_checksum,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(bch2_csum_opts),					\
+	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
+	  NULL,		NULL)						\
+	x(data_checksum,		u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_STR(bch2_csum_opts),					\
+	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
+	  NULL,		NULL)						\
+	x(compression,			u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_compression),					\
+	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
+	  NULL,		NULL)						\
+	x(background_compression,	u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_compression),					\
+	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
+	  NULL,		NULL)						\
+	x(str_hash,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(bch2_str_hash_opts),					\
+	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
+	  NULL,		"Hash function for directory entries and xattrs")\
+	x(metadata_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_METADATA_TARGET,	0,				\
+	  "(target)",	"Device or label for metadata writes")		\
+	x(foreground_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_FOREGROUND_TARGET,	0,				\
+	  "(target)",	"Device or label for foreground writes")	\
+	x(background_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_BACKGROUND_TARGET,	0,				\
+	  "(target)",	"Device or label to move data to in the background")\
+	x(promote_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_PROMOTE_TARGET,	0,				\
+	  "(target)",	"Device or label to promote data to on read")	\
+	x(erasure_code,			u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_BOOL(),							\
+	  BCH_SB_ERASURE_CODE,		false,				\
+	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
+	x(inodes_32bit,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODE_32BIT,		true,				\
+	  NULL,		"Constrain inode numbers to 32 bits")		\
+	x(shard_inode_numbers,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_SHARD_INUMS,		true,				\
+	  NULL,		"Shard new inode numbers by CPU id")		\
+	x(inodes_use_key_cache,	u8,					\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
+	  NULL,		"Use the btree key cache for the inodes btree")	\
+	x(btree_node_mem_ptr_optimization, u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
+	x(btree_write_buffer_size, u32,					\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_UINT(16, (1U << 20) - 1),					\
+	  BCH2_NO_SB_OPT,		1U << 13,			\
+	  NULL,		"Number of btree write buffer entries")		\
+	x(gc_reserve_percent,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(5, 21),						\
+	  BCH_SB_GC_RESERVE,		8,				\
+	  "%",		"Percentage of disk space to reserve for copygc")\
+	x(gc_reserve_bytes,		u64,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|			\
+	  OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,			\
+	  OPT_UINT(0, U64_MAX),						\
+	  BCH_SB_GC_RESERVE_BYTES,	0,				\
+	  "%",		"Amount of disk space to reserve for copygc\n"	\
+			"Takes precedence over gc_reserve_percent if set")\
+	x(root_reserve_percent,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(0, 100),						\
+	  BCH_SB_ROOT_RESERVE,		0,				\
+	  "%",		"Percentage of disk space to reserve for superuser")\
+	x(wide_macs,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_128_BIT_MACS,		false,				\
+	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
+	x(inline_data,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Enable inline data extents")			\
+	x(acl,				u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_POSIX_ACL,		true,				\
+	  NULL,		"Enable POSIX acls")				\
+	x(usrquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_USRQUOTA,		false,				\
+	  NULL,		"Enable user quotas")				\
+	x(grpquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_GRPQUOTA,		false,				\
+	  NULL,		"Enable group quotas")				\
+	x(prjquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_PRJQUOTA,		false,				\
+	  NULL,		"Enable project quotas")			\
+	x(degraded,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Allow mounting in degraded mode")		\
+	x(very_degraded,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Allow mounting in when data will be missing")	\
+	x(discard,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Enable discard/TRIM support")			\
+	x(verbose,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		BCACHEFS_VERBOSE_DEFAULT,	\
+	  NULL,		"Extra debugging information during mount/recovery")\
+	x(journal_flush_delay,		u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(1, U32_MAX),						\
+	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
+	  NULL,		"Delay in milliseconds before automatic journal commits")\
+	x(journal_flush_disabled,	u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_JOURNAL_FLUSH_DISABLED,false,				\
+	  NULL,		"Disable journal flush on sync/fsync\n"		\
+			"If enabled, writes can be lost, but only since the\n"\
+			"last journal write (default 1 second)")	\
+	x(journal_reclaim_delay,	u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(0, U32_MAX),						\
+	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
+	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
+	x(move_bytes_in_flight,		u32,				\
+	  OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_UINT(1024, U32_MAX),					\
+	  BCH2_NO_SB_OPT,		1U << 20,			\
+	  NULL,		"Maximum Amount of IO to keep in flight by the move path")\
+	x(move_ios_in_flight,		u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(1, 1024),						\
+	  BCH2_NO_SB_OPT,		32,				\
+	  NULL,		"Maximum number of IOs to keep in flight by the move path")\
+	x(fsck,				u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Run fsck on mount")				\
+	x(fix_errors,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_FN(bch2_opt_fix_errors),					\
+	  BCH2_NO_SB_OPT,		FSCK_FIX_exit,			\
+	  NULL,		"Fix errors during fsck without asking")	\
+	x(ratelimit_errors,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		RATELIMIT_ERRORS_DEFAULT,	\
+	  NULL,		"Ratelimit error messages during fsck")		\
+	x(nochanges,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
+			"even if we have to replay the journal")	\
+	x(norecovery,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't replay the journal")			\
+	x(keep_journal,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't free journal entries/keys after startup")\
+	x(read_entire_journal,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Read all journal entries, not just dirty ones")\
+	x(read_journal_only,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Only read the journal, skip the rest of recovery")\
+	x(journal_transaction_names,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_JOURNAL_TRANSACTION_NAMES, true,			\
+	  NULL,		"Log transaction function names in journal")	\
+	x(noexcl,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't open device in exclusive mode")		\
+	x(direct_io,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Use O_DIRECT (userspace only)")		\
+	x(sb,				u64,				\
+	  OPT_MOUNT,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		BCH_SB_SECTOR,			\
+	  "offset",	"Sector offset of superblock")			\
+	x(read_only,			u8,				\
+	  OPT_FS,							\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		NULL)						\
+	x(nostart,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don\'t start filesystem, only open devices")	\
+	x(reconstruct_alloc,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Reconstruct alloc btree")			\
+	x(version_upgrade,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_STR(bch2_version_upgrade_opts),				\
+	  BCH_SB_VERSION_UPGRADE,	BCH_VERSION_UPGRADE_compatible,	\
+	  NULL,		"Set superblock to latest version,\n"		\
+			"allowing any new features to be used")		\
+	x(buckets_nouse,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Allocate the buckets_nouse bitmap")		\
+	x(project,			u8,				\
+	  OPT_INODE,							\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		NULL)						\
+	x(nocow,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,		\
+	  OPT_BOOL(),							\
+	  BCH_SB_NOCOW,			false,				\
+	  NULL,		"Nocow mode: Writes will be done in place when possible.\n"\
+			"Snapshots and reflink will still caused writes to be COW\n"\
+			"Implicitly disables data checksumming, compression and encryption")\
+	x(nocow_enabled,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Enable nocow mode: enables runtime locking in\n"\
+			"data move path needed if nocow will ever be in use\n")\
+	x(no_data_io,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Skip submit_bio() for data reads and writes, "	\
+			"for performance testing purposes")		\
+	x(fs_size,			u64,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(bucket,			u32,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(durability,			u8,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
+	  BCH2_NO_SB_OPT,		1,				\
+	  "n",		"Data written to this device will be considered\n"\
+			"to have already been replicated n times")
+
+struct bch_opts {
+#define x(_name, _bits, ...)	unsigned _name##_defined:1;
+	BCH_OPTS()
+#undef x
+
+#define x(_name, _bits, ...)	_bits	_name;
+	BCH_OPTS()
+#undef x
+};
+
+static const __maybe_unused struct bch_opts bch2_opts_default = {
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
+	._name##_defined = true,					\
+	._name = _default,						\
+
+	BCH_OPTS()
+#undef x
+};
+
+#define opt_defined(_opts, _name)	((_opts)._name##_defined)
+
+#define opt_get(_opts, _name)						\
+	(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
+
+#define opt_set(_opts, _name, _v)					\
+do {									\
+	(_opts)._name##_defined = true;					\
+	(_opts)._name = _v;						\
+} while (0)
+
+static inline struct bch_opts bch2_opts_empty(void)
+{
+	return (struct bch_opts) { 0 };
+}
+
+void bch2_opts_apply(struct bch_opts *, struct bch_opts);
+
+enum bch_opt_id {
+#define x(_name, ...)	Opt_##_name,
+	BCH_OPTS()
+#undef x
+	bch2_opts_nr
+};
+
+struct bch_fs;
+struct printbuf;
+
+struct bch_option {
+	struct attribute	attr;
+	u64			(*get_sb)(const struct bch_sb *);
+	void			(*set_sb)(struct bch_sb *, u64);
+	enum opt_type		type;
+	enum opt_flags		flags;
+	u64			min, max;
+
+	const char * const *choices;
+
+	struct bch_opt_fn	fn;
+
+	const char		*hint;
+	const char		*help;
+
+};
+
+extern const struct bch_option bch2_opt_table[];
+
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
+int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
+void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
+void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
+
+int bch2_opt_lookup(const char *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+		   const char *, u64 *, struct printbuf *);
+
+#define OPT_SHOW_FULL_LIST	(1 << 0)
+#define OPT_SHOW_MOUNT_STYLE	(1 << 1)
+
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
+		      const struct bch_option *, u64, unsigned);
+
+int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
+
+/* inode opts: */
+
+struct bch_io_opts {
+#define x(_name, _bits)	u##_bits _name;
+	BCH_INODE_OPTS()
+#undef x
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
+#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
new file mode 100644
index 000000000000..de41f9a14492
--- /dev/null
+++ b/fs/bcachefs/printbuf.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "printbuf.h"
+
+static inline unsigned printbuf_linelen(struct printbuf *buf)
+{
+	return buf->pos - buf->last_newline;
+}
+
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
+	unsigned new_size;
+	char *buf;
+
+	if (!out->heap_allocated)
+		return 0;
+
+	/* Reserved space for terminating nul: */
+	extra += 1;
+
+	if (out->pos + extra < out->size)
+		return 0;
+
+	new_size = roundup_pow_of_two(out->size + extra);
+
+	/*
+	 * Note: output buffer must be freeable with kfree(), it's not required
+	 * that the user use printbuf_exit().
+	 */
+	buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+
+	if (!buf) {
+		out->allocation_failure = true;
+		return -ENOMEM;
+	}
+
+	out->buf	= buf;
+	out->size	= new_size;
+	return 0;
+}
+
+void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+	int len;
+
+	do {
+		va_list args2;
+
+		va_copy(args2, args);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+	} while (len + 1 >= printbuf_remaining(out) &&
+		 !bch2_printbuf_make_room(out, len + 1));
+
+	len = min_t(size_t, len,
+		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+	out->pos += len;
+}
+
+void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+
+	do {
+		va_start(args, fmt);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+		va_end(args);
+	} while (len + 1 >= printbuf_remaining(out) &&
+		 !bch2_printbuf_make_room(out, len + 1));
+
+	len = min_t(size_t, len,
+		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+	out->pos += len;
+}
+
+/**
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf:	printbuf to terminate
+ * Returns:	Printbuf contents, as a nul terminated C string
+ */
+const char *bch2_printbuf_str(const struct printbuf *buf)
+{
+	/*
+	 * If we've written to a printbuf then it's guaranteed to be a null
+	 * terminated string - but if we haven't, then we might not have
+	 * allocated a buffer at all:
+	 */
+	return buf->pos
+		? buf->buf
+		: "";
+}
+
+/**
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
+ * against accidental use.
+ * @buf:	printbuf to exit
+ */
+void bch2_printbuf_exit(struct printbuf *buf)
+{
+	if (buf->heap_allocated) {
+		kfree(buf->buf);
+		buf->buf = ERR_PTR(-EINTR); /* poison value */
+	}
+}
+
+void bch2_printbuf_tabstops_reset(struct printbuf *buf)
+{
+	buf->nr_tabstops = 0;
+}
+
+void bch2_printbuf_tabstop_pop(struct printbuf *buf)
+{
+	if (buf->nr_tabstops)
+		--buf->nr_tabstops;
+}
+
+/*
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces from previous tabpstop
+ *
+ * In the future this function may allocate memory if setting more than
+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
+ * of line.
+ */
+int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+{
+	unsigned prev_tabstop = buf->nr_tabstops
+		? buf->_tabstops[buf->nr_tabstops - 1]
+		: 0;
+
+	if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
+		return -EINVAL;
+
+	buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
+	buf->has_indent_or_tabstops = true;
+	return 0;
+}
+
+/**
+ * bch2_printbuf_indent_add() - add to the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces more spaces.
+ */
+void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+{
+	if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+		spaces = 0;
+
+	buf->indent += spaces;
+	prt_chars(buf, ' ', spaces);
+
+	buf->has_indent_or_tabstops = true;
+}
+
+/**
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to subtract from the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces less spaces.
+ */
+void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+{
+	if (WARN_ON_ONCE(spaces > buf->indent))
+		spaces = buf->indent;
+
+	if (buf->last_newline + buf->indent == buf->pos) {
+		buf->pos -= spaces;
+		printbuf_nul_terminate(buf);
+	}
+	buf->indent -= spaces;
+
+	if (!buf->indent && !buf->nr_tabstops)
+		buf->has_indent_or_tabstops = false;
+}
+
+void bch2_prt_newline(struct printbuf *buf)
+{
+	unsigned i;
+
+	bch2_printbuf_make_room(buf, 1 + buf->indent);
+
+	__prt_char(buf, '\n');
+
+	buf->last_newline	= buf->pos;
+
+	for (i = 0; i < buf->indent; i++)
+		__prt_char(buf, ' ');
+
+	printbuf_nul_terminate(buf);
+
+	buf->last_field		= buf->pos;
+	buf->cur_tabstop	= 0;
+}
+
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+	return buf->cur_tabstop < buf->nr_tabstops
+		? buf->_tabstops[buf->cur_tabstop]
+		: 0;
+}
+
+static void __prt_tab(struct printbuf *out)
+{
+	int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
+
+	prt_chars(out, ' ', spaces);
+
+	out->last_field = out->pos;
+	out->cur_tabstop++;
+}
+
+/**
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out:	printbuf to control
+ *
+ * Advance output to the next tabstop by printing spaces.
+ */
+void bch2_prt_tab(struct printbuf *out)
+{
+	if (WARN_ON(!cur_tabstop(out)))
+		return;
+
+	__prt_tab(out);
+}
+
+static void __prt_tab_rjust(struct printbuf *buf)
+{
+	unsigned move = buf->pos - buf->last_field;
+	int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
+
+	if (pad > 0) {
+		bch2_printbuf_make_room(buf, pad);
+
+		if (buf->last_field + pad < buf->size)
+			memmove(buf->buf + buf->last_field + pad,
+				buf->buf + buf->last_field,
+				min(move, buf->size - 1 - buf->last_field - pad));
+
+		if (buf->last_field < buf->size)
+			memset(buf->buf + buf->last_field, ' ',
+			       min((unsigned) pad, buf->size - buf->last_field));
+
+		buf->pos += pad;
+		printbuf_nul_terminate(buf);
+	}
+
+	buf->last_field = buf->pos;
+	buf->cur_tabstop++;
+}
+
+/**
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * previous output
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by inserting spaces immediately after the
+ * previous tabstop, right justifying previously outputted text.
+ */
+void bch2_prt_tab_rjust(struct printbuf *buf)
+{
+	if (WARN_ON(!cur_tabstop(buf)))
+		return;
+
+	__prt_tab_rjust(buf);
+}
+
+/**
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
+ *
+ * @out:	output printbuf
+ * @str:	string to print
+ * @count:	number of bytes to print
+ *
+ * The following contol characters are handled as so:
+ *   \n: prt_newline	newline that obeys current indent level
+ *   \t: prt_tab	advance to next tabstop
+ *   \r: prt_tab_rjust	advance to next tabstop, with right justification
+ */
+void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+{
+	const char *unprinted_start = str;
+	const char *end = str + count;
+
+	if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
+		prt_bytes(out, str, count);
+		return;
+	}
+
+	while (str != end) {
+		switch (*str) {
+		case '\n':
+			prt_bytes(out, unprinted_start, str - unprinted_start);
+			unprinted_start = str + 1;
+			bch2_prt_newline(out);
+			break;
+		case '\t':
+			if (likely(cur_tabstop(out))) {
+				prt_bytes(out, unprinted_start, str - unprinted_start);
+				unprinted_start = str + 1;
+				__prt_tab(out);
+			}
+			break;
+		case '\r':
+			if (likely(cur_tabstop(out))) {
+				prt_bytes(out, unprinted_start, str - unprinted_start);
+				unprinted_start = str + 1;
+				__prt_tab_rjust(out);
+			}
+			break;
+		}
+
+		str++;
+	}
+
+	prt_bytes(out, unprinted_start, str - unprinted_start);
+}
+
+/**
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
+ */
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
+{
+	bch2_printbuf_make_room(out, 10);
+	out->pos += string_get_size(v, 1, !out->si_units,
+				    out->buf + out->pos,
+				    printbuf_remaining_size(out));
+}
+
+/**
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
+ */
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
+{
+	if (v < 0)
+		prt_char(out, '-');
+	bch2_prt_human_readable_u64(out, abs(v));
+}
+
+/**
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_u64(struct printbuf *out, u64 v)
+{
+	if (out->human_readable_units)
+		bch2_prt_human_readable_u64(out, v);
+	else
+		bch2_prt_printf(out, "%llu", v);
+}
+
+/**
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_s64(struct printbuf *out, s64 v)
+{
+	if (v < 0)
+		prt_char(out, '-');
+	bch2_prt_units_u64(out, abs(v));
+}
+
+void bch2_prt_string_option(struct printbuf *out,
+			    const char * const list[],
+			    size_t selected)
+{
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
+}
+
+void bch2_prt_bitflags(struct printbuf *out,
+		       const char * const list[], u64 flags)
+{
+	unsigned bit, nr = 0;
+	bool first = true;
+
+	while (list[nr])
+		nr++;
+
+	while (flags && (bit = __ffs(flags)) < nr) {
+		if (!first)
+			bch2_prt_printf(out, ",");
+		first = false;
+		bch2_prt_printf(out, "%s", list[bit]);
+		flags ^= 1 << bit;
+	}
+}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
new file mode 100644
index 000000000000..2191423d9f22
--- /dev/null
+++ b/fs/bcachefs/printbuf.h
@@ -0,0 +1,284 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _BCACHEFS_PRINTBUF_H
+#define _BCACHEFS_PRINTBUF_H
+
+/*
+ * Printbufs: Simple strings for printing to, with optional heap allocation
+ *
+ * This code has provisions for use in userspace, to aid in making other code
+ * portable between kernelspace and userspace.
+ *
+ * Basic example:
+ *   struct printbuf buf = PRINTBUF;
+ *
+ *   prt_printf(&buf, "foo=");
+ *   foo_to_text(&buf, foo);
+ *   printk("%s", buf.buf);
+ *   printbuf_exit(&buf);
+ *
+ * Or
+ *   struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
+ *
+ * We can now write pretty printers instead of writing code that dumps
+ * everything to the kernel log buffer, and then those pretty-printers can be
+ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
+ *
+ * Memory allocation: Outputing to a printbuf may allocate memory. This
+ * allocation is done with GFP_KERNEL, by default: use the newer
+ * memalloc_*_(save|restore) functions as needed.
+ *
+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
+ *
+ * It's allowed to grab the output buffer and free it later with kfree() instead
+ * of using printbuf_exit(), if the user just needs a heap allocated string at
+ * the end.
+ *
+ * Memory allocation failures: We don't return errors directly, because on
+ * memory allocation failure we usually don't want to bail out and unwind - we
+ * want to print what we've got, on a best-effort basis. But code that does want
+ * to return -ENOMEM may check printbuf.allocation_failure.
+ *
+ * Indenting, tabstops:
+ *
+ * To aid is writing multi-line pretty printers spread across multiple
+ * functions, printbufs track the current indent level.
+ *
+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
+ * level, respectively.
+ *
+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
+ * prt_tab_rjust() will also advance the current line of text up to the next
+ * tabstop, but it does so by shifting text since the previous tabstop up to the
+ * next tabstop - right justifying it.
+ *
+ * Make sure you use prt_newline() instead of \n in the format string for indent
+ * level and tabstops to work corretly.
+ *
+ * Output units: printbuf->units exists to tell pretty-printers how to output
+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
+ * human readable bytes. prt_units() obeys it.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+enum printbuf_si {
+	PRINTBUF_UNITS_2,	/* use binary powers of 2^10 */
+	PRINTBUF_UNITS_10,	/* use powers of 10^3 (standard SI) */
+};
+
+#define PRINTBUF_INLINE_TABSTOPS	6
+
+struct printbuf {
+	char			*buf;
+	unsigned		size;
+	unsigned		pos;
+	unsigned		last_newline;
+	unsigned		last_field;
+	unsigned		indent;
+	/*
+	 * If nonzero, allocations will be done with GFP_ATOMIC:
+	 */
+	u8			atomic;
+	bool			allocation_failure:1;
+	bool			heap_allocated:1;
+	enum printbuf_si	si_units:1;
+	bool			human_readable_units:1;
+	bool			has_indent_or_tabstops:1;
+	bool			suppress_indent_tabstop_handling:1;
+	u8			nr_tabstops;
+
+	/*
+	 * Do not modify directly: use printbuf_tabstop_add(),
+	 * printbuf_tabstop_get()
+	 */
+	u8			cur_tabstop;
+	u8			_tabstops[PRINTBUF_INLINE_TABSTOPS];
+};
+
+int bch2_printbuf_make_room(struct printbuf *, unsigned);
+__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
+__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
+const char *bch2_printbuf_str(const struct printbuf *);
+void bch2_printbuf_exit(struct printbuf *);
+
+void bch2_printbuf_tabstops_reset(struct printbuf *);
+void bch2_printbuf_tabstop_pop(struct printbuf *);
+int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
+
+void bch2_printbuf_indent_add(struct printbuf *, unsigned);
+void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
+
+void bch2_prt_newline(struct printbuf *);
+void bch2_prt_tab(struct printbuf *);
+void bch2_prt_tab_rjust(struct printbuf *);
+
+void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void bch2_prt_human_readable_u64(struct printbuf *, u64);
+void bch2_prt_human_readable_s64(struct printbuf *, s64);
+void bch2_prt_units_u64(struct printbuf *, u64);
+void bch2_prt_units_s64(struct printbuf *, s64);
+void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
+void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
+
+/* Initializer for a heap allocated printbuf: */
+#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
+
+/* Initializer a printbuf that points to an external buffer: */
+#define PRINTBUF_EXTERN(_buf, _size)			\
+((struct printbuf) {					\
+	.buf	= _buf,					\
+	.size	= _size,				\
+})
+
+/*
+ * Returns size remaining of output buffer:
+ */
+static inline unsigned printbuf_remaining_size(struct printbuf *out)
+{
+	return out->pos < out->size ? out->size - out->pos : 0;
+}
+
+/*
+ * Returns number of characters we can print to the output buffer - i.e.
+ * excluding the terminating nul:
+ */
+static inline unsigned printbuf_remaining(struct printbuf *out)
+{
+	return out->pos < out->size ? out->size - out->pos - 1 : 0;
+}
+
+static inline unsigned printbuf_written(struct printbuf *out)
+{
+	return out->size ? min(out->pos, out->size - 1) : 0;
+}
+
+/*
+ * Returns true if output was truncated:
+ */
+static inline bool printbuf_overflowed(struct printbuf *out)
+{
+	return out->pos >= out->size;
+}
+
+static inline void printbuf_nul_terminate(struct printbuf *out)
+{
+	bch2_printbuf_make_room(out, 1);
+
+	if (out->pos < out->size)
+		out->buf[out->pos] = 0;
+	else if (out->size)
+		out->buf[out->size - 1] = 0;
+}
+
+/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
+static inline void __prt_char_reserved(struct printbuf *out, char c)
+{
+	if (printbuf_remaining(out))
+		out->buf[out->pos] = c;
+	out->pos++;
+}
+
+/* Doesn't nul terminate: */
+static inline void __prt_char(struct printbuf *out, char c)
+{
+	bch2_printbuf_make_room(out, 1);
+	__prt_char_reserved(out, c);
+}
+
+static inline void prt_char(struct printbuf *out, char c)
+{
+	__prt_char(out, c);
+	printbuf_nul_terminate(out);
+}
+
+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
+{
+	unsigned i, can_print = min(n, printbuf_remaining(out));
+
+	for (i = 0; i < can_print; i++)
+		out->buf[out->pos++] = c;
+	out->pos += n - can_print;
+}
+
+static inline void prt_chars(struct printbuf *out, char c, unsigned n)
+{
+	bch2_printbuf_make_room(out, n);
+	__prt_chars_reserved(out, c, n);
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
+{
+	unsigned i, can_print;
+
+	bch2_printbuf_make_room(out, n);
+
+	can_print = min(n, printbuf_remaining(out));
+
+	for (i = 0; i < can_print; i++)
+		out->buf[out->pos++] = ((char *) b)[i];
+	out->pos += n - can_print;
+
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_str(struct printbuf *out, const char *str)
+{
+	prt_bytes(out, str, strlen(str));
+}
+
+static inline void prt_str_indented(struct printbuf *out, const char *str)
+{
+	bch2_prt_bytes_indented(out, str, strlen(str));
+}
+
+static inline void prt_hex_byte(struct printbuf *out, u8 byte)
+{
+	bch2_printbuf_make_room(out, 2);
+	__prt_char_reserved(out, hex_asc_hi(byte));
+	__prt_char_reserved(out, hex_asc_lo(byte));
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
+{
+	bch2_printbuf_make_room(out, 2);
+	__prt_char_reserved(out, hex_asc_upper_hi(byte));
+	__prt_char_reserved(out, hex_asc_upper_lo(byte));
+	printbuf_nul_terminate(out);
+}
+
+/**
+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
+ */
+static inline void printbuf_reset(struct printbuf *buf)
+{
+	buf->pos		= 0;
+	buf->allocation_failure	= 0;
+	buf->indent		= 0;
+	buf->nr_tabstops	= 0;
+	buf->cur_tabstop	= 0;
+}
+
+/**
+ * printbuf_atomic_inc - mark as entering an atomic section
+ */
+static inline void printbuf_atomic_inc(struct printbuf *buf)
+{
+	buf->atomic++;
+}
+
+/**
+ * printbuf_atomic_inc - mark as leaving an atomic section
+ */
+static inline void printbuf_atomic_dec(struct printbuf *buf)
+{
+	buf->atomic--;
+}
+
+#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
new file mode 100644
index 000000000000..cb68ae44d597
--- /dev/null
+++ b/fs/bcachefs/quota.c
@@ -0,0 +1,978 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "quota.h"
+#include "snapshot.h"
+#include "super-io.h"
+
+static const char * const bch2_quota_types[] = {
+	"user",
+	"group",
+	"project",
+};
+
+static const char * const bch2_quota_counters[] = {
+	"space",
+	"inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+	if (vstruct_bytes(&q->field) < sizeof(*q)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&q->field), sizeof(*q));
+		return -BCH_ERR_invalid_sb_quota;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+	unsigned qtyp, counter;
+
+	for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+		prt_printf(out, "%s: flags %llx",
+		       bch2_quota_types[qtyp],
+		       le64_to_cpu(q->q[qtyp].flags));
+
+		for (counter = 0; counter < Q_COUNTERS; counter++)
+			prt_printf(out, " %s timelimit %u warnlimit %u",
+			       bch2_quota_counters[counter],
+			       le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+			       le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
+
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+	.validate	= bch2_sb_quota_validate,
+	.to_text	= bch2_sb_quota_to_text,
+};
+
+int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
+{
+	if (k.k->p.inode >= QTYP_NR) {
+		prt_printf(err, "invalid quota type (%llu >= %u)",
+		       k.k->p.inode, QTYP_NR);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
+	unsigned i;
+
+	for (i = 0; i < Q_COUNTERS; i++)
+		prt_printf(out, "%s hardlimit %llu softlimit %llu",
+		       bch2_quota_counters[i],
+		       le64_to_cpu(dq.v->c[i].hardlimit),
+		       le64_to_cpu(dq.v->c[i].softlimit));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+
+	prt_str(out, "i_fieldmask");
+	prt_tab(out);
+	prt_printf(out, "%x", i->i_fieldmask);
+	prt_newline(out);
+
+	prt_str(out, "i_flags");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_flags);
+	prt_newline(out);
+
+	prt_str(out, "i_spc_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_spc_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_ino_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_ino_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_rt_spc_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_rt_spc_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_spc_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_spc_warnlimit);
+	prt_newline(out);
+
+	prt_str(out, "i_ino_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_ino_warnlimit);
+	prt_newline(out);
+
+	prt_str(out, "i_rt_spc_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_rt_spc_warnlimit);
+	prt_newline(out);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+
+	prt_str(out, "d_fieldmask");
+	prt_tab(out);
+	prt_printf(out, "%x", q->d_fieldmask);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_hardlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_hardlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_softlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_softlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_hardlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_hardlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_softlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_softlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_space");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_space);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_count");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_count);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_timer");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_timer);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_timer");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_timer);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_warns");
+	prt_tab(out);
+	prt_printf(out, "%i", q->d_ino_warns);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_warns");
+	prt_tab(out);
+	prt_printf(out, "%i", q->d_spc_warns);
+	prt_newline(out);
+}
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+	qtypes >>= i;
+	return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes)				\
+	for (_i = 0;							\
+	     (_i = __next_qtype(_i, _qtypes),				\
+	      _q = &(_c)->quotas[_i],					\
+	      _i < QTYP_NR);						\
+	     _i++)
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+	if (capable(CAP_SYS_RESOURCE))
+		return true;
+#if 0
+	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+	return capable(CAP_SYS_RESOURCE) &&
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+	return false;
+}
+
+enum quota_msg {
+	SOFTWARN,	/* Softlimit reached */
+	SOFTLONGWARN,	/* Grace time expired */
+	HARDWARN,	/* Hardlimit reached */
+
+	HARDBELOW,	/* Usage got below inode hardlimit */
+	SOFTBELOW,	/* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+	[HARDWARN][Q_SPC]	= QUOTA_NL_BHARDWARN,
+	[SOFTLONGWARN][Q_SPC]	= QUOTA_NL_BSOFTLONGWARN,
+	[SOFTWARN][Q_SPC]	= QUOTA_NL_BSOFTWARN,
+	[HARDBELOW][Q_SPC]	= QUOTA_NL_BHARDBELOW,
+	[SOFTBELOW][Q_SPC]	= QUOTA_NL_BSOFTBELOW,
+
+	[HARDWARN][Q_INO]	= QUOTA_NL_IHARDWARN,
+	[SOFTLONGWARN][Q_INO]	= QUOTA_NL_ISOFTLONGWARN,
+	[SOFTWARN][Q_INO]	= QUOTA_NL_ISOFTWARN,
+	[HARDBELOW][Q_INO]	= QUOTA_NL_IHARDBELOW,
+	[SOFTBELOW][Q_INO]	= QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+	u8		nr;
+	struct {
+		u8	qtype;
+		u8	msg;
+	}		m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+			enum quota_counters counter,
+			struct quota_msgs *msgs,
+			enum quota_msg msg_type)
+{
+	BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+	msgs->m[msgs->nr].qtype	= qtype;
+	msgs->m[msgs->nr].msg	= quota_nl[msg_type][counter];
+	msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+			    unsigned qtype,
+			    enum quota_counters counter,
+			    struct quota_msgs *msgs,
+			    enum quota_msg msg_type)
+{
+	if (qc->warning_issued & (1 << msg_type))
+		return;
+
+	prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+			   struct super_block *sb,
+			   struct quota_msgs *msgs)
+{
+	unsigned i;
+
+	for (i = 0; i < msgs->nr; i++)
+		quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+				   sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+				  unsigned qtype,
+				  struct bch_memquota *mq,
+				  struct quota_msgs *msgs,
+				  enum quota_counters counter,
+				  s64 v,
+				  enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q = &c->quotas[qtype];
+	struct memquota_counter *qc = &mq->c[counter];
+	u64 n = qc->v + v;
+
+	BUG_ON((s64) n < 0);
+
+	if (mode == KEY_TYPE_QUOTA_NOCHECK)
+		return 0;
+
+	if (v <= 0) {
+		if (n < qc->hardlimit &&
+		    (qc->warning_issued & (1 << HARDWARN))) {
+			qc->warning_issued &= ~(1 << HARDWARN);
+			prepare_msg(qtype, counter, msgs, HARDBELOW);
+		}
+
+		if (n < qc->softlimit &&
+		    (qc->warning_issued & (1 << SOFTWARN))) {
+			qc->warning_issued &= ~(1 << SOFTWARN);
+			prepare_msg(qtype, counter, msgs, SOFTBELOW);
+		}
+
+		qc->warning_issued = 0;
+		return 0;
+	}
+
+	if (qc->hardlimit &&
+	    qc->hardlimit < n &&
+	    !ignore_hardlimit(q)) {
+		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+		return -EDQUOT;
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n) {
+		if (qc->timer == 0) {
+			qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
+			prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+		} else if (ktime_get_real_seconds() >= qc->timer &&
+			   !ignore_hardlimit(q)) {
+			prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+			return -EDQUOT;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+		    enum quota_counters counter, s64 v,
+		    enum quota_acct_mode mode)
+{
+	unsigned qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq[QTYP_NR];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
+		if (!mq[i])
+			return -ENOMEM;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mq[i]->c[counter].v += v;
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(qid, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+				  struct bch_memquota *dst_q,
+				  enum quota_counters counter, s64 v)
+{
+	BUG_ON(v > src_q->c[counter].v);
+	BUG_ON(v + dst_q->c[counter].v < v);
+
+	src_q->c[counter].v -= v;
+	dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+			struct bch_qid dst,
+			struct bch_qid src, u64 space,
+			enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q;
+	struct bch_memquota *src_q[3], *dst_q[3];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	qtypes &= enabled_qtypes(c);
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
+		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
+		if (!src_q[i] || !dst_q[i])
+			return -ENOMEM;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+					     dst_q[i]->c[Q_SPC].v + space,
+					     mode);
+		if (ret)
+			goto err;
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+					     dst_q[i]->c[Q_INO].v + 1,
+					     mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+	}
+
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(dst, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
+			    struct qc_dqblk *qdq)
+{
+	struct bkey_s_c_quota dq;
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq;
+	unsigned i;
+
+	BUG_ON(k.k->p.inode >= QTYP_NR);
+
+	if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
+		return 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_quota:
+		dq = bkey_s_c_to_quota(k);
+		q = &c->quotas[k.k->p.inode];
+
+		mutex_lock(&q->lock);
+		mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+		if (!mq) {
+			mutex_unlock(&q->lock);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < Q_COUNTERS; i++) {
+			mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+		}
+
+		if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
+			mq->c[Q_SPC].timer	= qdq->d_spc_timer;
+		if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
+			mq->c[Q_SPC].warns	= qdq->d_spc_warns;
+		if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
+			mq->c[Q_INO].timer	= qdq->d_ino_timer;
+		if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
+			mq->c[Q_INO].warns	= qdq->d_ino_warns;
+
+		mutex_unlock(&q->lock);
+	}
+
+	return 0;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		mutex_init(&c->quotas[i].lock);
+}
+
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+	struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
+
+	if (sb_quota)
+		return sb_quota;
+
+	sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
+	if (sb_quota) {
+		unsigned qtype, qc;
+
+		for (qtype = 0; qtype < QTYP_NR; qtype++)
+			for (qc = 0; qc < Q_COUNTERS; qc++)
+				sb_quota->q[qtype].c[qc].timelimit =
+					cpu_to_le32(7 * 24 * 60 * 60);
+	}
+
+	return sb_quota;
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+	struct bch_sb_field_quota *sb_quota;
+	unsigned i, j;
+
+	sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
+	if (!sb_quota)
+		return;
+
+	for (i = 0; i < QTYP_NR; i++) {
+		struct bch_memquota_type *q = &c->quotas[i];
+
+		for (j = 0; j < Q_COUNTERS; j++) {
+			q->limits[j].timelimit =
+				le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+			q->limits[j].warnlimit =
+				le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+		}
+	}
+}
+
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	struct bch_snapshot_tree s_t;
+	int ret;
+
+	ret = bch2_snapshot_tree_lookup(trans,
+			bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+			"%s: snapshot tree %u not found", __func__,
+			snapshot_t(c, k.k->p.snapshot)->tree);
+	if (ret)
+		return ret;
+
+	if (!s_t.master_subvol)
+		goto advance;
+
+	ret = bch2_inode_find_by_inum_nowarn_trans(trans,
+				(subvol_inum) {
+					le32_to_cpu(s_t.master_subvol),
+					k.k->p.offset,
+				}, &u);
+	/*
+	 * Inode might be deleted in this snapshot - the easiest way to handle
+	 * that is to just skip it here:
+	 */
+	if (bch2_err_matches(ret, ENOENT))
+		goto advance;
+
+	if (ret)
+		return ret;
+
+	bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+			KEY_TYPE_QUOTA_NOCHECK);
+	bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+			KEY_TYPE_QUOTA_NOCHECK);
+advance:
+	bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+	return 0;
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+	struct bch_sb_field_quota *sb_quota;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		mutex_unlock(&c->sb_lock);
+		return -BCH_ERR_ENOSPC_sb_quota;
+	}
+
+	bch2_sb_quota_read(c);
+	mutex_unlock(&c->sb_lock);
+
+	trans = bch2_trans_get(c);
+
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
+			POS_MIN, BTREE_ITER_PREFETCH, k,
+		__bch2_quota_set(c, k, NULL)) ?:
+	      for_each_btree_key2(trans, iter, BTREE_ID_inodes,
+			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+		bch2_fs_quota_read_inode(trans, &iter, k));
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	int ret = 0;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	/* Accounting must be enabled at mount time: */
+	if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+		return -EINVAL;
+
+	/* Can't enable enforcement without accounting: */
+	if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+		return -EINVAL;
+
+	if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+		return -EINVAL;
+
+	if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		ret = -BCH_ERR_ENOSPC_sb_quota;
+		goto unlock;
+	}
+
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
+
+	bch2_write_super(c);
+unlock:
+	mutex_unlock(&c->sb_lock);
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (uflags & FS_USER_QUOTA) {
+		if (c->opts.usrquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+					      POS(QTYP_USR, 0),
+					      POS(QTYP_USR, U64_MAX),
+					      0, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_GROUP_QUOTA) {
+		if (c->opts.grpquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+					      POS(QTYP_GRP, 0),
+					      POS(QTYP_GRP, U64_MAX),
+					      0, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_PROJ_QUOTA) {
+		if (c->opts.prjquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+					      POS(QTYP_PRJ, 0),
+					      POS(QTYP_PRJ, U64_MAX),
+					      0, NULL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	unsigned qtypes = enabled_qtypes(c);
+	unsigned i;
+
+	memset(state, 0, sizeof(*state));
+
+	for (i = 0; i < QTYP_NR; i++) {
+		state->s_state[i].flags |= QCI_SYSFILE;
+
+		if (!(qtypes & (1 << i)))
+			continue;
+
+		state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+		state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+		state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+		state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+		state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+	}
+
+	return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+			       struct qc_info *info)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	int ret = 0;
+
+	if (0) {
+		struct printbuf buf = PRINTBUF;
+
+		qc_info_to_text(&buf, info);
+		pr_info("setting:\n%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (type >= QTYP_NR)
+		return -EINVAL;
+
+	if (!((1 << type) & enabled_qtypes(c)))
+		return -ESRCH;
+
+	if (info->i_fieldmask &
+	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		ret = -BCH_ERR_ENOSPC_sb_quota;
+		goto unlock;
+	}
+
+	if (info->i_fieldmask & QC_SPC_TIMER)
+		sb_quota->q[type].c[Q_SPC].timelimit =
+			cpu_to_le32(info->i_spc_timelimit);
+
+	if (info->i_fieldmask & QC_SPC_WARNS)
+		sb_quota->q[type].c[Q_SPC].warnlimit =
+			cpu_to_le32(info->i_spc_warnlimit);
+
+	if (info->i_fieldmask & QC_INO_TIMER)
+		sb_quota->q[type].c[Q_INO].timelimit =
+			cpu_to_le32(info->i_ino_timelimit);
+
+	if (info->i_fieldmask & QC_INO_WARNS)
+		sb_quota->q[type].c[Q_INO].warnlimit =
+			cpu_to_le32(info->i_ino_warnlimit);
+
+	bch2_sb_quota_read(c);
+
+	bch2_write_super(c);
+unlock:
+	mutex_unlock(&c->sb_lock);
+
+	return bch2_err_class(ret);
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+	dst->d_space		= src->c[Q_SPC].v << 9;
+	dst->d_spc_hardlimit	= src->c[Q_SPC].hardlimit << 9;
+	dst->d_spc_softlimit	= src->c[Q_SPC].softlimit << 9;
+	dst->d_spc_timer	= src->c[Q_SPC].timer;
+	dst->d_spc_warns	= src->c[Q_SPC].warns;
+
+	dst->d_ino_count	= src->c[Q_INO].v;
+	dst->d_ino_hardlimit	= src->c[Q_INO].hardlimit;
+	dst->d_ino_softlimit	= src->c[Q_INO].softlimit;
+	dst->d_ino_timer	= src->c[Q_INO].timer;
+	dst->d_ino_warns	= src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid.type];
+	qid_t qid			= from_kqid(&init_user_ns, kqid);
+	struct bch_memquota *mq;
+
+	memset(qdq, 0, sizeof(*qdq));
+
+	mutex_lock(&q->lock);
+	mq = genradix_ptr(&q->table, qid);
+	if (mq)
+		__bch2_quota_get(qdq, mq);
+	mutex_unlock(&q->lock);
+
+	return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+			       struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid->type];
+	qid_t qid			= from_kqid(&init_user_ns, *kqid);
+	struct genradix_iter iter;
+	struct bch_memquota *mq;
+	int ret = 0;
+
+	mutex_lock(&q->lock);
+
+	genradix_for_each_from(&q->table, iter, mq, qid)
+		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+			__bch2_quota_get(qdq, mq);
+			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+			goto found;
+		}
+
+	ret = -ENOENT;
+found:
+	mutex_unlock(&q->lock);
+	return bch2_err_class(ret);
+}
+
+static int bch2_set_quota_trans(struct btree_trans *trans,
+				struct bkey_i_quota *new_quota,
+				struct qc_dqblk *qdq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (unlikely(ret))
+		return ret;
+
+	if (k.k->type == KEY_TYPE_quota)
+		new_quota->v = *bkey_s_c_to_quota(k).v;
+
+	if (qdq->d_fieldmask & QC_SPC_SOFT)
+		new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+	if (qdq->d_fieldmask & QC_SPC_HARD)
+		new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+
+	if (qdq->d_fieldmask & QC_INO_SOFT)
+		new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+	if (qdq->d_fieldmask & QC_INO_HARD)
+		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+	ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bkey_i_quota new_quota;
+	int ret;
+
+	if (0) {
+		struct printbuf buf = PRINTBUF;
+
+		qc_dqblk_to_text(&buf, qdq);
+		pr_info("setting:\n%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	bkey_quota_init(&new_quota.k_i);
+	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+			    bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
+		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
+
+	return bch2_err_class(ret);
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+	.quota_enable		= bch2_quota_enable,
+	.quota_disable		= bch2_quota_disable,
+	.rm_xquota		= bch2_quota_remove,
+
+	.get_state		= bch2_quota_get_state,
+	.set_info		= bch2_quota_set_info,
+
+	.get_dqblk		= bch2_get_quota,
+	.get_nextdqblk		= bch2_get_next_quota,
+	.set_dqblk		= bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
new file mode 100644
index 000000000000..2f463874a362
--- /dev/null
+++ b/fs/bcachefs/quota.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "inode.h"
+#include "quota_types.h"
+
+enum bkey_invalid_flags;
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_quota ((struct bkey_ops) {	\
+	.key_invalid	= bch2_quota_invalid,		\
+	.val_to_text	= bch2_quota_to_text,		\
+	.min_val_size	= 32,				\
+})
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+	return (struct bch_qid) {
+		.q[QTYP_USR] = u->bi_uid,
+		.q[QTYP_GRP] = u->bi_gid,
+		.q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
+	};
+}
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+	return ((c->opts.usrquota << QTYP_USR)|
+		(c->opts.grpquota << QTYP_GRP)|
+		(c->opts.prjquota << QTYP_PRJ));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+		    s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+			struct bch_qid, u64, enum quota_acct_mode);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+				  enum quota_counters counter, s64 v,
+				  enum quota_acct_mode mode)
+{
+	return 0;
+}
+
+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+				      struct bch_qid dst,
+				      struct bch_qid src, u64 space,
+				      enum quota_acct_mode mode)
+{
+	return 0;
+}
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
new file mode 100644
index 000000000000..6a136083d389
--- /dev/null
+++ b/fs/bcachefs/quota_types.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+	u32		q[QTYP_NR];
+};
+
+enum quota_acct_mode {
+	KEY_TYPE_QUOTA_PREALLOC,
+	KEY_TYPE_QUOTA_WARN,
+	KEY_TYPE_QUOTA_NOCHECK,
+};
+
+struct memquota_counter {
+	u64				v;
+	u64				hardlimit;
+	u64				softlimit;
+	s64				timer;
+	int				warns;
+	int				warning_issued;
+};
+
+struct bch_memquota {
+	struct memquota_counter		c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota)	bch_memquota_table;
+
+struct quota_limit {
+	u32				timelimit;
+	u32				warnlimit;
+};
+
+struct bch_memquota_type {
+	struct quota_limit		limits[Q_COUNTERS];
+	bch_memquota_table		table;
+	struct mutex			lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
new file mode 100644
index 000000000000..568f1e8e7507
--- /dev/null
+++ b/fs/bcachefs/rebalance.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "compress.h"
+#include "disk_groups.h"
+#include "errcode.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+
+/*
+ * Check if an extent should be moved:
+ * returns -1 if it should not be moved, or
+ * device of pointer that should be moved, if known, or INT_MAX if unknown
+ */
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+			   struct bkey_s_c k,
+			   struct bch_io_opts *io_opts,
+			   struct data_update_opts *data_opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	unsigned i;
+
+	data_opts->rewrite_ptrs		= 0;
+	data_opts->target		= io_opts->background_target;
+	data_opts->extra_replicas	= 0;
+	data_opts->btree_insert_flags	= 0;
+
+	if (io_opts->background_compression &&
+	    !bch2_bkey_is_incompressible(k)) {
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		i = 0;
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			if (!p.ptr.cached &&
+			    p.crc.compression_type !=
+			    bch2_compression_opt_to_type(io_opts->background_compression))
+				data_opts->rewrite_ptrs |= 1U << i;
+			i++;
+		}
+	}
+
+	if (io_opts->background_target) {
+		const struct bch_extent_ptr *ptr;
+
+		i = 0;
+		bkey_for_each_ptr(ptrs, ptr) {
+			if (!ptr->cached &&
+			    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
+			    bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
+				data_opts->rewrite_ptrs |= 1U << i;
+			i++;
+		}
+	}
+
+	return data_opts->rewrite_ptrs != 0;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+			    struct bkey_s_c k,
+			    struct bch_io_opts *io_opts)
+{
+	struct data_update_opts update_opts = { 0 };
+	struct bkey_ptrs_c ptrs;
+	const struct bch_extent_ptr *ptr;
+	unsigned i;
+
+	if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
+		return;
+
+	i = 0;
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr(ptrs, ptr) {
+		if ((1U << i) && update_opts.rewrite_ptrs)
+			if (atomic64_add_return(k.k->size,
+					&bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
+			    k.k->size)
+				rebalance_wakeup(c);
+		i++;
+	}
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+	    sectors)
+		rebalance_wakeup(c);
+}
+
+struct rebalance_work {
+	int		dev_most_full_idx;
+	unsigned	dev_most_full_percent;
+	u64		dev_most_full_work;
+	u64		dev_most_full_capacity;
+	u64		total_work;
+};
+
+static void rebalance_work_accumulate(struct rebalance_work *w,
+		u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+{
+	unsigned percent_full;
+	u64 work = dev_work + unknown_dev;
+
+	/* avoid divide by 0 */
+	if (!capacity)
+		return;
+
+	if (work < dev_work || work < unknown_dev)
+		work = U64_MAX;
+	work = min(work, capacity);
+
+	percent_full = div64_u64(work * 100, capacity);
+
+	if (percent_full >= w->dev_most_full_percent) {
+		w->dev_most_full_idx		= idx;
+		w->dev_most_full_percent	= percent_full;
+		w->dev_most_full_work		= work;
+		w->dev_most_full_capacity	= capacity;
+	}
+
+	if (w->total_work + dev_work >= w->total_work &&
+	    w->total_work + dev_work >= dev_work)
+		w->total_work += dev_work;
+}
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct rebalance_work ret = { .dev_most_full_idx = -1 };
+	u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		rebalance_work_accumulate(&ret,
+			atomic64_read(&ca->rebalance_work),
+			unknown_dev,
+			bucket_to_sector(ca, ca->mi.nbuckets -
+					 ca->mi.first_bucket),
+			i);
+
+	rebalance_work_accumulate(&ret,
+		unknown_dev, 0, c->capacity, -1);
+
+	return ret;
+}
+
+static void rebalance_work_reset(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		atomic64_set(&ca->rebalance_work, 0);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+	u64 utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct rebalance_work w, p;
+	struct bch_move_stats move_stats;
+	unsigned long start, prev_start;
+	unsigned long prev_run_time, prev_run_cputime;
+	unsigned long cputime, prev_cputime;
+	u64 io_start;
+	long throttle;
+
+	set_freezable();
+
+	io_start	= atomic64_read(&clock->now);
+	p		= rebalance_work(c);
+	prev_start	= jiffies;
+	prev_cputime	= curr_cputime();
+
+	bch2_move_stats_init(&move_stats, "rebalance");
+	while (!kthread_wait_freezable(r->enabled)) {
+		cond_resched();
+
+		start			= jiffies;
+		cputime			= curr_cputime();
+
+		prev_run_time		= start - prev_start;
+		prev_run_cputime	= cputime - prev_cputime;
+
+		w			= rebalance_work(c);
+		BUG_ON(!w.dev_most_full_capacity);
+
+		if (!w.total_work) {
+			r->state = REBALANCE_WAITING;
+			kthread_wait_freezable(rebalance_work(c).total_work);
+			continue;
+		}
+
+		/*
+		 * If there isn't much work to do, throttle cpu usage:
+		 */
+		throttle = prev_run_cputime * 100 /
+			max(1U, w.dev_most_full_percent) -
+			prev_run_time;
+
+		if (w.dev_most_full_percent < 20 && throttle > 0) {
+			r->throttled_until_iotime = io_start +
+				div_u64(w.dev_most_full_capacity *
+					(20 - w.dev_most_full_percent),
+					50);
+
+			if (atomic64_read(&clock->now) + clock->max_slop <
+			    r->throttled_until_iotime) {
+				r->throttled_until_cputime = start + throttle;
+				r->state = REBALANCE_THROTTLED;
+
+				bch2_kthread_io_clock_wait(clock,
+					r->throttled_until_iotime,
+					throttle);
+				continue;
+			}
+		}
+
+		/* minimum 1 mb/sec: */
+		r->pd.rate.rate =
+			max_t(u64, 1 << 11,
+			      r->pd.rate.rate *
+			      max(p.dev_most_full_percent, 1U) /
+			      max(w.dev_most_full_percent, 1U));
+
+		io_start	= atomic64_read(&clock->now);
+		p		= w;
+		prev_start	= start;
+		prev_cputime	= cputime;
+
+		r->state = REBALANCE_RUNNING;
+		memset(&move_stats, 0, sizeof(move_stats));
+		rebalance_work_reset(c);
+
+		bch2_move_data(c,
+			       0,		POS_MIN,
+			       BTREE_ID_NR,	POS_MAX,
+			       /* ratelimiting disabled for now */
+			       NULL, /*  &r->pd.rate, */
+			       &move_stats,
+			       writepoint_ptr(&c->rebalance_write_point),
+			       true,
+			       rebalance_pred, NULL);
+	}
+
+	return 0;
+}
+
+void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct rebalance_work w = rebalance_work(c);
+
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 20);
+
+	prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
+	prt_tab(out);
+
+	prt_human_readable_u64(out, w.dev_most_full_work << 9);
+	prt_printf(out, "/");
+	prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
+	prt_newline(out);
+
+	prt_printf(out, "total work:");
+	prt_tab(out);
+
+	prt_human_readable_u64(out, w.total_work << 9);
+	prt_printf(out, "/");
+	prt_human_readable_u64(out, c->capacity << 9);
+	prt_newline(out);
+
+	prt_printf(out, "rate:");
+	prt_tab(out);
+	prt_printf(out, "%u", r->pd.rate.rate);
+	prt_newline(out);
+
+	switch (r->state) {
+	case REBALANCE_WAITING:
+		prt_printf(out, "waiting");
+		break;
+	case REBALANCE_THROTTLED:
+		prt_printf(out, "throttled for %lu sec or ",
+		       (r->throttled_until_cputime - jiffies) / HZ);
+		prt_human_readable_u64(out,
+			    (r->throttled_until_iotime -
+			     atomic64_read(&c->io_clock[WRITE].now)) << 9);
+		prt_printf(out, " io");
+		break;
+	case REBALANCE_RUNNING:
+		prt_printf(out, "running");
+		break;
+	}
+	prt_newline(out);
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	c->rebalance.pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+	p = rcu_dereference_protected(c->rebalance.thread, 1);
+	c->rebalance.thread = NULL;
+
+	if (p) {
+		/* for sychronizing with rebalance_wakeup() */
+		synchronize_rcu();
+
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+	int ret;
+
+	if (c->rebalance.thread)
+		return 0;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(p);
+	if (ret) {
+		bch_err_msg(c, ret, "creating rebalance thread");
+		return ret;
+	}
+
+	get_task_struct(p);
+	rcu_assign_pointer(c->rebalance.thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+	bch2_pd_controller_init(&c->rebalance.pd);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
+}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
new file mode 100644
index 000000000000..7ade0bb81cce
--- /dev/null
+++ b/fs/bcachefs/rebalance.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
+
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(c->rebalance.thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
+			    struct bch_io_opts *);
+void bch2_rebalance_add_work(struct bch_fs *, u64);
+
+void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
new file mode 100644
index 000000000000..7462a92e9598
--- /dev/null
+++ b/fs/bcachefs/rebalance_types.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "move_types.h"
+
+enum rebalance_state {
+	REBALANCE_WAITING,
+	REBALANCE_THROTTLED,
+	REBALANCE_RUNNING,
+};
+
+struct bch_fs_rebalance {
+	struct task_struct __rcu *thread;
+	struct bch_pd_controller pd;
+
+	atomic64_t		work_unknown_dev;
+
+	enum rebalance_state	state;
+	u64			throttled_until_iotime;
+	unsigned long		throttled_until_cputime;
+
+	unsigned		enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
new file mode 100644
index 000000000000..4cd660650e5b
--- /dev/null
+++ b/fs/bcachefs/recovery.c
@@ -0,0 +1,1049 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "alloc_background.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs-common.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "move.h"
+#include "quota.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+#include <linux/stat.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static bool btree_id_is_alloc(enum btree_id id)
+{
+	switch (id) {
+	case BTREE_ID_alloc:
+	case BTREE_ID_backpointers:
+	case BTREE_ID_need_discard:
+	case BTREE_ID_freespace:
+	case BTREE_ID_bucket_gens:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/* for -o reconstruct_alloc: */
+static void drop_alloc_keys(struct journal_keys *keys)
+{
+	size_t src, dst;
+
+	for (src = 0, dst = 0; src < keys->nr; src++)
+		if (!btree_id_is_alloc(keys->d[src].btree_id))
+			keys->d[dst++] = keys->d[src];
+
+	keys->nr = dst;
+}
+
+/*
+ * Btree node pointers have a field to stack a pointer to the in memory btree
+ * node; we need to zero out this field when reading in btree nodes, or when
+ * reading in keys from the journal:
+ */
+static void zero_out_btree_mem_ptr(struct journal_keys *keys)
+{
+	struct journal_key *i;
+
+	for (i = keys->d; i < keys->d + keys->nr; i++)
+		if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
+			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+}
+
+/* journal replay: */
+
+static void replay_now_at(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->replay_journal_seq);
+
+	seq = min(seq, j->replay_journal_seq_end);
+
+	while (j->replay_journal_seq < seq)
+		bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
+
+static int bch2_journal_replay_key(struct btree_trans *trans,
+				   struct journal_key *k)
+{
+	struct btree_iter iter;
+	unsigned iter_flags =
+		BTREE_ITER_INTENT|
+		BTREE_ITER_NOT_EXTENTS;
+	unsigned update_flags = BTREE_TRIGGER_NORUN;
+	int ret;
+
+	/*
+	 * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
+	 * keep the key cache coherent with the underlying btree. Nothing
+	 * besides the allocator is doing updates yet so we don't need key cache
+	 * coherency for non-alloc btrees, and key cache fills for snapshots
+	 * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
+	 * the snapshots recovery pass runs.
+	 */
+	if (!k->level && k->btree_id == BTREE_ID_alloc)
+		iter_flags |= BTREE_ITER_CACHED;
+	else
+		update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
+
+	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+				  BTREE_MAX_DEPTH, k->level,
+				  iter_flags);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
+
+	/* Must be checked with btree locked: */
+	if (k->overwritten)
+		goto out;
+
+	ret = bch2_trans_update(trans, &iter, k->k, update_flags);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = *((const struct journal_key **)_l);
+	const struct journal_key *r = *((const struct journal_key **)_r);
+
+	return cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_journal_replay(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_key **keys_sorted, *k;
+	struct journal *j = &c->journal;
+	u64 start_seq	= c->journal_replay_seq_start;
+	u64 end_seq	= c->journal_replay_seq_start;
+	size_t i;
+	int ret;
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+	keys->gap = keys->nr;
+
+	keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
+	if (!keys_sorted)
+		return -BCH_ERR_ENOMEM_journal_replay;
+
+	for (i = 0; i < keys->nr; i++)
+		keys_sorted[i] = &keys->d[i];
+
+	sort(keys_sorted, keys->nr,
+	     sizeof(keys_sorted[0]),
+	     journal_sort_seq_cmp, NULL);
+
+	if (keys->nr) {
+		ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
+					   keys->nr, start_seq, end_seq);
+		if (ret)
+			goto err;
+	}
+
+	for (i = 0; i < keys->nr; i++) {
+		k = keys_sorted[i];
+
+		cond_resched();
+
+		replay_now_at(j, k->journal_seq);
+
+		ret = bch2_trans_do(c, NULL, NULL,
+				    BTREE_INSERT_LAZY_RW|
+				    BTREE_INSERT_NOFAIL|
+				    (!k->allocated
+				     ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
+				     : 0),
+			     bch2_journal_replay_key(trans, k));
+		if (ret) {
+			bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
+				bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
+			goto err;
+		}
+	}
+
+	replay_now_at(j, j->replay_journal_seq_end);
+	j->replay_journal_seq = 0;
+
+	bch2_journal_set_replay_done(j);
+	bch2_journal_flush_all_pins(j);
+	ret = bch2_journal_error(j);
+
+	if (keys->nr && !ret)
+		bch2_journal_log_msg(c, "journal replay finished");
+err:
+	kvfree(keys_sorted);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/* journal replay early: */
+
+static int journal_replay_entry_early(struct bch_fs *c,
+				      struct jset_entry *entry)
+{
+	int ret = 0;
+
+	switch (entry->type) {
+	case BCH_JSET_ENTRY_btree_root: {
+		struct btree_root *r;
+
+		while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
+			ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
+			if (ret)
+				return ret;
+		}
+
+		r = bch2_btree_id_root(c, entry->btree_id);
+
+		if (entry->u64s) {
+			r->level = entry->level;
+			bkey_copy(&r->key, &entry->start[0]);
+			r->error = 0;
+		} else {
+			r->error = -EIO;
+		}
+		r->alive = true;
+		break;
+	}
+	case BCH_JSET_ENTRY_usage: {
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
+		switch (entry->btree_id) {
+		case BCH_FS_USAGE_reserved:
+			if (entry->level < BCH_REPLICAS_MAX)
+				c->usage_base->persistent_reserved[entry->level] =
+					le64_to_cpu(u->v);
+			break;
+		case BCH_FS_USAGE_inodes:
+			c->usage_base->nr_inodes = le64_to_cpu(u->v);
+			break;
+		case BCH_FS_USAGE_key_version:
+			atomic64_set(&c->key_version,
+				     le64_to_cpu(u->v));
+			break;
+		}
+
+		break;
+	}
+	case BCH_JSET_ENTRY_data_usage: {
+		struct jset_entry_data_usage *u =
+			container_of(entry, struct jset_entry_data_usage, entry);
+
+		ret = bch2_replicas_set_usage(c, &u->r,
+					      le64_to_cpu(u->v));
+		break;
+	}
+	case BCH_JSET_ENTRY_dev_usage: {
+		struct jset_entry_dev_usage *u =
+			container_of(entry, struct jset_entry_dev_usage, entry);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
+		unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
+
+		for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
+			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
+			ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
+			ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
+		}
+
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist: {
+		struct jset_entry_blacklist *bl_entry =
+			container_of(entry, struct jset_entry_blacklist, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->seq),
+				le64_to_cpu(bl_entry->seq) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist_v2: {
+		struct jset_entry_blacklist_v2 *bl_entry =
+			container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->start),
+				le64_to_cpu(bl_entry->end) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_clock: {
+		struct jset_entry_clock *clock =
+			container_of(entry, struct jset_entry_clock, entry);
+
+		atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
+	}
+	}
+
+	return ret;
+}
+
+static int journal_replay_early(struct bch_fs *c,
+				struct bch_sb_field_clean *clean)
+{
+	struct jset_entry *entry;
+	int ret;
+
+	if (clean) {
+		for (entry = clean->start;
+		     entry != vstruct_end(&clean->field);
+		     entry = vstruct_next(entry)) {
+			ret = journal_replay_entry_early(c, entry);
+			if (ret)
+				return ret;
+		}
+	} else {
+		struct genradix_iter iter;
+		struct journal_replay *i, **_i;
+
+		genradix_for_each(&c->journal_entries, iter, _i) {
+			i = *_i;
+
+			if (!i || i->ignore)
+				continue;
+
+			vstruct_for_each(&i->j, entry) {
+				ret = journal_replay_entry_early(c, entry);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	bch2_fs_usage_initialize(c);
+
+	return 0;
+}
+
+/* sb clean section: */
+
+static int read_btree_roots(struct bch_fs *c)
+{
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->alive)
+			continue;
+
+		if (btree_id_is_alloc(i) &&
+		    c->opts.reconstruct_alloc) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			continue;
+		}
+
+		if (r->error) {
+			__fsck_err(c, btree_id_is_alloc(i)
+				   ? FSCK_CAN_IGNORE : 0,
+				   "invalid btree root %s",
+				   bch2_btree_ids[i]);
+			if (i == BTREE_ID_alloc)
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+		}
+
+		ret = bch2_btree_root_read(c, i, &r->key, r->level);
+		if (ret) {
+			fsck_err(c,
+				 "error reading btree root %s",
+				 bch2_btree_ids[i]);
+			if (btree_id_is_alloc(i))
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			ret = 0;
+		}
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->b) {
+			r->alive = false;
+			r->level = 0;
+			bch2_btree_root_alloc(c, i);
+		}
+	}
+fsck_err:
+	return ret;
+}
+
+static int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+	struct bkey_i_snapshot_tree	root_tree;
+	struct bkey_i_snapshot		root_snapshot;
+	struct bkey_i_subvolume		root_volume;
+	int ret;
+
+	bkey_snapshot_tree_init(&root_tree.k_i);
+	root_tree.k.p.offset		= 1;
+	root_tree.v.master_subvol	= cpu_to_le32(1);
+	root_tree.v.root_snapshot	= cpu_to_le32(U32_MAX);
+
+	bkey_snapshot_init(&root_snapshot.k_i);
+	root_snapshot.k.p.offset = U32_MAX;
+	root_snapshot.v.flags	= 0;
+	root_snapshot.v.parent	= 0;
+	root_snapshot.v.subvol	= cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+	root_snapshot.v.tree	= cpu_to_le32(1);
+	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+	bkey_subvolume_init(&root_volume.k_i);
+	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+	root_volume.v.flags	= 0;
+	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
+	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
+
+	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (!bkey_is_inode(k.k)) {
+		bch_err(trans->c, "root inode not found");
+		ret = -BCH_ERR_ENOENT_inode;
+		goto err;
+	}
+
+	ret = bch2_inode_unpack(k, &inode);
+	BUG_ON(ret);
+
+	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+	ret = bch2_inode_write(trans, &iter, &inode);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* set bi_subvol on root inode */
+noinline_for_stack
+static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+	int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+				__bch2_fs_upgrade_for_subvolumes(trans));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, _when)	#_fn,
+	BCH_RECOVERY_PASSES()
+#undef x
+	NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+	return bch2_gc(c, true, c->opts.norecovery);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+	return 0;
+}
+
+struct recovery_pass_fn {
+	int		(*fn)(struct bch_fs *);
+	unsigned	when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _when)	{ .fn = bch2_##_fn, .when = _when },
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static void check_version_upgrade(struct bch_fs *c)
+{
+	unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
+	unsigned latest_version	= bcachefs_metadata_version_current;
+	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+	unsigned new_version = 0;
+	u64 recovery_passes;
+
+	if (old_version < bcachefs_metadata_required_upgrade_below) {
+		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
+		    latest_compatible < bcachefs_metadata_required_upgrade_below)
+			new_version = latest_version;
+		else
+			new_version = latest_compatible;
+	} else {
+		switch (c->opts.version_upgrade) {
+		case BCH_VERSION_UPGRADE_compatible:
+			new_version = latest_compatible;
+			break;
+		case BCH_VERSION_UPGRADE_incompatible:
+			new_version = latest_version;
+			break;
+		case BCH_VERSION_UPGRADE_none:
+			new_version = old_version;
+			break;
+		}
+	}
+
+	if (new_version > old_version) {
+		struct printbuf buf = PRINTBUF;
+
+		if (old_version < bcachefs_metadata_required_upgrade_below)
+			prt_str(&buf, "Version upgrade required:\n");
+
+		if (old_version != c->sb.version) {
+			prt_str(&buf, "Version upgrade from ");
+			bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
+			prt_str(&buf, " to ");
+			bch2_version_to_text(&buf, c->sb.version);
+			prt_str(&buf, " incomplete\n");
+		}
+
+		prt_printf(&buf, "Doing %s version upgrade from ",
+			   BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
+			   ? "incompatible" : "compatible");
+		bch2_version_to_text(&buf, old_version);
+		prt_str(&buf, " to ");
+		bch2_version_to_text(&buf, new_version);
+		prt_newline(&buf);
+
+		recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
+		if (recovery_passes) {
+			if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
+				prt_str(&buf, "fsck required");
+			else {
+				prt_str(&buf, "running recovery passes: ");
+				prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
+			}
+
+			c->recovery_passes_explicit |= recovery_passes;
+			c->opts.fix_errors = FSCK_FIX_yes;
+		}
+
+		bch_info(c, "%s", buf.buf);
+
+		mutex_lock(&c->sb_lock);
+		bch2_sb_upgrade(c, new_version);
+		mutex_unlock(&c->sb_lock);
+
+		printbuf_exit(&buf);
+	}
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+	u64 ret = 0;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+		if (recovery_pass_fns[i].when & PASS_FSCK)
+			ret |= BIT_ULL(i);
+	return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
+
+	if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
+		return false;
+	if (c->recovery_passes_explicit & BIT_ULL(pass))
+		return true;
+	if ((p->when & PASS_FSCK) && c->opts.fsck)
+		return true;
+	if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+		return true;
+	if (p->when & PASS_ALWAYS)
+		return true;
+	return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	int ret;
+
+	c->curr_recovery_pass = pass;
+
+	if (should_run_recovery_pass(c, pass)) {
+		struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+		if (!(p->when & PASS_SILENT))
+			printk(KERN_INFO bch2_log_msg(c, "%s..."),
+			       bch2_recovery_passes[pass]);
+		ret = p->fn(c);
+		if (ret)
+			return ret;
+		if (!(p->when & PASS_SILENT))
+			printk(KERN_CONT " done\n");
+
+		c->recovery_passes_complete |= BIT_ULL(pass);
+	}
+
+	return 0;
+}
+
+static int bch2_run_recovery_passes(struct bch_fs *c)
+{
+	int ret = 0;
+
+	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+		ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+		if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
+			continue;
+		if (ret)
+			break;
+		c->curr_recovery_pass++;
+	}
+
+	return ret;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean = NULL;
+	struct jset *last_journal_entry = NULL;
+	u64 last_seq = 0, blacklist_seq, journal_seq;
+	bool write_sb = false;
+	int ret = 0;
+
+	if (c->sb.clean) {
+		clean = bch2_read_superblock_clean(c);
+		ret = PTR_ERR_OR_ZERO(clean);
+		if (ret)
+			goto err;
+
+		bch_info(c, "recovering from clean shutdown, journal seq %llu",
+			 le64_to_cpu(clean->journal_seq));
+	} else {
+		bch_info(c, "recovering from unclean shutdown");
+	}
+
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (!c->sb.clean &&
+	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery))
+		check_version_upgrade(c);
+
+	if (c->opts.fsck && c->opts.norecovery) {
+		bch_err(c, "cannot select both norecovery and fsck");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = bch2_blacklist_table_initialize(c);
+	if (ret) {
+		bch_err(c, "error initializing blacklist table");
+		goto err;
+	}
+
+	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+		struct genradix_iter iter;
+		struct journal_replay **i;
+
+		bch_verbose(c, "starting journal read");
+		ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
+		if (ret)
+			goto err;
+
+		/*
+		 * note: cmd_list_journal needs the blacklist table fully up to date so
+		 * it can asterisk ignored journal entries:
+		 */
+		if (c->opts.read_journal_only)
+			goto out;
+
+		genradix_for_each_reverse(&c->journal_entries, iter, i)
+			if (*i && !(*i)->ignore) {
+				last_journal_entry = &(*i)->j;
+				break;
+			}
+
+		if (mustfix_fsck_err_on(c->sb.clean &&
+					last_journal_entry &&
+					!journal_entry_empty(last_journal_entry), c,
+				"filesystem marked clean but journal not empty")) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+			c->sb.clean = false;
+		}
+
+		if (!last_journal_entry) {
+			fsck_err_on(!c->sb.clean, c, "no journal entries found");
+			if (clean)
+				goto use_clean;
+
+			genradix_for_each_reverse(&c->journal_entries, iter, i)
+				if (*i) {
+					last_journal_entry = &(*i)->j;
+					(*i)->ignore = false;
+					break;
+				}
+		}
+
+		ret = bch2_journal_keys_sort(c);
+		if (ret)
+			goto err;
+
+		if (c->sb.clean && last_journal_entry) {
+			ret = bch2_verify_superblock_clean(c, &clean,
+						      last_journal_entry);
+			if (ret)
+				goto err;
+		}
+	} else {
+use_clean:
+		if (!clean) {
+			bch_err(c, "no superblock clean section found");
+			ret = -BCH_ERR_fsck_repair_impossible;
+			goto err;
+
+		}
+		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+	}
+
+	c->journal_replay_seq_start	= last_seq;
+	c->journal_replay_seq_end	= blacklist_seq - 1;
+
+	if (c->opts.reconstruct_alloc) {
+		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+		drop_alloc_keys(&c->journal_keys);
+	}
+
+	zero_out_btree_mem_ptr(&c->journal_keys);
+
+	ret = journal_replay_early(c, clean);
+	if (ret)
+		goto err;
+
+	/*
+	 * After an unclean shutdown, skip then next few journal sequence
+	 * numbers as they may have been referenced by btree writes that
+	 * happened before their corresponding journal writes - those btree
+	 * writes need to be ignored, by skipping and blacklisting the next few
+	 * journal sequence numbers:
+	 */
+	if (!c->sb.clean)
+		journal_seq += 8;
+
+	if (blacklist_seq != journal_seq) {
+		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
+					     blacklist_seq, journal_seq) ?:
+			bch2_journal_seq_blacklist_add(c,
+					blacklist_seq, journal_seq);
+		if (ret) {
+			bch_err(c, "error creating new journal seq blacklist entry");
+			goto err;
+		}
+	}
+
+	ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
+				     journal_seq, last_seq, blacklist_seq - 1) ?:
+		bch2_fs_journal_start(&c->journal, journal_seq);
+	if (ret)
+		goto err;
+
+	if (c->opts.reconstruct_alloc)
+		bch2_journal_log_msg(c, "dropping alloc info");
+
+	/*
+	 * Skip past versions that might have possibly been used (as nonces),
+	 * but hadn't had their pointers written:
+	 */
+	if (c->sb.encryption_type && !c->sb.clean)
+		atomic64_add(1 << 16, &c->key_version);
+
+	ret = read_btree_roots(c);
+	if (ret)
+		goto err;
+
+	if (c->opts.fsck &&
+	    (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) ||
+	     BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)))
+		c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+
+	ret = bch2_run_recovery_passes(c);
+	if (ret)
+		goto err;
+
+	/* If we fixed errors, verify that fs is actually clean now: */
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
+	    !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags)) {
+		bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
+		clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+
+		c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+
+		ret = bch2_run_recovery_passes(c);
+		if (ret)
+			goto err;
+
+		if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) ||
+		    test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+			bch_err(c, "Second fsck run was not clean");
+			set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+		}
+
+		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+	}
+
+	if (enabled_qtypes(c)) {
+		bch_verbose(c, "reading quotas");
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "quotas done");
+	}
+
+	mutex_lock(&c->sb_lock);
+	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != c->sb.version) {
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, c->sb.version);
+		write_sb = true;
+	}
+
+	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+		write_sb = true;
+	}
+
+	if (c->opts.fsck &&
+	    !test_bit(BCH_FS_ERROR, &c->flags) &&
+	    !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
+		write_sb = true;
+	}
+
+	if (write_sb)
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+	    c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
+		struct bch_move_stats stats;
+
+		bch2_move_stats_init(&stats, "recovery");
+
+		bch_info(c, "scanning for old btree nodes");
+		ret =   bch2_fs_read_write(c) ?:
+			bch2_scan_old_btree_nodes(c, &stats);
+		if (ret)
+			goto err;
+		bch_info(c, "scanning for old btree nodes done");
+	}
+
+	if (c->journal_seq_blacklist_table &&
+	    c->journal_seq_blacklist_table->nr > 128)
+		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
+
+	ret = 0;
+out:
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
+	bch2_flush_fsck_errs(c);
+
+	if (!c->opts.keep_journal &&
+	    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) {
+		bch2_journal_keys_free(&c->journal_keys);
+		bch2_journal_entries_free(c);
+	}
+	kfree(clean);
+
+	if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
+		bch2_fs_read_write_early(c);
+		bch2_delete_dead_snapshots_async(c);
+	}
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+err:
+fsck_err:
+	bch2_fs_emergency_read_only(c);
+	goto out;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+	struct bkey_inode_buf packed_inode;
+	struct qstr lostfound = QSTR("lost+found");
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	bch_notice(c, "initializing new filesystem");
+
+	mutex_lock(&c->sb_lock);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+
+	bch2_sb_maybe_downgrade(c);
+
+	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
+		bch2_sb_upgrade(c, bcachefs_metadata_version_current);
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+
+	c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		bch2_btree_root_alloc(c, i);
+
+	for_each_online_member(ca, c, i)
+		bch2_dev_usage_init(ca);
+
+	for_each_online_member(ca, c, i) {
+		ret = bch2_dev_journal_alloc(ca);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			goto err;
+		}
+	}
+
+	/*
+	 * journal_res_get() will crash if called before this has
+	 * set up the journal.pin FIFO and journal.cur pointer:
+	 */
+	bch2_fs_journal_start(&c->journal, 1);
+	bch2_journal_set_replay_done(&c->journal);
+
+	ret = bch2_fs_read_write_early(c);
+	if (ret)
+		goto err;
+
+	/*
+	 * Write out the superblock and journal buckets, now that we can do
+	 * btree updates
+	 */
+	bch_verbose(c, "marking superblocks");
+	for_each_member_device(ca, c, i) {
+		ret = bch2_trans_mark_dev_sb(c, ca);
+		if (ret) {
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
+
+		ca->new_fs_bucket_idx = 0;
+	}
+
+	ret = bch2_fs_freespace_init(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_initialize_subvolumes(c);
+	if (ret)
+		goto err;
+
+	bch_verbose(c, "reading snapshots table");
+	ret = bch2_snapshots_read(c);
+	if (ret)
+		goto err;
+	bch_verbose(c, "reading snapshots done");
+
+	bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
+	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
+	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
+	bch2_inode_pack(&packed_inode, &root_inode);
+	packed_inode.inode.k.p.snapshot = U32_MAX;
+
+	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
+	if (ret) {
+		bch_err_msg(c, ret, "creating root directory");
+		goto err;
+	}
+
+	bch2_inode_init_early(c, &lostfound_inode);
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		bch2_create_trans(trans,
+				  BCACHEFS_ROOT_SUBVOL_INUM,
+				  &root_inode, &lostfound_inode,
+				  &lostfound,
+				  0, 0, S_IFDIR|0700, 0,
+				  NULL, NULL, (subvol_inum) { 0 }, 0));
+	if (ret) {
+		bch_err_msg(c, ret, "creating lost+found");
+		goto err;
+	}
+
+	if (enabled_qtypes(c)) {
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_journal_flush(&c->journal);
+	if (ret) {
+		bch_err_msg(c, ret, "writing first journal entry");
+		goto err;
+	}
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+err:
+	bch_err_fn(ca, ret);
+	return ret;
+}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
new file mode 100644
index 000000000000..852d30567da9
--- /dev/null
+++ b/fs/bcachefs/recovery.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+extern const char * const bch2_recovery_passes[];
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+						  enum bch_recovery_pass pass)
+{
+	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+		 bch2_recovery_passes[pass], pass,
+		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+	c->recovery_passes_explicit |= BIT_ULL(pass);
+
+	if (c->curr_recovery_pass >= pass) {
+		c->curr_recovery_pass = pass;
+		c->recovery_passes_complete &= (1ULL << pass) >> 1;
+		return -BCH_ERR_restart_recovery;
+	} else {
+		return 0;
+	}
+}
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
new file mode 100644
index 000000000000..fbfa9d831d6f
--- /dev/null
+++ b/fs/bcachefs/recovery_types.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_TYPES_H
+#define _BCACHEFS_RECOVERY_TYPES_H
+
+#define PASS_SILENT		BIT(0)
+#define PASS_FSCK		BIT(1)
+#define PASS_UNCLEAN		BIT(2)
+#define PASS_ALWAYS		BIT(3)
+
+#define BCH_RECOVERY_PASSES()									\
+	x(alloc_read,			PASS_ALWAYS)						\
+	x(stripes_read,			PASS_ALWAYS)						\
+	x(initialize_subvolumes,	0)							\
+	x(snapshots_read,		PASS_ALWAYS)						\
+	x(check_topology,		0)							\
+	x(check_allocations,		PASS_FSCK)						\
+	x(set_may_go_rw,		PASS_ALWAYS|PASS_SILENT)				\
+	x(journal_replay,		PASS_ALWAYS)						\
+	x(check_alloc_info,		PASS_FSCK)						\
+	x(check_lrus,			PASS_FSCK)						\
+	x(check_btree_backpointers,	PASS_FSCK)						\
+	x(check_backpointers_to_extents,PASS_FSCK)						\
+	x(check_extents_to_backpointers,PASS_FSCK)						\
+	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
+	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
+	x(bucket_gens_init,		0)							\
+	x(check_snapshot_trees,		PASS_FSCK)						\
+	x(check_snapshots,		PASS_FSCK)						\
+	x(check_subvols,		PASS_FSCK)						\
+	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN)					\
+	x(fs_upgrade_for_subvolumes,	0)							\
+	x(resume_logged_ops,		PASS_ALWAYS)						\
+	x(check_inodes,			PASS_FSCK)						\
+	x(check_extents,		PASS_FSCK)						\
+	x(check_dirents,		PASS_FSCK)						\
+	x(check_xattrs,			PASS_FSCK)						\
+	x(check_root,			PASS_FSCK)						\
+	x(check_directory_structure,	PASS_FSCK)						\
+	x(check_nlinks,			PASS_FSCK)						\
+	x(delete_dead_inodes,		PASS_FSCK|PASS_UNCLEAN)					\
+	x(fix_reflink_p,		0)							\
+
+enum bch_recovery_pass {
+#define x(n, when)	BCH_RECOVERY_PASS_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+#endif /* _BCACHEFS_RECOVERY_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
new file mode 100644
index 000000000000..d77d0ea9afff
--- /dev/null
+++ b/fs/bcachefs/reflink.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "extents.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "reflink.h"
+#include "subvolume.h"
+#include "super-io.h"
+
+#include <linux/sched/signal.h>
+
+static inline unsigned bkey_type_to_indirect(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_extent:
+		return KEY_TYPE_reflink_v;
+	case KEY_TYPE_inline_data:
+		return KEY_TYPE_indirect_inline_data;
+	default:
+		return 0;
+	}
+}
+
+/* reflink pointers */
+
+int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
+	    le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
+		prt_printf(err, "idx < front_pad (%llu < %u)",
+		       le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	prt_printf(out, "idx %llu front_pad %u back_pad %u",
+	       le64_to_cpu(p.v->idx),
+	       le32_to_cpu(p.v->front_pad),
+	       le32_to_cpu(p.v->back_pad));
+}
+
+bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+	struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
+
+	/*
+	 * Disabled for now, the triggers code needs to be reworked for merging
+	 * of reflink pointers to work:
+	 */
+	return false;
+
+	if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+		return false;
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+/* indirect extents */
+
+int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
+{
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+	prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+#if 0
+Currently disabled, needs to be debugged:
+
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+#endif
+
+int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c old, struct bkey_i *new,
+			      unsigned flags)
+{
+	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+		struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
+
+		if (!r->v.refcount) {
+			r->k.type = KEY_TYPE_deleted;
+			r->k.size = 0;
+			set_bkey_val_u64s(&r->k, 0);
+			return 0;
+		}
+	}
+
+	return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+}
+
+/* indirect inline data */
+
+int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+				      enum bkey_invalid_flags flags,
+				      struct printbuf *err)
+{
+	return 0;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+					struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
+
+	prt_printf(out, "refcount %llu datalen %u: %*phN",
+	       le64_to_cpu(d.v->refcount), datalen,
+	       min(datalen, 32U), d.v->data);
+}
+
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c old, struct bkey_i *new,
+			      unsigned flags)
+{
+	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+		struct bkey_i_indirect_inline_data *r =
+			bkey_i_to_indirect_inline_data(new);
+
+		if (!r->v.refcount) {
+			r->k.type = KEY_TYPE_deleted;
+			r->k.size = 0;
+			set_bkey_val_u64s(&r->k, 0);
+		}
+	}
+
+	return 0;
+}
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+				     struct btree_iter *extent_iter,
+				     struct bkey_i *orig)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter reflink_iter = { NULL };
+	struct bkey_s_c k;
+	struct bkey_i *r_v;
+	struct bkey_i_reflink_p *r_p;
+	__le64 *refcount;
+	int ret;
+
+	if (orig->k.type == KEY_TYPE_inline_data)
+		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
+
+	bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_prev(&reflink_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
+	ret = PTR_ERR_OR_ZERO(r_v);
+	if (ret)
+		goto err;
+
+	bkey_init(&r_v->k);
+	r_v->k.type	= bkey_type_to_indirect(&orig->k);
+	r_v->k.p	= reflink_iter.pos;
+	bch2_key_resize(&r_v->k, orig->k.size);
+	r_v->k.version	= orig->k.version;
+
+	set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
+
+	refcount	= bkey_refcount(r_v);
+	*refcount	= 0;
+	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
+
+	ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
+	if (ret)
+		goto err;
+
+	/*
+	 * orig is in a bkey_buf which statically allocates 5 64s for the val,
+	 * so we know it will be big enough:
+	 */
+	orig->k.type = KEY_TYPE_reflink_p;
+	r_p = bkey_i_to_reflink_p(orig);
+	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+
+	/* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
+#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
+	__underlying_memset(&r_p->v, 0, sizeof(r_p->v));
+#else
+	memset(&r_p->v, 0, sizeof(r_p->v));
+#endif
+
+	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+
+	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+err:
+	bch2_trans_iter_exit(trans, &reflink_iter);
+
+	return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) {
+		if (bkey_extent_is_unwritten(k))
+			continue;
+
+		if (bkey_extent_is_data(k.k))
+			return k;
+	}
+
+	if (bkey_ge(iter->pos, end))
+		bch2_btree_iter_set_pos(iter, end);
+	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+		     subvol_inum dst_inum, u64 dst_offset,
+		     subvol_inum src_inum, u64 src_offset,
+		     u64 remap_sectors,
+		     u64 new_i_size, s64 *i_sectors_delta)
+{
+	struct btree_trans *trans;
+	struct btree_iter dst_iter, src_iter;
+	struct bkey_s_c src_k;
+	struct bkey_buf new_dst, new_src;
+	struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+	struct bpos src_start = POS(src_inum.inum, src_offset);
+	struct bpos dst_end = dst_start, src_end = src_start;
+	struct bpos src_want;
+	u64 dst_done;
+	u32 dst_snapshot, src_snapshot;
+	int ret = 0, ret2 = 0;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
+		return -BCH_ERR_erofs_no_writes;
+
+	bch2_check_set_feature(c, BCH_FEATURE_reflink);
+
+	dst_end.offset += remap_sectors;
+	src_end.offset += remap_sectors;
+
+	bch2_bkey_buf_init(&new_dst);
+	bch2_bkey_buf_init(&new_src);
+	trans = bch2_trans_get(c);
+
+	bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
+			     BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
+			     BTREE_ITER_INTENT);
+
+	while ((ret == 0 ||
+		bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
+	       bkey_lt(dst_iter.pos, dst_end)) {
+		struct disk_reservation disk_res = { 0 };
+
+		bch2_trans_begin(trans);
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
+						  &src_snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+		ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
+						  &dst_snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
+		dst_done = dst_iter.pos.offset - dst_start.offset;
+		src_want = POS(src_start.inode, src_start.offset + dst_done);
+		bch2_btree_iter_set_pos(&src_iter, src_want);
+
+		src_k = get_next_src(&src_iter, src_end);
+		ret = bkey_err(src_k);
+		if (ret)
+			continue;
+
+		if (bkey_lt(src_want, src_iter.pos)) {
+			ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
+					min(dst_end.offset,
+					    dst_iter.pos.offset +
+					    src_iter.pos.offset - src_want.offset),
+					i_sectors_delta);
+			continue;
+		}
+
+		if (src_k.k->type != KEY_TYPE_reflink_p) {
+			bch2_btree_iter_set_pos_to_extent_start(&src_iter);
+
+			bch2_bkey_buf_reassemble(&new_src, c, src_k);
+			src_k = bkey_i_to_s_c(new_src.k);
+
+			ret = bch2_make_extent_indirect(trans, &src_iter,
+						new_src.k);
+			if (ret)
+				continue;
+
+			BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+		}
+
+		if (src_k.k->type == KEY_TYPE_reflink_p) {
+			struct bkey_s_c_reflink_p src_p =
+				bkey_s_c_to_reflink_p(src_k);
+			struct bkey_i_reflink_p *dst_p =
+				bkey_reflink_p_init(new_dst.k);
+
+			u64 offset = le64_to_cpu(src_p.v->idx) +
+				(src_want.offset -
+				 bkey_start_offset(src_k.k));
+
+			dst_p->v.idx = cpu_to_le64(offset);
+		} else {
+			BUG();
+		}
+
+		new_dst.k->k.p = dst_iter.pos;
+		bch2_key_resize(&new_dst.k->k,
+				min(src_k.k->p.offset - src_want.offset,
+				    dst_end.offset - dst_iter.pos.offset));
+
+		ret = bch2_extent_update(trans, dst_inum, &dst_iter,
+					 new_dst.k, &disk_res,
+					 new_i_size, i_sectors_delta,
+					 true);
+		bch2_disk_reservation_put(c, &disk_res);
+	}
+	bch2_trans_iter_exit(trans, &dst_iter);
+	bch2_trans_iter_exit(trans, &src_iter);
+
+	BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
+	BUG_ON(bkey_gt(dst_iter.pos, dst_end));
+
+	dst_done = dst_iter.pos.offset - dst_start.offset;
+	new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
+
+	do {
+		struct bch_inode_unpacked inode_u;
+		struct btree_iter inode_iter = { NULL };
+
+		bch2_trans_begin(trans);
+
+		ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
+				       dst_inum, BTREE_ITER_INTENT);
+
+		if (!ret2 &&
+		    inode_u.bi_size < new_i_size) {
+			inode_u.bi_size = new_i_size;
+			ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BTREE_INSERT_NOFAIL);
+		}
+
+		bch2_trans_iter_exit(trans, &inode_iter);
+	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&new_src, c);
+	bch2_bkey_buf_exit(&new_dst, c);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
+
+	return dst_done ?: ret ?: ret2;
+}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
new file mode 100644
index 000000000000..fe52538efb52
--- /dev/null
+++ b/fs/bcachefs/reflink.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+enum bkey_invalid_flags;
+
+int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_reflink_p ((struct bkey_ops) {		\
+	.key_invalid	= bch2_reflink_p_invalid,		\
+	.val_to_text	= bch2_reflink_p_to_text,		\
+	.key_merge	= bch2_reflink_p_merge,			\
+	.trans_trigger	= bch2_trans_mark_reflink_p,		\
+	.atomic_trigger	= bch2_mark_reflink_p,			\
+	.min_val_size	= 16,					\
+})
+
+int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+			      struct bkey_s_c, struct bkey_i *, unsigned);
+
+#define bch2_bkey_ops_reflink_v ((struct bkey_ops) {		\
+	.key_invalid	= bch2_reflink_v_invalid,		\
+	.val_to_text	= bch2_reflink_v_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.trans_trigger	= bch2_trans_mark_reflink_v,		\
+	.atomic_trigger	= bch2_mark_extent,			\
+	.min_val_size	= 8,					\
+})
+
+int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
+				      enum bkey_invalid_flags, struct printbuf *);
+void bch2_indirect_inline_data_to_text(struct printbuf *,
+				struct bch_fs *, struct bkey_s_c);
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+					 enum btree_id, unsigned,
+			      struct bkey_s_c, struct bkey_i *,
+			      unsigned);
+
+#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {	\
+	.key_invalid	= bch2_indirect_inline_data_invalid,	\
+	.val_to_text	= bch2_indirect_inline_data_to_text,	\
+	.trans_trigger	= bch2_trans_mark_indirect_inline_data,	\
+	.min_val_size	= 8,					\
+})
+
+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_s_c_to_reflink_v(k).v->refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
+	default:
+		return NULL;
+	}
+}
+
+static inline __le64 *bkey_refcount(struct bkey_i *k)
+{
+	switch (k->k.type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_i_to_reflink_v(k)->v.refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+	default:
+		return NULL;
+	}
+}
+
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+		     subvol_inum, u64, u64, u64, s64 *);
+
+#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
new file mode 100644
index 000000000000..cef2a0447b86
--- /dev/null
+++ b/fs/bcachefs/replicas.c
@@ -0,0 +1,1058 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+					    struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+static void verify_replicas_entry(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	unsigned i;
+
+	BUG_ON(e->data_type >= BCH_DATA_NR);
+	BUG_ON(!e->nr_devs);
+	BUG_ON(e->nr_required > 1 &&
+	       e->nr_required >= e->nr_devs);
+
+	for (i = 0; i + 1 < e->nr_devs; i++)
+		BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+{
+	bubble_sort(e->devs, e->nr_devs, u8_cmp);
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+					   struct bch_replicas_entry_v0 *e)
+{
+	unsigned i;
+
+	if (e->data_type < BCH_DATA_NR)
+		prt_printf(out, "%s", bch2_data_types[e->data_type]);
+	else
+		prt_printf(out, "(invalid data type %u)", e->data_type);
+
+	prt_printf(out, ": %u [", e->nr_devs);
+	for (i = 0; i < e->nr_devs; i++)
+		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+	prt_printf(out, "]");
+}
+
+void bch2_replicas_entry_to_text(struct printbuf *out,
+				 struct bch_replicas_entry *e)
+{
+	unsigned i;
+
+	if (e->data_type < BCH_DATA_NR)
+		prt_printf(out, "%s", bch2_data_types[e->data_type]);
+	else
+		prt_printf(out, "(invalid data type %u)", e->data_type);
+
+	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
+	for (i = 0; i < e->nr_devs; i++)
+		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+	prt_printf(out, "]");
+}
+
+void bch2_cpu_replicas_to_text(struct printbuf *out,
+			       struct bch_replicas_cpu *r)
+{
+	struct bch_replicas_entry *e;
+	bool first = true;
+
+	for_each_cpu_replicas_entry(r, e) {
+		if (!first)
+			prt_printf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_to_text(out, e);
+	}
+}
+
+static void extent_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	r->nr_required	= 1;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
+
+		if (!p.has_ec)
+			r->devs[r->nr_devs++] = p.ptr.dev;
+		else
+			r->nr_required = 0;
+	}
+}
+
+static void stripe_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	const struct bch_extent_ptr *ptr;
+
+	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+
+	for (ptr = s.v->ptrs;
+	     ptr < s.v->ptrs + s.v->nr_blocks;
+	     ptr++)
+		r->devs[r->nr_devs++] = ptr->dev;
+}
+
+void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+			   struct bkey_s_c k)
+{
+	e->nr_devs = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		e->data_type = BCH_DATA_btree;
+		extent_to_replicas(k, e);
+		break;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		e->data_type = BCH_DATA_user;
+		extent_to_replicas(k, e);
+		break;
+	case KEY_TYPE_stripe:
+		e->data_type = BCH_DATA_parity;
+		stripe_to_replicas(k, e);
+		break;
+	}
+
+	bch2_replicas_entry_sort(e);
+}
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+			      enum bch_data_type data_type,
+			      struct bch_devs_list devs)
+{
+	unsigned i;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_sb ||
+	       data_type >= BCH_DATA_NR);
+
+	e->data_type	= data_type;
+	e->nr_devs	= 0;
+	e->nr_required	= 1;
+
+	for (i = 0; i < devs.nr; i++)
+		e->devs[e->nr_devs++] = devs.devs[i];
+
+	bch2_replicas_entry_sort(e);
+}
+
+static struct bch_replicas_cpu
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+		       struct bch_replicas_entry *new_entry)
+{
+	unsigned i;
+	struct bch_replicas_cpu new = {
+		.nr		= old->nr + 1,
+		.entry_size	= max_t(unsigned, old->entry_size,
+					replicas_entry_bytes(new_entry)),
+	};
+
+	BUG_ON(!new_entry->data_type);
+	verify_replicas_entry(new_entry);
+
+	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
+	if (!new.entries)
+		return new;
+
+	for (i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(&new, i),
+		       cpu_replicas_entry(old, i),
+		       old->entry_size);
+
+	memcpy(cpu_replicas_entry(&new, old->nr),
+	       new_entry,
+	       replicas_entry_bytes(new_entry));
+
+	bch2_cpu_replicas_sort(&new);
+	return new;
+}
+
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
+				       struct bch_replicas_entry *search)
+{
+	int idx, entry_size = replicas_entry_bytes(search);
+
+	if (unlikely(entry_size > r->entry_size))
+		return -1;
+
+	verify_replicas_entry(search);
+
+#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
+	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
+			      entry_cmp, search);
+#undef entry_cmp
+
+	return idx < r->nr ? idx : -1;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *c,
+			    struct bch_replicas_entry *search)
+{
+	bch2_replicas_entry_sort(search);
+
+	return __replicas_entry_idx(&c->replicas, search);
+}
+
+static bool __replicas_has_entry(struct bch_replicas_cpu *r,
+				 struct bch_replicas_entry *search)
+{
+	return __replicas_entry_idx(r, search) >= 0;
+}
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry *search)
+{
+	bool marked;
+
+	if (!search->nr_devs)
+		return true;
+
+	verify_replicas_entry(search);
+
+	percpu_down_read(&c->mark_lock);
+	marked = __replicas_has_entry(&c->replicas, search) &&
+		(likely((!c->replicas_gc.entries)) ||
+		 __replicas_has_entry(&c->replicas_gc, search));
+	percpu_up_read(&c->mark_lock);
+
+	return marked;
+}
+
+static void __replicas_table_update(struct bch_fs_usage *dst,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage *src,
+				    struct bch_replicas_cpu *src_r)
+{
+	int src_idx, dst_idx;
+
+	*dst = *src;
+
+	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+		if (!src->replicas[src_idx])
+			continue;
+
+		dst_idx = __replicas_entry_idx(dst_r,
+				cpu_replicas_entry(src_r, src_idx));
+		BUG_ON(dst_idx < 0);
+
+		dst->replicas[dst_idx] = src->replicas[src_idx];
+	}
+}
+
+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage __percpu *src_p,
+				    struct bch_replicas_cpu *src_r)
+{
+	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+	struct bch_fs_usage *dst, *src = (void *)
+		bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
+
+	preempt_disable();
+	dst = this_cpu_ptr(dst_p);
+	preempt_enable();
+
+	__replicas_table_update(dst, dst_r, src, src_r);
+}
+
+/*
+ * Resize filesystem accounting:
+ */
+static int replicas_table_update(struct bch_fs *c,
+				 struct bch_replicas_cpu *new_r)
+{
+	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
+	struct bch_fs_usage_online *new_scratch = NULL;
+	struct bch_fs_usage __percpu *new_gc = NULL;
+	struct bch_fs_usage *new_base = NULL;
+	unsigned i, bytes = sizeof(struct bch_fs_usage) +
+		sizeof(u64) * new_r->nr;
+	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
+		sizeof(u64) * new_r->nr;
+	int ret = 0;
+
+	memset(new_usage, 0, sizeof(new_usage));
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+					sizeof(u64), GFP_KERNEL)))
+			goto err;
+
+	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+	    !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
+	    (c->usage_gc &&
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		if (c->usage[i])
+			__replicas_table_update_pcpu(new_usage[i], new_r,
+						     c->usage[i], &c->replicas);
+	if (c->usage_base)
+		__replicas_table_update(new_base,		new_r,
+					c->usage_base,		&c->replicas);
+	if (c->usage_gc)
+		__replicas_table_update_pcpu(new_gc,		new_r,
+					     c->usage_gc,	&c->replicas);
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		swap(c->usage[i],	new_usage[i]);
+	swap(c->usage_base,	new_base);
+	swap(c->usage_scratch,	new_scratch);
+	swap(c->usage_gc,	new_gc);
+	swap(c->replicas,	*new_r);
+out:
+	free_percpu(new_gc);
+	kfree(new_scratch);
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		free_percpu(new_usage[i]);
+	kfree(new_base);
+	return ret;
+err:
+	bch_err(c, "error updating replicas table: memory allocation failure");
+	ret = -BCH_ERR_ENOMEM_replicas_table;
+	goto out;
+}
+
+static unsigned reserve_journal_replicas(struct bch_fs *c,
+				     struct bch_replicas_cpu *r)
+{
+	struct bch_replicas_entry *e;
+	unsigned journal_res_u64s = 0;
+
+	/* nr_inodes: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+	/* key_version: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+	/* persistent_reserved: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
+		BCH_REPLICAS_MAX;
+
+	for_each_cpu_replicas_entry(r, e)
+		journal_res_u64s +=
+			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
+				     e->nr_devs, sizeof(u64));
+	return journal_res_u64s;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+				struct bch_replicas_entry *new_entry)
+{
+	struct bch_replicas_cpu new_r, new_gc;
+	int ret = 0;
+
+	verify_replicas_entry(new_entry);
+
+	memset(&new_r, 0, sizeof(new_r));
+	memset(&new_gc, 0, sizeof(new_gc));
+
+	mutex_lock(&c->sb_lock);
+
+	if (c->replicas_gc.entries &&
+	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
+		new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
+		if (!new_gc.entries) {
+			ret = -BCH_ERR_ENOMEM_cpu_replicas;
+			goto err;
+		}
+	}
+
+	if (!__replicas_has_entry(&c->replicas, new_entry)) {
+		new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
+		if (!new_r.entries) {
+			ret = -BCH_ERR_ENOMEM_cpu_replicas;
+			goto err;
+		}
+
+		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
+		if (ret)
+			goto err;
+
+		bch2_journal_entry_res_resize(&c->journal,
+				&c->replicas_journal_res,
+				reserve_journal_replicas(c, &new_r));
+	}
+
+	if (!new_r.entries &&
+	    !new_gc.entries)
+		goto out;
+
+	/* allocations done, now commit: */
+
+	if (new_r.entries)
+		bch2_write_super(c);
+
+	/* don't update in memory replicas until changes are persistent */
+	percpu_down_write(&c->mark_lock);
+	if (new_r.entries)
+		ret = replicas_table_update(c, &new_r);
+	if (new_gc.entries)
+		swap(new_gc, c->replicas_gc);
+	percpu_up_write(&c->mark_lock);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	kfree(new_r.entries);
+	kfree(new_gc.entries);
+
+	return ret;
+err:
+	bch_err_msg(c, ret, "adding replicas entry");
+	goto out;
+}
+
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+{
+	return likely(bch2_replicas_marked(c, r))
+		? 0 : bch2_mark_replicas_slowpath(c, r);
+}
+
+/* replicas delta list: */
+
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
+				  struct replicas_delta_list *r)
+{
+	struct replicas_delta *d = r->d;
+	struct replicas_delta *top = (void *) r->d + r->used;
+	int ret = 0;
+
+	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
+		ret = bch2_mark_replicas(c, &d->r);
+	return ret;
+}
+
+/*
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
+ * die at some point:
+ */
+
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+{
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+	percpu_down_write(&c->mark_lock);
+
+	ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+	if (ret)
+		goto err;
+
+	ret = replicas_table_update(c, &c->replicas_gc);
+err:
+	kfree(c->replicas_gc.entries);
+	c->replicas_gc.entries = NULL;
+
+	percpu_up_write(&c->mark_lock);
+
+	if (!ret)
+		bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+	struct bch_replicas_entry *e;
+	unsigned i = 0;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	BUG_ON(c->replicas_gc.entries);
+
+	c->replicas_gc.nr		= 0;
+	c->replicas_gc.entry_size	= 0;
+
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		if (!((1 << e->data_type) & typemask)) {
+			c->replicas_gc.nr++;
+			c->replicas_gc.entry_size =
+				max_t(unsigned, c->replicas_gc.entry_size,
+				      replicas_entry_bytes(e));
+		}
+
+	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
+					 c->replicas_gc.entry_size,
+					 GFP_KERNEL);
+	if (!c->replicas_gc.entries) {
+		mutex_unlock(&c->sb_lock);
+		bch_err(c, "error allocating c->replicas_gc");
+		return -BCH_ERR_ENOMEM_replicas_gc;
+	}
+
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		if (!((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
+			       e, c->replicas_gc.entry_size);
+
+	bch2_cpu_replicas_sort(&c->replicas_gc);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/*
+ * New much simpler mechanism for clearing out unneeded replicas entries - drop
+ * replicas entries that have 0 sectors used.
+ *
+ * However, we don't track sector counts for journal usage, so this doesn't drop
+ * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
+ * is retained for that.
+ */
+int bch2_replicas_gc2(struct bch_fs *c)
+{
+	struct bch_replicas_cpu new = { 0 };
+	unsigned i, nr;
+	int ret = 0;
+
+	bch2_journal_meta(&c->journal);
+retry:
+	nr		= READ_ONCE(c->replicas.nr);
+	new.entry_size	= READ_ONCE(c->replicas.entry_size);
+	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
+	if (!new.entries) {
+		bch_err(c, "error allocating c->replicas_gc");
+		return -BCH_ERR_ENOMEM_replicas_gc;
+	}
+
+	mutex_lock(&c->sb_lock);
+	percpu_down_write(&c->mark_lock);
+
+	if (nr			!= c->replicas.nr ||
+	    new.entry_size	!= c->replicas.entry_size) {
+		percpu_up_write(&c->mark_lock);
+		mutex_unlock(&c->sb_lock);
+		kfree(new.entries);
+		goto retry;
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		if (e->data_type == BCH_DATA_journal ||
+		    c->usage_base->replicas[i] ||
+		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[1]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[2]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[3]->replicas[i]))
+			memcpy(cpu_replicas_entry(&new, new.nr++),
+			       e, new.entry_size);
+	}
+
+	bch2_cpu_replicas_sort(&new);
+
+	ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
+	if (ret)
+		goto err;
+
+	ret = replicas_table_update(c, &new);
+err:
+	kfree(new.entries);
+
+	percpu_up_write(&c->mark_lock);
+
+	if (!ret)
+		bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_replicas_set_usage(struct bch_fs *c,
+			    struct bch_replicas_entry *r,
+			    u64 sectors)
+{
+	int ret, idx = bch2_replicas_entry_idx(c, r);
+
+	if (idx < 0) {
+		struct bch_replicas_cpu n;
+
+		n = cpu_replicas_add_entry(&c->replicas, r);
+		if (!n.entries)
+			return -BCH_ERR_ENOMEM_cpu_replicas;
+
+		ret = replicas_table_update(c, &n);
+		if (ret)
+			return ret;
+
+		kfree(n.entries);
+
+		idx = bch2_replicas_entry_idx(c, r);
+		BUG_ON(ret < 0);
+	}
+
+	c->usage_base->replicas[idx] = sectors;
+
+	return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static int
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
+				   struct bch_replicas_cpu *cpu_r)
+{
+	struct bch_replicas_entry *e, *dst;
+	unsigned nr = 0, entry_size = 0, idx = 0;
+
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
+
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
+	if (!cpu_r->entries)
+		return -BCH_ERR_ENOMEM_cpu_replicas;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	for_each_replicas_entry(sb_r, e) {
+		dst = cpu_replicas_entry(cpu_r, idx++);
+		memcpy(dst, e, replicas_entry_bytes(e));
+		bch2_replicas_entry_sort(dst);
+	}
+
+	return 0;
+}
+
+static int
+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
+				      struct bch_replicas_cpu *cpu_r)
+{
+	struct bch_replicas_entry_v0 *e;
+	unsigned nr = 0, entry_size = 0, idx = 0;
+
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
+
+	entry_size += sizeof(struct bch_replicas_entry) -
+		sizeof(struct bch_replicas_entry_v0);
+
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
+	if (!cpu_r->entries)
+		return -BCH_ERR_ENOMEM_cpu_replicas;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	for_each_replicas_entry(sb_r, e) {
+		struct bch_replicas_entry *dst =
+			cpu_replicas_entry(cpu_r, idx++);
+
+		dst->data_type	= e->data_type;
+		dst->nr_devs	= e->nr_devs;
+		dst->nr_required = 1;
+		memcpy(dst->devs, e->devs, e->nr_devs);
+		bch2_replicas_entry_sort(dst);
+	}
+
+	return 0;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+	struct bch_sb_field_replicas *sb_v1;
+	struct bch_sb_field_replicas_v0 *sb_v0;
+	struct bch_replicas_cpu new_r = { 0, 0, NULL };
+	int ret = 0;
+
+	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
+		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
+	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
+		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
+	if (ret)
+		return ret;
+
+	bch2_cpu_replicas_sort(&new_r);
+
+	percpu_down_write(&c->mark_lock);
+
+	ret = replicas_table_update(c, &new_r);
+	percpu_up_write(&c->mark_lock);
+
+	kfree(new_r.entries);
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
+					       struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas_v0 *sb_r;
+	struct bch_replicas_entry_v0 *dst;
+	struct bch_replicas_entry *src;
+	size_t bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, src)
+		bytes += replicas_entry_bytes(src) - 1;
+
+	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
+	if (!sb_r)
+		return -BCH_ERR_ENOSPC_sb_replicas;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
+	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		dst->data_type	= src->data_type;
+		dst->nr_devs	= src->nr_devs;
+		memcpy(dst->devs, src->devs, src->nr_devs);
+
+		dst = replicas_entry_next(dst);
+
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+					    struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_entry *dst, *src;
+	bool need_v1 = false;
+	size_t bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, src) {
+		bytes += replicas_entry_bytes(src);
+		if (src->nr_required != 1)
+			need_v1 = true;
+	}
+
+	if (!need_v1)
+		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
+
+	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
+	if (!sb_r)
+		return -BCH_ERR_ENOSPC_sb_replicas;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
+	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		memcpy(dst, src, replicas_entry_bytes(src));
+
+		dst = replicas_entry_next(dst);
+
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
+				      struct bch_sb *sb,
+				      struct printbuf *err)
+{
+	unsigned i, j;
+
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i < cpu_r->nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(cpu_r, i);
+
+		if (e->data_type >= BCH_DATA_NR) {
+			prt_printf(err, "invalid data type in entry ");
+			bch2_replicas_entry_to_text(err, e);
+			return -BCH_ERR_invalid_sb_replicas;
+		}
+
+		if (!e->nr_devs) {
+			prt_printf(err, "no devices in entry ");
+			bch2_replicas_entry_to_text(err, e);
+			return -BCH_ERR_invalid_sb_replicas;
+		}
+
+		if (e->nr_required > 1 &&
+		    e->nr_required >= e->nr_devs) {
+			prt_printf(err, "bad nr_required in entry ");
+			bch2_replicas_entry_to_text(err, e);
+			return -BCH_ERR_invalid_sb_replicas;
+		}
+
+		for (j = 0; j < e->nr_devs; j++)
+			if (!bch2_dev_exists(sb, e->devs[j])) {
+				prt_printf(err, "invalid device %u in entry ", e->devs[j]);
+				bch2_replicas_entry_to_text(err, e);
+				return -BCH_ERR_invalid_sb_replicas;
+			}
+
+		if (i + 1 < cpu_r->nr) {
+			struct bch_replicas_entry *n =
+				cpu_replicas_entry(cpu_r, i + 1);
+
+			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
+
+			if (!memcmp(e, n, cpu_r->entry_size)) {
+				prt_printf(err, "duplicate replicas entry ");
+				bch2_replicas_entry_to_text(err, e);
+				return -BCH_ERR_invalid_sb_replicas;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				     struct printbuf *err)
+{
+	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+	struct bch_replicas_cpu cpu_r;
+	int ret;
+
+	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
+	if (ret)
+		return ret;
+
+	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+	kfree(cpu_r.entries);
+	return ret;
+}
+
+static void bch2_sb_replicas_to_text(struct printbuf *out,
+				     struct bch_sb *sb,
+				     struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
+	struct bch_replicas_entry *e;
+	bool first = true;
+
+	for_each_replicas_entry(r, e) {
+		if (!first)
+			prt_printf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_to_text(out, e);
+	}
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+	.validate	= bch2_sb_replicas_validate,
+	.to_text	= bch2_sb_replicas_to_text,
+};
+
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
+					struct printbuf *err)
+{
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_replicas_cpu cpu_r;
+	int ret;
+
+	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
+	if (ret)
+		return ret;
+
+	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+	kfree(cpu_r.entries);
+	return ret;
+}
+
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_replicas_entry_v0 *e;
+	bool first = true;
+
+	for_each_replicas_entry(sb_r, e) {
+		if (!first)
+			prt_printf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_v0_to_text(out, e);
+	}
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
+	.validate	= bch2_sb_replicas_v0_validate,
+	.to_text	= bch2_sb_replicas_v0_to_text,
+};
+
+/* Query replicas: */
+
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
+			   unsigned flags, bool print)
+{
+	struct bch_replicas_entry *e;
+	bool ret = true;
+
+	percpu_down_read(&c->mark_lock);
+	for_each_cpu_replicas_entry(&c->replicas, e) {
+		unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
+		bool metadata = e->data_type < BCH_DATA_user;
+
+		if (e->data_type == BCH_DATA_cached)
+			continue;
+
+		for (i = 0; i < e->nr_devs; i++) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
+
+			nr_online += test_bit(e->devs[i], devs.d);
+			nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
+		}
+
+		if (nr_failed == e->nr_devs)
+			continue;
+
+		if (nr_online < e->nr_required)
+			dflags |= metadata
+				? BCH_FORCE_IF_METADATA_LOST
+				: BCH_FORCE_IF_DATA_LOST;
+
+		if (nr_online < e->nr_devs)
+			dflags |= metadata
+				? BCH_FORCE_IF_METADATA_DEGRADED
+				: BCH_FORCE_IF_DATA_DEGRADED;
+
+		if (dflags & ~flags) {
+			if (print) {
+				struct printbuf buf = PRINTBUF;
+
+				bch2_replicas_entry_to_text(&buf, e);
+				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
+					nr_online, buf.buf);
+				printbuf_exit(&buf);
+			}
+			ret = false;
+			break;
+		}
+
+	}
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
+{
+	struct bch_sb_field_replicas *replicas;
+	struct bch_sb_field_replicas_v0 *replicas_v0;
+	unsigned i, data_has = 0;
+
+	replicas = bch2_sb_field_get(sb, replicas);
+	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
+
+	if (replicas) {
+		struct bch_replicas_entry *r;
+
+		for_each_replicas_entry(replicas, r)
+			for (i = 0; i < r->nr_devs; i++)
+				if (r->devs[i] == dev)
+					data_has |= 1 << r->data_type;
+	} else if (replicas_v0) {
+		struct bch_replicas_entry_v0 *r;
+
+		for_each_replicas_entry_v0(replicas_v0, r)
+			for (i = 0; i < r->nr_devs; i++)
+				if (r->devs[i] == dev)
+					data_has |= 1 << r->data_type;
+	}
+
+
+	return data_has;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned ret;
+
+	mutex_lock(&c->sb_lock);
+	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+void bch2_fs_replicas_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	kfree(c->usage_scratch);
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		free_percpu(c->usage[i]);
+	kfree(c->usage_base);
+	kfree(c->replicas.entries);
+	kfree(c->replicas_gc.entries);
+
+	mempool_exit(&c->replicas_delta_pool);
+}
+
+int bch2_fs_replicas_init(struct bch_fs *c)
+{
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->replicas_journal_res,
+			reserve_journal_replicas(c, &c->replicas));
+
+	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
+					 REPLICAS_DELTA_LIST_MAX) ?:
+		replicas_table_update(c, &c->replicas);
+}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
new file mode 100644
index 000000000000..4887675a86f0
--- /dev/null
+++ b/fs/bcachefs/replicas.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+#include "bkey.h"
+#include "eytzinger.h"
+#include "replicas_types.h"
+
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_to_text(struct printbuf *,
+				 struct bch_replicas_entry *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *,
+			    struct bch_replicas_entry *);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+			      enum bch_data_type,
+			      struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+int bch2_mark_replicas(struct bch_fs *,
+		       struct bch_replicas_entry *);
+
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+	return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
+
+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+					      unsigned dev)
+{
+	e->data_type	= BCH_DATA_cached;
+	e->nr_devs	= 1;
+	e->nr_required	= 1;
+	e->devs[0]	= dev;
+}
+
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
+			   unsigned, bool);
+
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+int bch2_replicas_gc2(struct bch_fs *);
+
+int bch2_replicas_set_usage(struct bch_fs *,
+			    struct bch_replicas_entry *,
+			    u64);
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+#define replicas_entry_next(_i)						\
+	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
+
+#define for_each_replicas_entry(_r, _i)					\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
+#define for_each_replicas_entry_v0(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
+
+void bch2_fs_replicas_exit(struct bch_fs *);
+int bch2_fs_replicas_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
new file mode 100644
index 000000000000..5cfff489bbc3
--- /dev/null
+++ b/fs/bcachefs/replicas_types.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_TYPES_H
+#define _BCACHEFS_REPLICAS_TYPES_H
+
+struct bch_replicas_cpu {
+	unsigned		nr;
+	unsigned		entry_size;
+	struct bch_replicas_entry *entries;
+};
+
+struct replicas_delta {
+	s64			delta;
+	struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+	unsigned		size;
+	unsigned		used;
+
+	struct			{} memset_start;
+	u64			nr_inodes;
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	struct			{} memset_end;
+	struct replicas_delta	d[0];
+};
+
+#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
new file mode 100644
index 000000000000..61203d7c8d36
--- /dev/null
+++ b/fs/bcachefs/sb-clean.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "super-io.h"
+
+/*
+ * BCH_SB_FIELD_clean:
+ *
+ * Btree roots, and a few other things, are recovered from the journal after an
+ * unclean shutdown - but after a clean shutdown, to avoid having to read the
+ * journal, we can store them in the superblock.
+ *
+ * bch_sb_field_clean simply contains a list of journal entries, stored exactly
+ * as they would be in the journal:
+ */
+
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
+				int write)
+{
+	struct jset_entry *entry;
+	int ret;
+
+	for (entry = clean->start;
+	     entry < (struct jset_entry *) vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		ret = bch2_journal_entry_validate(c, NULL, entry,
+						  le16_to_cpu(c->disk_sb.sb->version),
+						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+						  write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+				      struct bch_sb_field_clean *clean,
+				      struct jset *j,
+				      enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+int bch2_verify_superblock_clean(struct bch_fs *c,
+				 struct bch_sb_field_clean **cleanp,
+				 struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret = 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+
+		if (k1)
+			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+		else
+			prt_printf(&buf1, "(none)");
+
+		if (k2)
+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+		else
+			prt_printf(&buf2, "(none)");
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
+				    l1 != l2, c,
+			"superblock btree root %u doesn't match journal after clean shutdown\n"
+			"sb:      l=%u %s\n"
+			"journal: l=%u %s\n", i,
+			l1, buf1.buf,
+			l2, buf2.buf);
+	}
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
+
+	if (fsck_err_on(!sb_clean, c,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
+		mutex_unlock(&c->sb_lock);
+		return NULL;
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
+	}
+
+	ret = bch2_sb_clean_validate_late(c, clean, READ);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(ret);
+	}
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
+}
+
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+	memset(entry, 0, u64s * sizeof(u64));
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = cpu_to_le16(u64s - 1);
+
+	*end = vstruct_next(*end);
+	return entry;
+}
+
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+					   struct jset_entry **end,
+					   u64 journal_seq)
+{
+	struct bch_dev *ca;
+	unsigned i, dev;
+
+	percpu_down_read(&c->mark_lock);
+
+	if (!journal_seq) {
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			bch2_fs_usage_acc_to_base(c, i);
+	} else {
+		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
+	}
+
+	{
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_inodes;
+		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
+	}
+
+	{
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_key_version;
+		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
+	}
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_reserved;
+		u->entry.level	= i;
+		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+		struct jset_entry_data_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+				     struct jset_entry_data_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_data_usage;
+		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
+		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+			      "embedded variable length struct");
+	}
+
+	for_each_member_device(ca, c, dev) {
+		unsigned b = sizeof(struct jset_entry_dev_usage) +
+			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+		struct jset_entry_dev_usage *u =
+			container_of(jset_entry_init(end, b),
+				     struct jset_entry_dev_usage, entry);
+
+		u->entry.type = BCH_JSET_ENTRY_dev_usage;
+		u->dev = cpu_to_le32(dev);
+		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
+			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+		}
+	}
+
+	percpu_up_read(&c->mark_lock);
+
+	for (i = 0; i < 2; i++) {
+		struct jset_entry_clock *clock =
+			container_of(jset_entry_init(end, sizeof(*clock)),
+				     struct jset_entry_clock, entry);
+
+		clock->entry.type = BCH_JSET_ENTRY_clock;
+		clock->rw	= i;
+		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
+	}
+}
+
+static int bch2_sb_clean_validate(struct bch_sb *sb,
+				  struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&clean->field), sizeof(*clean));
+		return -BCH_ERR_invalid_sb_clean;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+	struct jset_entry *entry;
+
+	prt_printf(out, "flags:          %x",	le32_to_cpu(clean->flags));
+	prt_newline(out);
+	prt_printf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
+	prt_newline(out);
+
+	for (entry = clean->start;
+	     entry != vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+		    !entry->u64s)
+			continue;
+
+		bch2_journal_entry_to_text(out, NULL, entry);
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+	.validate	= bch2_sb_clean_validate,
+	.to_text	= bch2_sb_clean_to_text,
+};
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+	int ret;
+
+	/*
+	 * Unconditionally write superblock, to verify it hasn't changed before
+	 * we go rw:
+	 */
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	bch2_sb_maybe_downgrade(c);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
+	ret = bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *sb_clean;
+	struct jset_entry *entry;
+	unsigned u64s;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	if (BCH_SB_CLEAN(c->disk_sb.sb))
+		goto out;
+
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
+
+	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
+
+	sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
+	if (!sb_clean) {
+		bch_err(c, "error resizing superblock while setting filesystem clean");
+		goto out;
+	}
+
+	sb_clean->flags		= 0;
+	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
+
+	/* Trying to catch outstanding bug: */
+	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
+
+	entry = sb_clean->start;
+	bch2_journal_super_entries_add_common(c, &entry, 0);
+	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
+	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
+	memset(entry, 0,
+	       vstruct_end(&sb_clean->field) - (void *) entry);
+
+	/*
+	 * this should be in the write path, and we should be validating every
+	 * superblock section:
+	 */
+	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
+	if (ret) {
+		bch_err(c, "error writing marking filesystem clean: validate error");
+		goto out;
+	}
+
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
new file mode 100644
index 000000000000..71caef281239
--- /dev/null
+++ b/fs/bcachefs/sb-clean.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_CLEAN_H
+#define _BCACHEFS_SB_CLEAN_H
+
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
+				 struct jset *);
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
+void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
+
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
new file mode 100644
index 000000000000..6dd85bb996fe
--- /dev/null
+++ b/fs/bcachefs/sb-members.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "opts.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+/* Code for bch_sb_field_members_v1: */
+
+static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi, int i)
+{
+	return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
+}
+
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
+{
+	return members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
+}
+
+static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
+{
+	struct bch_member ret, *p = members_v2_get_mut(mi, i);
+	memset(&ret, 0, sizeof(ret));
+	memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
+	return ret;
+}
+
+static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
+{
+	return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
+}
+
+static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
+{
+	struct bch_member ret, *p = members_v1_get_mut(mi, i);
+	memset(&ret, 0, sizeof(ret));
+	memcpy(&ret, p, min_t(size_t, sizeof(struct bch_member), sizeof(ret))); return ret;
+}
+
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
+{
+	struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
+	if (mi2)
+		return members_v2_get(mi2, i);
+	struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
+	return members_v1_get(mi1, i);
+}
+
+static int sb_members_v2_resize_entries(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+
+	if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
+		unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
+					      c->disk_sb.sb->nr_devices), 8);
+
+		mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+		if (!mi)
+			return -BCH_ERR_ENOSPC_sb_members_v2;
+
+		for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
+			void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
+			memmove(dst, members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
+			memset(dst + le16_to_cpu(mi->member_bytes),
+			       0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
+		}
+		mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
+	}
+	return 0;
+}
+
+int bch2_members_v2_init(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v1 *mi1;
+	struct bch_sb_field_members_v2 *mi2;
+
+	if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
+		mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
+				DIV_ROUND_UP(sizeof(*mi2) +
+					     sizeof(struct bch_member) * c->sb.nr_devices,
+					     sizeof(u64)));
+		mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
+		memcpy(&mi2->_members[0], &mi1->_members[0],
+		       BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
+		memset(&mi2->pad[0], 0, sizeof(mi2->pad));
+		mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
+	}
+
+	return sb_members_v2_resize_entries(c);
+}
+
+int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
+{
+	struct bch_sb_field_members_v1 *mi1;
+	struct bch_sb_field_members_v2 *mi2;
+
+	mi1 = bch2_sb_field_resize(disk_sb, members_v1,
+			DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
+				     disk_sb->sb->nr_devices, sizeof(u64)));
+	if (!mi1)
+		return -BCH_ERR_ENOSPC_sb_members;
+
+	mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
+
+	for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
+		memcpy(members_v1_get_mut(mi1, i), members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
+
+	return 0;
+}
+
+static int validate_member(struct printbuf *err,
+			   struct bch_member m,
+			   struct bch_sb *sb,
+			   int i)
+{
+	if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
+		prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
+			   i, le64_to_cpu(m.nbuckets), LONG_MAX);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le64_to_cpu(m.nbuckets) -
+	    le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
+		prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
+			   i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le16_to_cpu(m.bucket_size) <
+	    le16_to_cpu(sb->block_size)) {
+		prt_printf(err, "device %u: bucket size %u smaller than block size %u",
+			   i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le16_to_cpu(m.bucket_size) <
+	    BCH_SB_BTREE_NODE_SIZE(sb)) {
+		prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
+			   i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	return 0;
+}
+
+static void member_to_text(struct printbuf *out,
+			   struct bch_member m,
+			   struct bch_sb_field_disk_groups *gi,
+			   struct bch_sb *sb,
+			   int i)
+{
+	unsigned data_have = bch2_sb_dev_has_data(sb, i);
+	u64 bucket_size = le16_to_cpu(m.bucket_size);
+	u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
+
+
+	prt_printf(out, "Device:");
+	prt_tab(out);
+	prt_printf(out, "%u", i);
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "UUID:");
+	prt_tab(out);
+	pr_uuid(out, m.uuid.b);
+	prt_newline(out);
+
+	prt_printf(out, "Size:");
+	prt_tab(out);
+	prt_units_u64(out, device_size << 9);
+	prt_newline(out);
+
+	for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
+		prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
+		prt_tab(out);
+		prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
+		prt_newline(out);
+	}
+
+	prt_printf(out, "Bucket size:");
+	prt_tab(out);
+	prt_units_u64(out, bucket_size << 9);
+	prt_newline(out);
+
+	prt_printf(out, "First bucket:");
+	prt_tab(out);
+	prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
+	prt_newline(out);
+
+	prt_printf(out, "Buckets:");
+	prt_tab(out);
+	prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
+	prt_newline(out);
+
+	prt_printf(out, "Last mount:");
+	prt_tab(out);
+	if (m.last_mount)
+		pr_time(out, le64_to_cpu(m.last_mount));
+	else
+		prt_printf(out, "(never)");
+	prt_newline(out);
+
+	prt_printf(out, "State:");
+	prt_tab(out);
+	prt_printf(out, "%s",
+		   BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
+		   ? bch2_member_states[BCH_MEMBER_STATE(&m)]
+		   : "unknown");
+	prt_newline(out);
+
+	prt_printf(out, "Label:");
+	prt_tab(out);
+	if (BCH_MEMBER_GROUP(&m)) {
+		unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
+
+		if (idx < disk_groups_nr(gi))
+			prt_printf(out, "%s (%u)",
+				   gi->entries[idx].label, idx);
+		else
+			prt_printf(out, "(bad disk labels section)");
+	} else {
+		prt_printf(out, "(none)");
+	}
+	prt_newline(out);
+
+	prt_printf(out, "Data allowed:");
+	prt_tab(out);
+	if (BCH_MEMBER_DATA_ALLOWED(&m))
+		prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+	else
+		prt_printf(out, "(none)");
+	prt_newline(out);
+
+	prt_printf(out, "Has data:");
+	prt_tab(out);
+	if (data_have)
+		prt_bitflags(out, bch2_data_types, data_have);
+	else
+		prt_printf(out, "(none)");
+	prt_newline(out);
+
+	prt_printf(out, "Discard:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+	prt_newline(out);
+
+	prt_printf(out, "Freespace initialized:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+static int bch2_sb_members_v1_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+	unsigned i;
+
+	if ((void *) members_v1_get_mut(mi, sb->nr_devices)  >
+	    vstruct_end(&mi->field)) {
+		prt_printf(err, "too many devices for section size");
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = members_v1_get(mi, i);
+
+		int ret = validate_member(err, m, sb, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+	unsigned i;
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = members_v1_get(mi, i);
+		member_to_text(out, m, gi, sb, i);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
+	.validate	= bch2_sb_members_v1_validate,
+	.to_text	= bch2_sb_members_v1_to_text,
+};
+
+static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+	unsigned i;
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = members_v2_get(mi, i);
+		member_to_text(out, m, gi, sb, i);
+	}
+}
+
+static int bch2_sb_members_v2_validate(struct bch_sb *sb,
+				       struct bch_sb_field *f,
+				       struct printbuf *err)
+{
+	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+	size_t mi_bytes = (void *) members_v2_get_mut(mi, sb->nr_devices) -
+		(void *) mi;
+
+	if (mi_bytes > vstruct_bytes(&mi->field)) {
+		prt_printf(err, "section too small (%zu > %zu)",
+			   mi_bytes, vstruct_bytes(&mi->field));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	for (unsigned i = 0; i < sb->nr_devices; i++) {
+		int ret = validate_member(err, members_v2_get(mi, i), sb, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
+	.validate	= bch2_sb_members_v2_validate,
+	.to_text	= bch2_sb_members_v2_to_text,
+};
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
new file mode 100644
index 000000000000..430f3457bfd4
--- /dev/null
+++ b/fs/bcachefs/sb-members.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_H
+#define _BCACHEFS_SB_MEMBERS_H
+
+int bch2_members_v2_init(struct bch_fs *c);
+int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+	return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+	return bch2_dev_is_online(ca) &&
+		ca->mi.state != BCH_MEMBER_STATE_failed;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+	if (!percpu_ref_tryget(&ca->io_ref))
+		return false;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
+		return true;
+
+	percpu_ref_put(&ca->io_ref);
+	return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+					 unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs.nr; i++)
+		if (devs.devs[i] == dev)
+			return true;
+
+	return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+					  unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs->nr; i++)
+		if (devs->devs[i] == dev) {
+			array_remove_item(devs->devs, devs->nr, i);
+			return;
+		}
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	if (!bch2_dev_list_has_dev(*devs, dev)) {
+		BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
+		devs->devs[devs->nr++] = dev;
+	}
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+					      const struct bch_devs_mask *mask)
+{
+	struct bch_dev *ca = NULL;
+
+	while ((*iter = mask
+		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+		: *iter) < c->sb.nr_devices &&
+	       !(ca = rcu_dereference_check(c->devs[*iter],
+					    lockdep_is_held(&c->state_lock))))
+		(*iter)++;
+
+	return ca;
+}
+
+#define for_each_member_device_rcu(ca, c, iter, mask)			\
+	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	if ((ca = __bch2_next_dev(c, iter, NULL)))
+		percpu_ref_get(&ca->ref);
+	rcu_read_unlock();
+
+	return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter)				\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_dev(c, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+						      unsigned *iter,
+						      int state_mask)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+	       (!((1 << ca->mi.state) & state_mask) ||
+		!percpu_ref_tryget(&ca->io_ref)))
+		(*iter)++;
+	rcu_read_unlock();
+
+	return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask)		\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
+	     percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter)					\
+	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+
+#define for_each_readable_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter,				\
+		(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_protected(c->devs[idx],
+					 lockdep_is_held(&c->sb_lock) ||
+					 lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+	struct bch_devs_mask devs;
+	struct bch_dev *ca;
+	unsigned i;
+
+	memset(&devs, 0, sizeof(devs));
+	for_each_online_member(ca, c, i)
+		__set_bit(ca->dev_idx, devs.d);
+	return devs;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
new file mode 100644
index 000000000000..c1860d8163fb
--- /dev/null
+++ b/fs/bcachefs/seqmutex.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SEQMUTEX_H
+#define _BCACHEFS_SEQMUTEX_H
+
+#include <linux/mutex.h>
+
+struct seqmutex {
+	struct mutex	lock;
+	u32		seq;
+};
+
+#define seqmutex_init(_lock)	mutex_init(&(_lock)->lock)
+
+static inline bool seqmutex_trylock(struct seqmutex *lock)
+{
+	return mutex_trylock(&lock->lock);
+}
+
+static inline void seqmutex_lock(struct seqmutex *lock)
+{
+	mutex_lock(&lock->lock);
+}
+
+static inline void seqmutex_unlock(struct seqmutex *lock)
+{
+	lock->seq++;
+	mutex_unlock(&lock->lock);
+}
+
+static inline u32 seqmutex_seq(struct seqmutex *lock)
+{
+	return lock->seq;
+}
+
+static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
+{
+	if (lock->seq != seq || !mutex_trylock(&lock->lock))
+		return false;
+
+	if (lock->seq != seq) {
+		mutex_unlock(&lock->lock);
+		return false;
+	}
+
+	return true;
+}
+
+#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
new file mode 100644
index 000000000000..dc1a27cc31cd
--- /dev/null
+++ b/fs/bcachefs/siphash.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+
+#include "siphash.h"
+
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+	while (rounds--) {
+		ctx->v[0] += ctx->v[1];
+		ctx->v[2] += ctx->v[3];
+		ctx->v[1] = rol64(ctx->v[1], 13);
+		ctx->v[3] = rol64(ctx->v[3], 16);
+
+		ctx->v[1] ^= ctx->v[0];
+		ctx->v[3] ^= ctx->v[2];
+		ctx->v[0] = rol64(ctx->v[0], 32);
+
+		ctx->v[2] += ctx->v[1];
+		ctx->v[0] += ctx->v[3];
+		ctx->v[1] = rol64(ctx->v[1], 17);
+		ctx->v[3] = rol64(ctx->v[3], 21);
+
+		ctx->v[1] ^= ctx->v[2];
+		ctx->v[3] ^= ctx->v[0];
+		ctx->v[2] = rol64(ctx->v[2], 32);
+	}
+}
+
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
+{
+	u64 m = get_unaligned_le64(ptr);
+
+	ctx->v[3] ^= m;
+	SipHash_Rounds(ctx, rounds);
+	ctx->v[0] ^= m;
+}
+
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+	u64 k0, k1;
+
+	k0 = le64_to_cpu(key->k0);
+	k1 = le64_to_cpu(key->k1);
+
+	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+	ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+	memset(ctx->buf, 0, sizeof(ctx->buf));
+	ctx->bytes = 0;
+}
+
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
+		    const void *src, size_t len)
+{
+	const u8 *ptr = src;
+	size_t left, used;
+
+	if (len == 0)
+		return;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	ctx->bytes += len;
+
+	if (used > 0) {
+		left = sizeof(ctx->buf) - used;
+
+		if (len >= left) {
+			memcpy(&ctx->buf[used], ptr, left);
+			SipHash_CRounds(ctx, ctx->buf, rc);
+			len -= left;
+			ptr += left;
+		} else {
+			memcpy(&ctx->buf[used], ptr, len);
+			return;
+		}
+	}
+
+	while (len >= sizeof(ctx->buf)) {
+		SipHash_CRounds(ctx, ptr, rc);
+		len -= sizeof(ctx->buf);
+		ptr += sizeof(ctx->buf);
+	}
+
+	if (len > 0)
+		memcpy(&ctx->buf[used], ptr, len);
+}
+
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+
+	r = SipHash_End(ctx, rc, rf);
+
+	*((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+	size_t left, used;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	left = sizeof(ctx->buf) - used;
+	memset(&ctx->buf[used], 0, left - 1);
+	ctx->buf[7] = ctx->bytes;
+
+	SipHash_CRounds(ctx, ctx->buf, rc);
+	ctx->v[2] ^= 0xff;
+	SipHash_Rounds(ctx, rf);
+
+	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+	memset(ctx, 0, sizeof(*ctx));
+	return r;
+}
+
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+	SIPHASH_CTX ctx;
+
+	SipHash_Init(&ctx, key);
+	SipHash_Update(&ctx, rc, rf, src, len);
+	return SipHash_End(&ctx, rc, rf);
+}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
new file mode 100644
index 000000000000..3dfaf34a43b2
--- /dev/null
+++ b/fs/bcachefs/siphash.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ *  SipHash24_Init() for the fast and resonable strong version
+ *  SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH	 8
+#define SIPHASH_KEY_LENGTH	16
+#define SIPHASH_DIGEST_LENGTH	 8
+
+typedef struct _SIPHASH_CTX {
+	u64		v[4];
+	u8		buf[SIPHASH_BLOCK_LENGTH];
+	u32		bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+	__le64		k0;
+	__le64		k1;
+} SIPHASH_KEY;
+
+void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64	SipHash_End(SIPHASH_CTX *, int, int);
+void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
new file mode 100644
index 000000000000..b684b9f00c1b
--- /dev/null
+++ b/fs/bcachefs/six.c
@@ -0,0 +1,913 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+
+#include "six.h"
+
+#ifdef DEBUG
+#define EBUG_ON(cond)			BUG_ON(cond)
+#else
+#define EBUG_ON(cond)			do {} while (0)
+#endif
+
+#define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip)		lock_release(l, ip)
+
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
+#define SIX_LOCK_HELD_read_OFFSET	0
+#define SIX_LOCK_HELD_read		~(~0U << 26)
+#define SIX_LOCK_HELD_intent		(1U << 26)
+#define SIX_LOCK_HELD_write		(1U << 27)
+#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN			(1U << 31)
+
+struct six_lock_vals {
+	/* Value we add to the lock in order to take the lock: */
+	u32			lock_val;
+
+	/* If the lock has this value (used as a mask), taking the lock fails: */
+	u32			lock_fail;
+
+	/* Mask that indicates lock is held for this type: */
+	u32			held_mask;
+
+	/* Waitlist we wakeup when releasing the lock: */
+	enum six_lock_type	unlock_wakeup;
+};
+
+static const struct six_lock_vals l[] = {
+	[SIX_LOCK_read] = {
+		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,
+		.lock_fail	= SIX_LOCK_HELD_write,
+		.held_mask	= SIX_LOCK_HELD_read,
+		.unlock_wakeup	= SIX_LOCK_write,
+	},
+	[SIX_LOCK_intent] = {
+		.lock_val	= SIX_LOCK_HELD_intent,
+		.lock_fail	= SIX_LOCK_HELD_intent,
+		.held_mask	= SIX_LOCK_HELD_intent,
+		.unlock_wakeup	= SIX_LOCK_intent,
+	},
+	[SIX_LOCK_write] = {
+		.lock_val	= SIX_LOCK_HELD_write,
+		.lock_fail	= SIX_LOCK_HELD_read,
+		.held_mask	= SIX_LOCK_HELD_write,
+		.unlock_wakeup	= SIX_LOCK_read,
+	},
+};
+
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
+{
+	if ((atomic_read(&lock->state) & mask) != mask)
+		atomic_or(mask, &lock->state);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
+{
+	if (atomic_read(&lock->state) & mask)
+		atomic_and(~mask, &lock->state);
+}
+
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+				 u32 old, struct task_struct *owner)
+{
+	if (type != SIX_LOCK_intent)
+		return;
+
+	if (!(old & SIX_LOCK_HELD_intent)) {
+		EBUG_ON(lock->owner);
+		lock->owner = owner;
+	} else {
+		EBUG_ON(lock->owner != current);
+	}
+}
+
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+	unsigned read_count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		read_count += *per_cpu_ptr(lock->readers, cpu);
+	return read_count;
+}
+
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+			    struct task_struct *task, bool try)
+{
+	int ret;
+	u32 old;
+
+	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
+	EBUG_ON(type == SIX_LOCK_write &&
+		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
+
+	/*
+	 * Percpu reader mode:
+	 *
+	 * The basic idea behind this algorithm is that you can implement a lock
+	 * between two threads without any atomics, just memory barriers:
+	 *
+	 * For two threads you'll need two variables, one variable for "thread a
+	 * has the lock" and another for "thread b has the lock".
+	 *
+	 * To take the lock, a thread sets its variable indicating that it holds
+	 * the lock, then issues a full memory barrier, then reads from the
+	 * other thread's variable to check if the other thread thinks it has
+	 * the lock. If we raced, we backoff and retry/sleep.
+	 *
+	 * Failure to take the lock may cause a spurious trylock failure in
+	 * another thread, because we temporarily set the lock to indicate that
+	 * we held it. This would be a problem for a thread in six_lock(), when
+	 * they are calling trylock after adding themself to the waitlist and
+	 * prior to sleeping.
+	 *
+	 * Therefore, if we fail to get the lock, and there were waiters of the
+	 * type we conflict with, we will have to issue a wakeup.
+	 *
+	 * Since we may be called under wait_lock (and by the wakeup code
+	 * itself), we return that the wakeup has to be done instead of doing it
+	 * here.
+	 */
+	if (type == SIX_LOCK_read && lock->readers) {
+		preempt_disable();
+		this_cpu_inc(*lock->readers); /* signal that we own lock */
+
+		smp_mb();
+
+		old = atomic_read(&lock->state);
+		ret = !(old & l[type].lock_fail);
+
+		this_cpu_sub(*lock->readers, !ret);
+		preempt_enable();
+
+		if (!ret && (old & SIX_LOCK_WAITING_write))
+			ret = -1 - SIX_LOCK_write;
+	} else if (type == SIX_LOCK_write && lock->readers) {
+		if (try) {
+			atomic_add(SIX_LOCK_HELD_write, &lock->state);
+			smp_mb__after_atomic();
+		}
+
+		ret = !pcpu_read_count(lock);
+
+		if (try && !ret) {
+			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+			if (old & SIX_LOCK_WAITING_read)
+				ret = -1 - SIX_LOCK_read;
+		}
+	} else {
+		old = atomic_read(&lock->state);
+		do {
+			ret = !(old & l[type].lock_fail);
+			if (!ret || (type == SIX_LOCK_write && !try)) {
+				smp_mb();
+				break;
+			}
+		} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
+
+		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
+	}
+
+	if (ret > 0)
+		six_set_owner(lock, type, old, task);
+
+	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
+
+	return ret;
+}
+
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+	struct six_lock_waiter *w, *next;
+	struct task_struct *task;
+	bool saw_one;
+	int ret;
+again:
+	ret = 0;
+	saw_one = false;
+	raw_spin_lock(&lock->wait_lock);
+
+	list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+		if (w->lock_want != lock_type)
+			continue;
+
+		if (saw_one && lock_type != SIX_LOCK_read)
+			goto unlock;
+		saw_one = true;
+
+		ret = __do_six_trylock(lock, lock_type, w->task, false);
+		if (ret <= 0)
+			goto unlock;
+
+		/*
+		 * Similar to percpu_rwsem_wake_function(), we need to guard
+		 * against the wakee noticing w->lock_acquired, returning, and
+		 * then exiting before we do the wakeup:
+		 */
+		task = get_task_struct(w->task);
+		__list_del(w->list.prev, w->list.next);
+		/*
+		 * The release barrier here ensures the ordering of the
+		 * __list_del before setting w->lock_acquired; @w is on the
+		 * stack of the thread doing the waiting and will be reused
+		 * after it sees w->lock_acquired with no other locking:
+		 * pairs with smp_load_acquire() in six_lock_slowpath()
+		 */
+		smp_store_release(&w->lock_acquired, true);
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+
+	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
+unlock:
+	raw_spin_unlock(&lock->wait_lock);
+
+	if (ret < 0) {
+		lock_type = -ret - 1;
+		goto again;
+	}
+}
+
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
+			    enum six_lock_type lock_type)
+{
+	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
+		return;
+
+	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
+		return;
+
+	__six_lock_wakeup(lock, lock_type);
+}
+
+__always_inline
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
+{
+	int ret;
+
+	ret = __do_six_trylock(lock, type, current, try);
+	if (ret < 0)
+		__six_lock_wakeup(lock, -ret - 1);
+
+	return ret > 0;
+}
+
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+	if (!do_six_trylock(lock, type, true))
+		return false;
+
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_trylock_ip);
+
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip)
+{
+	if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
+		return false;
+
+	if (six_lock_seq(lock) != seq) {
+		six_unlock_ip(lock, type, ip);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_relock_ip);
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+
+static inline bool six_can_spin_on_owner(struct six_lock *lock)
+{
+	struct task_struct *owner;
+	bool ret;
+
+	if (need_resched())
+		return false;
+
+	rcu_read_lock();
+	owner = READ_ONCE(lock->owner);
+	ret = !owner || owner_on_cpu(owner);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool six_spin_on_owner(struct six_lock *lock,
+				     struct task_struct *owner,
+				     u64 end_time)
+{
+	bool ret = true;
+	unsigned loop = 0;
+
+	rcu_read_lock();
+	while (lock->owner == owner) {
+		/*
+		 * Ensure we emit the owner->on_cpu, dereference _after_
+		 * checking lock->owner still matches owner. If that fails,
+		 * owner might point to freed memory. If it still matches,
+		 * the rcu_read_lock() ensures the memory stays valid.
+		 */
+		barrier();
+
+		if (!owner_on_cpu(owner) || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
+			ret = false;
+			break;
+		}
+
+		cpu_relax();
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	struct task_struct *task = current;
+	u64 end_time;
+
+	if (type == SIX_LOCK_write)
+		return false;
+
+	preempt_disable();
+	if (!six_can_spin_on_owner(lock))
+		goto fail;
+
+	if (!osq_lock(&lock->osq))
+		goto fail;
+
+	end_time = sched_clock() + 10 * NSEC_PER_USEC;
+
+	while (1) {
+		struct task_struct *owner;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = READ_ONCE(lock->owner);
+		if (owner && !six_spin_on_owner(lock, owner, end_time))
+			break;
+
+		if (do_six_trylock(lock, type, false)) {
+			osq_unlock(&lock->osq);
+			preempt_enable();
+			return true;
+		}
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+
+	osq_unlock(&lock->osq);
+fail:
+	preempt_enable();
+
+	/*
+	 * If we fell out of the spin path because of need_resched(),
+	 * reschedule now, before we try-lock again. This avoids getting
+	 * scheduled out right after we obtained the lock.
+	 */
+	if (need_resched())
+		schedule();
+
+	return false;
+}
+
+#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	return false;
+}
+
+#endif
+
+noinline
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+			     struct six_lock_waiter *wait,
+			     six_lock_should_sleep_fn should_sleep_fn, void *p,
+			     unsigned long ip)
+{
+	int ret = 0;
+
+	if (type == SIX_LOCK_write) {
+		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+		atomic_add(SIX_LOCK_HELD_write, &lock->state);
+		smp_mb__after_atomic();
+	}
+
+	if (six_optimistic_spin(lock, type))
+		goto out;
+
+	lock_contended(&lock->dep_map, ip);
+
+	wait->task		= current;
+	wait->lock_want		= type;
+	wait->lock_acquired	= false;
+
+	raw_spin_lock(&lock->wait_lock);
+	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
+	/*
+	 * Retry taking the lock after taking waitlist lock, in case we raced
+	 * with an unlock:
+	 */
+	ret = __do_six_trylock(lock, type, current, false);
+	if (ret <= 0) {
+		wait->start_time = local_clock();
+
+		if (!list_empty(&lock->wait_list)) {
+			struct six_lock_waiter *last =
+				list_last_entry(&lock->wait_list,
+					struct six_lock_waiter, list);
+
+			if (time_before_eq64(wait->start_time, last->start_time))
+				wait->start_time = last->start_time + 1;
+		}
+
+		list_add_tail(&wait->list, &lock->wait_list);
+	}
+	raw_spin_unlock(&lock->wait_lock);
+
+	if (unlikely(ret > 0)) {
+		ret = 0;
+		goto out;
+	}
+
+	if (unlikely(ret < 0)) {
+		__six_lock_wakeup(lock, -ret - 1);
+		ret = 0;
+	}
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+
+		/*
+		 * Ensures that writes to the waitlist entry happen after we see
+		 * wait->lock_acquired: pairs with the smp_store_release in
+		 * __six_lock_wakeup
+		 */
+		if (smp_load_acquire(&wait->lock_acquired))
+			break;
+
+		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+		if (unlikely(ret)) {
+			bool acquired;
+
+			/*
+			 * If should_sleep_fn() returns an error, we are
+			 * required to return that error even if we already
+			 * acquired the lock - should_sleep_fn() might have
+			 * modified external state (e.g. when the deadlock cycle
+			 * detector in bcachefs issued a transaction restart)
+			 */
+			raw_spin_lock(&lock->wait_lock);
+			acquired = wait->lock_acquired;
+			if (!acquired)
+				list_del(&wait->list);
+			raw_spin_unlock(&lock->wait_lock);
+
+			if (unlikely(acquired))
+				do_six_unlock_type(lock, type);
+			break;
+		}
+
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+out:
+	if (ret && type == SIX_LOCK_write) {
+		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
+	}
+
+	return ret;
+}
+
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip)
+{
+	int ret;
+
+	wait->start_time = 0;
+
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
+
+	ret = do_six_trylock(lock, type, true) ? 0
+		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
+
+	if (ret && type != SIX_LOCK_write)
+		six_release(&lock->dep_map, ip);
+	if (!ret)
+		lock_acquired(&lock->dep_map, ip);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
+
+__always_inline
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	u32 state;
+
+	if (type == SIX_LOCK_intent)
+		lock->owner = NULL;
+
+	if (type == SIX_LOCK_read &&
+	    lock->readers) {
+		smp_mb(); /* unlock barrier */
+		this_cpu_dec(*lock->readers);
+		smp_mb(); /* between unlocking and checking for waiters */
+		state = atomic_read(&lock->state);
+	} else {
+		u32 v = l[type].lock_val;
+
+		if (type != SIX_LOCK_read)
+			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
+
+		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+		state = atomic_sub_return_release(v, &lock->state);
+	}
+
+	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+	EBUG_ON(type == SIX_LOCK_write &&
+		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+	EBUG_ON((type == SIX_LOCK_write ||
+		 type == SIX_LOCK_intent) &&
+		lock->owner != current);
+
+	if (type != SIX_LOCK_write)
+		six_release(&lock->dep_map, ip);
+	else
+		lock->seq++;
+
+	if (type == SIX_LOCK_intent &&
+	    lock->intent_lock_recurse) {
+		--lock->intent_lock_recurse;
+		return;
+	}
+
+	do_six_unlock_type(lock, type);
+}
+EXPORT_SYMBOL_GPL(six_unlock_ip);
+
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock:	lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
+void six_lock_downgrade(struct six_lock *lock)
+{
+	six_lock_increment(lock, SIX_LOCK_read);
+	six_unlock_intent(lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock:	lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_lock_tryupgrade(struct six_lock *lock)
+{
+	u32 old = atomic_read(&lock->state), new;
+
+	do {
+		new = old;
+
+		if (new & SIX_LOCK_HELD_intent)
+			return false;
+
+		if (!lock->readers) {
+			EBUG_ON(!(new & SIX_LOCK_HELD_read));
+			new -= l[SIX_LOCK_read].lock_val;
+		}
+
+		new |= SIX_LOCK_HELD_intent;
+	} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
+
+	if (lock->readers)
+		this_cpu_dec(*lock->readers);
+
+	six_set_owner(lock, SIX_LOCK_intent, old, current);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock:	lock to upgrade
+ * @from:	SIX_LOCK_read or SIX_LOCK_intent
+ * @to:		SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_trylock_convert(struct six_lock *lock,
+			 enum six_lock_type from,
+			 enum six_lock_type to)
+{
+	EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
+
+	if (to == from)
+		return true;
+
+	if (to == SIX_LOCK_read) {
+		six_lock_downgrade(lock);
+		return true;
+	} else {
+		return six_lock_tryupgrade(lock);
+	}
+}
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock:	lock to increment
+ * @type:	SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
+
+	/* XXX: assert already locked, and that we don't overflow: */
+
+	switch (type) {
+	case SIX_LOCK_read:
+		if (lock->readers) {
+			this_cpu_inc(*lock->readers);
+		} else {
+			EBUG_ON(!(atomic_read(&lock->state) &
+				  (SIX_LOCK_HELD_read|
+				   SIX_LOCK_HELD_intent)));
+			atomic_add(l[type].lock_val, &lock->state);
+		}
+		break;
+	case SIX_LOCK_intent:
+		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+		lock->intent_lock_recurse++;
+		break;
+	case SIX_LOCK_write:
+		BUG();
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock:	lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+	u32 state = atomic_read(&lock->state);
+	struct six_lock_waiter *w;
+
+	six_lock_wakeup(lock, state, SIX_LOCK_read);
+	six_lock_wakeup(lock, state, SIX_LOCK_intent);
+	six_lock_wakeup(lock, state, SIX_LOCK_write);
+
+	raw_spin_lock(&lock->wait_lock);
+	list_for_each_entry(w, &lock->wait_list, list)
+		wake_up_process(w->task);
+	raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock:	lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+	struct six_lock_count ret;
+
+	ret.n[SIX_LOCK_read]	= !lock->readers
+		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
+		: pcpu_read_count(lock);
+	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
+		lock->intent_lock_recurse;
+	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
+
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock:	lock to add/subtract readers for
+ * @nr:		reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+	if (lock->readers) {
+		this_cpu_add(*lock->readers, nr);
+	} else {
+		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
+		/* reader count starts at bit 0 */
+		atomic_add(nr, &lock->state);
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock:	lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
+void six_lock_exit(struct six_lock *lock)
+{
+	WARN_ON(lock->readers && pcpu_read_count(lock));
+	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
+
+	free_percpu(lock->readers);
+	lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+	atomic_set(&lock->state, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+	/*
+	 * Don't assume that we have real percpu variables available in
+	 * userspace:
+	 */
+#ifdef __KERNEL__
+	if (flags & SIX_LOCK_INIT_PCPU) {
+		/*
+		 * We don't return an error here on memory allocation failure
+		 * since percpu is an optimization, and locks will work with the
+		 * same semantics in non-percpu mode: callers can check for
+		 * failure if they wish by checking lock->readers, but generally
+		 * will not want to treat it as an error.
+		 */
+		lock->readers = alloc_percpu(unsigned);
+	}
+#endif
+}
+EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
new file mode 100644
index 000000000000..4c268b0b8316
--- /dev/null
+++ b/fs/bcachefs/six.h
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/**
+ * DOC: SIX locks overview
+ *
+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
+ * but with an additional state: read/shared, intent, exclusive/write
+ *
+ * The purpose of the intent state is to allow for greater concurrency on tree
+ * structures without deadlocking. In general, a read can't be upgraded to a
+ * write lock without deadlocking, so an operation that updates multiple nodes
+ * will have to take write locks for the full duration of the operation.
+ *
+ * But by adding an intent state, which is exclusive with other intent locks but
+ * not with readers, we can take intent locks at thte start of the operation,
+ * and then take write locks only for the actual update to each individual
+ * nodes, without deadlocking.
+ *
+ * Example usage:
+ *   six_lock_read(&foo->lock);
+ *   six_unlock_read(&foo->lock);
+ *
+ * An intent lock must be held before taking a write lock:
+ *   six_lock_intent(&foo->lock);
+ *   six_lock_write(&foo->lock);
+ *   six_unlock_write(&foo->lock);
+ *   six_unlock_intent(&foo->lock);
+ *
+ * Other operations:
+ *   six_trylock_read()
+ *   six_trylock_intent()
+ *   six_trylock_write()
+ *
+ *   six_lock_downgrade()	convert from intent to read
+ *   six_lock_tryupgrade()	attempt to convert from read to intent, may fail
+ *
+ * There are also interfaces that take the lock type as an enum:
+ *
+ *   six_lock_type(&foo->lock, SIX_LOCK_read);
+ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
+ *   six_lock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
+ *
+ * Lock sequence numbers - unlock(), relock():
+ *
+ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
+ *   This allows locks to be dropped and the retaken iff the state they protect
+ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
+ *   doing IO or allocating memory.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     u32 seq = six_lock_seq(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *
+ *     some_operation_that_may_block();
+ *
+ *     if (six_relock_read(&foo->lock, seq)) { ... }
+ *
+ *   If the relock operation succeeds, it is as if the lock was never unlocked.
+ *
+ * Reentrancy:
+ *
+ *   Six locks are not by themselves reentrent, but have counters for both the
+ *   read and intent states that can be used to provide reentrency by an upper
+ *   layer that tracks held locks. If a lock is known to already be held in the
+ *   read or intent state, six_lock_increment() can be used to bump the "lock
+ *   held in this state" counter, increasing the number of unlock calls that
+ *   will be required to fully unlock it.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
+ *     six_unlock_read(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *   foo->lock is now fully unlocked.
+ *
+ *   Since the intent state supercedes read, it's legal to increment the read
+ *   counter when holding an intent lock, but not the reverse.
+ *
+ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
+ *   is not legal.
+ *
+ * should_sleep_fn:
+ *
+ *   There is a six_lock() variant that takes a function pointer that is called
+ *   immediately prior to schedule() when blocking, and may return an error to
+ *   abort.
+ *
+ *   One possible use for this feature is when objects being locked are part of
+ *   a cache and may reused, and lock ordering is based on a property of the
+ *   object that will change when the object is reused - i.e. logical key order.
+ *
+ *   If looking up an object in the cache may race with object reuse, and lock
+ *   ordering is required to prevent deadlock, object reuse may change the
+ *   correct lock order for that object and cause a deadlock. should_sleep_fn
+ *   can be used to check if the object is still the object we want and avoid
+ *   this deadlock.
+ *
+ * Wait list entry interface:
+ *
+ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
+ *   wait list entry. By embedding six_lock_waiter into another object, and by
+ *   traversing lock waitlists, it is then possible for an upper layer to
+ *   implement full cycle detection for deadlock avoidance.
+ *
+ *   should_sleep_fn should be used for invoking the cycle detector, walking the
+ *   graph of held locks to check for a deadlock. The upper layer must track
+ *   held locks for each thread, and each thread's held locks must be reachable
+ *   from its six_lock_waiter object.
+ *
+ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
+ *   the lock, and before calling should_sleep_fn, and the wait object will not
+ *   be removed from the waitlist until either the lock has been successfully
+ *   acquired, or we aborted because should_sleep_fn returned an error.
+ *
+ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
+ *   have timestamps in strictly ascending order - this is so the timestamp can
+ *   be used as a cursor for lock graph traverse.
+ */
+
+#include <linux/lockdep.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+#include <linux/osq_lock.h>
+#endif
+
+enum six_lock_type {
+	SIX_LOCK_read,
+	SIX_LOCK_intent,
+	SIX_LOCK_write,
+};
+
+struct six_lock {
+	atomic_t		state;
+	u32			seq;
+	unsigned		intent_lock_recurse;
+	struct task_struct	*owner;
+	unsigned __percpu	*readers;
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+	struct optimistic_spin_queue osq;
+#endif
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+struct six_lock_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+	enum six_lock_type	lock_want;
+	bool			lock_acquired;
+	u64			start_time;
+};
+
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
+
+void six_lock_exit(struct six_lock *lock);
+
+enum six_lock_init_flags {
+	SIX_LOCK_INIT_PCPU	= 1U << 0,
+};
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags);
+
+/**
+ * six_lock_init - initialize a six lock
+ * @lock:	lock to initialize
+ * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
+ */
+#define six_lock_init(lock, flags)					\
+do {									\
+	static struct lock_class_key __key;				\
+									\
+	__six_lock_init((lock), #lock, &__key, flags);			\
+} while (0)
+
+/**
+ * six_lock_seq - obtain current lock sequence number
+ * @lock:	six_lock to obtain sequence number for
+ *
+ * @lock should be held for read or intent, and not write
+ *
+ * By saving the lock sequence number, we can unlock @lock and then (typically
+ * after some blocking operation) attempt to relock it: the relock will succeed
+ * if the sequence number hasn't changed, meaning no write locks have been taken
+ * and state corresponding to what @lock protects is still valid.
+ */
+static inline u32 six_lock_seq(const struct six_lock *lock)
+{
+	return lock->seq;
+}
+
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_trylock_type - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	return six_trylock_ip(lock, type, _THIS_IP_);
+}
+
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip);
+
+/**
+ * six_lock_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
+ * for full documentation.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
+				  struct six_lock_waiter *wait,
+				  six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+/**
+ * six_lock_ip - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
+			      six_lock_should_sleep_fn should_sleep_fn, void *p,
+			      unsigned long ip)
+{
+	struct six_lock_waiter wait;
+
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+}
+
+/**
+ * six_lock_type - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+				six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	struct six_lock_waiter wait;
+
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip);
+
+/**
+ * six_relock_type - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+				   unsigned seq)
+{
+	return six_relock_ip(lock, type, seq, _THIS_IP_);
+}
+
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_unlock_type - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	six_unlock_ip(lock, type, _THIS_IP_);
+}
+
+#define __SIX_LOCK(type)						\
+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
+{									\
+	return six_trylock_ip(lock, SIX_LOCK_##type, ip);		\
+}									\
+									\
+static inline bool six_trylock_##type(struct six_lock *lock)		\
+{									\
+	return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);	\
+}									\
+									\
+static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
+			   struct six_lock_waiter *wait,		\
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
+			   unsigned long ip)				\
+{									\
+	return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline int six_lock_ip_##type(struct six_lock *lock,		\
+		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
+		    unsigned long ip)					\
+{									\
+	return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
+{									\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);		\
+}									\
+									\
+static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
+{									\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);	\
+}									\
+									\
+static inline int six_lock_##type(struct six_lock *lock,		\
+				  six_lock_should_sleep_fn fn, void *p)\
+{									\
+	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
+}									\
+									\
+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
+{									\
+	six_unlock_ip(lock, SIX_LOCK_##type, ip);			\
+}									\
+									\
+static inline void six_unlock_##type(struct six_lock *lock)		\
+{									\
+	six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);		\
+}
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+void six_lock_downgrade(struct six_lock *);
+bool six_lock_tryupgrade(struct six_lock *);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+			 enum six_lock_type);
+
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+
+void six_lock_wakeup_all(struct six_lock *);
+
+struct six_lock_count {
+	unsigned n[3];
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+void six_lock_readers_add(struct six_lock *, int);
+
+#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
new file mode 100644
index 000000000000..4982468bfe11
--- /dev/null
+++ b/fs/bcachefs/snapshot.c
@@ -0,0 +1,1689 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+
+#include <linux/random.h>
+
+/*
+ * Snapshot trees:
+ *
+ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
+ * exist to provide a stable identifier for the whole lifetime of a snapshot
+ * tree.
+ */
+
+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
+
+	prt_printf(out, "subvol %u root snapshot %u",
+		   le32_to_cpu(t.v->master_subvol),
+		   le32_to_cpu(t.v->root_snapshot));
+}
+
+int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			       enum bkey_invalid_flags flags,
+			       struct printbuf *err)
+{
+	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+	    bkey_lt(k.k->p, POS(0, 1))) {
+		prt_printf(err, "bad pos");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
+			      struct bch_snapshot_tree *s)
+{
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
+					  BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+
+	if (bch2_err_matches(ret, ENOENT))
+		ret = -BCH_ERR_ENOENT_snapshot_tree;
+	return ret;
+}
+
+struct bkey_i_snapshot_tree *
+__bch2_snapshot_tree_create(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	int ret = bch2_bkey_get_empty_slot(trans, &iter,
+			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
+	struct bkey_i_snapshot_tree *s_t;
+
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
+		ret = -BCH_ERR_ENOSPC_snapshot_tree;
+	if (ret)
+		return ERR_PTR(ret);
+
+	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
+	ret = PTR_ERR_OR_ZERO(s_t);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret ? ERR_PTR(ret) : s_t;
+}
+
+static int bch2_snapshot_tree_create(struct btree_trans *trans,
+				u32 root_id, u32 subvol_id, u32 *tree_id)
+{
+	struct bkey_i_snapshot_tree *n_tree =
+		__bch2_snapshot_tree_create(trans);
+
+	if (IS_ERR(n_tree))
+		return PTR_ERR(n_tree);
+
+	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
+	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
+	*tree_id = n_tree->k.p.offset;
+	return 0;
+}
+
+/* Snapshot nodes: */
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	struct snapshot_table *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
+	while (id && id < ancestor)
+		id = __snapshot_t(t, id)->parent;
+	rcu_read_unlock();
+
+	return id == ancestor;
+}
+
+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+	const struct snapshot_t *s = __snapshot_t(t, id);
+
+	if (s->skip[2] <= ancestor)
+		return s->skip[2];
+	if (s->skip[1] <= ancestor)
+		return s->skip[1];
+	if (s->skip[0] <= ancestor)
+		return s->skip[0];
+	return s->parent;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	struct snapshot_table *t;
+	bool ret;
+
+	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
+	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
+		id = get_ancestor_below(t, id, ancestor);
+
+	if (id && id < ancestor) {
+		ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
+
+		EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
+	} else {
+		ret = id == ancestor;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+	size_t new_size;
+	struct snapshot_table *new, *old;
+
+	new_size = max(16UL, roundup_pow_of_two(idx + 1));
+
+	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	old = rcu_dereference_protected(c->snapshots, true);
+	if (old)
+		memcpy(new->s,
+		       rcu_dereference_protected(c->snapshots, true)->s,
+		       sizeof(new->s[0]) * c->snapshot_table_size);
+
+	rcu_assign_pointer(c->snapshots, new);
+	c->snapshot_table_size = new_size;
+	kvfree_rcu_mightsleep(old);
+
+	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+}
+
+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+
+	lockdep_assert_held(&c->snapshot_table_lock);
+
+	if (likely(idx < c->snapshot_table_size))
+		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+
+	return __snapshot_t_mut(c, id);
+}
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
+	       BCH_SNAPSHOT_SUBVOL(s.v),
+	       BCH_SNAPSHOT_DELETED(s.v),
+	       le32_to_cpu(s.v->parent),
+	       le32_to_cpu(s.v->children[0]),
+	       le32_to_cpu(s.v->children[1]),
+	       le32_to_cpu(s.v->subvol),
+	       le32_to_cpu(s.v->tree));
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
+		prt_printf(out, " depth %u skiplist %u %u %u",
+			   le32_to_cpu(s.v->depth),
+			   le32_to_cpu(s.v->skip[0]),
+			   le32_to_cpu(s.v->skip[1]),
+			   le32_to_cpu(s.v->skip[2]));
+}
+
+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_snapshot s;
+	u32 i, id;
+
+	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+	    bkey_lt(k.k->p, POS(0, 1))) {
+		prt_printf(err, "bad pos");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	s = bkey_s_c_to_snapshot(k);
+
+	id = le32_to_cpu(s.v->parent);
+	if (id && id <= k.k->p.offset) {
+		prt_printf(err, "bad parent node (%u <= %llu)",
+		       id, k.k->p.offset);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
+		prt_printf(err, "children not normalized");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (s.v->children[0] &&
+	    s.v->children[0] == s.v->children[1]) {
+		prt_printf(err, "duplicate child nodes");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	for (i = 0; i < 2; i++) {
+		id = le32_to_cpu(s.v->children[i]);
+
+		if (id >= k.k->p.offset) {
+			prt_printf(err, "bad child node (%u >= %llu)",
+			       id, k.k->p.offset);
+			return -BCH_ERR_invalid_bkey;
+		}
+	}
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
+		if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+		    le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
+			prt_printf(err, "skiplist not normalized");
+			return -BCH_ERR_invalid_bkey;
+		}
+
+		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
+			id = le32_to_cpu(s.v->skip[i]);
+
+			if ((id && !s.v->parent) ||
+			    (id && id <= k.k->p.offset)) {
+				prt_printf(err, "bad skiplist node %u", id);
+				return -BCH_ERR_invalid_bkey;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+	struct snapshot_t *t = snapshot_t_mut(c, id);
+	u32 parent = id;
+
+	while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+	       parent - id - 1 < IS_ANCESTOR_BITMAP)
+		__set_bit(parent - id - 1, t->is_ancestor);
+}
+
+static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+	mutex_lock(&c->snapshot_table_lock);
+	__set_is_ancestor_bitmap(c, id);
+	mutex_unlock(&c->snapshot_table_lock);
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+		       enum btree_id btree, unsigned level,
+		       struct bkey_s_c old, struct bkey_s_c new,
+		       unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct snapshot_t *t;
+	u32 id = new.k->p.offset;
+	int ret = 0;
+
+	mutex_lock(&c->snapshot_table_lock);
+
+	t = snapshot_t_mut(c, id);
+	if (!t) {
+		ret = -BCH_ERR_ENOMEM_mark_snapshot;
+		goto err;
+	}
+
+	if (new.k->type == KEY_TYPE_snapshot) {
+		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+		t->parent	= le32_to_cpu(s.v->parent);
+		t->children[0]	= le32_to_cpu(s.v->children[0]);
+		t->children[1]	= le32_to_cpu(s.v->children[1]);
+		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+		t->tree		= le32_to_cpu(s.v->tree);
+
+		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
+			t->depth	= le32_to_cpu(s.v->depth);
+			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
+			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
+			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
+		} else {
+			t->depth	= 0;
+			t->skip[0]	= 0;
+			t->skip[1]	= 0;
+			t->skip[2]	= 0;
+		}
+
+		__set_is_ancestor_bitmap(c, id);
+
+		if (BCH_SNAPSHOT_DELETED(s.v)) {
+			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+			c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
+		}
+	} else {
+		memset(t, 0, sizeof(*t));
+	}
+err:
+	mutex_unlock(&c->snapshot_table_lock);
+	return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+			 struct bch_snapshot *s)
+{
+	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
+				       BTREE_ITER_WITH_UPDATES, snapshot, s);
+}
+
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+{
+	struct bch_snapshot v;
+	int ret;
+
+	if (!id)
+		return 0;
+
+	ret = bch2_snapshot_lookup(trans, id, &v);
+	if (bch2_err_matches(ret, ENOENT))
+		bch_err(trans->c, "snapshot node %u not found", id);
+	if (ret)
+		return ret;
+
+	return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+/*
+ * If @k is a snapshot with just one live child, it's part of a linear chain,
+ * which we consider to be an equivalence class: and then after snapshot
+ * deletion cleanup, there should only be a single key at a given position in
+ * this equivalence class.
+ *
+ * This sets the equivalence class of @k to be the child's equivalence class, if
+ * it's part of such a linear chain: this correctly sets equivalence classes on
+ * startup if we run leaf to root (i.e. in natural key order).
+ */
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	unsigned i, nr_live = 0, live_idx = 0;
+	struct bkey_s_c_snapshot snap;
+	u32 id = k.k->p.offset, child[2];
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+
+	child[0] = le32_to_cpu(snap.v->children[0]);
+	child[1] = le32_to_cpu(snap.v->children[1]);
+
+	for (i = 0; i < 2; i++) {
+		int ret = bch2_snapshot_live(trans, child[i]);
+
+		if (ret < 0)
+			return ret;
+
+		if (ret)
+			live_idx = i;
+		nr_live += ret;
+	}
+
+	mutex_lock(&c->snapshot_table_lock);
+
+	snapshot_t_mut(c, id)->equiv = nr_live == 1
+		? snapshot_t_mut(c, child[live_idx])->equiv
+		: id;
+
+	mutex_unlock(&c->snapshot_table_lock);
+
+	return 0;
+}
+
+/* fsck: */
+
+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
+{
+	return snapshot_t(c, id)->children[child];
+}
+
+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 0);
+}
+
+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 1);
+}
+
+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
+{
+	u32 n, parent;
+
+	n = bch2_snapshot_left_child(c, id);
+	if (n)
+		return n;
+
+	while ((parent = bch2_snapshot_parent(c, id))) {
+		n = bch2_snapshot_right_child(c, parent);
+		if (n && n != id)
+			return n;
+		id = parent;
+	}
+
+	return 0;
+}
+
+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+{
+	u32 id = snapshot_root;
+	u32 subvol = 0, s;
+
+	while (id) {
+		s = snapshot_t(c, id)->subvol;
+
+		if (s && (!subvol || s < subvol))
+			subvol = s;
+
+		id = bch2_snapshot_tree_next(c, id);
+	}
+
+	return subvol;
+}
+
+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
+					    u32 snapshot_root, u32 *subvol_id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_subvolume s;
+	bool found = false;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+				     0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		s = bkey_s_c_to_subvolume(k);
+		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
+			continue;
+		if (!BCH_SUBVOLUME_SNAP(s.v)) {
+			*subvol_id = s.k->p.offset;
+			found = true;
+			break;
+		}
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && !found) {
+		struct bkey_i_subvolume *u;
+
+		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+
+		u = bch2_bkey_get_mut_typed(trans, &iter,
+					    BTREE_ID_subvolumes, POS(0, *subvol_id),
+					    0, subvolume);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			return ret;
+
+		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
+	}
+
+	return ret;
+}
+
+static int check_snapshot_tree(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot_tree st;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct printbuf buf = PRINTBUF;
+	u32 root_id;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot_tree)
+		return 0;
+
+	st = bkey_s_c_to_snapshot_tree(k);
+	root_id = le32_to_cpu(st.v->root_snapshot);
+
+	ret = bch2_snapshot_lookup(trans, root_id, &s);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret ||
+			root_id != bch2_snapshot_root(c, root_id) ||
+			st.k->p.offset != le32_to_cpu(s.tree),
+			c,
+			"snapshot tree points to missing/incorrect snapshot:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto err;
+	}
+
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
+				 false, 0, &subvol);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret, c,
+			"snapshot tree points to missing subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+						le32_to_cpu(subvol.snapshot),
+						root_id), c,
+			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
+			"snapshot tree points to snapshot subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		struct bkey_i_snapshot_tree *u;
+		u32 subvol_id;
+
+		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+		if (ret)
+			goto err;
+
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.master_subvol = cpu_to_le32(subvol_id);
+		st = snapshot_tree_i_to_s_c(u);
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * For each snapshot_tree, make sure it points to the root of a snapshot tree
+ * and that snapshot entry points back to it, or delete it.
+ *
+ * And, make sure it points to a subvolume within that snapshot tree, or correct
+ * it to point to the oldest subvolume within that snapshot tree.
+ */
+int bch2_check_snapshot_trees(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+			BTREE_ID_snapshot_trees, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_snapshot_tree(trans, &iter, k)));
+
+	if (ret)
+		bch_err(c, "error %i checking snapshot trees", ret);
+	return ret;
+}
+
+/*
+ * Look up snapshot tree for @tree_id and find root,
+ * make sure @snap_id is a descendent:
+ */
+static int snapshot_tree_ptr_good(struct btree_trans *trans,
+				  u32 snap_id, u32 tree_id)
+{
+	struct bch_snapshot_tree s_t;
+	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+
+	if (bch2_err_matches(ret, ENOENT))
+		return 0;
+	if (ret)
+		return ret;
+
+	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+
+	if (!id)
+		return 0;
+
+	rcu_read_lock();
+	s = snapshot_t(c, id);
+	if (s->parent)
+		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+	rcu_read_unlock();
+
+	return id;
+}
+
+static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
+{
+	unsigned i;
+
+	for (i = 0; i < 3; i++)
+		if (!s.parent) {
+			if (s.skip[i])
+				return false;
+		} else {
+			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
+				return false;
+		}
+
+	return true;
+}
+
+/*
+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
+ * its snapshot_tree pointer is correct (allocate new one if necessary), then
+ * update this node's pointer to root node's pointer:
+ */
+static int snapshot_tree_ptr_repair(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bkey_s_c k,
+				    struct bch_snapshot *s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter root_iter;
+	struct bch_snapshot_tree s_t;
+	struct bkey_s_c_snapshot root;
+	struct bkey_i_snapshot *u;
+	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
+	int ret;
+
+	root = bch2_bkey_get_iter_typed(trans, &root_iter,
+			       BTREE_ID_snapshots, POS(0, root_id),
+			       BTREE_ITER_WITH_UPDATES, snapshot);
+	ret = bkey_err(root);
+	if (ret)
+		goto err;
+
+	tree_id = le32_to_cpu(root.v->tree);
+
+	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
+		ret =   PTR_ERR_OR_ZERO(u) ?:
+			bch2_snapshot_tree_create(trans, root_id,
+				bch2_snapshot_tree_oldest_subvol(c, root_id),
+				&tree_id);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		if (k.k->p.offset == root_id)
+			*s = u->v;
+	}
+
+	if (k.k->p.offset != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		*s = u->v;
+	}
+err:
+	bch2_trans_iter_exit(trans, &root_iter);
+	return ret;
+}
+
+static int check_snapshot(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct bch_snapshot v;
+	struct bkey_i_snapshot *u;
+	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
+	u32 real_depth;
+	struct printbuf buf = PRINTBUF;
+	bool should_have_subvol;
+	u32 i, id;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	memset(&s, 0, sizeof(s));
+	memcpy(&s, k.v, bkey_val_bytes(k.k));
+
+	id = le32_to_cpu(s.parent);
+	if (id) {
+		ret = bch2_snapshot_lookup(trans, id, &v);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot with nonexistent parent:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (ret)
+			goto err;
+
+		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
+		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
+			bch_err(c, "snapshot parent %u missing pointer to child %llu",
+				id, k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	for (i = 0; i < 2 && s.children[i]; i++) {
+		id = le32_to_cpu(s.children[i]);
+
+		ret = bch2_snapshot_lookup(trans, id, &v);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot node %llu has nonexistent child %u",
+				k.k->p.offset, id);
+		if (ret)
+			goto err;
+
+		if (le32_to_cpu(v.parent) != k.k->p.offset) {
+			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
+				id, le32_to_cpu(v.parent), k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+		!BCH_SNAPSHOT_DELETED(&s);
+
+	if (should_have_subvol) {
+		id = le32_to_cpu(s.subvol);
+		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (ret)
+			goto err;
+
+		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
+			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+				k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	} else {
+		if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+			ret = PTR_ERR_OR_ZERO(u);
+			if (ret)
+				goto err;
+
+			u->v.subvol = 0;
+			s = u->v;
+		}
+	}
+
+	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
+	if (ret < 0)
+		goto err;
+
+	if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
+		if (ret)
+			goto err;
+	}
+	ret = 0;
+
+	real_depth = bch2_snapshot_depth(c, parent_id);
+
+	if (le32_to_cpu(s.depth) != real_depth &&
+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+	     fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
+		      real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.depth = cpu_to_le32(real_depth);
+		s = u->v;
+	}
+
+	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
+	if (ret < 0)
+		goto err;
+
+	if (!ret &&
+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+	     fsck_err(c, "snapshot with bad skiplist field:\n  %s",
+		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
+			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
+
+		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
+		s = u->v;
+	}
+	ret = 0;
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_snapshots(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	/*
+	 * We iterate backwards as checking/fixing the depth field requires that
+	 * the parent's depth already be correct:
+	 */
+	ret = bch2_trans_run(c,
+		for_each_btree_key_reverse_commit(trans, iter,
+			BTREE_ID_snapshots, POS_MAX,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_snapshot(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *s;
+	int ret = 0;
+
+	s = bch2_bkey_get_mut_typed(trans, &iter,
+				    BTREE_ID_snapshots, POS(0, id),
+				    0, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+					trans->c, "missing snapshot %u", id);
+		return ret;
+	}
+
+	/* already deleted? */
+	if (BCH_SNAPSHOT_DELETED(&s->v))
+		goto err;
+
+	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+	s->v.subvol = 0;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
+{
+	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
+		swap(s->children[0], s->children[1]);
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+	struct btree_iter c_iter = (struct btree_iter) { NULL };
+	struct btree_iter tree_iter = (struct btree_iter) { NULL };
+	struct bkey_s_c_snapshot s;
+	u32 parent_id, child_id;
+	unsigned i;
+	int ret = 0;
+
+	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+				     BTREE_ITER_INTENT, snapshot);
+	ret = bkey_err(s);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"missing snapshot %u", id);
+
+	if (ret)
+		goto err;
+
+	BUG_ON(s.v->children[1]);
+
+	parent_id = le32_to_cpu(s.v->parent);
+	child_id = le32_to_cpu(s.v->children[0]);
+
+	if (parent_id) {
+		struct bkey_i_snapshot *parent;
+
+		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
+				     BTREE_ID_snapshots, POS(0, parent_id),
+				     0, snapshot);
+		ret = PTR_ERR_OR_ZERO(parent);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+					"missing snapshot %u", parent_id);
+		if (unlikely(ret))
+			goto err;
+
+		/* find entry in parent->children for node being deleted */
+		for (i = 0; i < 2; i++)
+			if (le32_to_cpu(parent->v.children[i]) == id)
+				break;
+
+		if (bch2_fs_inconsistent_on(i == 2, c,
+					"snapshot %u missing child pointer to %u",
+					parent_id, id))
+			goto err;
+
+		parent->v.children[i] = le32_to_cpu(child_id);
+
+		normalize_snapshot_child_pointers(&parent->v);
+	}
+
+	if (child_id) {
+		struct bkey_i_snapshot *child;
+
+		child = bch2_bkey_get_mut_typed(trans, &c_iter,
+				     BTREE_ID_snapshots, POS(0, child_id),
+				     0, snapshot);
+		ret = PTR_ERR_OR_ZERO(child);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+					"missing snapshot %u", child_id);
+		if (unlikely(ret))
+			goto err;
+
+		child->v.parent = cpu_to_le32(parent_id);
+
+		if (!child->v.parent) {
+			child->v.skip[0] = 0;
+			child->v.skip[1] = 0;
+			child->v.skip[2] = 0;
+		}
+	}
+
+	if (!parent_id) {
+		/*
+		 * We're deleting the root of a snapshot tree: update the
+		 * snapshot_tree entry to point to the new root, or delete it if
+		 * this is the last snapshot ID in this tree:
+		 */
+		struct bkey_i_snapshot_tree *s_t;
+
+		BUG_ON(s.v->children[1]);
+
+		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
+				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+				0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(s_t);
+		if (ret)
+			goto err;
+
+		if (s.v->children[0]) {
+			s_t->v.root_snapshot = s.v->children[0];
+		} else {
+			s_t->k.type = KEY_TYPE_deleted;
+			set_bkey_val_u64s(&s_t->k, 0);
+		}
+	}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &tree_iter);
+	bch2_trans_iter_exit(trans, &p_iter);
+	bch2_trans_iter_exit(trans, &c_iter);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
+			  u32 *new_snapids,
+			  u32 *snapshot_subvols,
+			  unsigned nr_snapids)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n;
+	struct bkey_s_c k;
+	unsigned i, j;
+	u32 depth = bch2_snapshot_depth(c, parent);
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+			     POS_MIN, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_snapids; i++) {
+		k = bch2_btree_iter_prev_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k || !k.k->p.offset) {
+			ret = -BCH_ERR_ENOSPC_snapshot_create;
+			goto err;
+		}
+
+		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		n->v.flags	= 0;
+		n->v.parent	= cpu_to_le32(parent);
+		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
+		n->v.tree	= cpu_to_le32(tree);
+		n->v.depth	= cpu_to_le32(depth);
+
+		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
+			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
+
+		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
+		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+		if (ret)
+			goto err;
+
+		new_snapids[i]	= iter.pos.offset;
+
+		mutex_lock(&c->snapshot_table_lock);
+		snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+		mutex_unlock(&c->snapshot_table_lock);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * Create new snapshot IDs as children of an existing snapshot ID:
+ */
+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n_parent;
+	int ret = 0;
+
+	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshots, POS(0, parent),
+			0, snapshot);
+	ret = PTR_ERR_OR_ZERO(n_parent);
+	if (unlikely(ret)) {
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(trans->c, "snapshot %u not found", parent);
+		return ret;
+	}
+
+	if (n_parent->v.children[0] || n_parent->v.children[1]) {
+		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		goto err;
+
+	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
+	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
+	n_parent->v.subvol = 0;
+	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * Create a snapshot node that is the root of a new tree:
+ */
+static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct bkey_i_snapshot_tree *n_tree;
+	int ret;
+
+	n_tree = __bch2_snapshot_tree_create(trans);
+	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
+		create_snapids(trans, 0, n_tree->k.p.offset,
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		return ret;
+
+	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
+	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
+	return 0;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	BUG_ON((parent == 0) != (nr_snapids == 1));
+	BUG_ON((parent != 0) != (nr_snapids == 2));
+
+	return parent
+		? bch2_snapshot_node_create_children(trans, parent,
+				new_snapids, snapshot_subvols, nr_snapids)
+		: bch2_snapshot_node_create_tree(trans,
+				new_snapids, snapshot_subvols, nr_snapids);
+
+}
+
+/*
+ * If we have an unlinked inode in an internal snapshot node, and the inode
+ * really has been deleted in all child snapshots, how does this get cleaned up?
+ *
+ * first there is the problem of how keys that have been overwritten in all
+ * child snapshots get deleted (unimplemented?), but inodes may perhaps be
+ * special?
+ *
+ * also: unlinked inode in internal snapshot appears to not be getting deleted
+ * correctly if inode doesn't exist in leaf snapshots
+ *
+ * solution:
+ *
+ * for a key in an interior snapshot node that needs work to be done that
+ * requires it to be mutated: iterate over all descendent leaf nodes and copy
+ * that key to snapshot leaf nodes, where we can mutate it
+ */
+
+static int snapshot_delete_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k,
+			       snapshot_id_list *deleted,
+			       snapshot_id_list *equiv_seen,
+			       struct bpos *last_pos)
+{
+	struct bch_fs *c = trans->c;
+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	if (!bkey_eq(k.k->p, *last_pos))
+		equiv_seen->nr = 0;
+	*last_pos = k.k->p;
+
+	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+	    snapshot_list_has_id(equiv_seen, equiv)) {
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	} else {
+		return snapshot_list_add(c, equiv_seen, equiv);
+	}
+}
+
+static int move_key_to_correct_snapshot(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	/*
+	 * When we have a linear chain of snapshot nodes, we consider
+	 * those to form an equivalence class: we're going to collapse
+	 * them all down to a single node, and keep the leaf-most node -
+	 * which has the same id as the equivalence class id.
+	 *
+	 * If there are multiple keys in different snapshots at the same
+	 * position, we're only going to keep the one in the newest
+	 * snapshot - the rest have been overwritten and are redundant,
+	 * and for the key we're going to keep we need to move it to the
+	 * equivalance class ID if it's not there already.
+	 */
+	if (equiv != k.k->p.snapshot) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+		struct btree_iter new_iter;
+		int ret;
+
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
+
+		new->k.p.snapshot = equiv;
+
+		bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
+				     BTREE_ITER_ALL_SNAPSHOTS|
+				     BTREE_ITER_CACHED|
+				     BTREE_ITER_INTENT);
+
+		ret =   bch2_btree_iter_traverse(&new_iter) ?:
+			bch2_trans_update(trans, &new_iter, new,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+			bch2_btree_delete_at(trans, iter,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		bch2_trans_iter_exit(trans, &new_iter);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
+					  struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot snap;
+	u32 children[2];
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+	if (BCH_SNAPSHOT_DELETED(snap.v) ||
+	    BCH_SNAPSHOT_SUBVOL(snap.v))
+		return 0;
+
+	children[0] = le32_to_cpu(snap.v->children[0]);
+	children[1] = le32_to_cpu(snap.v->children[1]);
+
+	ret   = bch2_snapshot_live(trans, children[0]) ?:
+		bch2_snapshot_live(trans, children[1]);
+	if (ret < 0)
+		return ret;
+
+	if (!ret)
+		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+	return 0;
+}
+
+static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
+						snapshot_id_list *skip)
+{
+	rcu_read_lock();
+	while (snapshot_list_has_id(skip, id))
+		id = __bch2_snapshot_parent(c, id);
+
+	while (n--) {
+		do {
+			id = __bch2_snapshot_parent(c, id);
+		} while (snapshot_list_has_id(skip, id));
+	}
+	rcu_read_unlock();
+
+	return id;
+}
+
+static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
+					      struct btree_iter *iter, struct bkey_s_c k,
+					      snapshot_id_list *deleted)
+{
+	struct bch_fs *c = trans->c;
+	u32 nr_deleted_ancestors = 0;
+	struct bkey_i_snapshot *s;
+	u32 *i;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	if (snapshot_list_has_id(deleted, k.k->p.offset))
+		return 0;
+
+	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	darray_for_each(*deleted, i)
+		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+
+	if (!nr_deleted_ancestors)
+		return 0;
+
+	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
+
+	if (!s->v.depth) {
+		s->v.skip[0] = 0;
+		s->v.skip[1] = 0;
+		s->v.skip[2] = 0;
+	} else {
+		u32 depth = le32_to_cpu(s->v.depth);
+		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
+
+		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
+			u32 id = le32_to_cpu(s->v.skip[j]);
+
+			if (snapshot_list_has_id(deleted, id)) {
+				id = depth > 1
+					? bch2_snapshot_nth_parent_skip(c,
+							parent,
+							get_random_u32_below(depth - 1),
+							deleted)
+					: parent;
+				s->v.skip[j] = cpu_to_le32(id);
+			}
+		}
+
+		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
+	}
+
+	return bch2_trans_update(trans, iter, &s->k_i, 0);
+}
+
+int bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_snapshot snap;
+	snapshot_id_list deleted = { 0 };
+	snapshot_id_list deleted_interior = { 0 };
+	u32 *i, id;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+		ret = bch2_fs_read_write_early(c);
+		if (ret) {
+			bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+			return ret;
+		}
+	}
+
+	trans = bch2_trans_get(c);
+
+	/*
+	 * For every snapshot node: If we have no live children and it's not
+	 * pointed to by a subvolume, delete it:
+	 */
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
+			POS_MIN, 0, k,
+			NULL, NULL, 0,
+		bch2_delete_redundant_snapshot(trans, &iter, k));
+	if (ret) {
+		bch_err_msg(c, ret, "deleting redundant snapshots");
+		goto err;
+	}
+
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+				  POS_MIN, 0, k,
+		bch2_snapshot_set_equiv(trans, k));
+	if (ret) {
+		bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+		goto err;
+	}
+
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		snap = bkey_s_c_to_snapshot(k);
+		if (BCH_SNAPSHOT_DELETED(snap.v)) {
+			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
+			if (ret)
+				break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret) {
+		bch_err_msg(c, ret, "walking snapshots");
+		goto err;
+	}
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		struct bpos last_pos = POS_MIN;
+		snapshot_id_list equiv_seen = { 0 };
+		struct disk_reservation res = { 0 };
+
+		if (!btree_type_has_snapshots(id))
+			continue;
+
+		ret = for_each_btree_key_commit(trans, iter,
+				id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				&res, NULL, BTREE_INSERT_NOFAIL,
+			snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+		      for_each_btree_key_commit(trans, iter,
+				id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				&res, NULL, BTREE_INSERT_NOFAIL,
+			move_key_to_correct_snapshot(trans, &iter, k));
+
+		bch2_disk_reservation_put(c, &res);
+		darray_exit(&equiv_seen);
+
+		if (ret) {
+			bch_err_msg(c, ret, "deleting keys from dying snapshots");
+			goto err;
+		}
+	}
+
+	down_write(&c->snapshot_create_lock);
+
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		u32 snapshot = k.k->p.offset;
+		u32 equiv = bch2_snapshot_equiv(c, snapshot);
+
+		if (equiv != snapshot)
+			snapshot_list_add(c, &deleted_interior, snapshot);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		goto err_create_lock;
+
+	/*
+	 * Fixing children of deleted snapshots can't be done completely
+	 * atomically, if we crash between here and when we delete the interior
+	 * nodes some depth fields will be off:
+	 */
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
+				  BTREE_ITER_INTENT, k,
+				  NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+	if (ret)
+		goto err_create_lock;
+
+	darray_for_each(deleted, i) {
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
+		if (ret) {
+			bch_err_msg(c, ret, "deleting snapshot %u", *i);
+			goto err_create_lock;
+		}
+	}
+
+	darray_for_each(deleted_interior, i) {
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
+		if (ret) {
+			bch_err_msg(c, ret, "deleting snapshot %u", *i);
+			goto err_create_lock;
+		}
+	}
+
+	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err_create_lock:
+	up_write(&c->snapshot_create_lock);
+err:
+	darray_exit(&deleted_interior);
+	darray_exit(&deleted);
+	bch2_trans_put(trans);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+	if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
+		bch2_delete_dead_snapshots(c);
+	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
+	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
+				    struct btree_trans_commit_hook *h)
+{
+	struct bch_fs *c = trans->c;
+
+	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
+		return 0;
+
+	bch2_delete_dead_snapshots_async(c);
+	return 0;
+}
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+				       enum btree_id id,
+				       struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, id, pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while (1) {
+		k = bch2_btree_iter_prev(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		if (!k.k)
+			break;
+
+		if (!bkey_eq(pos, k.k->p))
+			break;
+
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+			ret = 1;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s = snapshot_t(c, id);
+
+	return s->children[1] ?: s->children[0];
+}
+
+static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
+{
+	u32 child;
+
+	while ((child = bch2_snapshot_smallest_child(c, id)))
+		id = child;
+	return id;
+}
+
+static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
+					       enum btree_id btree,
+					       struct bkey_s_c interior_k,
+					       u32 leaf_id, struct bpos *new_min_pos)
+{
+	struct btree_iter iter;
+	struct bpos pos = interior_k.k->p;
+	struct bkey_s_c k;
+	struct bkey_i *new;
+	int ret;
+
+	pos.snapshot = leaf_id;
+
+	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	/* key already overwritten in this snapshot? */
+	if (k.k->p.snapshot != interior_k.k->p.snapshot)
+		goto out;
+
+	if (bpos_eq(*new_min_pos, POS_MIN)) {
+		*new_min_pos = k.k->p;
+		new_min_pos->snapshot = leaf_id;
+	}
+
+	new = bch2_bkey_make_mut_noupdate(trans, interior_k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		goto out;
+
+	new->k.p.snapshot = leaf_id;
+	ret = bch2_trans_update(trans, &iter, new, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
+					  enum btree_id btree,
+					  struct bkey_s_c k,
+					  struct bpos *new_min_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_buf sk;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+
+	*new_min_pos = POS_MIN;
+
+	for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
+	     id < k.k->p.snapshot;
+	     id++) {
+		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
+		    !bch2_snapshot_is_leaf(c, id))
+			continue;
+again:
+		ret =   btree_trans_too_many_iters(trans) ?:
+			bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
+			bch2_trans_commit(trans, NULL, NULL, 0);
+		if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+			bch2_trans_begin(trans);
+			goto again;
+		}
+
+		if (ret)
+			break;
+	}
+
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+int bch2_snapshots_read(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+			bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+			bch2_snapshot_set_equiv(trans, k)) ?:
+		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+	kfree(rcu_dereference_protected(c->snapshots, true));
+}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
new file mode 100644
index 000000000000..de215d9d1252
--- /dev/null
+++ b/fs/bcachefs/snapshot.h
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_H
+#define _BCACHEFS_SNAPSHOT_H
+
+enum bkey_invalid_flags;
+
+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
+			       enum bkey_invalid_flags, struct printbuf *);
+
+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
+	.key_invalid	= bch2_snapshot_tree_invalid,		\
+	.val_to_text	= bch2_snapshot_tree_to_text,		\
+	.min_val_size	= 8,					\
+})
+
+struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
+
+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
+		       struct bkey_s_c, struct bkey_s_c, unsigned);
+
+#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
+	.key_invalid	= bch2_snapshot_invalid,		\
+	.val_to_text	= bch2_snapshot_to_text,		\
+	.atomic_trigger	= bch2_mark_snapshot,			\
+	.min_val_size	= 24,					\
+})
+
+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
+{
+	return &t->s[U32_MAX - id];
+}
+
+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+	return __snapshot_t(rcu_dereference(c->snapshots), id);
+}
+
+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = snapshot_t(c, id)->tree;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent_early(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	u32 parent = snapshot_t(c, id)->parent;
+
+	if (parent &&
+	    snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+		panic("id %u depth=%u parent %u depth=%u\n",
+		      id, snapshot_t(c, id)->depth,
+		      parent, snapshot_t(c, parent)->depth);
+
+	return parent;
+#else
+	return snapshot_t(c, id)->parent;
+#endif
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
+{
+	rcu_read_lock();
+	while (n--)
+		id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
+
+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
+{
+	u32 parent;
+
+	rcu_read_lock();
+	while ((parent = __bch2_snapshot_parent(c, id)))
+		id = parent;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->equiv;
+}
+
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_equiv(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+	return id == bch2_snapshot_equiv(c, id);
+}
+
+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+	bool ret;
+
+	rcu_read_lock();
+	s = snapshot_t(c, id);
+	ret = s->children[0];
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
+{
+	return !bch2_snapshot_is_internal_node(c, id);
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+	u32 parent = __bch2_snapshot_parent(c, id);
+
+	if (!parent)
+		return 0;
+
+	s = snapshot_t(c, __bch2_snapshot_parent(c, id));
+	if (id == s->children[0])
+		return s->children[1];
+	if (id == s->children[1])
+		return s->children[0];
+	return 0;
+}
+
+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
+{
+	u32 depth;
+
+	rcu_read_lock();
+	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
+	rcu_read_unlock();
+
+	return depth;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	return id == ancestor
+		? true
+		: __bch2_snapshot_is_ancestor(c, id, ancestor);
+}
+
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *t;
+	bool ret;
+
+	rcu_read_lock();
+	t = snapshot_t(c, id);
+	ret = (t->children[0]|t->children[1]) != 0;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
+{
+	u32 *i;
+
+	darray_for_each(*s, i)
+		if (*i == id)
+			return true;
+	return false;
+}
+
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	u32 *i;
+
+	darray_for_each(*s, i)
+		if (bch2_snapshot_is_ancestor(c, id, *i))
+			return true;
+	return false;
+}
+
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	int ret;
+
+	BUG_ON(snapshot_list_has_id(s, id));
+	ret = darray_push(s, id);
+	if (ret)
+		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+	return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+			 struct bch_snapshot *s);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+			     struct bch_subvolume *);
+
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+			      u32 *, u32 *, unsigned);
+
+int bch2_check_snapshot_trees(struct bch_fs *);
+int bch2_check_snapshots(struct bch_fs *);
+
+int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
+int bch2_delete_dead_snapshots_hook(struct btree_trans *,
+				    struct btree_trans_commit_hook *);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
+
+static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+					  enum btree_id id,
+					  struct bpos pos)
+{
+	if (!btree_type_has_snapshots(id) ||
+	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+		return 0;
+
+	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
+					  struct bkey_s_c, struct bpos *);
+
+int bch2_snapshots_read(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
new file mode 100644
index 000000000000..ae21a8cca1b4
--- /dev/null
+++ b/fs/bcachefs/str_hash.h
@@ -0,0 +1,370 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_STR_HASH_H
+#define _BCACHEFS_STR_HASH_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "checksum.h"
+#include "error.h"
+#include "inode.h"
+#include "siphash.h"
+#include "subvolume.h"
+#include "super.h"
+
+#include <linux/crc32c.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+
+static inline enum bch_str_hash_type
+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
+{
+	switch (opt) {
+	case BCH_STR_HASH_OPT_crc32c:
+		return BCH_STR_HASH_crc32c;
+	case BCH_STR_HASH_OPT_crc64:
+		return BCH_STR_HASH_crc64;
+	case BCH_STR_HASH_OPT_siphash:
+		return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
+			? BCH_STR_HASH_siphash
+			: BCH_STR_HASH_siphash_old;
+	default:
+	     BUG();
+	}
+}
+
+struct bch_hash_info {
+	u8			type;
+	/*
+	 * For crc32 or crc64 string hashes the first key value of
+	 * the siphash_key (k0) is used as the key.
+	 */
+	SIPHASH_KEY	siphash_key;
+};
+
+static inline struct bch_hash_info
+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
+{
+	/* XXX ick */
+	struct bch_hash_info info = {
+		.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
+			~(~0U << INODE_STR_HASH_BITS),
+		.siphash_key = { .k0 = bi->bi_hash_seed }
+	};
+
+	if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
+		SHASH_DESC_ON_STACK(desc, c->sha256);
+		u8 digest[SHA256_DIGEST_SIZE];
+
+		desc->tfm = c->sha256;
+
+		crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
+				    sizeof(bi->bi_hash_seed), digest);
+		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
+	}
+
+	return info;
+}
+
+struct bch_str_hash_ctx {
+	union {
+		u32		crc32c;
+		u64		crc64;
+		SIPHASH_CTX	siphash;
+	};
+};
+
+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
+				     const struct bch_hash_info *info)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_crc32c:
+		ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
+				     sizeof(info->siphash_key.k0));
+		break;
+	case BCH_STR_HASH_crc64:
+		ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
+				      sizeof(info->siphash_key.k0));
+		break;
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
+		SipHash24_Init(&ctx->siphash, &info->siphash_key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
+				       const struct bch_hash_info *info,
+				       const void *data, size_t len)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_crc32c:
+		ctx->crc32c = crc32c(ctx->crc32c, data, len);
+		break;
+	case BCH_STR_HASH_crc64:
+		ctx->crc64 = crc64_be(ctx->crc64, data, len);
+		break;
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
+		SipHash24_Update(&ctx->siphash, data, len);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+				   const struct bch_hash_info *info)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_crc32c:
+		return ctx->crc32c;
+	case BCH_STR_HASH_crc64:
+		return ctx->crc64 >> 1;
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
+		return SipHash24_End(&ctx->siphash) >> 1;
+	default:
+		BUG();
+	}
+}
+
+struct bch_hash_desc {
+	enum btree_id	btree_id;
+	u8		key_type;
+
+	u64		(*hash_key)(const struct bch_hash_info *, const void *);
+	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
+	bool		(*cmp_key)(struct bkey_s_c, const void *);
+	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+	bool		(*is_visible)(subvol_inum inum, struct bkey_s_c);
+};
+
+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
+{
+	return k.k->type == desc.key_type &&
+		(!desc.is_visible ||
+		 !inum.inum ||
+		 desc.is_visible(inum, k));
+}
+
+static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+		 struct btree_iter *iter,
+		 const struct bch_hash_desc desc,
+		 const struct bch_hash_info *info,
+		 subvol_inum inum, const void *key,
+		 unsigned flags)
+{
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+			   POS(inum.inum, U64_MAX),
+			   BTREE_ITER_SLOTS|flags, k, ret) {
+		if (is_visible_key(desc, inum, k)) {
+			if (!desc.cmp_key(k, key))
+				return 0;
+		} else if (k.k->type == KEY_TYPE_hash_whiteout) {
+			;
+		} else {
+			/* hole, not found */
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, iter);
+
+	return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
+}
+
+static __always_inline int
+bch2_hash_hole(struct btree_trans *trans,
+	       struct btree_iter *iter,
+	       const struct bch_hash_desc desc,
+	       const struct bch_hash_info *info,
+	       subvol_inum inum, const void *key)
+{
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+			   POS(inum.inum, U64_MAX),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
+		if (!is_visible_key(desc, inum, k))
+			return 0;
+	bch2_trans_iter_exit(trans, iter);
+
+	return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
+}
+
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+			     const struct bch_hash_desc desc,
+			     const struct bch_hash_info *info,
+			     struct btree_iter *start)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_copy_iter(&iter, start);
+
+	bch2_btree_iter_advance(&iter);
+
+	for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
+		if (k.k->type != desc.key_type &&
+		    k.k->type != KEY_TYPE_hash_whiteout)
+			break;
+
+		if (k.k->type == desc.key_type &&
+		    desc.hash_bkey(info, k) <= start->pos.offset) {
+			ret = 1;
+			break;
+		}
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static __always_inline
+int bch2_hash_set_snapshot(struct btree_trans *trans,
+			   const struct bch_hash_desc desc,
+			   const struct bch_hash_info *info,
+			   subvol_inum inum, u32 snapshot,
+			   struct bkey_i *insert,
+			   int flags,
+			   int update_flags)
+{
+	struct btree_iter iter, slot = { NULL };
+	struct bkey_s_c k;
+	bool found = false;
+	int ret;
+
+	for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+			   SPOS(insert->k.p.inode,
+				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+				snapshot),
+			   POS(insert->k.p.inode, U64_MAX),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (is_visible_key(desc, inum, k)) {
+			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
+				goto found;
+
+			/* hash collision: */
+			continue;
+		}
+
+		if (!slot.path &&
+		    !(flags & BCH_HASH_SET_MUST_REPLACE))
+			bch2_trans_copy_iter(&slot, &iter);
+
+		if (k.k->type != KEY_TYPE_hash_whiteout)
+			goto not_found;
+	}
+
+	if (!ret)
+		ret = -BCH_ERR_ENOSPC_str_hash_create;
+out:
+	bch2_trans_iter_exit(trans, &slot);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+found:
+	found = true;
+not_found:
+
+	if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+		ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
+	} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+		ret = -EEXIST;
+	} else {
+		if (!found && slot.path)
+			swap(iter, slot);
+
+		insert->k.p = iter.pos;
+		ret = bch2_trans_update(trans, &iter, insert, 0);
+	}
+
+	goto out;
+}
+
+static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+		  const struct bch_hash_desc desc,
+		  const struct bch_hash_info *info,
+		  subvol_inum inum,
+		  struct bkey_i *insert, int flags)
+{
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	insert->k.p.inode = inum.inum;
+
+	return bch2_hash_set_snapshot(trans, desc, info, inum,
+				      snapshot, insert, flags, 0);
+}
+
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			const struct bch_hash_info *info,
+			struct btree_iter *iter,
+			unsigned update_flags)
+{
+	struct bkey_i *delete;
+	int ret;
+
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	ret = PTR_ERR_OR_ZERO(delete);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
+	if (ret < 0)
+		return ret;
+
+	bkey_init(&delete->k);
+	delete->k.p = iter->pos;
+	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
+
+	return bch2_trans_update(trans, iter, delete, update_flags);
+}
+
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+		     const struct bch_hash_desc desc,
+		     const struct bch_hash_info *info,
+		     subvol_inum inum, const void *key)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
+				BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
new file mode 100644
index 000000000000..caf2dd7dafff
--- /dev/null
+++ b/fs/bcachefs/subvolume.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+#include "subvolume.h"
+
+#include <linux/random.h>
+
+static int bch2_subvolume_delete(struct btree_trans *, u32);
+
+static int check_subvol(struct btree_trans *trans,
+			struct btree_iter *iter,
+			struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_subvolume subvol;
+	struct bch_snapshot snapshot;
+	unsigned snapid;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	subvol = bkey_s_c_to_subvolume(k);
+	snapid = le32_to_cpu(subvol.v->snapshot);
+	ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
+
+	if (bch2_err_matches(ret, ENOENT))
+		bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+			k.k->p.offset, snapid);
+	if (ret)
+		return ret;
+
+	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+		bch2_fs_lazy_rw(c);
+
+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
+		if (ret)
+			bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+		return ret ?: -BCH_ERR_transaction_restart_nested;
+	}
+
+	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
+		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
+		u32 snapshot_tree;
+		struct bch_snapshot_tree st;
+
+		rcu_read_lock();
+		snapshot_tree = snapshot_t(c, snapshot_root)->tree;
+		rcu_read_unlock();
+
+		ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
+
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"%s: snapshot tree %u not found", __func__, snapshot_tree);
+
+		if (ret)
+			return ret;
+
+		if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
+				"subvolume %llu is not set as snapshot but is not master subvolume",
+				k.k->p.offset)) {
+			struct bkey_i_subvolume *s =
+				bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+			ret = PTR_ERR_OR_ZERO(s);
+			if (ret)
+				return ret;
+
+			SET_BCH_SUBVOLUME_SNAP(&s->v, true);
+		}
+	}
+
+fsck_err:
+	return ret;
+}
+
+int bch2_check_subvols(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+			BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_subvol(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Subvolumes: */
+
+int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+	    bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
+		prt_printf(err, "invalid pos");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+	prt_printf(out, "root %llu snapshot id %u",
+		   le64_to_cpu(s.v->inode),
+		   le32_to_cpu(s.v->snapshot));
+
+	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
+		prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+}
+
+static __always_inline int
+bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
+			   bool inconsistent_if_not_found,
+			   int iter_flags,
+			   struct bch_subvolume *s)
+{
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
+					  iter_flags, subvolume, s);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
+				inconsistent_if_not_found,
+				trans->c, "missing subvolume %u", subvol);
+	return ret;
+}
+
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+		       bool inconsistent_if_not_found,
+		       int iter_flags,
+		       struct bch_subvolume *s)
+{
+	return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+}
+
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+			     struct bch_subvolume *subvol)
+{
+	struct bch_snapshot snap;
+
+	return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
+		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+				u32 *snapid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_subvolume subvol;
+	int ret;
+
+	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+					  BTREE_ID_subvolumes, POS(0, subvolid),
+					  BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+					  subvolume);
+	ret = bkey_err(subvol);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+				"missing subvolume %u", subvolid);
+
+	if (likely(!ret))
+		*snapid = le32_to_cpu(subvol.v->snapshot);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_subvolume_reparent(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k,
+				   u32 old_parent, u32 new_parent)
+{
+	struct bkey_i_subvolume *s;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
+	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+		return 0;
+
+	s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	s->v.parent = cpu_to_le32(new_parent);
+	return 0;
+}
+
+/*
+ * Separate from the snapshot tree in the snapshots btree, we record the tree
+ * structure of how snapshot subvolumes were created - the parent subvolume of
+ * each snapshot subvolume.
+ *
+ * When a subvolume is deleted, we scan for child subvolumes and reparant them,
+ * to avoid dangling references:
+ */
+static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_subvolume s;
+
+	return lockrestart_do(trans,
+			bch2_subvolume_get(trans, subvolid_to_delete, true,
+				   BTREE_ITER_CACHED, &s)) ?:
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL,
+			bch2_subvolume_reparent(trans, &iter, k,
+					subvolid_to_delete, le32_to_cpu(s.parent)));
+}
+
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_subvolume subvol;
+	struct btree_trans_commit_hook *h;
+	u32 snapid;
+	int ret = 0;
+
+	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+				BTREE_ID_subvolumes, POS(0, subvolid),
+				BTREE_ITER_CACHED|BTREE_ITER_INTENT,
+				subvolume);
+	ret = bkey_err(subvol);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+				"missing subvolume %u", subvolid);
+	if (ret)
+		return ret;
+
+	snapid = le32_to_cpu(subvol.v->snapshot);
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+	if (ret)
+		goto err;
+
+	ret = bch2_snapshot_node_set_deleted(trans, snapid);
+	if (ret)
+		goto err;
+
+	h = bch2_trans_kmalloc(trans, sizeof(*h));
+	ret = PTR_ERR_OR_ZERO(h);
+	if (ret)
+		goto err;
+
+	h->fn = bch2_delete_dead_snapshots_hook;
+	bch2_trans_commit_hook(trans, h);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+	return bch2_subvolumes_reparent(trans, subvolid) ?:
+		commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			  __bch2_subvolume_delete(trans, subvolid));
+}
+
+static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+				snapshot_wait_for_pagecache_and_delete_work);
+	snapshot_id_list s;
+	u32 *id;
+	int ret = 0;
+
+	while (!ret) {
+		mutex_lock(&c->snapshots_unlinked_lock);
+		s = c->snapshots_unlinked;
+		darray_init(&c->snapshots_unlinked);
+		mutex_unlock(&c->snapshots_unlinked_lock);
+
+		if (!s.nr)
+			break;
+
+		bch2_evict_subvolume_inodes(c, &s);
+
+		for (id = s.data; id < s.data + s.nr; id++) {
+			ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
+			if (ret) {
+				bch_err_msg(c, ret, "deleting subvolume %u", *id);
+				break;
+			}
+		}
+
+		darray_exit(&s);
+	}
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+}
+
+struct subvolume_unlink_hook {
+	struct btree_trans_commit_hook	h;
+	u32				subvol;
+};
+
+static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+						      struct btree_trans_commit_hook *_h)
+{
+	struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	mutex_lock(&c->snapshots_unlinked_lock);
+	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+		ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
+	mutex_unlock(&c->snapshots_unlinked_lock);
+
+	if (ret)
+		return ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
+		return -EROFS;
+
+	if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+	return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_i_subvolume *n;
+	struct subvolume_unlink_hook *h;
+	int ret = 0;
+
+	h = bch2_trans_kmalloc(trans, sizeof(*h));
+	ret = PTR_ERR_OR_ZERO(h);
+	if (ret)
+		return ret;
+
+	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
+	h->subvol	= subvolid;
+	bch2_trans_commit_hook(trans, &h->h);
+
+	n = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_subvolumes, POS(0, subvolid),
+			BTREE_ITER_CACHED, subvolume);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+					"missing subvolume %u", subvolid);
+		return ret;
+	}
+
+	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+			  u32 src_subvolid,
+			  u32 *new_subvolid,
+			  u32 *new_snapshotid,
+			  bool ro)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+	struct bkey_i_subvolume *new_subvol = NULL;
+	struct bkey_i_subvolume *src_subvol = NULL;
+	u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+	int ret = 0;
+
+	ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
+				BTREE_ID_subvolumes, POS(0, U32_MAX));
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
+		ret = -BCH_ERR_ENOSPC_subvolume_create;
+	if (ret)
+		return ret;
+
+	snapshot_subvols[0] = dst_iter.pos.offset;
+	snapshot_subvols[1] = src_subvolid;
+
+	if (src_subvolid) {
+		/* Creating a snapshot: */
+
+		src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
+				BTREE_ID_subvolumes, POS(0, src_subvolid),
+				BTREE_ITER_CACHED, subvolume);
+		ret = PTR_ERR_OR_ZERO(src_subvol);
+		if (unlikely(ret)) {
+			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+						"subvolume %u not found", src_subvolid);
+			goto err;
+		}
+
+		parent = le32_to_cpu(src_subvol->v.snapshot);
+	}
+
+	ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+					snapshot_subvols,
+					src_subvolid ? 2 : 1);
+	if (ret)
+		goto err;
+
+	if (src_subvolid) {
+		src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+		ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+		if (ret)
+			goto err;
+	}
+
+	new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
+	ret = PTR_ERR_OR_ZERO(new_subvol);
+	if (ret)
+		goto err;
+
+	new_subvol->v.flags	= 0;
+	new_subvol->v.snapshot	= cpu_to_le32(new_nodes[0]);
+	new_subvol->v.inode	= cpu_to_le64(inode);
+	new_subvol->v.parent	= cpu_to_le32(src_subvolid);
+	new_subvol->v.otime.lo	= cpu_to_le64(bch2_current_time(c));
+	new_subvol->v.otime.hi	= 0;
+
+	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+
+	*new_subvolid	= new_subvol->k.p.offset;
+	*new_snapshotid	= new_nodes[0];
+err:
+	bch2_trans_iter_exit(trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+	INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+		  bch2_subvolume_wait_for_pagecache_and_delete);
+	mutex_init(&c->snapshots_unlinked_lock);
+	return 0;
+}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
new file mode 100644
index 000000000000..bb14f92e8687
--- /dev/null
+++ b/fs/bcachefs/subvolume.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+#include "darray.h"
+#include "subvolume_types.h"
+
+enum bkey_invalid_flags;
+
+int bch2_check_subvols(struct bch_fs *);
+
+int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
+	.key_invalid	= bch2_subvolume_invalid,		\
+	.val_to_text	= bch2_subvolume_to_text,		\
+	.min_val_size	= 16,					\
+})
+
+int bch2_subvolume_get(struct btree_trans *, unsigned,
+		       bool, int, struct bch_subvolume *);
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+
+int bch2_subvolume_unlink(struct btree_trans *, u32);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+			  u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
new file mode 100644
index 000000000000..86833445af20
--- /dev/null
+++ b/fs/bcachefs/subvolume_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
+
+#define IS_ANCESTOR_BITMAP	128
+
+struct snapshot_t {
+	u32			parent;
+	u32			skip[3];
+	u32			depth;
+	u32			children[2];
+	u32			subvol; /* Nonzero only if a subvolume points to this node: */
+	u32			tree;
+	u32			equiv;
+	unsigned long		is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
+};
+
+struct snapshot_table {
+	struct snapshot_t	s[0];
+};
+
+typedef struct {
+	u32		subvol;
+	u64		inum;
+} subvol_inum;
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
new file mode 100644
index 000000000000..332d41e1c0a3
--- /dev/null
+++ b/fs/bcachefs/super-io.c
@@ -0,0 +1,1258 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "checksum.h"
+#include "counters.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_sb.h"
+#include "journal_seq_blacklist.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "quota.h"
+#include "sb-clean.h"
+#include "sb-members.h"
+#include "super-io.h"
+#include "super.h"
+#include "trace.h"
+#include "vstructs.h"
+
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
+struct bch2_metadata_version {
+	u16		version;
+	const char	*name;
+	u64		recovery_passes;
+};
+
+static const struct bch2_metadata_version bch2_metadata_versions[] = {
+#define x(n, v, _recovery_passes) {		\
+	.version = v,				\
+	.name = #n,				\
+	.recovery_passes = _recovery_passes,	\
+},
+	BCH_METADATA_VERSIONS()
+#undef x
+};
+
+void bch2_version_to_text(struct printbuf *out, unsigned v)
+{
+	const char *str = "(unknown version)";
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+		if (bch2_metadata_versions[i].version == v) {
+			str = bch2_metadata_versions[i].name;
+			break;
+		}
+
+	prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
+}
+
+unsigned bch2_latest_compatible_version(unsigned v)
+{
+	if (!BCH_VERSION_MAJOR(v))
+		return v;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+		if (bch2_metadata_versions[i].version > v &&
+		    BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
+		    BCH_VERSION_MAJOR(v))
+			v = bch2_metadata_versions[i].version;
+
+	return v;
+}
+
+u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
+				 unsigned old_version,
+				 unsigned new_version)
+{
+	u64 ret = 0;
+
+	for (const struct bch2_metadata_version *i = bch2_metadata_versions;
+	     i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions);
+	     i++)
+		if (i->version > old_version && i->version <= new_version) {
+			if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK)
+				ret |= bch2_fsck_recovery_passes();
+			ret |= i->recovery_passes;
+		}
+
+	return ret &= ~RECOVERY_PASS_ALL_FSCK;
+}
+
+const char * const bch2_sb_fields[] = {
+#define x(name, nr)	#name,
+	BCH_SB_FIELDS()
+#undef x
+	NULL
+};
+
+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
+				  struct printbuf *);
+
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
+				      enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f;
+
+	/* XXX: need locking around superblock to access optional fields */
+
+	vstruct_for_each(sb, f)
+		if (le32_to_cpu(f->type) == type)
+			return f;
+	return NULL;
+}
+
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
+						   struct bch_sb_field *f,
+						   unsigned u64s)
+{
+	unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
+
+	BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
+
+	if (!f && !u64s) {
+		/* nothing to do: */
+	} else if (!f) {
+		f = vstruct_last(sb->sb);
+		memset(f, 0, sizeof(u64) * u64s);
+		f->u64s = cpu_to_le32(u64s);
+		f->type = 0;
+	} else {
+		void *src, *dst;
+
+		src = vstruct_end(f);
+
+		if (u64s) {
+			f->u64s = cpu_to_le32(u64s);
+			dst = vstruct_end(f);
+		} else {
+			dst = f;
+		}
+
+		memmove(dst, src, vstruct_end(sb->sb) - src);
+
+		if (dst > src)
+			memset(src, 0, dst - src);
+	}
+
+	sb->sb->u64s = cpu_to_le32(sb_u64s);
+
+	return u64s ? f : NULL;
+}
+
+void bch2_sb_field_delete(struct bch_sb_handle *sb,
+			  enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+
+	if (f)
+		__bch2_sb_field_resize(sb, f, 0);
+}
+
+/* Superblock realloc/free: */
+
+void bch2_free_super(struct bch_sb_handle *sb)
+{
+	kfree(sb->bio);
+	if (!IS_ERR_OR_NULL(sb->bdev))
+		blkdev_put(sb->bdev, sb->holder);
+	kfree(sb->holder);
+
+	kfree(sb->sb);
+	memset(sb, 0, sizeof(*sb));
+}
+
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
+{
+	size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+	size_t new_buffer_size;
+	struct bch_sb *new_sb;
+	struct bio *bio;
+
+	if (sb->bdev)
+		new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
+
+	new_buffer_size = roundup_pow_of_two(new_bytes);
+
+	if (sb->sb && sb->buffer_size >= new_buffer_size)
+		return 0;
+
+	if (sb->sb && sb->have_layout) {
+		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+		if (new_bytes > max_bytes) {
+			pr_err("%pg: superblock too big: want %zu but have %llu",
+			       sb->bdev, new_bytes, max_bytes);
+			return -BCH_ERR_ENOSPC_sb;
+		}
+	}
+
+	if (sb->buffer_size >= new_buffer_size && sb->sb)
+		return 0;
+
+	if (dynamic_fault("bcachefs:add:super_realloc"))
+		return -BCH_ERR_ENOMEM_sb_realloc_injected;
+
+	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
+	if (!new_sb)
+		return -BCH_ERR_ENOMEM_sb_buf_realloc;
+
+	sb->sb = new_sb;
+
+	if (sb->have_bio) {
+		unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
+
+		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+		if (!bio)
+			return -BCH_ERR_ENOMEM_sb_bio_realloc;
+
+		bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+		kfree(sb->bio);
+		sb->bio = bio;
+	}
+
+	sb->buffer_size = new_buffer_size;
+
+	return 0;
+}
+
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
+					  enum bch_sb_field_type type,
+					  unsigned u64s)
+{
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	ssize_t d = -old_u64s + u64s;
+
+	if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+		return NULL;
+
+	if (sb->fs_sb) {
+		struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
+		struct bch_dev *ca;
+		unsigned i;
+
+		lockdep_assert_held(&c->sb_lock);
+
+		/* XXX: we're not checking that offline device have enough space */
+
+		for_each_online_member(ca, c, i) {
+			struct bch_sb_handle *dev_sb = &ca->disk_sb;
+
+			if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
+				percpu_ref_put(&ca->ref);
+				return NULL;
+			}
+		}
+	}
+
+	f = bch2_sb_field_get_id(sb->sb, type);
+	f = __bch2_sb_field_resize(sb, f, u64s);
+	if (f)
+		f->type = cpu_to_le32(type);
+	return f;
+}
+
+/* Superblock validate: */
+
+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
+{
+	u64 offset, prev_offset, max_sectors;
+	unsigned i;
+
+	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+
+	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
+	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
+		prt_printf(out, "Not a bcachefs superblock layout");
+		return -BCH_ERR_invalid_sb_layout;
+	}
+
+	if (layout->layout_type != 0) {
+		prt_printf(out, "Invalid superblock layout type %u",
+		       layout->layout_type);
+		return -BCH_ERR_invalid_sb_layout_type;
+	}
+
+	if (!layout->nr_superblocks) {
+		prt_printf(out, "Invalid superblock layout: no superblocks");
+		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
+	}
+
+	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
+		prt_printf(out, "Invalid superblock layout: too many superblocks");
+		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
+	}
+
+	max_sectors = 1 << layout->sb_max_size_bits;
+
+	prev_offset = le64_to_cpu(layout->sb_offset[0]);
+
+	for (i = 1; i < layout->nr_superblocks; i++) {
+		offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset < prev_offset + max_sectors) {
+			prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
+			       "  (sb %u ends at %llu next starts at %llu",
+			       i - 1, prev_offset + max_sectors, offset);
+			return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
+		}
+		prev_offset = offset;
+	}
+
+	return 0;
+}
+
+static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
+{
+	u16 version		= le16_to_cpu(sb->version);
+	u16 version_min		= le16_to_cpu(sb->version_min);
+
+	if (!bch2_version_compatible(version)) {
+		prt_str(out, "Unsupported superblock version ");
+		bch2_version_to_text(out, version);
+		prt_str(out, " (min ");
+		bch2_version_to_text(out, bcachefs_metadata_version_min);
+		prt_str(out, ", max ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
+		prt_str(out, ")");
+		return -BCH_ERR_invalid_sb_version;
+	}
+
+	if (!bch2_version_compatible(version_min)) {
+		prt_str(out, "Unsupported superblock version_min ");
+		bch2_version_to_text(out, version_min);
+		prt_str(out, " (min ");
+		bch2_version_to_text(out, bcachefs_metadata_version_min);
+		prt_str(out, ", max ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
+		prt_str(out, ")");
+		return -BCH_ERR_invalid_sb_version;
+	}
+
+	if (version_min > version) {
+		prt_str(out, "Bad minimum version ");
+		bch2_version_to_text(out, version_min);
+		prt_str(out, ", greater than version field ");
+		bch2_version_to_text(out, version);
+		return -BCH_ERR_invalid_sb_version;
+	}
+
+	return 0;
+}
+
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+			    int rw)
+{
+	struct bch_sb *sb = disk_sb->sb;
+	struct bch_sb_field *f;
+	struct bch_sb_field_members_v1 *mi;
+	enum bch_opt_id opt_id;
+	u16 block_size;
+	int ret;
+
+	ret = bch2_sb_compatible(sb, out);
+	if (ret)
+		return ret;
+
+	if (sb->features[1] ||
+	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
+		prt_printf(out, "Filesystem has incompatible features");
+		return -BCH_ERR_invalid_sb_features;
+	}
+
+	block_size = le16_to_cpu(sb->block_size);
+
+	if (block_size > PAGE_SECTORS) {
+		prt_printf(out, "Block size too big (got %u, max %u)",
+		       block_size, PAGE_SECTORS);
+		return -BCH_ERR_invalid_sb_block_size;
+	}
+
+	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
+		prt_printf(out, "Bad user UUID (got zeroes)");
+		return -BCH_ERR_invalid_sb_uuid;
+	}
+
+	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
+		prt_printf(out, "Bad internal UUID (got zeroes)");
+		return -BCH_ERR_invalid_sb_uuid;
+	}
+
+	if (!sb->nr_devices ||
+	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
+		prt_printf(out, "Bad number of member devices %u (max %u)",
+		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
+		return -BCH_ERR_invalid_sb_too_many_members;
+	}
+
+	if (sb->dev_idx >= sb->nr_devices) {
+		prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
+		       sb->dev_idx, sb->nr_devices);
+		return -BCH_ERR_invalid_sb_dev_idx;
+	}
+
+	if (!sb->time_precision ||
+	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
+		prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
+		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
+		return -BCH_ERR_invalid_sb_time_precision;
+	}
+
+	if (rw == READ) {
+		/*
+		 * Been seeing a bug where these are getting inexplicably
+		 * zeroed, so we're now validating them, but we have to be
+		 * careful not to preven people's filesystems from mounting:
+		 */
+		if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+			SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+		if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+			SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+
+		if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
+			SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
+	}
+
+	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+		const struct bch_option *opt = bch2_opt_table + opt_id;
+
+		if (opt->get_sb != BCH2_NO_SB_OPT) {
+			u64 v = bch2_opt_from_sb(sb, opt_id);
+
+			prt_printf(out, "Invalid option ");
+			ret = bch2_opt_validate(opt, v, out);
+			if (ret)
+				return ret;
+
+			printbuf_reset(out);
+		}
+	}
+
+	/* validate layout */
+	ret = validate_sb_layout(&sb->layout, out);
+	if (ret)
+		return ret;
+
+	vstruct_for_each(sb, f) {
+		if (!f->u64s) {
+			prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
+			       le32_to_cpu(f->type));
+			return -BCH_ERR_invalid_sb_field_size;
+		}
+
+		if (vstruct_next(f) > vstruct_last(sb)) {
+			prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+			       le32_to_cpu(f->type));
+			return -BCH_ERR_invalid_sb_field_size;
+		}
+	}
+
+	/* members must be validated first: */
+	mi = bch2_sb_field_get(sb, members_v1);
+	if (!mi) {
+		prt_printf(out, "Invalid superblock: member info area missing");
+		return -BCH_ERR_invalid_sb_members_missing;
+	}
+
+	ret = bch2_sb_field_validate(sb, &mi->field, out);
+	if (ret)
+		return ret;
+
+	vstruct_for_each(sb, f) {
+		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
+			continue;
+
+		ret = bch2_sb_field_validate(sb, f, out);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* device open: */
+
+static void bch2_sb_update(struct bch_fs *c)
+{
+	struct bch_sb *src = c->disk_sb.sb;
+	struct bch_dev *ca;
+	unsigned i;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	c->sb.uuid		= src->uuid;
+	c->sb.user_uuid		= src->user_uuid;
+	c->sb.version		= le16_to_cpu(src->version);
+	c->sb.version_min	= le16_to_cpu(src->version_min);
+	c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
+	c->sb.nr_devices	= src->nr_devices;
+	c->sb.clean		= BCH_SB_CLEAN(src);
+	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
+
+	c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
+	c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
+
+	/* XXX this is wrong, we need a 96 or 128 bit integer type */
+	c->sb.time_base_lo	= div_u64(le64_to_cpu(src->time_base_lo),
+					  c->sb.nsec_per_time_unit);
+	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
+
+	c->sb.features		= le64_to_cpu(src->features[0]);
+	c->sb.compat		= le64_to_cpu(src->compat[0]);
+
+	for_each_member_device(ca, c, i) {
+		struct bch_member m = bch2_sb_member_get(src, i);
+		ca->mi = bch2_mi_to_cpu(&m);
+	}
+}
+
+static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+{
+	struct bch_sb_field *src_f, *dst_f;
+	struct bch_sb *dst = dst_handle->sb;
+	unsigned i;
+
+	dst->version		= src->version;
+	dst->version_min	= src->version_min;
+	dst->seq		= src->seq;
+	dst->uuid		= src->uuid;
+	dst->user_uuid		= src->user_uuid;
+	memcpy(dst->label,	src->label, sizeof(dst->label));
+
+	dst->block_size		= src->block_size;
+	dst->nr_devices		= src->nr_devices;
+
+	dst->time_base_lo	= src->time_base_lo;
+	dst->time_base_hi	= src->time_base_hi;
+	dst->time_precision	= src->time_precision;
+
+	memcpy(dst->flags,	src->flags,	sizeof(dst->flags));
+	memcpy(dst->features,	src->features,	sizeof(dst->features));
+	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
+
+	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
+		int d;
+
+		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
+			continue;
+
+		src_f = bch2_sb_field_get_id(src, i);
+		dst_f = bch2_sb_field_get_id(dst, i);
+
+		d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
+		    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
+		if (d > 0) {
+			int ret = bch2_sb_realloc(dst_handle,
+					le32_to_cpu(dst_handle->sb->u64s) + d);
+
+			if (ret)
+				return ret;
+
+			dst = dst_handle->sb;
+			dst_f = bch2_sb_field_get_id(dst, i);
+		}
+
+		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
+				src_f ? le32_to_cpu(src_f->u64s) : 0);
+
+		if (src_f)
+			memcpy(dst_f, src_f, vstruct_bytes(src_f));
+	}
+
+	return 0;
+}
+
+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
+{
+	int ret;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	ret =   bch2_sb_realloc(&c->disk_sb, 0) ?:
+		__copy_super(&c->disk_sb, src) ?:
+		bch2_sb_replicas_to_cpu_replicas(c) ?:
+		bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		return ret;
+
+	bch2_sb_update(c);
+	return 0;
+}
+
+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
+{
+	return __copy_super(&ca->disk_sb, c->disk_sb.sb);
+}
+
+/* read superblock: */
+
+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
+{
+	struct bch_csum csum;
+	size_t bytes;
+	int ret;
+reread:
+	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	sb->bio->bi_iter.bi_sector = offset;
+	bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
+
+	ret = submit_bio_wait(sb->bio);
+	if (ret) {
+		prt_printf(err, "IO error: %i", ret);
+		return ret;
+	}
+
+	if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
+	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
+		prt_printf(err, "Not a bcachefs superblock");
+		return -BCH_ERR_invalid_sb_magic;
+	}
+
+	ret = bch2_sb_compatible(sb->sb, err);
+	if (ret)
+		return ret;
+
+	bytes = vstruct_bytes(sb->sb);
+
+	if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+		prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+		       bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+		return -BCH_ERR_invalid_sb_too_big;
+	}
+
+	if (bytes > sb->buffer_size) {
+		ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
+		if (ret)
+			return ret;
+		goto reread;
+	}
+
+	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+		prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+		return -BCH_ERR_invalid_sb_csum_type;
+	}
+
+	/* XXX: verify MACs */
+	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
+			    null_nonce(), sb->sb);
+
+	if (bch2_crc_cmp(csum, sb->sb->csum)) {
+		prt_printf(err, "bad checksum");
+		return -BCH_ERR_invalid_sb_csum;
+	}
+
+	sb->seq = le64_to_cpu(sb->sb->seq);
+
+	return 0;
+}
+
+int bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
+{
+	u64 offset = opt_get(*opts, sb);
+	struct bch_sb_layout layout;
+	struct printbuf err = PRINTBUF;
+	__le64 *i;
+	int ret;
+#ifndef __KERNEL__
+retry:
+#endif
+	memset(sb, 0, sizeof(*sb));
+	sb->mode	= BLK_OPEN_READ;
+	sb->have_bio	= true;
+	sb->holder	= kmalloc(1, GFP_KERNEL);
+	if (!sb->holder)
+		return -ENOMEM;
+
+#ifndef __KERNEL__
+	if (opt_get(*opts, direct_io) == false)
+		sb->mode |= BLK_OPEN_BUFFERED;
+#endif
+
+	if (!opt_get(*opts, noexcl))
+		sb->mode |= BLK_OPEN_EXCL;
+
+	if (!opt_get(*opts, nochanges))
+		sb->mode |= BLK_OPEN_WRITE;
+
+	sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+	if (IS_ERR(sb->bdev) &&
+	    PTR_ERR(sb->bdev) == -EACCES &&
+	    opt_get(*opts, read_only)) {
+		sb->mode &= ~BLK_OPEN_WRITE;
+
+		sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+		if (!IS_ERR(sb->bdev))
+			opt_set(*opts, nochanges, true);
+	}
+
+	if (IS_ERR(sb->bdev)) {
+		ret = PTR_ERR(sb->bdev);
+		goto out;
+	}
+
+	ret = bch2_sb_realloc(sb, 0);
+	if (ret) {
+		prt_printf(&err, "error allocating memory for superblock");
+		goto err;
+	}
+
+	if (bch2_fs_init_fault("read_super")) {
+		prt_printf(&err, "dynamic fault");
+		ret = -EFAULT;
+		goto err;
+	}
+
+	ret = read_one_super(sb, offset, &err);
+	if (!ret)
+		goto got_super;
+
+	if (opt_defined(*opts, sb))
+		goto err;
+
+	printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
+	       path, err.buf);
+	printbuf_reset(&err);
+
+	/*
+	 * Error reading primary superblock - read location of backup
+	 * superblocks:
+	 */
+	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+	/*
+	 * use sb buffer to read layout, since sb buffer is page aligned but
+	 * layout won't be:
+	 */
+	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
+
+	ret = submit_bio_wait(sb->bio);
+	if (ret) {
+		prt_printf(&err, "IO error: %i", ret);
+		goto err;
+	}
+
+	memcpy(&layout, sb->sb, sizeof(layout));
+	ret = validate_sb_layout(&layout, &err);
+	if (ret)
+		goto err;
+
+	for (i = layout.sb_offset;
+	     i < layout.sb_offset + layout.nr_superblocks; i++) {
+		offset = le64_to_cpu(*i);
+
+		if (offset == opt_get(*opts, sb))
+			continue;
+
+		ret = read_one_super(sb, offset, &err);
+		if (!ret)
+			goto got_super;
+	}
+
+	goto err;
+
+got_super:
+	if (le16_to_cpu(sb->sb->block_size) << 9 <
+	    bdev_logical_block_size(sb->bdev) &&
+	    opt_get(*opts, direct_io)) {
+#ifndef __KERNEL__
+		opt_set(*opts, direct_io, false);
+		bch2_free_super(sb);
+		goto retry;
+#endif
+		prt_printf(&err, "block size (%u) smaller than device block size (%u)",
+		       le16_to_cpu(sb->sb->block_size) << 9,
+		       bdev_logical_block_size(sb->bdev));
+		ret = -BCH_ERR_block_size_too_small;
+		goto err;
+	}
+
+	ret = 0;
+	sb->have_layout = true;
+
+	ret = bch2_sb_validate(sb, &err, READ);
+	if (ret) {
+		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
+		       path, err.buf);
+		goto err_no_print;
+	}
+out:
+	printbuf_exit(&err);
+	return ret;
+err:
+	printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
+	       path, err.buf);
+err_no_print:
+	bch2_free_super(sb);
+	goto out;
+}
+
+/* write superblock: */
+
+static void write_super_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+
+	/* XXX: return errors directly */
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
+			       bch2_blk_status_to_str(bio->bi_status)))
+		ca->sb_write_error = 1;
+
+	closure_put(&ca->fs->sb_write);
+	percpu_ref_put(&ca->io_ref);
+}
+
+static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
+
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
+		     bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	sb->offset = sb->layout.sb_offset[idx];
+
+	SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
+	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
+				null_nonce(), sb);
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bch2_bio_map(bio, sb,
+		     roundup((size_t) vstruct_bytes(sb),
+			     bdev_logical_block_size(ca->disk_sb.bdev)));
+
+	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
+		     bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
+int bch2_write_super(struct bch_fs *c)
+{
+	struct closure *cl = &c->sb_write;
+	struct bch_dev *ca;
+	struct printbuf err = PRINTBUF;
+	unsigned i, sb = 0, nr_wrote;
+	struct bch_devs_mask sb_written;
+	bool wrote, can_mount_without_written, can_mount_with_written;
+	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
+	int ret = 0;
+
+	trace_and_count(c, write_super, c, _RET_IP_);
+
+	if (c->opts.very_degraded)
+		degraded_flags |= BCH_FORCE_IF_LOST;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	closure_init_stack(cl);
+	memset(&sb_written, 0, sizeof(sb_written));
+
+	/* Make sure we're using the new magic numbers: */
+	c->disk_sb.sb->magic = BCHFS_MAGIC;
+	c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
+
+	le64_add_cpu(&c->disk_sb.sb->seq, 1);
+
+	if (test_bit(BCH_FS_ERROR, &c->flags))
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+	if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
+
+	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+
+	bch2_sb_counters_from_cpu(c);
+	bch_members_cpy_v2_v1(&c->disk_sb);
+
+	for_each_online_member(ca, c, i)
+		bch2_sb_from_fs(c, ca);
+
+	for_each_online_member(ca, c, i) {
+		printbuf_reset(&err);
+
+		ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
+		if (ret) {
+			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
+			percpu_ref_put(&ca->io_ref);
+			goto out;
+		}
+	}
+
+	if (c->opts.nochanges)
+		goto out;
+
+	/*
+	 * Defer writing the superblock until filesystem initialization is
+	 * complete - don't write out a partly initialized superblock:
+	 */
+	if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
+		goto out;
+
+	for_each_online_member(ca, c, i) {
+		__set_bit(ca->dev_idx, sb_written.d);
+		ca->sb_write_error = 0;
+	}
+
+	for_each_online_member(ca, c, i)
+		read_back_super(c, ca);
+	closure_sync(cl);
+
+	for_each_online_member(ca, c, i) {
+		if (ca->sb_write_error)
+			continue;
+
+		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+			bch2_fs_fatal_error(c,
+				"Superblock write was silently dropped! (seq %llu expected %llu)",
+				le64_to_cpu(ca->sb_read_scratch->seq),
+				ca->disk_sb.seq);
+			percpu_ref_put(&ca->io_ref);
+			ret = -BCH_ERR_erofs_sb_err;
+			goto out;
+		}
+
+		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
+			bch2_fs_fatal_error(c,
+				"Superblock modified by another process (seq %llu expected %llu)",
+				le64_to_cpu(ca->sb_read_scratch->seq),
+				ca->disk_sb.seq);
+			percpu_ref_put(&ca->io_ref);
+			ret = -BCH_ERR_erofs_sb_err;
+			goto out;
+		}
+	}
+
+	do {
+		wrote = false;
+		for_each_online_member(ca, c, i)
+			if (!ca->sb_write_error &&
+			    sb < ca->disk_sb.sb->layout.nr_superblocks) {
+				write_one_super(c, ca, sb);
+				wrote = true;
+			}
+		closure_sync(cl);
+		sb++;
+	} while (wrote);
+
+	for_each_online_member(ca, c, i) {
+		if (ca->sb_write_error)
+			__clear_bit(ca->dev_idx, sb_written.d);
+		else
+			ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
+	}
+
+	nr_wrote = dev_mask_nr(&sb_written);
+
+	can_mount_with_written =
+		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+
+	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+		sb_written.d[i] = ~sb_written.d[i];
+
+	can_mount_without_written =
+		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+
+	/*
+	 * If we would be able to mount _without_ the devices we successfully
+	 * wrote superblocks to, we weren't able to write to enough devices:
+	 *
+	 * Exception: if we can mount without the successes because we haven't
+	 * written anything (new filesystem), we continue if we'd be able to
+	 * mount with the devices we did successfully write to:
+	 */
+	if (bch2_fs_fatal_err_on(!nr_wrote ||
+				 !can_mount_with_written ||
+				 (can_mount_without_written &&
+				  !can_mount_with_written), c,
+		"Unable to write superblock to sufficient devices (from %ps)",
+		(void *) _RET_IP_))
+		ret = -1;
+out:
+	/* Make new options visible after they're persistent: */
+	bch2_sb_update(c);
+	printbuf_exit(&err);
+	return ret;
+}
+
+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
+{
+	mutex_lock(&c->sb_lock);
+	if (!(c->sb.features & (1ULL << feat))) {
+		c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
+
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+}
+
+/* Downgrade if superblock is at a higher version than currently supported: */
+void bch2_sb_maybe_downgrade(struct bch_fs *c)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	/*
+	 * Downgrade, if superblock is at a higher version than currently
+	 * supported:
+	 */
+	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+	if (c->sb.version > bcachefs_metadata_version_current)
+		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+	if (c->sb.version_min > bcachefs_metadata_version_current)
+		c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
+	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
+}
+
+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	c->disk_sb.sb->version = cpu_to_le16(new_version);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+}
+
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr)					\
+	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+	BCH_SB_FIELDS()
+#undef x
+};
+
+static const struct bch_sb_field_ops bch2_sb_field_null_ops;
+
+static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
+{
+	return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
+		? bch2_sb_field_ops[type]
+		: &bch2_sb_field_null_ops;
+}
+
+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	unsigned type = le32_to_cpu(f->type);
+	struct printbuf field_err = PRINTBUF;
+	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
+	int ret;
+
+	ret = ops->validate ? ops->validate(sb, f, &field_err) : 0;
+	if (ret) {
+		prt_printf(err, "Invalid superblock section %s: %s",
+			   bch2_sb_fields[type], field_err.buf);
+		prt_newline(err);
+		bch2_sb_field_to_text(err, sb, f);
+	}
+
+	printbuf_exit(&field_err);
+	return ret;
+}
+
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+			   struct bch_sb_field *f)
+{
+	unsigned type = le32_to_cpu(f->type);
+	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
+
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
+
+	if (type < BCH_SB_FIELD_NR)
+		prt_printf(out, "%s", bch2_sb_fields[type]);
+	else
+		prt_printf(out, "(unknown field %u)", type);
+
+	prt_printf(out, " (size %zu):", vstruct_bytes(f));
+	prt_newline(out);
+
+	if (ops->to_text) {
+		printbuf_indent_add(out, 2);
+		ops->to_text(out, sb, f);
+		printbuf_indent_sub(out, 2);
+	}
+}
+
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
+{
+	unsigned i;
+
+	prt_printf(out, "Type:                    %u", l->layout_type);
+	prt_newline(out);
+
+	prt_str(out, "Superblock max size:     ");
+	prt_units_u64(out, 512 << l->sb_max_size_bits);
+	prt_newline(out);
+
+	prt_printf(out, "Nr superblocks:          %u", l->nr_superblocks);
+	prt_newline(out);
+
+	prt_str(out, "Offsets:                 ");
+	for (i = 0; i < l->nr_superblocks; i++) {
+		if (i)
+			prt_str(out, ", ");
+		prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+	}
+	prt_newline(out);
+}
+
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+		     bool print_layout, unsigned fields)
+{
+	struct bch_sb_field *f;
+	u64 fields_have = 0;
+	unsigned nr_devices = 0;
+
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 44);
+
+	for (int i = 0; i < sb->nr_devices; i++)
+		nr_devices += bch2_dev_exists(sb, i);
+
+	prt_printf(out, "External UUID:");
+	prt_tab(out);
+	pr_uuid(out, sb->user_uuid.b);
+	prt_newline(out);
+
+	prt_printf(out, "Internal UUID:");
+	prt_tab(out);
+	pr_uuid(out, sb->uuid.b);
+	prt_newline(out);
+
+	prt_str(out, "Device index:");
+	prt_tab(out);
+	prt_printf(out, "%u", sb->dev_idx);
+	prt_newline(out);
+
+	prt_str(out, "Label:");
+	prt_tab(out);
+	prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+	prt_newline(out);
+
+	prt_str(out, "Version:");
+	prt_tab(out);
+	bch2_version_to_text(out, le16_to_cpu(sb->version));
+	prt_newline(out);
+
+	prt_str(out, "Version upgrade complete:");
+	prt_tab(out);
+	bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Oldest version on disk:");
+	prt_tab(out);
+	bch2_version_to_text(out, le16_to_cpu(sb->version_min));
+	prt_newline(out);
+
+	prt_printf(out, "Created:");
+	prt_tab(out);
+	if (sb->time_base_lo)
+		pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+	else
+		prt_printf(out, "(not set)");
+	prt_newline(out);
+
+	prt_printf(out, "Sequence number:");
+	prt_tab(out);
+	prt_printf(out, "%llu", le64_to_cpu(sb->seq));
+	prt_newline(out);
+
+	prt_printf(out, "Superblock size:");
+	prt_tab(out);
+	prt_printf(out, "%zu", vstruct_bytes(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Clean:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Devices:");
+	prt_tab(out);
+	prt_printf(out, "%u", nr_devices);
+	prt_newline(out);
+
+	prt_printf(out, "Sections:");
+	vstruct_for_each(sb, f)
+		fields_have |= 1 << le32_to_cpu(f->type);
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_fields, fields_have);
+	prt_newline(out);
+
+	prt_printf(out, "Features:");
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
+	prt_newline(out);
+
+	prt_printf(out, "Compat features:");
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
+	prt_newline(out);
+
+	prt_newline(out);
+	prt_printf(out, "Options:");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+	{
+		enum bch_opt_id id;
+
+		for (id = 0; id < bch2_opts_nr; id++) {
+			const struct bch_option *opt = bch2_opt_table + id;
+
+			if (opt->get_sb != BCH2_NO_SB_OPT) {
+				u64 v = bch2_opt_from_sb(sb, id);
+
+				prt_printf(out, "%s:", opt->attr.name);
+				prt_tab(out);
+				bch2_opt_to_text(out, NULL, sb, opt, v,
+						 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+				prt_newline(out);
+			}
+		}
+	}
+
+	printbuf_indent_sub(out, 2);
+
+	if (print_layout) {
+		prt_newline(out);
+		prt_printf(out, "layout:");
+		prt_newline(out);
+		printbuf_indent_add(out, 2);
+		bch2_sb_layout_to_text(out, &sb->layout);
+		printbuf_indent_sub(out, 2);
+	}
+
+	vstruct_for_each(sb, f)
+		if (fields & (1 << le32_to_cpu(f->type))) {
+			prt_newline(out);
+			bch2_sb_field_to_text(out, sb, f);
+		}
+}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
new file mode 100644
index 000000000000..b0d8584f475f
--- /dev/null
+++ b/fs/bcachefs/super-io.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_IO_H
+#define _BCACHEFS_SUPER_IO_H
+
+#include "extents.h"
+#include "eytzinger.h"
+#include "super_types.h"
+#include "super.h"
+#include "sb-members.h"
+
+#include <asm/byteorder.h>
+
+static inline bool bch2_version_compatible(u16 version)
+{
+	return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
+		version >= bcachefs_metadata_version_min;
+}
+
+void bch2_version_to_text(struct printbuf *, unsigned);
+unsigned bch2_latest_compatible_version(unsigned);
+
+u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
+				 unsigned,
+				 unsigned);
+
+#define field_to_type(_f, _name)					\
+	container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
+#define bch2_sb_field_get(_sb, _name)					\
+	field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
+
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
+					     enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_resize(_sb, _name, _u64s)				\
+	field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
+
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
+
+extern const char * const bch2_sb_fields[];
+
+struct bch_sb_field_ops {
+	int	(*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+	void	(*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
+};
+
+static inline __le64 bch2_sb_magic(struct bch_fs *c)
+{
+	__le64 ret;
+
+	memcpy(&ret, &c->sb.uuid, sizeof(ret));
+	return ret;
+}
+
+static inline __u64 jset_magic(struct bch_fs *c)
+{
+	return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
+}
+
+static inline __u64 bset_magic(struct bch_fs *c)
+{
+	return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
+}
+
+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
+
+void bch2_free_super(struct bch_sb_handle *);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_write_super(struct bch_fs *);
+void __bch2_check_set_feature(struct bch_fs *, unsigned);
+
+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
+{
+	if (!(c->sb.features & (1ULL << feat)))
+		__bch2_check_set_feature(c, feat);
+}
+
+/* BCH_SB_FIELD_members_v1: */
+
+static inline bool bch2_member_exists(struct bch_member *m)
+{
+	return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+}
+
+static inline bool bch2_dev_exists(struct bch_sb *sb,
+				   unsigned dev)
+{
+	if (dev < sb->nr_devices) {
+	struct bch_member m = bch2_sb_member_get(sb, dev);
+		return bch2_member_exists(&m);
+	}
+	return false;
+}
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+	return (struct bch_member_cpu) {
+		.nbuckets	= le64_to_cpu(mi->nbuckets),
+		.first_bucket	= le16_to_cpu(mi->first_bucket),
+		.bucket_size	= le16_to_cpu(mi->bucket_size),
+		.group		= BCH_MEMBER_GROUP(mi),
+		.state		= BCH_MEMBER_STATE(mi),
+		.discard	= BCH_MEMBER_DISCARD(mi),
+		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
+		.durability	= BCH_MEMBER_DURABILITY(mi)
+			? BCH_MEMBER_DURABILITY(mi) - 1
+			: 1,
+		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
+		.valid		= bch2_member_exists(mi),
+	};
+}
+
+void bch2_sb_maybe_downgrade(struct bch_fs *);
+void bch2_sb_upgrade(struct bch_fs *, unsigned);
+
+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+			   struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
+
+#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
new file mode 100644
index 000000000000..0e85c22672be
--- /dev/null
+++ b/fs/bcachefs/super.c
@@ -0,0 +1,2022 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_sort.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_write_buffer.h"
+#include "buckets_waiting_for_journal.h"
+#include "chardev.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "counters.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io_read.h"
+#include "io_write.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "move.h"
+#include "migrate.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "quota.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "sb-members.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "sysfs.h"
+#include "trace.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/sysfs.h>
+#include <crypto/hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
+
+#define KTYPE(type)							\
+static const struct attribute_group type ## _group = {			\
+	.attrs = type ## _files						\
+};									\
+									\
+static const struct attribute_group *type ## _groups[] = {		\
+	&type ## _group,						\
+	NULL								\
+};									\
+									\
+static const struct kobj_type type ## _ktype = {			\
+	.release	= type ## _release,				\
+	.sysfs_ops	= &type ## _sysfs_ops,				\
+	.default_groups = type ## _groups				\
+}
+
+static void bch2_fs_release(struct kobject *);
+static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_internal_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_opts_dir_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_time_stats_release(struct kobject *k)
+{
+}
+
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_counters);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
+
+static struct kset *bcachefs_kset;
+static LIST_HEAD(bch_fs_list);
+static DEFINE_MUTEX(bch_fs_list_lock);
+
+DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
+
+static void bch2_dev_free(struct bch_dev *);
+static int bch2_dev_alloc(struct bch_fs *, unsigned);
+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
+
+struct bch_fs *bch2_dev_to_fs(dev_t dev)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&bch_fs_list_lock);
+	rcu_read_lock();
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		for_each_member_device_rcu(ca, c, i, NULL)
+			if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
+				closure_get(&c->cl);
+				goto found;
+			}
+	c = NULL;
+found:
+	rcu_read_unlock();
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
+{
+	struct bch_fs *c;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
+			return c;
+
+	return NULL;
+}
+
+struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
+{
+	struct bch_fs *c;
+
+	mutex_lock(&bch_fs_list_lock);
+	c = __bch2_uuid_to_fs(uuid);
+	if (c)
+		closure_get(&c->cl);
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, u64s =
+		((sizeof(struct jset_entry_dev_usage) +
+		  sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
+		sizeof(u64);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		nr++;
+	rcu_read_unlock();
+
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->dev_usage_journal_res, u64s * nr);
+}
+
+/* Filesystem RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and rebalance (to free up space)
+ *
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
+ *   don't because they either reserve ahead of time or don't block if
+ *   allocations fail, but allocations can require mark and sweep gc to run
+ *   because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
+
+static void __bch2_fs_read_only(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, clean_passes = 0;
+	u64 seq = 0;
+
+	bch2_fs_ec_stop(c);
+	bch2_open_buckets_stop(c, NULL, true);
+	bch2_rebalance_stop(c);
+	bch2_copygc_stop(c);
+	bch2_gc_thread_stop(c);
+	bch2_fs_ec_flush(c);
+
+	bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
+		    journal_cur_seq(&c->journal));
+
+	do {
+		clean_passes++;
+
+		if (bch2_btree_interior_updates_flush(c) ||
+		    bch2_journal_flush_all_pins(&c->journal) ||
+		    bch2_btree_flush_all_writes(c) ||
+		    seq != atomic64_read(&c->journal.seq)) {
+			seq = atomic64_read(&c->journal.seq);
+			clean_passes = 0;
+		}
+	} while (clean_passes < 2);
+
+	bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
+		    journal_cur_seq(&c->journal));
+
+	if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+	bch2_fs_journal_stop(&c->journal);
+
+	/*
+	 * After stopping journal:
+	 */
+	for_each_member_device(ca, c, i)
+		bch2_dev_allocator_remove(c, ca);
+}
+
+#ifndef BCH_WRITE_REF_DEBUG
+static void bch2_writes_disabled(struct percpu_ref *writes)
+{
+	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
+
+	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch2_read_only_wait);
+}
+#endif
+
+void bch2_fs_read_only(struct bch_fs *c)
+{
+	if (!test_bit(BCH_FS_RW, &c->flags)) {
+		bch2_journal_reclaim_stop(&c->journal);
+		return;
+	}
+
+	BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	/*
+	 * Block new foreground-end write operations from starting - any new
+	 * writes will return -EROFS:
+	 */
+	set_bit(BCH_FS_GOING_RO, &c->flags);
+#ifndef BCH_WRITE_REF_DEBUG
+	percpu_ref_kill(&c->writes);
+#else
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+		bch2_write_ref_put(c, i);
+#endif
+
+	/*
+	 * If we're not doing an emergency shutdown, we want to wait on
+	 * outstanding writes to complete so they don't see spurious errors due
+	 * to shutting down the allocator:
+	 *
+	 * If we are doing an emergency shutdown outstanding writes may
+	 * hang until we shutdown the allocator so we don't want to wait
+	 * on outstanding writes before shutting everything down - but
+	 * we do need to wait on them before returning and signalling
+	 * that going RO is complete:
+	 */
+	wait_event(bch2_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+
+	__bch2_fs_read_only(c);
+
+	wait_event(bch2_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	clear_bit(BCH_FS_GOING_RO, &c->flags);
+
+	if (!bch2_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags) &&
+	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
+	    test_bit(BCH_FS_STARTED, &c->flags) &&
+	    test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
+	    !c->opts.norecovery) {
+		BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
+		BUG_ON(atomic_read(&c->btree_cache.dirty));
+		BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
+		BUG_ON(c->btree_write_buffer.state.nr);
+
+		bch_verbose(c, "marking filesystem clean");
+		bch2_fs_mark_clean(c);
+	}
+
+	clear_bit(BCH_FS_RW, &c->flags);
+}
+
+static void bch2_fs_read_only_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, read_only_work);
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
+}
+
+static void bch2_fs_read_only_async(struct bch_fs *c)
+{
+	queue_work(system_long_wq, &c->read_only_work);
+}
+
+bool bch2_fs_emergency_read_only(struct bch_fs *c)
+{
+	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+
+	bch2_journal_halt(&c->journal);
+	bch2_fs_read_only_async(c);
+
+	wake_up(&bch2_read_only_wait);
+	return ret;
+}
+
+static int bch2_fs_read_write_late(struct bch_fs *c)
+{
+	int ret;
+
+	/*
+	 * Data move operations can't run until after check_snapshots has
+	 * completed, and bch2_snapshot_is_ancestor() is available.
+	 *
+	 * Ideally we'd start copygc/rebalance earlier instead of waiting for
+	 * all of recovery/fsck to complete:
+	 */
+	ret = bch2_copygc_start(c);
+	if (ret) {
+		bch_err(c, "error starting copygc thread");
+		return ret;
+	}
+
+	ret = bch2_rebalance_start(c);
+	if (ret) {
+		bch_err(c, "error starting rebalance thread");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __bch2_fs_read_write(struct bch_fs *c, bool early)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+		bch_err(c, "cannot go rw, unfixed btree errors");
+		return -BCH_ERR_erofs_unfixed_errors;
+	}
+
+	if (test_bit(BCH_FS_RW, &c->flags))
+		return 0;
+
+	if (c->opts.norecovery)
+		return -BCH_ERR_erofs_norecovery;
+
+	/*
+	 * nochanges is used for fsck -n mode - we have to allow going rw
+	 * during recovery for that to work:
+	 */
+	if (c->opts.nochanges && (!early || c->opts.read_only))
+		return -BCH_ERR_erofs_nochanges;
+
+	bch_info(c, "going read-write");
+
+	ret = bch2_members_v2_init(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_fs_mark_dirty(c);
+	if (ret)
+		goto err;
+
+	clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+
+	/*
+	 * First journal write must be a flush write: after a clean shutdown we
+	 * don't read the journal, so the first journal write may end up
+	 * overwriting whatever was there previously, and there must always be
+	 * at least one non-flush write in the journal or recovery will fail:
+	 */
+	set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+
+	for_each_rw_member(ca, c, i)
+		bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	ret = bch2_gc_thread_start(c);
+	if (ret) {
+		bch_err(c, "error starting gc thread");
+		return ret;
+	}
+
+	ret = bch2_journal_reclaim_start(&c->journal);
+	if (ret)
+		goto err;
+
+	if (!early) {
+		ret = bch2_fs_read_write_late(c);
+		if (ret)
+			goto err;
+	}
+
+#ifndef BCH_WRITE_REF_DEBUG
+	percpu_ref_reinit(&c->writes);
+#else
+	for (i = 0; i < BCH_WRITE_REF_NR; i++) {
+		BUG_ON(atomic_long_read(&c->writes[i]));
+		atomic_long_inc(&c->writes[i]);
+	}
+#endif
+	set_bit(BCH_FS_RW, &c->flags);
+	set_bit(BCH_FS_WAS_RW, &c->flags);
+
+	bch2_do_discards(c);
+	bch2_do_invalidates(c);
+	bch2_do_stripe_deletes(c);
+	bch2_do_pending_node_rewrites(c);
+	return 0;
+err:
+	__bch2_fs_read_only(c);
+	return ret;
+}
+
+int bch2_fs_read_write(struct bch_fs *c)
+{
+	return __bch2_fs_read_write(c, false);
+}
+
+int bch2_fs_read_write_early(struct bch_fs *c)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	return __bch2_fs_read_write(c, true);
+}
+
+/* Filesystem startup/shutdown: */
+
+static void __bch2_fs_free(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_exit(&c->times[i]);
+
+	bch2_free_pending_node_rewrites(c);
+	bch2_fs_counters_exit(c);
+	bch2_fs_snapshots_exit(c);
+	bch2_fs_quota_exit(c);
+	bch2_fs_fs_io_direct_exit(c);
+	bch2_fs_fs_io_buffered_exit(c);
+	bch2_fs_fsio_exit(c);
+	bch2_fs_ec_exit(c);
+	bch2_fs_encryption_exit(c);
+	bch2_fs_nocow_locking_exit(c);
+	bch2_fs_io_write_exit(c);
+	bch2_fs_io_read_exit(c);
+	bch2_fs_buckets_waiting_for_journal_exit(c);
+	bch2_fs_btree_interior_update_exit(c);
+	bch2_fs_btree_iter_exit(c);
+	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
+	bch2_fs_btree_cache_exit(c);
+	bch2_fs_replicas_exit(c);
+	bch2_fs_journal_exit(&c->journal);
+	bch2_io_clock_exit(&c->io_clock[WRITE]);
+	bch2_io_clock_exit(&c->io_clock[READ]);
+	bch2_fs_compress_exit(c);
+	bch2_journal_keys_free(&c->journal_keys);
+	bch2_journal_entries_free(c);
+	bch2_fs_btree_write_buffer_exit(c);
+	percpu_free_rwsem(&c->mark_lock);
+	free_percpu(c->online_reserved);
+
+	darray_exit(&c->btree_roots_extra);
+	free_percpu(c->pcpu);
+	mempool_exit(&c->large_bkey_pool);
+	mempool_exit(&c->btree_bounce_pool);
+	bioset_exit(&c->btree_bio);
+	mempool_exit(&c->fill_iter);
+#ifndef BCH_WRITE_REF_DEBUG
+	percpu_ref_exit(&c->writes);
+#endif
+	kfree(rcu_dereference_protected(c->disk_groups, 1));
+	kfree(c->journal_seq_blacklist_table);
+	kfree(c->unused_inode_hints);
+
+	if (c->write_ref_wq)
+		destroy_workqueue(c->write_ref_wq);
+	if (c->io_complete_wq)
+		destroy_workqueue(c->io_complete_wq);
+	if (c->copygc_wq)
+		destroy_workqueue(c->copygc_wq);
+	if (c->btree_io_complete_wq)
+		destroy_workqueue(c->btree_io_complete_wq);
+	if (c->btree_update_wq)
+		destroy_workqueue(c->btree_update_wq);
+
+	bch2_free_super(&c->disk_sb);
+	kvpfree(c, sizeof(*c));
+	module_put(THIS_MODULE);
+}
+
+static void bch2_fs_release(struct kobject *kobj)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	__bch2_fs_free(c);
+}
+
+void __bch2_fs_stop(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	bch_verbose(c, "shutting down");
+
+	set_bit(BCH_FS_STOPPING, &c->flags);
+
+	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
+
+	for_each_member_device(ca, c, i)
+		if (ca->kobj.state_in_sysfs &&
+		    ca->disk_sb.bdev)
+			sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	bch2_fs_debug_exit(c);
+	bch2_fs_chardev_exit(c);
+
+	kobject_put(&c->counters_kobj);
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	/* btree prefetch might have kicked off reads in the background: */
+	bch2_btree_flush_all_reads(c);
+
+	for_each_member_device(ca, c, i)
+		cancel_work_sync(&ca->io_error_work);
+
+	cancel_work_sync(&c->read_only_work);
+}
+
+void bch2_fs_free(struct bch_fs *c)
+{
+	unsigned i;
+
+	mutex_lock(&bch_fs_list_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_fs_list_lock);
+
+	closure_sync(&c->cl);
+	closure_debug_destroy(&c->cl);
+
+	for (i = 0; i < c->sb.nr_devices; i++) {
+		struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
+
+		if (ca) {
+			bch2_free_super(&ca->disk_sb);
+			bch2_dev_free(ca);
+		}
+	}
+
+	bch_verbose(c, "shutdown complete");
+
+	kobject_put(&c->kobj);
+}
+
+void bch2_fs_stop(struct bch_fs *c)
+{
+	__bch2_fs_stop(c);
+	bch2_fs_free(c);
+}
+
+static int bch2_fs_online(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	if (__bch2_uuid_to_fs(c->sb.uuid)) {
+		bch_err(c, "filesystem UUID already open");
+		return -EINVAL;
+	}
+
+	ret = bch2_fs_chardev_init(c);
+	if (ret) {
+		bch_err(c, "error creating character device");
+		return ret;
+	}
+
+	bch2_fs_debug_init(c);
+
+	ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+	    kobject_add(&c->internal, &c->kobj, "internal") ?:
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+	    kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
+	    bch2_opts_create_sysfs_files(&c->opts_dir);
+	if (ret) {
+		bch_err(c, "error creating sysfs objects");
+		return ret;
+	}
+
+	down_write(&c->state_lock);
+
+	for_each_member_device(ca, c, i) {
+		ret = bch2_dev_sysfs_online(c, ca);
+		if (ret) {
+			bch_err(c, "error creating sysfs objects");
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
+	}
+
+	BUG_ON(!list_empty(&c->list));
+	list_add(&c->list, &bch_fs_list);
+err:
+	up_write(&c->state_lock);
+	return ret;
+}
+
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+{
+	struct bch_fs *c;
+	struct printbuf name = PRINTBUF;
+	unsigned i, iter_size;
+	int ret = 0;
+
+	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+	if (!c) {
+		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
+		goto out;
+	}
+
+	__module_get(THIS_MODULE);
+
+	closure_init(&c->cl, NULL);
+
+	c->kobj.kset = bcachefs_kset;
+	kobject_init(&c->kobj, &bch2_fs_ktype);
+	kobject_init(&c->internal, &bch2_fs_internal_ktype);
+	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+	kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
+
+	c->minor		= -1;
+	c->disk_sb.fs_sb	= true;
+
+	init_rwsem(&c->state_lock);
+	mutex_init(&c->sb_lock);
+	mutex_init(&c->replicas_gc_lock);
+	mutex_init(&c->btree_root_lock);
+	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+
+	init_rwsem(&c->gc_lock);
+	mutex_init(&c->gc_gens_lock);
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_init(&c->times[i]);
+
+	bch2_fs_copygc_init(c);
+	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
+	bch2_fs_btree_interior_update_init_early(c);
+	bch2_fs_allocator_background_init(c);
+	bch2_fs_allocator_foreground_init(c);
+	bch2_fs_rebalance_init(c);
+	bch2_fs_quota_init(c);
+	bch2_fs_ec_init_early(c);
+	bch2_fs_move_init(c);
+
+	INIT_LIST_HEAD(&c->list);
+
+	mutex_init(&c->usage_scratch_lock);
+
+	mutex_init(&c->bio_bounce_pages_lock);
+	mutex_init(&c->snapshot_table_lock);
+	init_rwsem(&c->snapshot_create_lock);
+
+	spin_lock_init(&c->btree_write_error_lock);
+
+	INIT_WORK(&c->journal_seq_blacklist_gc_work,
+		  bch2_blacklist_entries_gc);
+
+	INIT_LIST_HEAD(&c->journal_iters);
+
+	INIT_LIST_HEAD(&c->fsck_errors);
+	mutex_init(&c->fsck_error_lock);
+
+	seqcount_init(&c->gc_pos_lock);
+
+	seqcount_init(&c->usage_lock);
+
+	sema_init(&c->io_in_flight, 128);
+
+	INIT_LIST_HEAD(&c->vfs_inodes_list);
+	mutex_init(&c->vfs_inodes_lock);
+
+	c->copy_gc_enabled		= 1;
+	c->rebalance.enabled		= 1;
+	c->promote_whole_extents	= true;
+
+	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
+	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
+	c->journal.blocked_time		= &c->times[BCH_TIME_blocked_journal];
+	c->journal.flush_seq_time	= &c->times[BCH_TIME_journal_flush_seq];
+
+	bch2_fs_btree_cache_init_early(&c->btree_cache);
+
+	mutex_init(&c->sectors_available_lock);
+
+	ret = percpu_init_rwsem(&c->mark_lock);
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+	ret = bch2_sb_to_fs(c, sb);
+	mutex_unlock(&c->sb_lock);
+
+	if (ret)
+		goto err;
+
+	pr_uuid(&name, c->sb.user_uuid.b);
+	strscpy(c->name, name.buf, sizeof(c->name));
+	printbuf_exit(&name);
+
+	ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
+	if (ret)
+		goto err;
+
+	/* Compat: */
+	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
+	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+		SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+
+	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
+	    !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
+
+	c->opts = bch2_opts_default;
+	ret = bch2_opts_from_sb(&c->opts, sb);
+	if (ret)
+		goto err;
+
+	bch2_opts_apply(&c->opts, opts);
+
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+	if (c->opts.inodes_use_key_cache)
+		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
+
+	c->block_bits		= ilog2(block_sectors(c));
+	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
+
+	if (bch2_fs_init_fault("fs_alloc")) {
+		bch_err(c, "fs_alloc fault injected");
+		ret = -EFAULT;
+		goto err;
+	}
+
+	iter_size = sizeof(struct sort_iter) +
+		(btree_blocks(c) + 1) * 2 *
+		sizeof(struct sort_iter_set);
+
+	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
+	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
+				WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+				WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+				WQ_FREEZABLE, 0)) ||
+#ifndef BCH_WRITE_REF_DEBUG
+	    percpu_ref_init(&c->writes, bch2_writes_disabled,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+#endif
+	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+	    bioset_init(&c->btree_bio, 1,
+			max(offsetof(struct btree_read_bio, bio),
+			    offsetof(struct btree_write_bio, wbio.bio)),
+			BIOSET_NEED_BVECS) ||
+	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+	    !(c->online_reserved = alloc_percpu(u64)) ||
+	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+					btree_bytes(c)) ||
+	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+					      sizeof(u64), GFP_KERNEL))) {
+		ret = -BCH_ERR_ENOMEM_fs_other_alloc;
+		goto err;
+	}
+
+	ret = bch2_fs_counters_init(c) ?:
+	    bch2_io_clock_init(&c->io_clock[READ]) ?:
+	    bch2_io_clock_init(&c->io_clock[WRITE]) ?:
+	    bch2_fs_journal_init(&c->journal) ?:
+	    bch2_fs_replicas_init(c) ?:
+	    bch2_fs_btree_cache_init(c) ?:
+	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
+	    bch2_fs_btree_iter_init(c) ?:
+	    bch2_fs_btree_interior_update_init(c) ?:
+	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
+	    bch2_fs_btree_write_buffer_init(c) ?:
+	    bch2_fs_subvolumes_init(c) ?:
+	    bch2_fs_io_read_init(c) ?:
+	    bch2_fs_io_write_init(c) ?:
+	    bch2_fs_nocow_locking_init(c) ?:
+	    bch2_fs_encryption_init(c) ?:
+	    bch2_fs_compress_init(c) ?:
+	    bch2_fs_ec_init(c) ?:
+	    bch2_fs_fsio_init(c) ?:
+	    bch2_fs_fs_io_buffered_init(c) ?:
+	    bch2_fs_fs_io_direct_init(c);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (bch2_dev_exists(c->disk_sb.sb, i) &&
+		    bch2_dev_alloc(c, i)) {
+			ret = -EEXIST;
+			goto err;
+		}
+
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->btree_root_journal_res,
+			BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
+	bch2_dev_usage_journal_reserve(c);
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->clock_journal_res,
+			(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
+
+	mutex_lock(&bch_fs_list_lock);
+	ret = bch2_fs_online(c);
+	mutex_unlock(&bch_fs_list_lock);
+
+	if (ret)
+		goto err;
+out:
+	return c;
+err:
+	bch2_fs_free(c);
+	c = ERR_PTR(ret);
+	goto out;
+}
+
+noinline_for_stack
+static void print_mount_opts(struct bch_fs *c)
+{
+	enum bch_opt_id i;
+	struct printbuf p = PRINTBUF;
+	bool first = true;
+
+	prt_str(&p, "mounting version ");
+	bch2_version_to_text(&p, c->sb.version);
+
+	if (c->opts.read_only) {
+		prt_str(&p, " opts=");
+		first = false;
+		prt_printf(&p, "ro");
+	}
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+		if (!(opt->flags & OPT_MOUNT))
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		prt_str(&p, first ? " opts=" : ",");
+		first = false;
+		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
+	}
+
+	bch_info(c, "%s", p.buf);
+	printbuf_exit(&p);
+}
+
+int bch2_fs_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	time64_t now = ktime_get_real_seconds();
+	unsigned i;
+	int ret;
+
+	print_mount_opts(c);
+
+	down_write(&c->state_lock);
+
+	BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
+
+	mutex_lock(&c->sb_lock);
+
+	ret = bch2_members_v2_init(c);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		goto err;
+	}
+
+	for_each_online_member(ca, c, i)
+		bch2_sb_from_fs(c, ca);
+
+	for_each_online_member(ca, c, i)
+		bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
+
+	mutex_unlock(&c->sb_lock);
+
+	for_each_rw_member(ca, c, i)
+		bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
+		mutex_lock(&c->btree_transaction_stats[i].lock);
+		bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
+		mutex_unlock(&c->btree_transaction_stats[i].lock);
+	}
+
+	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
+		? bch2_fs_recovery(c)
+		: bch2_fs_initialize(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_opts_check_may_set(c);
+	if (ret)
+		goto err;
+
+	if (bch2_fs_init_fault("fs_start")) {
+		bch_err(c, "fs_start fault injected");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	set_bit(BCH_FS_STARTED, &c->flags);
+
+	if (c->opts.read_only || c->opts.nochanges) {
+		bch2_fs_read_only(c);
+	} else {
+		ret = !test_bit(BCH_FS_RW, &c->flags)
+			? bch2_fs_read_write(c)
+			: bch2_fs_read_write_late(c);
+		if (ret)
+			goto err;
+	}
+
+	ret = 0;
+out:
+	up_write(&c->state_lock);
+	return ret;
+err:
+	bch_err_msg(c, ret, "starting filesystem");
+	goto out;
+}
+
+static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+{
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+
+	if (le16_to_cpu(sb->block_size) != block_sectors(c))
+		return -BCH_ERR_mismatched_block_size;
+
+	if (le16_to_cpu(m.bucket_size) <
+	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
+		return -BCH_ERR_bucket_size_too_small;
+
+	return 0;
+}
+
+static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+{
+	struct bch_sb *newest =
+		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+
+	if (!uuid_equal(&fs->uuid, &sb->uuid))
+		return -BCH_ERR_device_not_a_member_of_filesystem;
+
+	if (!bch2_dev_exists(newest, sb->dev_idx))
+		return -BCH_ERR_device_has_been_removed;
+
+	if (fs->block_size != sb->block_size)
+		return -BCH_ERR_mismatched_block_size;
+
+	return 0;
+}
+
+/* Device startup/shutdown: */
+
+static void bch2_dev_release(struct kobject *kobj)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+
+	kfree(ca);
+}
+
+static void bch2_dev_free(struct bch_dev *ca)
+{
+	cancel_work_sync(&ca->io_error_work);
+
+	if (ca->kobj.state_in_sysfs &&
+	    ca->disk_sb.bdev)
+		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+	if (ca->kobj.state_in_sysfs)
+		kobject_del(&ca->kobj);
+
+	bch2_free_super(&ca->disk_sb);
+	bch2_dev_journal_exit(ca);
+
+	free_percpu(ca->io_done);
+	bioset_exit(&ca->replica_set);
+	bch2_dev_buckets_free(ca);
+	free_page((unsigned long) ca->sb_read_scratch);
+
+	bch2_time_stats_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_exit(&ca->io_latency[READ]);
+
+	percpu_ref_exit(&ca->io_ref);
+	percpu_ref_exit(&ca->ref);
+	kobject_put(&ca->kobj);
+}
+
+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
+{
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (percpu_ref_is_zero(&ca->io_ref))
+		return;
+
+	__bch2_dev_read_only(c, ca);
+
+	reinit_completion(&ca->io_ref_completion);
+	percpu_ref_kill(&ca->io_ref);
+	wait_for_completion(&ca->io_ref_completion);
+
+	if (ca->kobj.state_in_sysfs) {
+		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+		sysfs_remove_link(&ca->kobj, "block");
+	}
+
+	bch2_free_super(&ca->disk_sb);
+	bch2_dev_journal_exit(ca);
+}
+
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
+{
+	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
+
+	complete(&ca->ref_completion);
+}
+
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
+{
+	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
+
+	complete(&ca->io_ref_completion);
+}
+
+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
+{
+	int ret;
+
+	if (!c->kobj.state_in_sysfs)
+		return 0;
+
+	if (!ca->kobj.state_in_sysfs) {
+		ret = kobject_add(&ca->kobj, &c->kobj,
+				  "dev-%u", ca->dev_idx);
+		if (ret)
+			return ret;
+	}
+
+	if (ca->disk_sb.bdev) {
+		struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
+
+		ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
+		if (ret)
+			return ret;
+
+		ret = sysfs_create_link(&ca->kobj, block, "block");
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+					struct bch_member *member)
+{
+	struct bch_dev *ca;
+
+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	if (!ca)
+		return NULL;
+
+	kobject_init(&ca->kobj, &bch2_dev_ktype);
+	init_completion(&ca->ref_completion);
+	init_completion(&ca->io_ref_completion);
+
+	init_rwsem(&ca->bucket_lock);
+
+	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
+
+	bch2_time_stats_init(&ca->io_latency[READ]);
+	bch2_time_stats_init(&ca->io_latency[WRITE]);
+
+	ca->mi = bch2_mi_to_cpu(member);
+	ca->uuid = member->uuid;
+
+	ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+			     ca->mi.bucket_size / btree_sectors(c));
+
+	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
+			    0, GFP_KERNEL) ||
+	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+	    !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
+	    bch2_dev_buckets_alloc(c, ca) ||
+	    bioset_init(&ca->replica_set, 4,
+			offsetof(struct bch_write_bio, bio), 0) ||
+	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
+		goto err;
+
+	return ca;
+err:
+	bch2_dev_free(ca);
+	return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+			    unsigned dev_idx)
+{
+	ca->dev_idx = dev_idx;
+	__set_bit(ca->dev_idx, ca->self.d);
+	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+	ca->fs = c;
+	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+
+	if (bch2_dev_sysfs_online(c, ca))
+		pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+	struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+	struct bch_dev *ca = NULL;
+	int ret = 0;
+
+	if (bch2_fs_init_fault("dev_alloc"))
+		goto err;
+
+	ca = __bch2_dev_alloc(c, &member);
+	if (!ca)
+		goto err;
+
+	ca->fs = c;
+
+	bch2_dev_attach(c, ca, dev_idx);
+	return ret;
+err:
+	if (ca)
+		bch2_dev_free(ca);
+	return -BCH_ERR_ENOMEM_dev_alloc;
+}
+
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
+{
+	unsigned ret;
+
+	if (bch2_dev_is_online(ca)) {
+		bch_err(ca, "already have device online in slot %u",
+			sb->sb->dev_idx);
+		return -BCH_ERR_device_already_online;
+	}
+
+	if (get_capacity(sb->bdev->bd_disk) <
+	    ca->mi.bucket_size * ca->mi.nbuckets) {
+		bch_err(ca, "cannot online: device too small");
+		return -BCH_ERR_device_size_too_small;
+	}
+
+	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
+	ret = bch2_dev_journal_init(ca, sb->sb);
+	if (ret)
+		return ret;
+
+	/* Commit: */
+	ca->disk_sb = *sb;
+	memset(sb, 0, sizeof(*sb));
+
+	ca->dev = ca->disk_sb.bdev->bd_dev;
+
+	percpu_ref_reinit(&ca->io_ref);
+
+	return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (le64_to_cpu(sb->sb->seq) >
+	    le64_to_cpu(c->disk_sb.sb->seq))
+		bch2_sb_to_fs(c, sb->sb);
+
+	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+	       !c->devs[sb->sb->dev_idx]);
+
+	ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+	ret = __bch2_dev_attach_bdev(ca, sb);
+	if (ret)
+		return ret;
+
+	bch2_dev_sysfs_online(c, ca);
+
+	if (c->sb.nr_devices == 1)
+		snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
+	snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
+
+	rebalance_wakeup(c);
+	return 0;
+}
+
+/* Device management: */
+
+/*
+ * Note: this function is also used by the error paths - when a particular
+ * device sees an error, we call it to determine whether we can just set the
+ * device RO, or - if this function returns false - we'll set the whole
+ * filesystem RO:
+ *
+ * XXX: maybe we should be more explicit about whether we're changing state
+ * because we got an error or what have you?
+ */
+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
+			    enum bch_member_state new_state, int flags)
+{
+	struct bch_devs_mask new_online_devs;
+	struct bch_dev *ca2;
+	int i, nr_rw = 0, required;
+
+	lockdep_assert_held(&c->state_lock);
+
+	switch (new_state) {
+	case BCH_MEMBER_STATE_rw:
+		return true;
+	case BCH_MEMBER_STATE_ro:
+		if (ca->mi.state != BCH_MEMBER_STATE_rw)
+			return true;
+
+		/* do we have enough devices to write to?  */
+		for_each_member_device(ca2, c, i)
+			if (ca2 != ca)
+				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
+
+		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
+			       ? c->opts.metadata_replicas
+			       : c->opts.metadata_replicas_required,
+			       !(flags & BCH_FORCE_IF_DATA_DEGRADED)
+			       ? c->opts.data_replicas
+			       : c->opts.data_replicas_required);
+
+		return nr_rw >= required;
+	case BCH_MEMBER_STATE_failed:
+	case BCH_MEMBER_STATE_spare:
+		if (ca->mi.state != BCH_MEMBER_STATE_rw &&
+		    ca->mi.state != BCH_MEMBER_STATE_ro)
+			return true;
+
+		/* do we have enough devices to read from?  */
+		new_online_devs = bch2_online_devs(c);
+		__clear_bit(ca->dev_idx, new_online_devs.d);
+
+		return bch2_have_enough_devs(c, new_online_devs, flags, false);
+	default:
+		BUG();
+	}
+}
+
+static bool bch2_fs_may_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, flags = 0;
+
+	if (c->opts.very_degraded)
+		flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
+
+	if (c->opts.degraded)
+		flags |= BCH_FORCE_IF_DEGRADED;
+
+	if (!c->opts.degraded &&
+	    !c->opts.very_degraded) {
+		mutex_lock(&c->sb_lock);
+
+		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+			if (!bch2_dev_exists(c->disk_sb.sb, i))
+				continue;
+
+			ca = bch_dev_locked(c, i);
+
+			if (!bch2_dev_is_online(ca) &&
+			    (ca->mi.state == BCH_MEMBER_STATE_rw ||
+			     ca->mi.state == BCH_MEMBER_STATE_ro)) {
+				mutex_unlock(&c->sb_lock);
+				return false;
+			}
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
+}
+
+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
+{
+	/*
+	 * The allocator thread itself allocates btree nodes, so stop it first:
+	 */
+	bch2_dev_allocator_remove(c, ca);
+	bch2_dev_journal_stop(&c->journal, ca);
+}
+
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
+
+	bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+}
+
+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+			 enum bch_member_state new_state, int flags)
+{
+	struct bch_member *m;
+	int ret = 0;
+
+	if (ca->mi.state == new_state)
+		return 0;
+
+	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
+		return -BCH_ERR_device_state_not_allowed;
+
+	if (new_state != BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_only(c, ca);
+
+	bch_notice(ca, "%s", bch2_member_states[new_state]);
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_STATE(m, new_state);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (new_state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
+
+	rebalance_wakeup(c);
+
+	return ret;
+}
+
+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+		       enum bch_member_state new_state, int flags)
+{
+	int ret;
+
+	down_write(&c->state_lock);
+	ret = __bch2_dev_set_state(c, ca, new_state, flags);
+	up_write(&c->state_lock);
+
+	return ret;
+}
+
+/* Device add/removal: */
+
+static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bpos start	= POS(ca->dev_idx, 0);
+	struct bpos end		= POS(ca->dev_idx, U64_MAX);
+	int ret;
+
+	/*
+	 * We clear the LRU and need_discard btrees first so that we don't race
+	 * with bch2_do_invalidates() and bch2_do_discards()
+	 */
+	ret =   bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
+					BTREE_TRIGGER_NORUN, NULL);
+	if (ret)
+		bch_err_msg(c, ret, "removing dev alloc info");
+
+	return ret;
+}
+
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	struct bch_member *m;
+	unsigned dev_idx = ca->dev_idx, data;
+	int ret;
+
+	down_write(&c->state_lock);
+
+	/*
+	 * We consume a reference to ca->ref, regardless of whether we succeed
+	 * or fail:
+	 */
+	percpu_ref_put(&ca->ref);
+
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
+		bch_err(ca, "Cannot remove without losing data");
+		ret = -BCH_ERR_device_state_not_allowed;
+		goto err;
+	}
+
+	__bch2_dev_read_only(c, ca);
+
+	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
+	if (ret) {
+		bch_err_msg(ca, ret, "dropping data");
+		goto err;
+	}
+
+	ret = bch2_dev_remove_alloc(c, ca);
+	if (ret) {
+		bch_err_msg(ca, ret, "deleting alloc info");
+		goto err;
+	}
+
+	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+	if (ret) {
+		bch_err_msg(ca, ret, "flushing journal");
+		goto err;
+	}
+
+	ret = bch2_journal_flush(&c->journal);
+	if (ret) {
+		bch_err(ca, "journal error");
+		goto err;
+	}
+
+	ret = bch2_replicas_gc2(c);
+	if (ret) {
+		bch_err_msg(ca, ret, "in replicas_gc2()");
+		goto err;
+	}
+
+	data = bch2_dev_has_data(c, ca);
+	if (data) {
+		struct printbuf data_has = PRINTBUF;
+
+		prt_bitflags(&data_has, bch2_data_types, data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+		printbuf_exit(&data_has);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	__bch2_dev_offline(c, ca);
+
+	mutex_lock(&c->sb_lock);
+	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+	mutex_unlock(&c->sb_lock);
+
+	percpu_ref_kill(&ca->ref);
+	wait_for_completion(&ca->ref_completion);
+
+	bch2_dev_free(ca);
+
+	/*
+	 * At this point the device object has been removed in-core, but the
+	 * on-disk journal might still refer to the device index via sb device
+	 * usage entries. Recovery fails if it sees usage information for an
+	 * invalid device. Flush journal pins to push the back of the journal
+	 * past now invalid device index references before we update the
+	 * superblock, but after the device object has been removed so any
+	 * further journal writes elide usage info for the device.
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+
+	/*
+	 * Free this device's slot in the bch_member array - all pointers to
+	 * this device must be gone:
+	 */
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+	memset(&m->uuid, 0, sizeof(m->uuid));
+
+	bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+	up_write(&c->state_lock);
+
+	bch2_dev_usage_journal_reserve(c);
+	return 0;
+err:
+	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
+	    !percpu_ref_is_zero(&ca->io_ref))
+		__bch2_dev_read_write(c, ca);
+	up_write(&c->state_lock);
+	return ret;
+}
+
+/* Add new device to running filesystem: */
+int bch2_dev_add(struct bch_fs *c, const char *path)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb;
+	struct bch_dev *ca = NULL;
+	struct bch_sb_field_members_v2 *mi;
+	struct bch_member dev_mi;
+	unsigned dev_idx, nr_devices, u64s;
+	struct printbuf errbuf = PRINTBUF;
+	struct printbuf label = PRINTBUF;
+	int ret;
+
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret) {
+		bch_err_msg(c, ret, "reading super");
+		goto err;
+	}
+
+	dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
+
+	if (BCH_MEMBER_GROUP(&dev_mi)) {
+		bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+		if (label.allocation_failure) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	ret = bch2_dev_may_add(sb.sb, c);
+	if (ret) {
+		bch_err_fn(c, ret);
+		goto err;
+	}
+
+	ca = __bch2_dev_alloc(c, &dev_mi);
+	if (!ca) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	bch2_dev_usage_init(ca);
+
+	ret = __bch2_dev_attach_bdev(ca, &sb);
+	if (ret)
+		goto err;
+
+	ret = bch2_dev_journal_alloc(ca);
+	if (ret) {
+		bch_err_msg(c, ret, "allocating journal");
+		goto err;
+	}
+
+	down_write(&c->state_lock);
+	mutex_lock(&c->sb_lock);
+
+	ret = bch2_sb_from_fs(c, ca);
+	if (ret) {
+		bch_err_msg(c, ret, "setting up new superblock");
+		goto err_unlock;
+	}
+
+	mi = bch2_sb_field_get(ca->disk_sb.sb, members_v2);
+
+	if (!bch2_sb_field_resize(&ca->disk_sb, members_v2,
+				le32_to_cpu(mi->field.u64s) +
+				sizeof(dev_mi) / sizeof(u64))) {
+		ret = -BCH_ERR_ENOSPC_sb_members;
+		bch_err_msg(c, ret, "setting up new superblock");
+		goto err_unlock;
+	}
+
+	if (dynamic_fault("bcachefs:add:no_slot"))
+		goto no_slot;
+
+	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
+		if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
+			goto have_slot;
+no_slot:
+	ret = -BCH_ERR_ENOSPC_sb_members;
+	bch_err_msg(c, ret, "setting up new superblock");
+	goto err_unlock;
+
+have_slot:
+	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+	u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+			    le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+	mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+	if (!mi) {
+		ret = -BCH_ERR_ENOSPC_sb_members;
+		bch_err_msg(c, ret, "setting up new superblock");
+		goto err_unlock;
+	}
+	struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+
+	/* success: */
+
+	*m = dev_mi;
+	m->last_mount = cpu_to_le64(ktime_get_real_seconds());
+	c->disk_sb.sb->nr_devices	= nr_devices;
+
+	ca->disk_sb.sb->dev_idx	= dev_idx;
+	bch2_dev_attach(c, ca, dev_idx);
+
+	if (BCH_MEMBER_GROUP(&dev_mi)) {
+		ret = __bch2_dev_group_set(c, ca, label.buf);
+		if (ret) {
+			bch_err_msg(c, ret, "creating new label");
+			goto err_unlock;
+		}
+	}
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	bch2_dev_usage_journal_reserve(c);
+
+	ret = bch2_trans_mark_dev_sb(c, ca);
+	if (ret) {
+		bch_err_msg(c, ret, "marking new superblock");
+		goto err_late;
+	}
+
+	ret = bch2_fs_freespace_init(c);
+	if (ret) {
+		bch_err_msg(c, ret, "initializing free space");
+		goto err_late;
+	}
+
+	ca->new_fs_bucket_idx = 0;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
+
+	up_write(&c->state_lock);
+	return 0;
+
+err_unlock:
+	mutex_unlock(&c->sb_lock);
+	up_write(&c->state_lock);
+err:
+	if (ca)
+		bch2_dev_free(ca);
+	bch2_free_super(&sb);
+	printbuf_exit(&label);
+	printbuf_exit(&errbuf);
+	return ret;
+err_late:
+	up_write(&c->state_lock);
+	ca = NULL;
+	goto err;
+}
+
+/* Hot add existing device to running filesystem: */
+int bch2_dev_online(struct bch_fs *c, const char *path)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb = { NULL };
+	struct bch_dev *ca;
+	unsigned dev_idx;
+	int ret;
+
+	down_write(&c->state_lock);
+
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret) {
+		up_write(&c->state_lock);
+		return ret;
+	}
+
+	dev_idx = sb.sb->dev_idx;
+
+	ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+	if (ret) {
+		bch_err_msg(c, ret, "bringing %s online", path);
+		goto err;
+	}
+
+	ret = bch2_dev_attach_bdev(c, &sb);
+	if (ret)
+		goto err;
+
+	ca = bch_dev_locked(c, dev_idx);
+
+	ret = bch2_trans_mark_dev_sb(c, ca);
+	if (ret) {
+		bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+		goto err;
+	}
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
+
+	mutex_lock(&c->sb_lock);
+	struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+	m->last_mount =
+		cpu_to_le64(ktime_get_real_seconds());
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	ret = bch2_fs_freespace_init(c);
+	if (ret)
+		bch_err_msg(c, ret, "initializing free space");
+
+	up_write(&c->state_lock);
+	return 0;
+err:
+	up_write(&c->state_lock);
+	bch2_free_super(&sb);
+	return ret;
+}
+
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	down_write(&c->state_lock);
+
+	if (!bch2_dev_is_online(ca)) {
+		bch_err(ca, "Already offline");
+		up_write(&c->state_lock);
+		return 0;
+	}
+
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
+		bch_err(ca, "Cannot offline required disk");
+		up_write(&c->state_lock);
+		return -BCH_ERR_device_state_not_allowed;
+	}
+
+	__bch2_dev_offline(c, ca);
+
+	up_write(&c->state_lock);
+	return 0;
+}
+
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bch_member *m;
+	u64 old_nbuckets;
+	int ret = 0;
+
+	down_write(&c->state_lock);
+	old_nbuckets = ca->mi.nbuckets;
+
+	if (nbuckets < ca->mi.nbuckets) {
+		bch_err(ca, "Cannot shrink yet");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (bch2_dev_is_online(ca) &&
+	    get_capacity(ca->disk_sb.bdev->bd_disk) <
+	    ca->mi.bucket_size * nbuckets) {
+		bch_err(ca, "New size larger than device");
+		ret = -BCH_ERR_device_size_too_small;
+		goto err;
+	}
+
+	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
+	if (ret) {
+		bch_err_msg(ca, ret, "resizing buckets");
+		goto err;
+	}
+
+	ret = bch2_trans_mark_dev_sb(c, ca);
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	m->nbuckets = cpu_to_le64(nbuckets);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (ca->mi.freespace_initialized) {
+		ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+		if (ret)
+			goto err;
+
+		/*
+		 * XXX: this is all wrong transactionally - we'll be able to do
+		 * this correctly after the disk space accounting rewrite
+		 */
+		ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
+	}
+
+	bch2_recalc_capacity(c);
+err:
+	up_write(&c->state_lock);
+	return ret;
+}
+
+/* return with ref on ca->ref: */
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		if (!strcmp(name, ca->name))
+			goto found;
+	ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
+found:
+	rcu_read_unlock();
+
+	return ca;
+}
+
+/* Filesystem open: */
+
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
+			    struct bch_opts opts)
+{
+	struct bch_sb_handle *sb = NULL;
+	struct bch_fs *c = NULL;
+	unsigned i, best_sb = 0;
+	struct printbuf errbuf = PRINTBUF;
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return ERR_PTR(-ENODEV);
+
+	if (!nr_devices) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
+	if (!sb) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < nr_devices; i++) {
+		ret = bch2_read_super(devices[i], &opts, &sb[i]);
+		if (ret)
+			goto err;
+
+	}
+
+	for (i = 1; i < nr_devices; i++)
+		if (le64_to_cpu(sb[i].sb->seq) >
+		    le64_to_cpu(sb[best_sb].sb->seq))
+			best_sb = i;
+
+	i = 0;
+	while (i < nr_devices) {
+		if (i != best_sb &&
+		    !bch2_dev_exists(sb[best_sb].sb, sb[i].sb->dev_idx)) {
+			pr_info("%pg has been removed, skipping", sb[i].bdev);
+			bch2_free_super(&sb[i]);
+			array_remove_item(sb, nr_devices, i);
+			continue;
+		}
+
+		ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+		if (ret)
+			goto err_print;
+		i++;
+	}
+
+	c = bch2_fs_alloc(sb[best_sb].sb, opts);
+	if (IS_ERR(c)) {
+		ret = PTR_ERR(c);
+		goto err;
+	}
+
+	down_write(&c->state_lock);
+	for (i = 0; i < nr_devices; i++) {
+		ret = bch2_dev_attach_bdev(c, &sb[i]);
+		if (ret) {
+			up_write(&c->state_lock);
+			goto err;
+		}
+	}
+	up_write(&c->state_lock);
+
+	if (!bch2_fs_may_start(c)) {
+		ret = -BCH_ERR_insufficient_devices_to_start;
+		goto err_print;
+	}
+
+	if (!c->opts.nostart) {
+		ret = bch2_fs_start(c);
+		if (ret)
+			goto err;
+	}
+out:
+	kfree(sb);
+	printbuf_exit(&errbuf);
+	module_put(THIS_MODULE);
+	return c;
+err_print:
+	pr_err("bch_fs_open err opening %s: %s",
+	       devices[0], bch2_err_str(ret));
+err:
+	if (!IS_ERR_OR_NULL(c))
+		bch2_fs_stop(c);
+	if (sb)
+		for (i = 0; i < nr_devices; i++)
+			bch2_free_super(&sb[i]);
+	c = ERR_PTR(ret);
+	goto out;
+}
+
+/* Global interfaces/init */
+
+static void bcachefs_exit(void)
+{
+	bch2_debug_exit();
+	bch2_vfs_exit();
+	bch2_chardev_exit();
+	bch2_btree_key_cache_exit();
+	if (bcachefs_kset)
+		kset_unregister(bcachefs_kset);
+}
+
+static int __init bcachefs_init(void)
+{
+	bch2_bkey_pack_test();
+
+	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+	    bch2_btree_key_cache_init() ||
+	    bch2_chardev_init() ||
+	    bch2_vfs_init() ||
+	    bch2_debug_init())
+		goto err;
+
+	return 0;
+err:
+	bcachefs_exit();
+	return -ENOMEM;
+}
+
+#define BCH_DEBUG_PARAM(name, description)			\
+	bool bch2_##name;					\
+	module_param_named(name, bch2_##name, bool, 0644);	\
+	MODULE_PARM_DESC(name, description);
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+__maybe_unused
+static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
+module_param_named(version, bch2_metadata_version, uint, 0400);
+
+module_exit(bcachefs_exit);
+module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
new file mode 100644
index 000000000000..bf762df18012
--- /dev/null
+++ b/fs/bcachefs/super.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_H
+#define _BCACHEFS_SUPER_H
+
+#include "extents.h"
+
+#include "bcachefs_ioctl.h"
+
+#include <linux/math64.h>
+
+struct bch_fs *bch2_dev_to_fs(dev_t);
+struct bch_fs *bch2_uuid_to_fs(__uuid_t);
+
+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
+			   enum bch_member_state, int);
+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+			enum bch_member_state, int);
+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+		      enum bch_member_state, int);
+
+int bch2_dev_fail(struct bch_dev *, int);
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_add(struct bch_fs *, const char *);
+int bch2_dev_online(struct bch_fs *, const char *);
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
+
+bool bch2_fs_emergency_read_only(struct bch_fs *);
+void bch2_fs_read_only(struct bch_fs *);
+
+int bch2_fs_read_write(struct bch_fs *);
+int bch2_fs_read_write_early(struct bch_fs *);
+
+/*
+ * Only for use in the recovery/fsck path:
+ */
+static inline void bch2_fs_lazy_rw(struct bch_fs *c)
+{
+	if (!test_bit(BCH_FS_RW, &c->flags) &&
+	    !test_bit(BCH_FS_WAS_RW, &c->flags))
+		bch2_fs_read_write_early(c);
+}
+
+void __bch2_fs_stop(struct bch_fs *);
+void bch2_fs_free(struct bch_fs *);
+void bch2_fs_stop(struct bch_fs *);
+
+int bch2_fs_start(struct bch_fs *);
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+
+#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
new file mode 100644
index 000000000000..78d6138db62d
--- /dev/null
+++ b/fs/bcachefs/super_types.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_TYPES_H
+#define _BCACHEFS_SUPER_TYPES_H
+
+struct bch_sb_handle {
+	struct bch_sb		*sb;
+	struct block_device	*bdev;
+	struct bio		*bio;
+	void			*holder;
+	size_t			buffer_size;
+	blk_mode_t		mode;
+	unsigned		have_layout:1;
+	unsigned		have_bio:1;
+	unsigned		fs_sb:1;
+	u64			seq;
+};
+
+struct bch_devs_mask {
+	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
+};
+
+struct bch_devs_list {
+	u8			nr;
+	u8			devs[BCH_BKEY_PTRS_MAX];
+};
+
+struct bch_member_cpu {
+	u64			nbuckets;	/* device size */
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u16			group;
+	u8			state;
+	u8			discard;
+	u8			data_allowed;
+	u8			durability;
+	u8			freespace_initialized;
+	u8			valid;
+};
+
+struct bch_disk_group_cpu {
+	bool				deleted;
+	u16				parent;
+	struct bch_devs_mask		devs;
+};
+
+struct bch_disk_groups_cpu {
+	struct rcu_head			rcu;
+	unsigned			nr;
+	struct bch_disk_group_cpu	entries[] __counted_by(nr);
+};
+
+#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
new file mode 100644
index 000000000000..eb764b9a4629
--- /dev/null
+++ b/fs/bcachefs/sysfs.c
@@ -0,0 +1,1031 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#ifndef NO_BCACHEFS_SYSFS
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "sysfs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "opts.h"
+#include "rebalance.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "tests.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/sched/clock.h>
+
+#include "util.h"
+
+#define SYSFS_OPS(type)							\
+const struct sysfs_ops type ## _sysfs_ops = {				\
+	.show	= type ## _show,					\
+	.store	= type ## _store					\
+}
+
+#define SHOW(fn)							\
+static ssize_t fn ## _to_text(struct printbuf *,			\
+			      struct kobject *, struct attribute *);	\
+									\
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+			   char *buf)					\
+{									\
+	struct printbuf out = PRINTBUF;					\
+	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
+									\
+	if (out.pos && out.buf[out.pos - 1] != '\n')			\
+		prt_newline(&out);					\
+									\
+	if (!ret && out.allocation_failure)				\
+		ret = -ENOMEM;						\
+									\
+	if (!ret) {							\
+		ret = min_t(size_t, out.pos, PAGE_SIZE - 1);		\
+		memcpy(buf, out.buf, ret);				\
+	}								\
+	printbuf_exit(&out);						\
+	return bch2_err_class(ret);					\
+}									\
+									\
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+			      struct attribute *attr)
+
+#define STORE(fn)							\
+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+			    const char *, size_t);			\
+									\
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+			    const char *buf, size_t size)		\
+{									\
+	return bch2_err_class(fn##_store_inner(kobj, attr, buf, size));	\
+}									\
+									\
+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+				  const char *buf, size_t size)
+
+#define __sysfs_attribute(_name, _mode)					\
+	static struct attribute sysfs_##_name =				\
+		{ .name = #_name, .mode = _mode }
+
+#define write_attribute(n)	__sysfs_attribute(n, 0200)
+#define read_attribute(n)	__sysfs_attribute(n, 0444)
+#define rw_attribute(n)		__sysfs_attribute(n, 0644)
+
+#define sysfs_printf(file, fmt, ...)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		prt_printf(out, fmt "\n", __VA_ARGS__);			\
+} while (0)
+
+#define sysfs_print(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		snprint(out, var);					\
+} while (0)
+
+#define sysfs_hprint(file, val)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		prt_human_readable_s64(out, val);			\
+} while (0)
+
+#define sysfs_strtoul(file, var)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)			\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe_clamp(buf, var, min, max)		\
+			?: (ssize_t) size;				\
+} while (0)
+
+#define strtoul_or_return(cp)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+write_attribute(trigger_gc);
+write_attribute(trigger_discards);
+write_attribute(trigger_invalidates);
+write_attribute(prune_cache);
+write_attribute(btree_wakeup);
+rw_attribute(btree_gc_periodic);
+rw_attribute(gc_gens_pos);
+
+read_attribute(uuid);
+read_attribute(minor);
+read_attribute(bucket_size);
+read_attribute(first_bucket);
+read_attribute(nbuckets);
+rw_attribute(durability);
+read_attribute(iodone);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(btree_write_stats);
+
+read_attribute(btree_cache_size);
+read_attribute(compression_stats);
+read_attribute(journal_debug);
+read_attribute(btree_updates);
+read_attribute(btree_cache);
+read_attribute(btree_key_cache);
+read_attribute(stripes_heap);
+read_attribute(open_buckets);
+read_attribute(open_buckets_partial);
+read_attribute(write_points);
+read_attribute(nocow_lock_table);
+
+#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(write_refs);
+
+static const char * const bch2_write_refs[] = {
+#define x(n)	#n,
+	BCH_WRITE_REFS()
+#undef x
+	NULL
+};
+
+static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	bch2_printbuf_tabstop_push(out, 24);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
+		prt_str(out, bch2_write_refs[i]);
+		prt_tab(out);
+		prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
+		prt_newline(out);
+	}
+}
+#endif
+
+read_attribute(internal_uuid);
+read_attribute(disk_groups);
+
+read_attribute(has_data);
+read_attribute(alloc_debug);
+
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
+
+rw_attribute(discard);
+rw_attribute(label);
+
+rw_attribute(copy_gc_enabled);
+read_attribute(copy_gc_wait);
+
+rw_attribute(rebalance_enabled);
+sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_work);
+rw_attribute(promote_whole_extents);
+
+read_attribute(new_stripes);
+
+read_attribute(io_timers_read);
+read_attribute(io_timers_write);
+
+read_attribute(moving_ctxts);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+write_attribute(perf_test);
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#define x(_name)						\
+	static struct attribute sysfs_time_stat_##_name =		\
+		{ .name = #_name, .mode = 0444 };
+	BCH_TIME_STATS()
+#undef x
+
+static struct attribute sysfs_state_rw = {
+	.name = "state",
+	.mode =  0444,
+};
+
+static size_t bch2_btree_cache_size(struct bch_fs *c)
+{
+	size_t ret = 0;
+	struct btree *b;
+
+	mutex_lock(&c->btree_cache.lock);
+	list_for_each_entry(b, &c->btree_cache.live, list)
+		ret += btree_bytes(c);
+
+	mutex_unlock(&c->btree_cache.lock);
+	return ret;
+}
+
+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	enum btree_id id;
+	u64 nr_uncompressed_extents = 0,
+	    nr_compressed_extents = 0,
+	    nr_incompressible_extents = 0,
+	    uncompressed_sectors = 0,
+	    incompressible_sectors = 0,
+	    compressed_sectors_compressed = 0,
+	    compressed_sectors_uncompressed = 0;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EPERM;
+
+	trans = bch2_trans_get(c);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		if (!btree_type_has_ptrs(id))
+			continue;
+
+		for_each_btree_key(trans, iter, id, POS_MIN,
+				   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+			const union bch_extent_entry *entry;
+			struct extent_ptr_decoded p;
+			bool compressed = false, uncompressed = false, incompressible = false;
+
+			bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+				switch (p.crc.compression_type) {
+				case BCH_COMPRESSION_TYPE_none:
+					uncompressed = true;
+					uncompressed_sectors += k.k->size;
+					break;
+				case BCH_COMPRESSION_TYPE_incompressible:
+					incompressible = true;
+					incompressible_sectors += k.k->size;
+					break;
+				default:
+					compressed_sectors_compressed +=
+						p.crc.compressed_size;
+					compressed_sectors_uncompressed +=
+						p.crc.uncompressed_size;
+					compressed = true;
+					break;
+				}
+			}
+
+			if (incompressible)
+				nr_incompressible_extents++;
+			else if (uncompressed)
+				nr_uncompressed_extents++;
+			else if (compressed)
+				nr_compressed_extents++;
+		}
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		return ret;
+
+	prt_printf(out, "uncompressed:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_uncompressed_extents);
+	prt_printf(out, "	size:			");
+	prt_human_readable_u64(out, uncompressed_sectors << 9);
+	prt_printf(out, "\n");
+
+	prt_printf(out, "compressed:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_compressed_extents);
+	prt_printf(out, "	compressed size:	");
+	prt_human_readable_u64(out, compressed_sectors_compressed << 9);
+	prt_printf(out, "\n");
+	prt_printf(out, "	uncompressed size:	");
+	prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
+	prt_printf(out, "\n");
+
+	prt_printf(out, "incompressible:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_incompressible_extents);
+	prt_printf(out, "	size:			");
+	prt_human_readable_u64(out, incompressible_sectors << 9);
+	prt_printf(out, "\n");
+	return 0;
+}
+
+static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+	bch2_bpos_to_text(out, c->gc_gens_pos);
+	prt_printf(out, "\n");
+}
+
+static void bch2_btree_wakeup_all(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+
+	seqmutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+
+		if (b)
+			six_lock_wakeup_all(&b->lock);
+
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+}
+
+SHOW(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	sysfs_print(minor,			c->minor);
+	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
+
+	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
+
+	if (attr == &sysfs_btree_write_stats)
+		bch2_btree_write_stats_to_text(out, c);
+
+	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
+
+	if (attr == &sysfs_gc_gens_pos)
+		bch2_gc_gens_pos_to_text(out, c);
+
+	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+
+	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
+	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
+
+	if (attr == &sysfs_copy_gc_wait)
+		bch2_copygc_wait_to_text(out, c);
+
+	if (attr == &sysfs_rebalance_work)
+		bch2_rebalance_work_to_text(out, c);
+
+	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
+
+	/* Debugging: */
+
+	if (attr == &sysfs_journal_debug)
+		bch2_journal_debug_to_text(out, &c->journal);
+
+	if (attr == &sysfs_btree_updates)
+		bch2_btree_updates_to_text(out, c);
+
+	if (attr == &sysfs_btree_cache)
+		bch2_btree_cache_to_text(out, c);
+
+	if (attr == &sysfs_btree_key_cache)
+		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
+
+	if (attr == &sysfs_stripes_heap)
+		bch2_stripes_heap_to_text(out, c);
+
+	if (attr == &sysfs_open_buckets)
+		bch2_open_buckets_to_text(out, c);
+
+	if (attr == &sysfs_open_buckets_partial)
+		bch2_open_buckets_partial_to_text(out, c);
+
+	if (attr == &sysfs_write_points)
+		bch2_write_points_to_text(out, c);
+
+	if (attr == &sysfs_compression_stats)
+		bch2_compression_stats_to_text(out, c);
+
+	if (attr == &sysfs_new_stripes)
+		bch2_new_stripes_to_text(out, c);
+
+	if (attr == &sysfs_io_timers_read)
+		bch2_io_timers_to_text(out, &c->io_clock[READ]);
+
+	if (attr == &sysfs_io_timers_write)
+		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
+
+	if (attr == &sysfs_moving_ctxts)
+		bch2_fs_moving_ctxts_to_text(out, c);
+
+#ifdef BCH_WRITE_REF_DEBUG
+	if (attr == &sysfs_write_refs)
+		bch2_write_refs_to_text(out, c);
+#endif
+
+	if (attr == &sysfs_nocow_lock_table)
+		bch2_nocow_locks_to_text(out, &c->nocow_locks);
+
+	if (attr == &sysfs_disk_groups)
+		bch2_disk_groups_to_text(out, c);
+
+	return 0;
+}
+
+STORE(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	if (attr == &sysfs_btree_gc_periodic) {
+		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
+			?: (ssize_t) size;
+
+		wake_up_process(c->gc_thread);
+		return ret;
+	}
+
+	if (attr == &sysfs_copy_gc_enabled) {
+		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+			?: (ssize_t) size;
+
+		if (c->copygc_thread)
+			wake_up_process(c->copygc_thread);
+		return ret;
+	}
+
+	if (attr == &sysfs_rebalance_enabled) {
+		ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
+			?: (ssize_t) size;
+
+		rebalance_wakeup(c);
+		return ret;
+	}
+
+	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
+
+	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
+
+	/* Debugging: */
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EPERM;
+
+	/* Debugging: */
+
+	if (!test_bit(BCH_FS_RW, &c->flags))
+		return -EROFS;
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+	}
+
+	if (attr == &sysfs_btree_wakeup)
+		bch2_btree_wakeup_all(c);
+
+	if (attr == &sysfs_trigger_gc) {
+		/*
+		 * Full gc is currently incompatible with btree key cache:
+		 */
+#if 0
+		down_read(&c->state_lock);
+		bch2_gc(c, false, false);
+		up_read(&c->state_lock);
+#else
+		bch2_gc_gens(c);
+#endif
+	}
+
+	if (attr == &sysfs_trigger_discards)
+		bch2_do_discards(c);
+
+	if (attr == &sysfs_trigger_invalidates)
+		bch2_do_invalidates(c);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+	if (attr == &sysfs_perf_test) {
+		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+		char *test		= strsep(&p, " \t\n");
+		char *nr_str		= strsep(&p, " \t\n");
+		char *threads_str	= strsep(&p, " \t\n");
+		unsigned threads;
+		u64 nr;
+		int ret = -EINVAL;
+
+		if (threads_str &&
+		    !(ret = kstrtouint(threads_str, 10, &threads)) &&
+		    !(ret = bch2_strtoull_h(nr_str, &nr)))
+			ret = bch2_btree_perf_test(c, test, nr, threads);
+		kfree(tmp);
+
+		if (ret)
+			size = ret;
+	}
+#endif
+	return size;
+}
+SYSFS_OPS(bch2_fs);
+
+struct attribute *bch2_fs_files[] = {
+	&sysfs_minor,
+	&sysfs_btree_cache_size,
+	&sysfs_btree_write_stats,
+
+	&sysfs_promote_whole_extents,
+
+	&sysfs_compression_stats,
+
+#ifdef CONFIG_BCACHEFS_TESTS
+	&sysfs_perf_test,
+#endif
+	NULL
+};
+
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+	u64 counter = 0;
+	u64 counter_since_mount = 0;
+
+	printbuf_tabstop_push(out, 32);
+
+	#define x(t, ...) \
+		if (attr == &sysfs_##t) {					\
+			counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+			prt_printf(out, "since mount:");				\
+			prt_tab(out);						\
+			prt_human_readable_u64(out, counter_since_mount);	\
+			prt_newline(out);					\
+										\
+			prt_printf(out, "since filesystem creation:");		\
+			prt_tab(out);						\
+			prt_human_readable_u64(out, counter);			\
+			prt_newline(out);					\
+		}
+	BCH_PERSISTENT_COUNTERS()
+	#undef x
+	return 0;
+}
+
+STORE(bch2_fs_counters) {
+	return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+	&sysfs_##t,
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	NULL
+};
+/* internal dir - just a wrapper */
+
+SHOW(bch2_fs_internal)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
+	return bch2_fs_to_text(out, &c->kobj, attr);
+}
+
+STORE(bch2_fs_internal)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
+	return bch2_fs_store(&c->kobj, attr, buf, size);
+}
+SYSFS_OPS(bch2_fs_internal);
+
+struct attribute *bch2_fs_internal_files[] = {
+	&sysfs_journal_debug,
+	&sysfs_btree_updates,
+	&sysfs_btree_cache,
+	&sysfs_btree_key_cache,
+	&sysfs_new_stripes,
+	&sysfs_stripes_heap,
+	&sysfs_open_buckets,
+	&sysfs_open_buckets_partial,
+	&sysfs_write_points,
+#ifdef BCH_WRITE_REF_DEBUG
+	&sysfs_write_refs,
+#endif
+	&sysfs_nocow_lock_table,
+	&sysfs_io_timers_read,
+	&sysfs_io_timers_write,
+
+	&sysfs_trigger_gc,
+	&sysfs_trigger_discards,
+	&sysfs_trigger_invalidates,
+	&sysfs_prune_cache,
+	&sysfs_btree_wakeup,
+
+	&sysfs_gc_gens_pos,
+
+	&sysfs_copy_gc_enabled,
+	&sysfs_copy_gc_wait,
+
+	&sysfs_rebalance_enabled,
+	&sysfs_rebalance_work,
+	sysfs_pd_controller_files(rebalance),
+
+	&sysfs_moving_ctxts,
+
+	&sysfs_internal_uuid,
+
+	&sysfs_disk_groups,
+	NULL
+};
+
+/* options */
+
+SHOW(bch2_fs_opts_dir)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+	int id = opt - bch2_opt_table;
+	u64 v = bch2_opt_get_by_id(&c->opts, id);
+
+	bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+	prt_char(out, '\n');
+
+	return 0;
+}
+
+STORE(bch2_fs_opts_dir)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+	int ret, id = opt - bch2_opt_table;
+	char *tmp;
+	u64 v;
+
+	/*
+	 * We don't need to take c->writes for correctness, but it eliminates an
+	 * unsightly error message in the dmesg log when we're RO:
+	 */
+	if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
+		return -EROFS;
+
+	tmp = kstrdup(buf, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
+	kfree(tmp);
+
+	if (ret < 0)
+		goto err;
+
+	ret = bch2_opt_check_may_set(c, id, v);
+	if (ret < 0)
+		goto err;
+
+	bch2_opt_set_sb(c, opt, v);
+	bch2_opt_set_by_id(&c->opts, id, v);
+
+	if ((id == Opt_background_target ||
+	     id == Opt_background_compression) && v) {
+		bch2_rebalance_add_work(c, S64_MAX);
+		rebalance_wakeup(c);
+	}
+
+	ret = size;
+err:
+	bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+	return ret;
+}
+SYSFS_OPS(bch2_fs_opts_dir);
+
+struct attribute *bch2_fs_opts_dir_files[] = { NULL };
+
+int bch2_opts_create_sysfs_files(struct kobject *kobj)
+{
+	const struct bch_option *i;
+	int ret;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + bch2_opts_nr;
+	     i++) {
+		if (!(i->flags & OPT_FS))
+			continue;
+
+		ret = sysfs_create_file(kobj, &i->attr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* time stats */
+
+SHOW(bch2_fs_time_stats)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name)								\
+	if (attr == &sysfs_time_stat_##name)				\
+		bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
+	BCH_TIME_STATS()
+#undef x
+
+	return 0;
+}
+
+STORE(bch2_fs_time_stats)
+{
+	return size;
+}
+SYSFS_OPS(bch2_fs_time_stats);
+
+struct attribute *bch2_fs_time_stats_files[] = {
+#define x(name)						\
+	&sysfs_time_stat_##name,
+	BCH_TIME_STATS()
+#undef x
+	NULL
+};
+
+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+	unsigned i, nr[BCH_DATA_NR];
+
+	memset(nr, 0, sizeof(nr));
+
+	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+		nr[c->open_buckets[i].data_type]++;
+
+	printbuf_tabstop_push(out, 8);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+
+	prt_tab(out);
+	prt_str(out, "buckets");
+	prt_tab_rjust(out);
+	prt_str(out, "sectors");
+	prt_tab_rjust(out);
+	prt_str(out, "fragmented");
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	for (i = 0; i < BCH_DATA_NR; i++) {
+		prt_str(out, bch2_data_types[i]);
+		prt_tab(out);
+		prt_u64(out, stats.d[i].buckets);
+		prt_tab_rjust(out);
+		prt_u64(out, stats.d[i].sectors);
+		prt_tab_rjust(out);
+		prt_u64(out, stats.d[i].fragmented);
+		prt_tab_rjust(out);
+		prt_newline(out);
+	}
+
+	prt_str(out, "ec");
+	prt_tab(out);
+	prt_u64(out, stats.buckets_ec);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_newline(out);
+
+	prt_printf(out, "reserves:");
+	prt_newline(out);
+	for (i = 0; i < BCH_WATERMARK_NR; i++) {
+		prt_str(out, bch2_watermarks[i]);
+		prt_tab(out);
+		prt_u64(out, bch2_dev_buckets_reserved(ca, i));
+		prt_tab_rjust(out);
+		prt_newline(out);
+	}
+
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 24);
+
+	prt_str(out, "freelist_wait");
+	prt_tab(out);
+	prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
+	prt_newline(out);
+
+	prt_str(out, "open buckets allocated");
+	prt_tab(out);
+	prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+	prt_newline(out);
+
+	prt_str(out, "open buckets this dev");
+	prt_tab(out);
+	prt_u64(out, ca->nr_open_buckets);
+	prt_newline(out);
+
+	prt_str(out, "open buckets total");
+	prt_tab(out);
+	prt_u64(out, OPEN_BUCKETS_COUNT);
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_wait");
+	prt_tab(out);
+	prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_btree");
+	prt_tab(out);
+	prt_u64(out, nr[BCH_DATA_btree]);
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_user");
+	prt_tab(out);
+	prt_u64(out, nr[BCH_DATA_user]);
+	prt_newline(out);
+
+	prt_str(out, "buckets_to_invalidate");
+	prt_tab(out);
+	prt_u64(out, should_invalidate_buckets(ca, stats));
+	prt_newline(out);
+
+	prt_str(out, "btree reserve cache");
+	prt_tab(out);
+	prt_u64(out, c->btree_reserve_cache_nr);
+	prt_newline(out);
+}
+
+static const char * const bch2_rw[] = {
+	"read",
+	"write",
+	NULL
+};
+
+static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+	int rw, i;
+
+	for (rw = 0; rw < 2; rw++) {
+		prt_printf(out, "%s:\n", bch2_rw[rw]);
+
+		for (i = 1; i < BCH_DATA_NR; i++)
+			prt_printf(out, "%-12s:%12llu\n",
+			       bch2_data_types[i],
+			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
+	}
+}
+
+SHOW(bch2_dev)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+	struct bch_fs *c = ca->fs;
+
+	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
+
+	sysfs_print(bucket_size,	bucket_bytes(ca));
+	sysfs_print(first_bucket,	ca->mi.first_bucket);
+	sysfs_print(nbuckets,		ca->mi.nbuckets);
+	sysfs_print(durability,		ca->mi.durability);
+	sysfs_print(discard,		ca->mi.discard);
+
+	if (attr == &sysfs_label) {
+		if (ca->mi.group) {
+			mutex_lock(&c->sb_lock);
+			bch2_disk_path_to_text(out, c->disk_sb.sb,
+					       ca->mi.group - 1);
+			mutex_unlock(&c->sb_lock);
+		}
+
+		prt_char(out, '\n');
+	}
+
+	if (attr == &sysfs_has_data) {
+		prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+		prt_char(out, '\n');
+	}
+
+	if (attr == &sysfs_state_rw) {
+		prt_string_option(out, bch2_member_states, ca->mi.state);
+		prt_char(out, '\n');
+	}
+
+	if (attr == &sysfs_iodone)
+		dev_iodone_to_text(out, ca);
+
+	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
+	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
+
+	if (attr == &sysfs_io_latency_stats_read)
+		bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+
+	if (attr == &sysfs_io_latency_stats_write)
+		bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+
+	sysfs_printf(congested,			"%u%%",
+		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+		     * 100 / CONGESTED_MAX);
+
+	if (attr == &sysfs_alloc_debug)
+		dev_alloc_debug_to_text(out, ca);
+
+	return 0;
+}
+
+STORE(bch2_dev)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+	struct bch_fs *c = ca->fs;
+	struct bch_member *mi;
+
+	if (attr == &sysfs_discard) {
+		bool v = strtoul_or_return(buf);
+
+		mutex_lock(&c->sb_lock);
+		mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+		if (v != BCH_MEMBER_DISCARD(mi)) {
+			SET_BCH_MEMBER_DISCARD(mi, v);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (attr == &sysfs_durability) {
+		u64 v = strtoul_or_return(buf);
+
+		mutex_lock(&c->sb_lock);
+		mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+		if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
+			SET_BCH_MEMBER_DURABILITY(mi, v + 1);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (attr == &sysfs_label) {
+		char *tmp;
+		int ret;
+
+		tmp = kstrdup(buf, GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		ret = bch2_dev_group_set(c, ca, strim(tmp));
+		kfree(tmp);
+		if (ret)
+			return ret;
+	}
+
+	return size;
+}
+SYSFS_OPS(bch2_dev);
+
+struct attribute *bch2_dev_files[] = {
+	&sysfs_uuid,
+	&sysfs_bucket_size,
+	&sysfs_first_bucket,
+	&sysfs_nbuckets,
+	&sysfs_durability,
+
+	/* settings: */
+	&sysfs_discard,
+	&sysfs_state_rw,
+	&sysfs_label,
+
+	&sysfs_has_data,
+	&sysfs_iodone,
+
+	&sysfs_io_latency_read,
+	&sysfs_io_latency_write,
+	&sysfs_io_latency_stats_read,
+	&sysfs_io_latency_stats_write,
+	&sysfs_congested,
+
+	/* debug: */
+	&sysfs_alloc_debug,
+	NULL
+};
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
new file mode 100644
index 000000000000..222cd5062702
--- /dev/null
+++ b/fs/bcachefs/sysfs.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SYSFS_H_
+#define _BCACHEFS_SYSFS_H_
+
+#include <linux/sysfs.h>
+
+#ifndef NO_BCACHEFS_SYSFS
+
+struct attribute;
+struct sysfs_ops;
+
+extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
+extern struct attribute *bch2_fs_internal_files[];
+extern struct attribute *bch2_fs_opts_dir_files[];
+extern struct attribute *bch2_fs_time_stats_files[];
+extern struct attribute *bch2_dev_files[];
+
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
+
+int bch2_opts_create_sysfs_files(struct kobject *);
+
+#else
+
+static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
+static struct attribute *bch2_fs_internal_files[] = {};
+static struct attribute *bch2_fs_opts_dir_files[] = {};
+static struct attribute *bch2_fs_time_stats_files[] = {};
+static struct attribute *bch2_dev_files[] = {};
+
+static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+static const struct sysfs_ops bch2_dev_sysfs_ops;
+
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
+
+#endif /* NO_BCACHEFS_SYSFS */
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
new file mode 100644
index 000000000000..2fc9e60c754b
--- /dev/null
+++ b/fs/bcachefs/tests.c
@@ -0,0 +1,919 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_TESTS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "journal_reclaim.h"
+#include "snapshot.h"
+#include "tests.h"
+
+#include "linux/kthread.h"
+#include "linux/random.h"
+
+static void delete_test_keys(struct bch_fs *c)
+{
+	int ret;
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
+	BUG_ON(ret);
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
+	BUG_ON(ret);
+}
+
+/* unit tests */
+
+static int test_delete(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k.p.snapshot = U32_MAX;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
+			     BTREE_ITER_INTENT);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
+	bch_err_msg(c, ret, "update error");
+	if (ret)
+		goto err;
+
+	pr_info("deleting once");
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error (first)");
+	if (ret)
+		goto err;
+
+	pr_info("deleting twice");
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error (second)");
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_delete_written(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k.p.snapshot = U32_MAX;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
+			     BTREE_ITER_INTENT);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
+	bch_err_msg(c, ret, "update error");
+	if (ret)
+		goto err;
+
+	bch2_trans_unlock(trans);
+	bch2_journal_flush_all_pins(&c->journal);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error");
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_iterate(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	u64 i;
+	int ret = 0;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i++) {
+		struct bkey_i_cookie ck;
+
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i;
+		ck.k.p.snapshot = U32_MAX;
+
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			goto err;
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
+		BUG_ON(k.k->p.offset != i++);
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating backwards");
+
+	ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
+					 SPOS(0, U64_MAX, U32_MAX), 0, k,
+		({
+			BUG_ON(k.k->p.offset != --i);
+			0;
+		}));
+	bch_err_msg(c, ret, "error iterating backwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	u64 i;
+	int ret = 0;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test extents");
+
+	for (i = 0; i < nr; i += 8) {
+		struct bkey_i_cookie ck;
+
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 8;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
+
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			goto err;
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
+		BUG_ON(bkey_start_offset(k.k) != i);
+		i = k.k->p.offset;
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating backwards");
+
+	ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
+					 SPOS(0, U64_MAX, U32_MAX), 0, k,
+		({
+			BUG_ON(k.k->p.offset != i);
+			i = bkey_start_offset(k.k);
+			0;
+		}));
+	bch_err_msg(c, ret, "error iterating backwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	u64 i;
+	int ret = 0;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i++) {
+		struct bkey_i_cookie ck;
+
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i * 2;
+		ck.k.p.snapshot = U32_MAX;
+
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			goto err;
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
+		BUG_ON(k.k->p.offset != i);
+		i += 2;
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i != nr * 2);
+
+	pr_info("iterating forwards by slots");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  BTREE_ITER_SLOTS, k, ({
+		if (i >= nr * 2)
+			break;
+
+		BUG_ON(k.k->p.offset != i);
+		BUG_ON(bkey_deleted(k.k) != (i & 1));
+
+		i++;
+		0;
+	}));
+	if (ret < 0) {
+		bch_err_msg(c, ret, "error iterating forwards by slots");
+		goto err;
+	}
+	ret = 0;
+err:
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	u64 i;
+	int ret = 0;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i += 16) {
+		struct bkey_i_cookie ck;
+
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 16;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
+
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			goto err;
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
+		BUG_ON(bkey_start_offset(k.k) != i + 8);
+		BUG_ON(k.k->size != 8);
+		i += 16;
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating forwards by slots");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+				 SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				 BTREE_ITER_SLOTS, k, ({
+		if (i == nr)
+			break;
+		BUG_ON(bkey_deleted(k.k) != !(i % 16));
+
+		BUG_ON(bkey_start_offset(k.k) != i);
+		BUG_ON(k.k->size != 8);
+		i = k.k->p.offset;
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards by slots");
+	if (ret)
+		goto err;
+	ret = 0;
+err:
+	bch2_trans_put(trans);
+	return 0;
+}
+
+/*
+ * XXX: we really want to make sure we've got a btree with depth > 0 for these
+ * tests
+ */
+static int test_peek_end(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return 0;
+}
+
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return 0;
+}
+
+/* extent unit tests */
+
+static u64 test_version;
+
+static int insert_test_extent(struct bch_fs *c,
+			      u64 start, u64 end)
+{
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.offset = end;
+	k.k_i.k.p.snapshot = U32_MAX;
+	k.k_i.k.size = end - start;
+	k.k_i.k.version.lo = test_version++;
+
+	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __test_extent_overwrite(struct bch_fs *c,
+				    u64 e1_start, u64 e1_end,
+				    u64 e2_start, u64 e2_end)
+{
+	int ret;
+
+	ret   = insert_test_extent(c, e1_start, e1_end) ?:
+		insert_test_extent(c, e2_start, e2_end);
+
+	delete_test_keys(c);
+	return ret;
+}
+
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 0, 64, 0, 32) ?:
+		__test_extent_overwrite(c, 8, 64, 0, 32);
+}
+
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 0, 64, 32, 64) ?:
+		__test_extent_overwrite(c, 0, 64, 32, 72);
+}
+
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+{
+	return __test_extent_overwrite(c, 0, 64, 32, 40);
+}
+
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 32, 64,  0,  64) ?:
+		__test_extent_overwrite(c, 32, 64,  0, 128) ?:
+		__test_extent_overwrite(c, 32, 64, 32,  64) ?:
+		__test_extent_overwrite(c, 32, 64, 32, 128);
+}
+
+static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
+{
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.inode	= inum;
+	k.k_i.k.p.offset = start + len;
+	k.k_i.k.p.snapshot = snapid;
+	k.k_i.k.size = len;
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
+{
+	return  insert_test_overlapping_extent(c, inum,  0, 16, U32_MAX - 2) ?: /* overwrite entire */
+		insert_test_overlapping_extent(c, inum,  2,  8, U32_MAX - 2) ?:
+		insert_test_overlapping_extent(c, inum,  4,  4, U32_MAX) ?:
+		insert_test_overlapping_extent(c, inum, 32,  8, U32_MAX - 2) ?: /* overwrite front/back */
+		insert_test_overlapping_extent(c, inum, 36,  8, U32_MAX) ?:
+		insert_test_overlapping_extent(c, inum, 60,  8, U32_MAX - 2) ?:
+		insert_test_overlapping_extent(c, inum, 64,  8, U32_MAX);
+}
+
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie cookie;
+	int ret;
+
+	bkey_cookie_init(&cookie.k_i);
+	cookie.k.p.snapshot = snapid_hi;
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+	if (ret)
+		return ret;
+
+	trans = bch2_trans_get(c);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, snapid_lo), 0);
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+
+	BUG_ON(k.k->p.snapshot != U32_MAX);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+	struct bkey_i_cookie cookie;
+	u32 snapids[2];
+	u32 snapid_subvols[2] = { 1, 1 };
+	int ret;
+
+	bkey_cookie_init(&cookie.k_i);
+	cookie.k.p.snapshot = U32_MAX;
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+	if (ret)
+		return ret;
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		      bch2_snapshot_node_create(trans, U32_MAX,
+						snapids,
+						snapid_subvols,
+						2));
+	if (ret)
+		return ret;
+
+	if (snapids[0] > snapids[1])
+		swap(snapids[0], snapids[1]);
+
+	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+	bch_err_msg(c, ret, "from test_snapshot_filter");
+	return ret;
+}
+
+/* perf tests */
+
+static u64 test_rand(void)
+{
+	u64 v;
+
+	get_random_bytes(&v, sizeof(v));
+	return v;
+}
+
+static int rand_insert(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bkey_i_cookie k;
+	int ret = 0;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = test_rand();
+		k.k.p.snapshot = U32_MAX;
+
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int rand_insert_multi(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bkey_i_cookie k[8];
+	int ret = 0;
+	unsigned j;
+	u64 i;
+
+	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
+		for (j = 0; j < ARRAY_SIZE(k); j++) {
+			bkey_cookie_init(&k[j].k_i);
+			k[j].k.p.offset = test_rand();
+			k[j].k.p.snapshot = U32_MAX;
+		}
+
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int rand_lookup(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+	u64 i;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	for (i = 0; i < nr; i++) {
+		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
+
+		lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+		ret = bkey_err(k);
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int rand_mixed_trans(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i_cookie *cookie,
+			    u64 i, u64 pos)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
+
+	k = bch2_btree_iter_peek(iter);
+	ret = bkey_err(k);
+	bch_err_msg(trans->c, ret, "lookup error");
+	if (ret)
+		return ret;
+
+	if (!(i & 3) && k.k) {
+		bkey_cookie_init(&cookie->k_i);
+		cookie->k.p = iter->pos;
+		ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
+	}
+
+	return ret;
+}
+
+static int rand_mixed(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_i_cookie cookie;
+	int ret = 0;
+	u64 i, rand;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	for (i = 0; i < nr; i++) {
+		rand = test_rand();
+		ret = commit_do(trans, NULL, NULL, 0,
+			rand_mixed_trans(trans, &iter, &cookie, i, rand));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int __do_delete(struct btree_trans *trans, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k)
+		goto err;
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int rand_delete(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = 0;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
+
+		ret = commit_do(trans, NULL, NULL, 0,
+			__do_delete(trans, pos));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int seq_insert(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie insert;
+
+	bkey_cookie_init(&insert.k_i);
+
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX),
+					BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+					NULL, NULL, 0, ({
+			if (iter.pos.offset >= nr)
+				break;
+			insert.k.p = iter.pos;
+			bch2_trans_update(trans, &iter, &insert.k_i, 0);
+		})));
+}
+
+static int seq_lookup(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	return bch2_trans_run(c,
+		for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k,
+		0));
+}
+
+static int seq_overwrite(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX),
+					BTREE_ITER_INTENT, k,
+					NULL, NULL, 0, ({
+			struct bkey_i_cookie u;
+
+			bkey_reassemble(&u.k_i, k);
+			bch2_trans_update(trans, &iter, &u.k_i, 0);
+		})));
+}
+
+static int seq_delete(struct bch_fs *c, u64 nr)
+{
+	return bch2_btree_delete_range(c, BTREE_ID_xattrs,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
+}
+
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
+
+struct test_job {
+	struct bch_fs			*c;
+	u64				nr;
+	unsigned			nr_threads;
+	perf_test_fn			fn;
+
+	atomic_t			ready;
+	wait_queue_head_t		ready_wait;
+
+	atomic_t			done;
+	struct completion		done_completion;
+
+	u64				start;
+	u64				finish;
+	int				ret;
+};
+
+static int btree_perf_test_thread(void *data)
+{
+	struct test_job *j = data;
+	int ret;
+
+	if (atomic_dec_and_test(&j->ready)) {
+		wake_up(&j->ready_wait);
+		j->start = sched_clock();
+	} else {
+		wait_event(j->ready_wait, !atomic_read(&j->ready));
+	}
+
+	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
+	if (ret) {
+		bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
+		j->ret = ret;
+	}
+
+	if (atomic_dec_and_test(&j->done)) {
+		j->finish = sched_clock();
+		complete(&j->done_completion);
+	}
+
+	return 0;
+}
+
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+			 u64 nr, unsigned nr_threads)
+{
+	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
+	char name_buf[20];
+	struct printbuf nr_buf = PRINTBUF;
+	struct printbuf per_sec_buf = PRINTBUF;
+	unsigned i;
+	u64 time;
+
+	atomic_set(&j.ready, nr_threads);
+	init_waitqueue_head(&j.ready_wait);
+
+	atomic_set(&j.done, nr_threads);
+	init_completion(&j.done_completion);
+
+#define perf_test(_test)				\
+	if (!strcmp(testname, #_test)) j.fn = _test
+
+	perf_test(rand_insert);
+	perf_test(rand_insert_multi);
+	perf_test(rand_lookup);
+	perf_test(rand_mixed);
+	perf_test(rand_delete);
+
+	perf_test(seq_insert);
+	perf_test(seq_lookup);
+	perf_test(seq_overwrite);
+	perf_test(seq_delete);
+
+	/* a unit test, not a perf test: */
+	perf_test(test_delete);
+	perf_test(test_delete_written);
+	perf_test(test_iterate);
+	perf_test(test_iterate_extents);
+	perf_test(test_iterate_slots);
+	perf_test(test_iterate_slots_extents);
+	perf_test(test_peek_end);
+	perf_test(test_peek_end_extents);
+
+	perf_test(test_extent_overwrite_front);
+	perf_test(test_extent_overwrite_back);
+	perf_test(test_extent_overwrite_middle);
+	perf_test(test_extent_overwrite_all);
+	perf_test(test_extent_create_overlapping);
+
+	perf_test(test_snapshots);
+
+	if (!j.fn) {
+		pr_err("unknown test %s", testname);
+		return -EINVAL;
+	}
+
+	//pr_info("running test %s:", testname);
+
+	if (nr_threads == 1)
+		btree_perf_test_thread(&j);
+	else
+		for (i = 0; i < nr_threads; i++)
+			kthread_run(btree_perf_test_thread, &j,
+				    "bcachefs perf test[%u]", i);
+
+	while (wait_for_completion_interruptible(&j.done_completion))
+		;
+
+	time = j.finish - j.start;
+
+	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
+	prt_human_readable_u64(&nr_buf, nr);
+	prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
+	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
+		name_buf, nr_buf.buf, nr_threads,
+		div_u64(time, NSEC_PER_SEC),
+		div_u64(time * nr_threads, nr),
+		per_sec_buf.buf);
+	printbuf_exit(&per_sec_buf);
+	printbuf_exit(&nr_buf);
+	return j.ret;
+}
+
+#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
new file mode 100644
index 000000000000..c73b18aea7e0
--- /dev/null
+++ b/fs/bcachefs/tests.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TEST_H
+#define _BCACHEFS_TEST_H
+
+struct bch_fs;
+
+#ifdef CONFIG_BCACHEFS_TESTS
+
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+
+#else
+
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
new file mode 100644
index 000000000000..33efa6005c6f
--- /dev/null
+++ b/fs/bcachefs/trace.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "keylist.h"
+#include "opts.h"
+#include "six.h"
+
+#include <linux/blktrace_api.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
new file mode 100644
index 000000000000..19264492151b
--- /dev/null
+++ b/fs/bcachefs/trace.h
@@ -0,0 +1,1284 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcachefs
+
+#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHEFS_H
+
+#include <linux/tracepoint.h>
+
+#define TRACE_BPOS_entries(name)				\
+	__field(u64,			name##_inode	)	\
+	__field(u64,			name##_offset	)	\
+	__field(u32,			name##_snapshot	)
+
+#define TRACE_BPOS_assign(dst, src)				\
+	__entry->dst##_inode		= (src).inode;		\
+	__entry->dst##_offset		= (src).offset;		\
+	__entry->dst##_snapshot		= (src).snapshot
+
+DECLARE_EVENT_CLASS(bpos,
+	TP_PROTO(const struct bpos *p),
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		TRACE_BPOS_entries(p)
+	),
+
+	TP_fast_assign(
+		TRACE_BPOS_assign(p, *p);
+	),
+
+	TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
+);
+
+DECLARE_EVENT_CLASS(bkey,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k),
+
+	TP_STRUCT__entry(
+		__string(k,	k				)
+	),
+
+	TP_fast_assign(
+		__assign_str(k, k);
+	),
+
+	TP_printk("%s", __get_str(k))
+);
+
+DECLARE_EVENT_CLASS(btree_node,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u8,		level			)
+		__field(u8,		btree_id		)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->level		= b->c.level;
+		__entry->btree_id	= b->c.btree_id;
+		TRACE_BPOS_assign(pos, b->key.k.p);
+	),
+
+	TP_printk("%d,%d %u %s %llu:%llu:%u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->level,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(bch_fs,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+	),
+
+	TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
+);
+
+DECLARE_EVENT_CLASS(bio,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev ? bio_dev(bio) : 0;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
+	),
+
+	TP_printk("%d,%d  %s %llu + %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector, __entry->nr_sector)
+);
+
+/* super-io.c: */
+TRACE_EVENT(write_super,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev	)
+		__field(unsigned long,	ip	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->ip		= ip;
+	),
+
+	TP_printk("%d,%d for %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (void *) __entry->ip)
+);
+
+/* io.c: */
+
+DEFINE_EVENT(bio, read_promote,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+TRACE_EVENT(read_nopromote,
+	TP_PROTO(struct bch_fs *c, int ret),
+	TP_ARGS(c, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__array(char,		ret, 32		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+	),
+
+	TP_printk("%d,%d ret %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ret)
+);
+
+DEFINE_EVENT(bio, read_bounce,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_split,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_retry,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_reuse_race,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+/* Journal */
+
+DEFINE_EVENT(bch_fs, journal_full,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, journal_entry_full,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bio, journal_write,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+TRACE_EVENT(journal_reclaim_start,
+	TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+		 u64 min_nr, u64 min_key_cache,
+		 u64 prereserved, u64 prereserved_total,
+		 u64 btree_cache_dirty, u64 btree_cache_total,
+		 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+	TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+		btree_cache_dirty, btree_cache_total,
+		btree_key_cache_dirty, btree_key_cache_total),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(bool,		direct			)
+		__field(bool,		kicked			)
+		__field(u64,		min_nr			)
+		__field(u64,		min_key_cache		)
+		__field(u64,		prereserved		)
+		__field(u64,		prereserved_total	)
+		__field(u64,		btree_cache_dirty	)
+		__field(u64,		btree_cache_total	)
+		__field(u64,		btree_key_cache_dirty	)
+		__field(u64,		btree_key_cache_total	)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->direct			= direct;
+		__entry->kicked			= kicked;
+		__entry->min_nr			= min_nr;
+		__entry->min_key_cache		= min_key_cache;
+		__entry->prereserved		= prereserved;
+		__entry->prereserved_total	= prereserved_total;
+		__entry->btree_cache_dirty	= btree_cache_dirty;
+		__entry->btree_cache_total	= btree_cache_total;
+		__entry->btree_key_cache_dirty	= btree_key_cache_dirty;
+		__entry->btree_key_cache_total	= btree_key_cache_total;
+	),
+
+	TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->direct,
+		  __entry->kicked,
+		  __entry->min_nr,
+		  __entry->min_key_cache,
+		  __entry->prereserved,
+		  __entry->prereserved_total,
+		  __entry->btree_cache_dirty,
+		  __entry->btree_cache_total,
+		  __entry->btree_key_cache_dirty,
+		  __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+	TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+	TP_ARGS(c, nr_flushed),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		nr_flushed		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->nr_flushed	= nr_flushed;
+	),
+
+	TP_printk("%d,%d flushed %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_flushed)
+);
+
+/* bset.c: */
+
+DEFINE_EVENT(bpos, bkey_pack_pos_fail,
+	TP_PROTO(const struct bpos *p),
+	TP_ARGS(p)
+);
+
+/* Btree cache: */
+
+TRACE_EVENT(btree_cache_scan,
+	TP_PROTO(long nr_to_scan, long can_free, long ret),
+	TP_ARGS(nr_to_scan, can_free, ret),
+
+	TP_STRUCT__entry(
+		__field(long,	nr_to_scan		)
+		__field(long,	can_free		)
+		__field(long,	ret			)
+	),
+
+	TP_fast_assign(
+		__entry->nr_to_scan	= nr_to_scan;
+		__entry->can_free	= can_free;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("scanned for %li nodes, can free %li, ret %li",
+		  __entry->nr_to_scan, __entry->can_free, __entry->ret)
+);
+
+DEFINE_EVENT(btree_node, btree_cache_reap,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_node_write,
+	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
+	TP_ARGS(b, bytes, sectors),
+
+	TP_STRUCT__entry(
+		__field(enum btree_node_type,	type)
+		__field(unsigned,	bytes			)
+		__field(unsigned,	sectors			)
+	),
+
+	TP_fast_assign(
+		__entry->type	= btree_node_type(b);
+		__entry->bytes	= bytes;
+		__entry->sectors = sectors;
+	),
+
+	TP_printk("bkey type %u bytes %u sectors %u",
+		  __entry->type , __entry->bytes, __entry->sectors)
+);
+
+DEFINE_EVENT(btree_node, btree_node_alloc,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_free,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_reserve_get_fail,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 size_t required,
+		 int ret),
+	TP_ARGS(trans_fn, caller_ip, required, ret),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(size_t,			required	)
+		__array(char,			ret, 32		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip	= caller_ip;
+		__entry->required	= required;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+	),
+
+	TP_printk("%s %pS required %zu ret %s",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->required,
+		  __entry->ret)
+);
+
+DEFINE_EVENT(btree_node, btree_node_compact,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_merge,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_split,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_rewrite,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_set_root,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_path_relock_fail,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned level),
+	TP_ARGS(trans, caller_ip, path, level),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		TRACE_BPOS_entries(pos)
+		__array(char,			node, 24	)
+		__field(u8,			self_read_count	)
+		__field(u8,			self_intent_count)
+		__field(u8,			read_count	)
+		__field(u8,			intent_count	)
+		__field(u32,			iter_lock_seq	)
+		__field(u32,			node_lock_seq	)
+	),
+
+	TP_fast_assign(
+		struct btree *b = btree_path_node(path, level);
+		struct six_lock_count c;
+
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->level			= path->level;
+		TRACE_BPOS_assign(pos, path->pos);
+
+		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+		__entry->self_read_count	= c.n[SIX_LOCK_read];
+		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
+
+		if (IS_ERR(b)) {
+			strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+		} else {
+			c = six_lock_counts(&path->l[level].b->c.lock);
+			__entry->read_count	= c.n[SIX_LOCK_read];
+			__entry->intent_count	= c.n[SIX_LOCK_intent];
+			scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+		}
+		__entry->iter_lock_seq		= path->l[level].lock_seq;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->level,
+		  __entry->node,
+		  __entry->self_read_count,
+		  __entry->self_intent_count,
+		  __entry->read_count,
+		  __entry->intent_count,
+		  __entry->iter_lock_seq,
+		  __entry->node_lock_seq)
+);
+
+TRACE_EVENT(btree_path_upgrade_fail,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned level),
+	TP_ARGS(trans, caller_ip, path, level),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		TRACE_BPOS_entries(pos)
+		__field(u8,			locked		)
+		__field(u8,			self_read_count	)
+		__field(u8,			self_intent_count)
+		__field(u8,			read_count	)
+		__field(u8,			intent_count	)
+		__field(u32,			iter_lock_seq	)
+		__field(u32,			node_lock_seq	)
+	),
+
+	TP_fast_assign(
+		struct six_lock_count c;
+
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->level			= level;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->locked			= btree_node_locked(path, level);
+
+		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+		__entry->self_read_count	= c.n[SIX_LOCK_read];
+		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
+		c = six_lock_counts(&path->l[level].b->c.lock);
+		__entry->read_count		= c.n[SIX_LOCK_read];
+		__entry->intent_count		= c.n[SIX_LOCK_intent];
+		__entry->iter_lock_seq		= path->l[level].lock_seq;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->level,
+		  __entry->locked,
+		  __entry->self_read_count,
+		  __entry->self_intent_count,
+		  __entry->read_count,
+		  __entry->intent_count,
+		  __entry->iter_lock_seq,
+		  __entry->node_lock_seq)
+);
+
+/* Garbage collection */
+
+DEFINE_EVENT(bch_fs, gc_gens_start,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_gens_end,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+/* Allocator */
+
+DECLARE_EVENT_CLASS(bucket_alloc,
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 bucket,
+		 u64 free,
+		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
+		 struct bucket_alloc_state *s,
+		 bool nonblocking,
+		 const char *err),
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
+		s, nonblocking, err),
+
+	TP_STRUCT__entry(
+		__field(u8,			dev			)
+		__array(char,	reserve,	16			)
+		__field(u64,			bucket	)
+		__field(u64,			free			)
+		__field(u64,			avail			)
+		__field(u64,			copygc_wait_amount	)
+		__field(s64,			copygc_waiting_for	)
+		__field(u64,			seen			)
+		__field(u64,			open			)
+		__field(u64,			need_journal_commit	)
+		__field(u64,			nouse			)
+		__field(bool,			nonblocking		)
+		__field(u64,			nocow			)
+		__array(char,			err,	32		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ca->dev_idx;
+		strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+		__entry->bucket		= bucket;
+		__entry->free		= free;
+		__entry->avail		= avail;
+		__entry->copygc_wait_amount	= copygc_wait_amount;
+		__entry->copygc_waiting_for	= copygc_waiting_for;
+		__entry->seen		= s->buckets_seen;
+		__entry->open		= s->skipped_open;
+		__entry->need_journal_commit = s->skipped_need_journal_commit;
+		__entry->nouse		= s->skipped_nouse;
+		__entry->nonblocking	= nonblocking;
+		__entry->nocow		= s->skipped_nocow;
+		strscpy(__entry->err, err, sizeof(__entry->err));
+	),
+
+	TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
+		  __entry->reserve,
+		  __entry->dev,
+		  __entry->bucket,
+		  __entry->free,
+		  __entry->avail,
+		  __entry->copygc_wait_amount,
+		  __entry->copygc_waiting_for,
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->nouse,
+		  __entry->nocow,
+		  __entry->nonblocking,
+		  __entry->err)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 bucket,
+		 u64 free,
+		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
+		 struct bucket_alloc_state *s,
+		 bool nonblocking,
+		 const char *err),
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
+		s, nonblocking, err)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 bucket,
+		 u64 free,
+		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
+		 struct bucket_alloc_state *s,
+		 bool nonblocking,
+		 const char *err),
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
+		s, nonblocking, err)
+);
+
+TRACE_EVENT(discard_buckets,
+	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+		 u64 need_journal_commit, u64 discarded, const char *err),
+	TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		seen			)
+		__field(u64,		open			)
+		__field(u64,		need_journal_commit	)
+		__field(u64,		discarded		)
+		__array(char,		err,	16		)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->seen			= seen;
+		__entry->open			= open;
+		__entry->need_journal_commit	= need_journal_commit;
+		__entry->discarded		= discarded;
+		strscpy(__entry->err, err, sizeof(__entry->err));
+	),
+
+	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->discarded,
+		  __entry->err)
+);
+
+TRACE_EVENT(bucket_invalidate,
+	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
+	TP_ARGS(c, dev, bucket, sectors),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		dev_idx			)
+		__field(u32,		sectors			)
+		__field(u64,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->dev_idx	= dev;
+		__entry->sectors	= sectors;
+		__entry->bucket		= bucket;
+	),
+
+	TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dev_idx, __entry->bucket,
+		  __entry->sectors)
+);
+
+/* Moving IO */
+
+TRACE_EVENT(bucket_evacuate,
+	TP_PROTO(struct bch_fs *c, struct bpos *bucket),
+	TP_ARGS(c, bucket),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		dev_idx			)
+		__field(u64,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->dev_idx	= bucket->inode;
+		__entry->bucket		= bucket->offset;
+	),
+
+	TP_printk("%d:%d %u:%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dev_idx, __entry->bucket)
+);
+
+DEFINE_EVENT(bkey, move_extent,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_read,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_write,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_finish,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+TRACE_EVENT(move_extent_fail,
+	TP_PROTO(struct bch_fs *c, const char *msg),
+	TP_ARGS(c, msg),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__string(msg,		msg			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__assign_str(msg, msg);
+	),
+
+	TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+);
+
+DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+TRACE_EVENT(move_data,
+	TP_PROTO(struct bch_fs *c, u64 sectors_moved,
+		 u64 keys_moved),
+	TP_ARGS(c, sectors_moved, keys_moved),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		sectors_moved	)
+		__field(u64,		keys_moved	)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->sectors_moved = sectors_moved;
+		__entry->keys_moved = keys_moved;
+	),
+
+	TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->sectors_moved, __entry->keys_moved)
+);
+
+TRACE_EVENT(evacuate_bucket,
+	TP_PROTO(struct bch_fs *c, struct bpos *bucket,
+		 unsigned sectors, unsigned bucket_size,
+		 u64 fragmentation, int ret),
+	TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__field(u64,		member		)
+		__field(u64,		bucket		)
+		__field(u32,		sectors		)
+		__field(u32,		bucket_size	)
+		__field(u64,		fragmentation	)
+		__field(int,		ret		)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->member			= bucket->inode;
+		__entry->bucket			= bucket->offset;
+		__entry->sectors		= sectors;
+		__entry->bucket_size		= bucket_size;
+		__entry->fragmentation		= fragmentation;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->member, __entry->bucket,
+		  __entry->sectors, __entry->bucket_size,
+		  __entry->fragmentation, __entry->ret)
+);
+
+TRACE_EVENT(copygc,
+	TP_PROTO(struct bch_fs *c,
+		 u64 sectors_moved, u64 sectors_not_moved,
+		 u64 buckets_moved, u64 buckets_not_moved),
+	TP_ARGS(c,
+		sectors_moved, sectors_not_moved,
+		buckets_moved, buckets_not_moved),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		sectors_moved		)
+		__field(u64,		sectors_not_moved	)
+		__field(u64,		buckets_moved		)
+		__field(u64,		buckets_not_moved	)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->sectors_moved		= sectors_moved;
+		__entry->sectors_not_moved	= sectors_not_moved;
+		__entry->buckets_moved		= buckets_moved;
+		__entry->buckets_not_moved = buckets_moved;
+	),
+
+	TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->sectors_moved, __entry->sectors_not_moved,
+		  __entry->buckets_moved, __entry->buckets_not_moved)
+);
+
+TRACE_EVENT(copygc_wait,
+	TP_PROTO(struct bch_fs *c,
+		 u64 wait_amount, u64 until),
+	TP_ARGS(c, wait_amount, until),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		wait_amount		)
+		__field(u64,		until			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->wait_amount	= wait_amount;
+		__entry->until		= until;
+	),
+
+	TP_printk("%d,%u waiting for %llu sectors until %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->wait_amount, __entry->until)
+);
+
+/* btree transactions: */
+
+DECLARE_EVENT_CLASS(transaction_event,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+	),
+
+	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	transaction_commit,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_injected,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_split_race,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree *b),
+	TP_ARGS(trans, caller_ip, b),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			level		)
+		__field(u16,			written		)
+		__field(u16,			blocks		)
+		__field(u16,			u64s_remaining	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->level		= b->c.level;
+		__entry->written	= b->written;
+		__entry->blocks		= btree_blocks(trans->c);
+		__entry->u64s_remaining	= bch_btree_keys_u64s_remaining(trans->c, b);
+	),
+
+	TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
+		  __entry->trans_fn, (void *) __entry->caller_ip,
+		  __entry->level,
+		  __entry->written, __entry->blocks,
+		  __entry->u64s_remaining)
+);
+
+DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_journal_preres_get,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 unsigned flags),
+	TP_ARGS(trans, caller_ip, flags),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned,		flags		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->flags			= flags;
+	),
+
+	TP_printk("%s %pS %x", __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->flags)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_fault_inject,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_traverse_all,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_too_many_iters,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DECLARE_EVENT_CLASS(transaction_restart_iter,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos)
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+TRACE_EVENT(trans_restart_upgrade,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned old_locks_want,
+		 unsigned new_locks_want),
+	TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			old_locks_want	)
+		__field(u8,			new_locks_want	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->old_locks_want		= old_locks_want;
+		__entry->new_locks_want		= new_locks_want;
+		TRACE_BPOS_assign(pos, path->pos)
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->old_locks_want,
+		  __entry->new_locks_want)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_next_node,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_parent_for_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_key_cache_upgrade,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path_intent,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock_recursion_limit,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_would_deadlock_write,
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+	),
+
+	TP_printk("%s", __entry->trans_fn)
+);
+
+TRACE_EVENT(trans_restart_mem_realloced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 unsigned long bytes),
+	TP_ARGS(trans, caller_ip, bytes),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned long,		bytes		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip	= caller_ip;
+		__entry->bytes		= bytes;
+	),
+
+	TP_printk("%s %pS bytes %lu",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->bytes)
+);
+
+TRACE_EVENT(trans_restart_key_cache_key_realloced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned old_u64s,
+		 unsigned new_u64s),
+	TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(enum btree_id,		btree_id	)
+		TRACE_BPOS_entries(pos)
+		__field(u32,			old_u64s	)
+		__field(u32,			new_u64s	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+
+		__entry->btree_id	= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->old_u64s	= old_u64s;
+		__entry->new_u64s	= new_u64s;
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_ids[__entry->btree_id],
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->old_u64s,
+		  __entry->new_u64s)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
+	TP_ARGS(trans, nr, skipped, fast, size),
+
+	TP_STRUCT__entry(
+		__field(size_t,		nr		)
+		__field(size_t,		skipped		)
+		__field(size_t,		fast		)
+		__field(size_t,		size		)
+	),
+
+	TP_fast_assign(
+		__entry->nr	= nr;
+		__entry->skipped = skipped;
+		__entry->fast	= fast;
+		__entry->size	= size;
+	),
+
+	TP_printk("%zu/%zu skipped %zu fast %zu",
+		  __entry->nr, __entry->size, __entry->skipped, __entry->fast)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+	TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
+	TP_ARGS(trans, nr, size),
+
+	TP_STRUCT__entry(
+		__field(size_t,		nr		)
+		__field(size_t,		size		)
+	),
+
+	TP_fast_assign(
+		__entry->nr	= nr;
+		__entry->size	= size;
+	),
+
+	TP_printk("%zu/%zu", __entry->nr, __entry->size)
+);
+
+#endif /* _TRACE_BCACHEFS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../fs/bcachefs
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
new file mode 100644
index 000000000000..9764c2e6a910
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "two_state_shared_lock.h"
+
+void __bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+	__wait_event(lock->wait, bch2_two_state_trylock(lock, s));
+}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
new file mode 100644
index 000000000000..905801772002
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TWO_STATE_LOCK_H
+#define _BCACHEFS_TWO_STATE_LOCK_H
+
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include "util.h"
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+typedef struct {
+	atomic_long_t		v;
+	wait_queue_head_t	wait;
+} two_state_lock_t;
+
+static inline void two_state_lock_init(two_state_lock_t *lock)
+{
+	atomic_long_set(&lock->v, 0);
+	init_waitqueue_head(&lock->wait);
+}
+
+static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+
+	EBUG_ON(atomic_long_read(&lock->v) == 0);
+
+	if (atomic_long_sub_return_release(i, &lock->v) == 0)
+		wake_up_all(&lock->wait);
+}
+
+static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+	long v = atomic_long_read(&lock->v), old;
+
+	do {
+		old = v;
+
+		if (i > 0 ? v < 0 : v > 0)
+			return false;
+	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+					old, old + i)) != old);
+	return true;
+}
+
+void __bch2_two_state_lock(two_state_lock_t *, int);
+
+static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+	if (!bch2_two_state_trylock(lock, s))
+		__bch2_two_state_lock(lock, s);
+}
+
+#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
new file mode 100644
index 000000000000..08bac0ba8d0b
--- /dev/null
+++ b/fs/bcachefs/util.c
@@ -0,0 +1,1141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/console.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/log2.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/sched/clock.h>
+
+#include "eytzinger.h"
+#include "mean_and_variance.h"
+#include "util.h"
+
+static const char si_units[] = "?kMGTPEZY";
+
+/* string_get_size units: */
+static const char *const units_2[] = {
+	"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+};
+static const char *const units_10[] = {
+	"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+};
+
+static int parse_u64(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 v = 0;
+
+	if (!isdigit(*cp))
+		return -EINVAL;
+
+	do {
+		if (v > U64_MAX / 10)
+			return -ERANGE;
+		v *= 10;
+		if (v > U64_MAX - (*cp - '0'))
+			return -ERANGE;
+		v += *cp - '0';
+		cp++;
+	} while (isdigit(*cp));
+
+	*res = v;
+	return cp - start;
+}
+
+static int bch2_pow(u64 n, u64 p, u64 *res)
+{
+	*res = 1;
+
+	while (p--) {
+		if (*res > div_u64(U64_MAX, n))
+			return -ERANGE;
+		*res *= n;
+	}
+	return 0;
+}
+
+static int parse_unit_suffix(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 base = 1024;
+	unsigned u;
+	int ret;
+
+	if (*cp == ' ')
+		cp++;
+
+	for (u = 1; u < strlen(si_units); u++)
+		if (*cp == si_units[u]) {
+			cp++;
+			goto got_unit;
+		}
+
+	for (u = 0; u < ARRAY_SIZE(units_2); u++)
+		if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
+			cp += strlen(units_2[u]);
+			goto got_unit;
+		}
+
+	for (u = 0; u < ARRAY_SIZE(units_10); u++)
+		if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
+			cp += strlen(units_10[u]);
+			base = 1000;
+			goto got_unit;
+		}
+
+	*res = 1;
+	return 0;
+got_unit:
+	ret = bch2_pow(base, u, res);
+	if (ret)
+		return ret;
+
+	return cp - start;
+}
+
+#define parse_or_ret(cp, _f)			\
+do {						\
+	int _ret = _f;				\
+	if (_ret < 0)				\
+		return _ret;			\
+	cp += _ret;				\
+} while (0)
+
+static int __bch2_strtou64_h(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 v = 0, b, f_n = 0, f_d = 1;
+	int ret;
+
+	parse_or_ret(cp, parse_u64(cp, &v));
+
+	if (*cp == '.') {
+		cp++;
+		ret = parse_u64(cp, &f_n);
+		if (ret < 0)
+			return ret;
+		cp += ret;
+
+		ret = bch2_pow(10, ret, &f_d);
+		if (ret)
+			return ret;
+	}
+
+	parse_or_ret(cp, parse_unit_suffix(cp, &b));
+
+	if (v > div_u64(U64_MAX, b))
+		return -ERANGE;
+	v *= b;
+
+	if (f_n > div_u64(U64_MAX, b))
+		return -ERANGE;
+
+	f_n = div_u64(f_n * b, f_d);
+	if (v + f_n < v)
+		return -ERANGE;
+	v += f_n;
+
+	*res = v;
+	return cp - start;
+}
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+			 u64 t_max, bool t_signed)
+{
+	bool positive = *cp != '-';
+	u64 v = 0;
+
+	if (*cp == '+' || *cp == '-')
+		cp++;
+
+	parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
+
+	if (*cp == '\n')
+		cp++;
+	if (*cp)
+		return -EINVAL;
+
+	if (positive) {
+		if (v > t_max)
+			return -ERANGE;
+	} else {
+		if (v && !t_signed)
+			return -ERANGE;
+
+		if (v > t_max + 1)
+			return -ERANGE;
+		v = -v;
+	}
+
+	*res = v;
+	return 0;
+}
+
+#define STRTO_H(name, type)					\
+int bch2_ ## name ## _h(const char *cp, type *res)		\
+{								\
+	u64 v = 0;						\
+	int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),	\
+			ANYSINT_MAX(type) != ((type) ~0ULL));	\
+	*res = v;						\
+	return ret;						\
+}
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+STRTO_H(strtou64, u64)
+
+u64 bch2_read_flag_list(char *opt, const char * const list[])
+{
+	u64 ret = 0;
+	char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
+
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	while ((p = strsep(&s, ","))) {
+		int flag = match_string(list, -1, p);
+
+		if (flag < 0) {
+			ret = -1;
+			break;
+		}
+
+		ret |= 1 << flag;
+	}
+
+	kfree(d);
+
+	return ret;
+}
+
+bool bch2_is_zero(const void *_p, size_t n)
+{
+	const char *p = _p;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		if (p[i])
+			return false;
+	return true;
+}
+
+void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+{
+	while (nr_bits)
+		prt_char(out, '0' + ((v >> --nr_bits) & 1));
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+	const char *p;
+
+	if (!lines) {
+		printk("%s (null)\n", prefix);
+		return;
+	}
+
+	console_lock();
+	while (1) {
+		p = strchrnul(lines, '\n');
+		printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+		if (!*p)
+			break;
+		lines = p + 1;
+	}
+	console_unlock();
+}
+
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
+{
+#ifdef CONFIG_STACKTRACE
+	unsigned nr_entries = 0;
+	int ret = 0;
+
+	stack->nr = 0;
+	ret = darray_make_room(stack, 32);
+	if (ret)
+		return ret;
+
+	if (!down_read_trylock(&task->signal->exec_update_lock))
+		return -1;
+
+	do {
+		nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0);
+	} while (nr_entries == stack->size &&
+		 !(ret = darray_make_room(stack, stack->size * 2)));
+
+	stack->nr = nr_entries;
+	up_read(&task->signal->exec_update_lock);
+
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
+{
+	unsigned long *i;
+
+	darray_for_each(*stack, i) {
+		prt_printf(out, "[<0>] %pB", (void *) *i);
+		prt_newline(out);
+	}
+}
+
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
+{
+	bch_stacktrace stack = { 0 };
+	int ret = bch2_save_backtrace(&stack, task);
+
+	bch2_prt_backtrace(out, &stack);
+	darray_exit(&stack);
+	return ret;
+}
+
+/* time stats: */
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct bch2_quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
+					      u64 start, u64 end)
+{
+	u64 duration, freq;
+
+	if (time_after64(end, start)) {
+		duration = end - start;
+		mean_and_variance_update(&stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
+		stats->max_duration = max(stats->max_duration, duration);
+		stats->min_duration = min(stats->min_duration, duration);
+		bch2_quantiles_update(&stats->quantiles, duration);
+	}
+
+	if (time_after64(end, stats->last_event)) {
+		freq = end - stats->last_event;
+		mean_and_variance_update(&stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
+		stats->max_freq = max(stats->max_freq, freq);
+		stats->min_freq = min(stats->min_freq, freq);
+		stats->last_event = end;
+	}
+}
+
+static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+						  struct bch2_time_stat_buffer *b)
+{
+	struct bch2_time_stat_buffer_entry *i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+	for (i = b->entries;
+	     i < b->entries + ARRAY_SIZE(b->entries);
+	     i++)
+		bch2_time_stats_update_one(stats, i->start, i->end);
+	spin_unlock_irqrestore(&stats->lock, flags);
+
+	b->nr = 0;
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
+		       "time_stats: min_duration = %llu, min_freq = %llu",
+		       stats->min_duration, stats->min_freq);
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		bch2_time_stats_update_one(stats, start, end);
+
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+		    stats->duration_stats.n > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct bch2_time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct bch2_time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+			bch2_time_stats_clear_buffer(stats, b);
+		preempt_enable();
+	}
+}
+#endif
+
+static const struct time_unit {
+	const char	*name;
+	u64		nsecs;
+} time_units[] = {
+	{ "ns",		1		 },
+	{ "us",		NSEC_PER_USEC	 },
+	{ "ms",		NSEC_PER_MSEC	 },
+	{ "s",		NSEC_PER_SEC	 },
+	{ "m",          (u64) NSEC_PER_SEC * 60},
+	{ "h",          (u64) NSEC_PER_SEC * 3600},
+	{ "eon",        U64_MAX          },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
+	prt_tab_rjust(out);
+	prt_printf(out, "%s", u->name);
+}
+
+#define TABSTOP_SIZE 12
+
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+	prt_str(out, name);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, ns);
+	prt_newline(out);
+}
+
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+{
+	const struct time_unit *u;
+	s64 f_mean = 0, d_mean = 0;
+	u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
+	int i;
+	/*
+	 * avoid divide by zero
+	 */
+	if (stats->freq_stats.n) {
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	}
+
+	printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+	prt_printf(out, "count:");
+	prt_tab(out);
+	prt_printf(out, "%llu ",
+			 stats->duration_stats.n);
+	printbuf_tabstop_pop(out);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+
+	printbuf_tabstop_push(out, out->indent + 20);
+	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+	printbuf_tabstop_push(out, 0);
+	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+	prt_tab(out);
+	prt_printf(out, "since mount");
+	prt_tab_rjust(out);
+	prt_tab(out);
+	prt_printf(out, "recent");
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, out->indent + 20);
+	printbuf_tabstop_push(out, TABSTOP_SIZE);
+	printbuf_tabstop_push(out, 2);
+	printbuf_tabstop_push(out, TABSTOP_SIZE);
+
+	prt_printf(out, "duration of events");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	pr_name_and_units(out, "min:", stats->min_duration);
+	pr_name_and_units(out, "max:", stats->max_duration);
+
+	prt_printf(out, "mean:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, d_mean);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	prt_newline(out);
+
+	prt_printf(out, "stddev:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, d_stddev);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+
+	printbuf_indent_sub(out, 2);
+	prt_newline(out);
+
+	prt_printf(out, "time between events");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	pr_name_and_units(out, "min:", stats->min_freq);
+	pr_name_and_units(out, "max:", stats->max_freq);
+
+	prt_printf(out, "mean:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, f_mean);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+	prt_newline(out);
+
+	prt_printf(out, "stddev:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, f_stddev);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+
+	printbuf_indent_sub(out, 2);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+
+	i = eytzinger0_first(NR_QUANTILES);
+	u = pick_time_units(stats->quantiles.entries[i].m);
+
+	prt_printf(out, "quantiles (%s):\t", u->name);
+	eytzinger0_for_each(i, NR_QUANTILES) {
+		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+		q = max(stats->quantiles.entries[i].m, last_q);
+		prt_printf(out, "%llu ",
+		       div_u64(q, u->nsecs));
+		if (is_last)
+			prt_newline(out);
+		last_q = q;
+	}
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->duration_stats_weighted.weight = 8;
+	stats->freq_stats_weighted.weight = 8;
+	stats->min_duration = U64_MAX;
+	stats->min_freq = U64_MAX;
+	spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
+/**
+ * bch2_ratelimit_delay() - return how long to delay until the next time to do
+ *		some work
+ * @d:		the struct bch_ratelimit to update
+ * Returns:	the amount of time to delay by, in jiffies
+ */
+u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
+{
+	u64 now = local_clock();
+
+	return time_after64(d->next, now)
+		? nsecs_to_jiffies(d->next - now)
+		: 0;
+}
+
+/**
+ * bch2_ratelimit_increment() - increment @d by the amount of work done
+ * @d:		the struct bch_ratelimit to update
+ * @done:	the amount of work done, in arbitrary units
+ */
+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
+{
+	u64 now = local_clock();
+
+	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+	if (time_before64(now + NSEC_PER_SEC, d->next))
+		d->next = now + NSEC_PER_SEC;
+
+	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+		d->next = now - NSEC_PER_SEC * 2;
+}
+
+/* pd controller: */
+
+/*
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
+ * @target: desired value
+ * @actual: current value
+ *
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
+ * it makes actual go down.
+ */
+void bch2_pd_controller_update(struct bch_pd_controller *pd,
+			      s64 target, s64 actual, int sign)
+{
+	s64 proportional, derivative, change;
+
+	unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
+
+	if (seconds_since_update == 0)
+		return;
+
+	pd->last_update = jiffies;
+
+	proportional = actual - target;
+	proportional *= seconds_since_update;
+	proportional = div_s64(proportional, pd->p_term_inverse);
+
+	derivative = actual - pd->last_actual;
+	derivative = div_s64(derivative, seconds_since_update);
+	derivative = ewma_add(pd->smoothed_derivative, derivative,
+			      (pd->d_term / seconds_since_update) ?: 1);
+	derivative = derivative * pd->d_term;
+	derivative = div_s64(derivative, pd->p_term_inverse);
+
+	change = proportional + derivative;
+
+	/* Don't increase rate if not keeping up */
+	if (change > 0 &&
+	    pd->backpressure &&
+	    time_after64(local_clock(),
+			 pd->rate.next + NSEC_PER_MSEC))
+		change = 0;
+
+	change *= (sign * -1);
+
+	pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
+				1, UINT_MAX);
+
+	pd->last_actual		= actual;
+	pd->last_derivative	= derivative;
+	pd->last_proportional	= proportional;
+	pd->last_change		= change;
+	pd->last_target		= target;
+}
+
+void bch2_pd_controller_init(struct bch_pd_controller *pd)
+{
+	pd->rate.rate		= 1024;
+	pd->last_update		= jiffies;
+	pd->p_term_inverse	= 6000;
+	pd->d_term		= 30;
+	pd->d_smooth		= pd->d_term;
+	pd->backpressure	= 1;
+}
+
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
+{
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 20);
+
+	prt_printf(out, "rate:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->rate.rate);
+	prt_newline(out);
+
+	prt_printf(out, "target:");
+	prt_tab(out);
+	prt_human_readable_u64(out, pd->last_target);
+	prt_newline(out);
+
+	prt_printf(out, "actual:");
+	prt_tab(out);
+	prt_human_readable_u64(out, pd->last_actual);
+	prt_newline(out);
+
+	prt_printf(out, "proportional:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_proportional);
+	prt_newline(out);
+
+	prt_printf(out, "derivative:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_derivative);
+	prt_newline(out);
+
+	prt_printf(out, "change:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_change);
+	prt_newline(out);
+
+	prt_printf(out, "next io:");
+	prt_tab(out);
+	prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+	prt_newline(out);
+}
+
+/* misc: */
+
+void bch2_bio_map(struct bio *bio, void *base, size_t size)
+{
+	while (size) {
+		struct page *page = is_vmalloc_addr(base)
+				? vmalloc_to_page(base)
+				: virt_to_page(base);
+		unsigned offset = offset_in_page(base);
+		unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, offset));
+		size -= len;
+		base += len;
+	}
+}
+
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
+{
+	while (size) {
+		struct page *page = alloc_pages(gfp_mask, 0);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+		if (!page)
+			return -ENOMEM;
+
+		if (unlikely(!bio_add_page(bio, page, len, 0))) {
+			__free_page(page);
+			break;
+		}
+
+		size -= len;
+	}
+
+	return 0;
+}
+
+size_t bch2_rand_range(size_t max)
+{
+	size_t rand;
+
+	if (!max)
+		return 0;
+
+	do {
+		rand = get_random_long();
+		rand &= roundup_pow_of_two(max) - 1;
+	} while (rand >= max);
+
+	return rand;
+}
+
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, dst, iter, dst_iter) {
+		void *dstp = kmap_local_page(bv.bv_page);
+
+		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
+		kunmap_local(dstp);
+
+		src += bv.bv_len;
+	}
+}
+
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, src, iter, src_iter) {
+		void *srcp = kmap_local_page(bv.bv_page);
+
+		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
+		kunmap_local(srcp);
+
+		dst += bv.bv_len;
+	}
+}
+
+static int alignment_ok(const void *base, size_t align)
+{
+	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+		((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, size_t size)
+{
+	u32 t = *(u32 *)a;
+	*(u32 *)a = *(u32 *)b;
+	*(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, size_t size)
+{
+	u64 t = *(u64 *)a;
+	*(u64 *)a = *(u64 *)b;
+	*(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, size_t size)
+{
+	char t;
+
+	do {
+		t = *(char *)a;
+		*(char *)a++ = *(char *)b;
+		*(char *)b++ = t;
+	} while (--size > 0);
+}
+
+static inline int do_cmp(void *base, size_t n, size_t size,
+			 int (*cmp_func)(const void *, const void *, size_t),
+			 size_t l, size_t r)
+{
+	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
+			base + inorder_to_eytzinger0(r, n) * size,
+			size);
+}
+
+static inline void do_swap(void *base, size_t n, size_t size,
+			   void (*swap_func)(void *, void *, size_t),
+			   size_t l, size_t r)
+{
+	swap_func(base + inorder_to_eytzinger0(l, n) * size,
+		  base + inorder_to_eytzinger0(r, n) * size,
+		  size);
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     int (*cmp_func)(const void *, const void *, size_t),
+		     void (*swap_func)(void *, void *, size_t))
+{
+	int i, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		for (r = i; r * 2 + 1 < n; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < n &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		do_swap(base, n, size, swap_func, 0, i);
+
+		for (r = 0; r * 2 + 1 < i; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < i &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2 - 1) * size, n = num * size, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 + size < n; r  = c) {
+			c = r * 2 + size;
+			if (c < n - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i > 0; i -= size) {
+		swap_func(base, base + i, size);
+		for (r = 0; r * 2 + size < i; r = c) {
+			c = r * 2 + size;
+			if (c < i - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+}
+
+static void mempool_free_vp(void *element, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+
+	vpfree(element, size);
+}
+
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+
+	return vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return size < PAGE_SIZE
+		? mempool_init_kmalloc_pool(pool, min_nr, size)
+		: mempool_init(pool, min_nr, mempool_alloc_vp,
+			       mempool_free_vp, (void *) size);
+}
+
+#if 0
+void eytzinger1_test(void)
+{
+	unsigned inorder, eytz, size;
+
+	pr_info("1 based eytzinger test:");
+
+	for (size = 2;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger1_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+		BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+		BUG_ON(eytzinger1_prev(eytzinger1_first(size), size)	!= 0);
+		BUG_ON(eytzinger1_next(eytzinger1_last(size), size)	!= 0);
+
+		inorder = 1;
+		eytzinger1_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger1_last(size) &&
+			       eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+void eytzinger0_test(void)
+{
+
+	unsigned inorder, eytz, size;
+
+	pr_info("0 based eytzinger test:");
+
+	for (size = 1;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger0_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+		BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+		BUG_ON(eytzinger0_prev(eytzinger0_first(size), size)	!= -1);
+		BUG_ON(eytzinger0_next(eytzinger0_last(size), size)	!= -1);
+
+		inorder = 0;
+		eytzinger0_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger0_last(size) &&
+			       eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+	const u16 *l = _l, *r = _r;
+
+	return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+	int i, c1 = -1, c2 = -1;
+	ssize_t r;
+
+	r = eytzinger0_find_le(test_array, nr,
+			       sizeof(test_array[0]),
+			       cmp_u16, &search);
+	if (r >= 0)
+		c1 = test_array[r];
+
+	for (i = 0; i < nr; i++)
+		if (test_array[i] <= search && test_array[i] > c2)
+			c2 = test_array[i];
+
+	if (c1 != c2) {
+		eytzinger0_for_each(i, nr)
+			pr_info("[%3u] = %12u", i, test_array[i]);
+		pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+			i, r, c1, c2);
+	}
+}
+
+void eytzinger0_find_test(void)
+{
+	unsigned i, nr, allocated = 1 << 12;
+	u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+	for (nr = 1; nr < allocated; nr++) {
+		pr_info("testing %u elems", nr);
+
+		get_random_bytes(test_array, nr * sizeof(test_array[0]));
+		eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+		/* verify array is sorted correctly: */
+		eytzinger0_for_each(i, nr)
+			BUG_ON(i != eytzinger0_last(nr) &&
+			       test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+		for (i = 0; i < U16_MAX; i += 1 << 12)
+			eytzinger0_find_test_val(test_array, nr, i);
+
+		for (i = 0; i < nr; i++) {
+			eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+			eytzinger0_find_test_val(test_array, nr, test_array[i]);
+			eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+		}
+	}
+
+	kfree(test_array);
+}
+#endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+	u64 *ret;
+	int cpu;
+
+	/* access to pcpu vars has to be blocked by other locking */
+	preempt_disable();
+	ret = this_cpu_ptr(p);
+	preempt_enable();
+
+	for_each_possible_cpu(cpu) {
+		u64 *i = per_cpu_ptr(p, cpu);
+
+		if (i != ret) {
+			acc_u64s(ret, i, nr);
+			memset(i, 0, nr * sizeof(u64));
+		}
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
new file mode 100644
index 000000000000..849a37ae497c
--- /dev/null
+++ b/fs/bcachefs/util.h
@@ -0,0 +1,852 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_UTIL_H
+#define _BCACHEFS_UTIL_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/closure.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/sched/clock.h>
+#include <linux/llist.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "mean_and_variance.h"
+
+#include "darray.h"
+
+struct closure;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define EBUG_ON(cond)		BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define CPU_BIG_ENDIAN		0
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CPU_BIG_ENDIAN		1
+#endif
+
+/* type hackery */
+
+#define type_is_exact(_val, _type)					\
+	__builtin_types_compatible_p(typeof(_val), _type)
+
+#define type_is(_val, _type)						\
+	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
+	 __builtin_types_compatible_p(typeof(_val), const _type))
+
+/* Userspace doesn't align allocations as nicely as the kernel allocators: */
+static inline size_t buf_pages(void *p, size_t len)
+{
+	return DIV_ROUND_UP(len +
+			    ((unsigned long) p & (PAGE_SIZE - 1)),
+			    PAGE_SIZE);
+}
+
+static inline void vpfree(void *p, size_t size)
+{
+	if (is_vmalloc_addr(p))
+		vfree(p);
+	else
+		free_pages((unsigned long) p, get_order(size));
+}
+
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+{
+	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+					 get_order(size)) ?:
+		__vmalloc(size, gfp_mask);
+}
+
+static inline void kvpfree(void *p, size_t size)
+{
+	if (size < PAGE_SIZE)
+		kfree(p);
+	else
+		vpfree(p, size);
+}
+
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+{
+	return size < PAGE_SIZE
+		? kmalloc(size, gfp_mask)
+		: vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
+
+#define HEAP(type)							\
+struct {								\
+	size_t size, used;						\
+	type *data;							\
+}
+
+#define DECLARE_HEAP(type, name) HEAP(type) name
+
+#define init_heap(heap, _size, gfp)					\
+({									\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+				 (gfp));				\
+})
+
+#define free_heap(heap)							\
+do {									\
+	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_set_backpointer(h, i, _fn)					\
+do {									\
+	void (*fn)(typeof(h), size_t) = _fn;				\
+	if (fn)								\
+		fn(h, i);						\
+} while (0)
+
+#define heap_swap(h, i, j, set_backpointer)				\
+do {									\
+	swap((h)->data[i], (h)->data[j]);				\
+	heap_set_backpointer(h, i, set_backpointer);			\
+	heap_set_backpointer(h, j, set_backpointer);			\
+} while (0)
+
+#define heap_peek(h)							\
+({									\
+	EBUG_ON(!(h)->used);						\
+	(h)->data[0];							\
+})
+
+#define heap_full(h)	((h)->used == (h)->size)
+
+#define heap_sift_down(h, i, cmp, set_backpointer)			\
+do {									\
+	size_t _c, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _c) {			\
+		_c = _j * 2 + 1;					\
+		if (_c + 1 < (h)->used &&				\
+		    cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0)	\
+			_c++;						\
+									\
+		if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)		\
+			break;						\
+		heap_swap(h, _c, _j, set_backpointer);			\
+	}								\
+} while (0)
+
+#define heap_sift_up(h, i, cmp, set_backpointer)			\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp(h, (h)->data[i], (h)->data[p]) >= 0)		\
+			break;						\
+		heap_swap(h, i, p, set_backpointer);			\
+		i = p;							\
+	}								\
+} while (0)
+
+#define __heap_add(h, d, cmp, set_backpointer)				\
+({									\
+	size_t _i = (h)->used++;					\
+	(h)->data[_i] = d;						\
+	heap_set_backpointer(h, _i, set_backpointer);			\
+									\
+	heap_sift_up(h, _i, cmp, set_backpointer);			\
+	_i;								\
+})
+
+#define heap_add(h, d, cmp, set_backpointer)				\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r)								\
+		__heap_add(h, d, cmp, set_backpointer);			\
+	_r;								\
+})
+
+#define heap_add_or_replace(h, new, cmp, set_backpointer)		\
+do {									\
+	if (!heap_add(h, new, cmp, set_backpointer) &&			\
+	    cmp(h, new, heap_peek(h)) >= 0) {				\
+		(h)->data[0] = new;					\
+		heap_set_backpointer(h, 0, set_backpointer);		\
+		heap_sift_down(h, 0, cmp, set_backpointer);		\
+	}								\
+} while (0)
+
+#define heap_del(h, i, cmp, set_backpointer)				\
+do {									\
+	size_t _i = (i);						\
+									\
+	BUG_ON(_i >= (h)->used);					\
+	(h)->used--;							\
+	if ((_i) < (h)->used) {						\
+		heap_swap(h, _i, (h)->used, set_backpointer);		\
+		heap_sift_up(h, _i, cmp, set_backpointer);		\
+		heap_sift_down(h, _i, cmp, set_backpointer);		\
+	}								\
+} while (0)
+
+#define heap_pop(h, d, cmp, set_backpointer)				\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		heap_del(h, 0, cmp, set_backpointer);			\
+	}								\
+	_r;								\
+})
+
+#define heap_resort(heap, cmp, set_backpointer)				\
+do {									\
+	ssize_t _i;							\
+	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
+		heap_sift_down(heap, _i, cmp, set_backpointer);		\
+} while (0)
+
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+#include "printbuf.h"
+
+#define prt_vprintf(_out, ...)		bch2_prt_vprintf(_out, __VA_ARGS__)
+#define prt_printf(_out, ...)		bch2_prt_printf(_out, __VA_ARGS__)
+#define printbuf_str(_buf)		bch2_printbuf_str(_buf)
+#define printbuf_exit(_buf)		bch2_printbuf_exit(_buf)
+
+#define printbuf_tabstops_reset(_buf)	bch2_printbuf_tabstops_reset(_buf)
+#define printbuf_tabstop_pop(_buf)	bch2_printbuf_tabstop_pop(_buf)
+#define printbuf_tabstop_push(_buf, _n)	bch2_printbuf_tabstop_push(_buf, _n)
+
+#define printbuf_indent_add(_out, _n)	bch2_printbuf_indent_add(_out, _n)
+#define printbuf_indent_sub(_out, _n)	bch2_printbuf_indent_sub(_out, _n)
+
+#define prt_newline(_out)		bch2_prt_newline(_out)
+#define prt_tab(_out)			bch2_prt_tab(_out)
+#define prt_tab_rjust(_out)		bch2_prt_tab_rjust(_out)
+
+#define prt_bytes_indented(...)		bch2_prt_bytes_indented(__VA_ARGS__)
+#define prt_u64(_out, _v)		prt_printf(_out, "%llu", (u64) (_v))
+#define prt_human_readable_u64(...)	bch2_prt_human_readable_u64(__VA_ARGS__)
+#define prt_human_readable_s64(...)	bch2_prt_human_readable_s64(__VA_ARGS__)
+#define prt_units_u64(...)		bch2_prt_units_u64(__VA_ARGS__)
+#define prt_units_s64(...)		bch2_prt_units_s64(__VA_ARGS__)
+#define prt_string_option(...)		bch2_prt_string_option(__VA_ARGS__)
+#define prt_bitflags(...)		bch2_prt_bitflags(__VA_ARGS__)
+
+void bch2_pr_time_units(struct printbuf *, u64);
+
+#ifdef __KERNEL__
+static inline void pr_time(struct printbuf *out, u64 time)
+{
+	prt_printf(out, "%llu", time);
+}
+#else
+#include <time.h>
+static inline void pr_time(struct printbuf *out, u64 _time)
+{
+	char time_str[64];
+	time_t time = _time;
+	struct tm *tm = localtime(&time);
+	size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
+	if (!err)
+		prt_printf(out, "(formatting error)");
+	else
+		prt_printf(out, "%s", time_str);
+}
+#endif
+
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+	sprintf(out, "%pUb", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
+
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
+{
+	char uuid_str[40];
+
+	uuid_unparse_lower(uuid, uuid_str);
+	prt_printf(out, "%s", uuid_str);
+}
+
+int bch2_strtoint_h(const char *, int *);
+int bch2_strtouint_h(const char *, unsigned int *);
+int bch2_strtoll_h(const char *, long long *);
+int bch2_strtoull_h(const char *, unsigned long long *);
+int bch2_strtou64_h(const char *, u64 *);
+
+static inline int bch2_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch2_strtoint_h(cp, (int *) res);
+#else
+	return bch2_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch2_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch2_strtouint_h(cp, (unsigned int *) res);
+#else
+	return bch2_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)						\
+	( type_is(*res, int)		? bch2_strtoint_h(cp, (void *) res)\
+	: type_is(*res, long)		? bch2_strtol_h(cp, (void *) res)\
+	: type_is(*res, long long)	? bch2_strtoll_h(cp, (void *) res)\
+	: type_is(*res, unsigned)	? bch2_strtouint_h(cp, (void *) res)\
+	: type_is(*res, unsigned long)	? bch2_strtoul_h(cp, (void *) res)\
+	: type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
+	: -EINVAL)
+
+#define strtoul_safe(cp, var)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = _v;						\
+	_r;								\
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)				\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = clamp_t(typeof(var), _v, min, max);		\
+	_r;								\
+})
+
+#define strtoul_safe_restrict(cp, var, min, max)			\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r && _v >= min && _v <= max)				\
+		var = _v;						\
+	else								\
+		_r = -EINVAL;						\
+	_r;								\
+})
+
+#define snprint(out, var)						\
+	prt_printf(out,							\
+		   type_is(var, int)		? "%i\n"		\
+		 : type_is(var, unsigned)	? "%u\n"		\
+		 : type_is(var, long)		? "%li\n"		\
+		 : type_is(var, unsigned long)	? "%lu\n"		\
+		 : type_is(var, s64)		? "%lli\n"		\
+		 : type_is(var, u64)		? "%llu\n"		\
+		 : type_is(var, char *)		? "%s\n"		\
+		 : "%i\n", var)
+
+bool bch2_is_zero(const void *, size_t);
+
+u64 bch2_read_flag_list(char *, const char * const[]);
+
+void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines);
+
+typedef DARRAY(unsigned long) bch_stacktrace;
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *);
+void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *);
+
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct bch2_quantiles {
+	struct bch2_quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct bch2_time_stat_buffer {
+	unsigned	nr;
+	struct bch2_time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[32];
+};
+
+struct bch2_time_stats {
+	spinlock_t	lock;
+	/* all fields are in nanoseconds */
+	u64		max_duration;
+	u64             min_duration;
+	u64             max_freq;
+	u64             min_freq;
+	u64		last_event;
+	struct bch2_quantiles quantiles;
+
+	struct mean_and_variance	  duration_stats;
+	struct mean_and_variance_weighted duration_stats_weighted;
+	struct mean_and_variance	  freq_stats;
+	struct mean_and_variance_weighted freq_stats_weighted;
+	struct bch2_time_stat_buffer __percpu *buffer;
+};
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+#endif
+
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+	__bch2_time_stats_update(stats, start, local_clock());
+}
+
+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+#define ewma_add(ewma, val, weight)					\
+({									\
+	typeof(ewma) _ewma = (ewma);					\
+	typeof(weight) _weight = (weight);				\
+									\
+	(((_ewma << _weight) - _ewma) + (val)) >> _weight;		\
+})
+
+struct bch_ratelimit {
+	/* Next time we want to do some work, in nanoseconds */
+	u64			next;
+
+	/*
+	 * Rate at which we want to do work, in units per nanosecond
+	 * The units here correspond to the units passed to
+	 * bch2_ratelimit_increment()
+	 */
+	unsigned		rate;
+};
+
+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
+{
+	d->next = local_clock();
+}
+
+u64 bch2_ratelimit_delay(struct bch_ratelimit *);
+void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
+
+struct bch_pd_controller {
+	struct bch_ratelimit	rate;
+	unsigned long		last_update;
+
+	s64			last_actual;
+	s64			smoothed_derivative;
+
+	unsigned		p_term_inverse;
+	unsigned		d_smooth;
+	unsigned		d_term;
+
+	/* for exporting to sysfs (no effect on behavior) */
+	s64			last_derivative;
+	s64			last_proportional;
+	s64			last_change;
+	s64			last_target;
+
+	/*
+	 * If true, the rate will not increase if bch2_ratelimit_delay()
+	 * is not being called often enough.
+	 */
+	bool			backpressure;
+};
+
+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
+void bch2_pd_controller_init(struct bch_pd_controller *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
+
+#define sysfs_pd_controller_attribute(name)				\
+	rw_attribute(name##_rate);					\
+	rw_attribute(name##_rate_bytes);				\
+	rw_attribute(name##_rate_d_term);				\
+	rw_attribute(name##_rate_p_term_inverse);			\
+	read_attribute(name##_rate_debug)
+
+#define sysfs_pd_controller_files(name)					\
+	&sysfs_##name##_rate,						\
+	&sysfs_##name##_rate_bytes,					\
+	&sysfs_##name##_rate_d_term,					\
+	&sysfs_##name##_rate_p_term_inverse,				\
+	&sysfs_##name##_rate_debug
+
+#define sysfs_pd_controller_show(name, var)				\
+do {									\
+	sysfs_hprint(name##_rate,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_bytes,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
+	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
+									\
+	if (attr == &sysfs_##name##_rate_debug)				\
+		bch2_pd_controller_debug_to_text(out, var);		\
+} while (0)
+
+#define sysfs_pd_controller_store(name, var)				\
+do {									\
+	sysfs_strtoul_clamp(name##_rate,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul_clamp(name##_rate_bytes,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul(name##_rate_d_term,	(var)->d_term);		\
+	sysfs_strtoul_clamp(name##_rate_p_term_inverse,			\
+			    (var)->p_term_inverse, 1, INT_MAX);		\
+} while (0)
+
+#define container_of_or_null(ptr, type, member)				\
+({									\
+	typeof(ptr) _ptr = ptr;						\
+	_ptr ? container_of(_ptr, type, member) : NULL;			\
+})
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+	unsigned fract = x & ~(~0 << fract_bits);
+
+	x >>= fract_bits;
+	x   = 1 << x;
+	x  += (x * fract) >> fract_bits;
+
+	return x;
+}
+
+void bch2_bio_map(struct bio *bio, void *base, size_t);
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl)					\
+do {									\
+	closure_get(cl);						\
+	submit_bio(bio);						\
+} while (0)
+
+#define kthread_wait(cond)						\
+({									\
+	int _ret = 0;							\
+									\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
+#define kthread_wait_freezable(cond)					\
+({									\
+	int _ret = 0;							\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+		try_to_freeze();					\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
+size_t bch2_rand_range(size_t);
+
+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+
+static inline void memcpy_u64s_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+}
+
+static inline void __memcpy_u64s(void *dst, const void *src,
+				 unsigned u64s)
+{
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+
+	asm volatile("rep ; movsq"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+#endif
+}
+
+static inline void memcpy_u64s(void *dst, const void *src,
+			       unsigned u64s)
+{
+	EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
+		 dst + u64s * sizeof(u64) <= src));
+
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down(void *dst, const void *src,
+				       unsigned u64s)
+{
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down_small(void *dst, const void *src,
+				       unsigned u64s)
+{
+	memcpy_u64s_small(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down_small(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
+					   unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s;
+	u64 *src = (u64 *) _src + u64s;
+
+	while (u64s--)
+		*--dst = *--src;
+}
+
+static inline void memmove_u64s_up_small(void *dst, const void *src,
+					 unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up_small(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
+				     unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s - 1;
+	u64 *src = (u64 *) _src + u64s - 1;
+
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+
+	asm volatile("std ;\n"
+		     "rep ; movsq\n"
+		     "cld ;\n"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	while (u64s--)
+		*dst-- = *src--;
+#endif
+}
+
+static inline void memmove_u64s_up(void *dst, const void *src,
+				   unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up(dst, src, u64s);
+}
+
+static inline void memmove_u64s(void *dst, const void *src,
+				unsigned u64s)
+{
+	if (dst < src)
+		__memmove_u64s_down(dst, src, u64s);
+	else
+		__memmove_u64s_up(dst, src, u64s);
+}
+
+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
+static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
+{
+	unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
+
+	memset(s + bytes, c, rem);
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t));
+
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos)				\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	__array_insert_item(_array, _nr, _pos);				\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
+static inline void __move_gap(void *array, size_t element_size,
+			      size_t nr, size_t size,
+			      size_t old_gap, size_t new_gap)
+{
+	size_t gap_end = old_gap + size - nr;
+
+	if (new_gap < old_gap) {
+		size_t move = old_gap - new_gap;
+
+		memmove(array + element_size * (gap_end - move),
+			array + element_size * (old_gap - move),
+				element_size * move);
+	} else if (new_gap > old_gap) {
+		size_t move = new_gap - old_gap;
+
+		memmove(array + element_size * old_gap,
+			array + element_size * gap_end,
+				element_size * move);
+	}
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_array, _nr, _size, _old_gap, _new_gap)	\
+	__move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+
+#define bubble_sort(_base, _nr, _cmp)					\
+do {									\
+	ssize_t _i, _last;						\
+	bool _swapped = true;						\
+									\
+	for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
+		_swapped = false;					\
+		for (_i = 0; _i < _last; _i++)				\
+			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
+				swap((_base)[_i], (_base)[_i + 1]);	\
+				_swapped = true;			\
+			}						\
+	}								\
+} while (0)
+
+static inline u64 percpu_u64_get(u64 __percpu *src)
+{
+	u64 ret = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		ret += *per_cpu_ptr(src, cpu);
+	return ret;
+}
+
+static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(dst, cpu) = 0;
+	this_cpu_write(*dst, src);
+}
+
+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
+{
+	unsigned i;
+
+	for (i = 0; i < nr; i++)
+		acc[i] += src[i];
+}
+
+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
+				   unsigned nr)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
+}
+
+static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memset(per_cpu_ptr(p, cpu), c, bytes);
+}
+
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+static inline int u8_cmp(u8 l, u8 r)
+{
+	return cmp_int(l, r);
+}
+
+static inline int cmp_le32(__le32 l, __le32 r)
+{
+	return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
+}
+
+#include <linux/uuid.h>
+
+#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
new file mode 100644
index 000000000000..cb4f33ed9ab3
--- /dev/null
+++ b/fs/bcachefs/varint.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/math.h>
+#include <linux/string.h>
+#include <asm/unaligned.h>
+
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
+#include "varint.h"
+
+/**
+ * bch2_varint_encode - encode a variable length integer
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
+ */
+int bch2_varint_encode(u8 *out, u64 v)
+{
+	unsigned bits = fls64(v|1);
+	unsigned bytes = DIV_ROUND_UP(bits, 7);
+	__le64 v_le;
+
+	if (likely(bytes < 9)) {
+		v <<= bytes;
+		v |= ~(~0 << (bytes - 1));
+		v_le = cpu_to_le64(v);
+		memcpy(out, &v_le, bytes);
+	} else {
+		*out++ = 255;
+		bytes = 9;
+		put_unaligned_le64(v, out);
+	}
+
+	return bytes;
+}
+
+/**
+ * bch2_varint_decode - encode a variable length integer
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ */
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+	unsigned bytes = likely(in < end)
+		? ffz(*in & 255) + 1
+		: 1;
+	u64 v;
+
+	if (unlikely(in + bytes > end))
+		return -1;
+
+	if (likely(bytes < 9)) {
+		__le64 v_le = 0;
+
+		memcpy(&v_le, in, bytes);
+		v = le64_to_cpu(v_le);
+		v >>= bytes;
+	} else {
+		v = get_unaligned_le64(++in);
+	}
+
+	*out = v;
+	return bytes;
+}
+
+/**
+ * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
+ *
+ * This version assumes it's always safe to write 8 bytes to @out, even if the
+ * encoded integer would be smaller.
+ */
+int bch2_varint_encode_fast(u8 *out, u64 v)
+{
+	unsigned bits = fls64(v|1);
+	unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+	if (likely(bytes < 9)) {
+		v <<= bytes;
+		v |= ~(~0 << (bytes - 1));
+	} else {
+		*out++ = 255;
+		bytes = 9;
+	}
+
+	put_unaligned_le64(v, out);
+	return bytes;
+}
+
+/**
+ * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ *
+ * This version assumes that it is safe to read at most 8 bytes past the end of
+ * @end (we still return an error if the varint extends past @end).
+ */
+int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
+{
+#ifdef CONFIG_VALGRIND
+	VALGRIND_MAKE_MEM_DEFINED(in, 8);
+#endif
+	u64 v = get_unaligned_le64(in);
+	unsigned bytes = ffz(*in) + 1;
+
+	if (unlikely(in + bytes > end))
+		return -1;
+
+	if (likely(bytes < 9)) {
+		v >>= bytes;
+		v &= ~(~0ULL << (7 * bytes));
+	} else {
+		v = get_unaligned_le64(++in);
+	}
+
+	*out = v;
+	return bytes;
+}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
new file mode 100644
index 000000000000..92a182fb3d7a
--- /dev/null
+++ b/fs/bcachefs/varint.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+int bch2_varint_encode_fast(u8 *, u64);
+int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
new file mode 100644
index 000000000000..a6561b4b36a6
--- /dev/null
+++ b/fs/bcachefs/vstructs.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VSTRUCTS_H
+#define _VSTRUCTS_H
+
+#include "util.h"
+
+/*
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
+ * assumes u64 is little endian:
+ */
+#define __vstruct_u64s(_s)						\
+({									\
+	( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)		\
+	: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)		\
+	: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)		\
+	: ((__force u8) ((_s)->u64s)));						\
+})
+
+#define __vstruct_bytes(_type, _u64s)					\
+({									\
+	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
+									\
+	(size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));	\
+})
+
+#define vstruct_bytes(_s)						\
+	__vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
+
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s)		\
+	(round_up(__vstruct_bytes(_type, _u64s),			\
+		  512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
+
+#define vstruct_blocks(_s, _sector_block_bits)				\
+	__vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
+
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)		\
+	__vstruct_blocks(typeof(*(_s)), _sector_block_bits,		\
+			 __vstruct_u64s(_s) + (_u64s))
+
+#define vstruct_sectors(_s, _sector_block_bits)				\
+	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
+
+#define vstruct_next(_s)						\
+	((typeof(_s))			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_last(_s)						\
+	((typeof(&(_s)->start[0]))	((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_end(_s)							\
+	((void *)			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+
+#define vstruct_for_each(_s, _i)					\
+	for (_i = (_s)->start;						\
+	     _i < vstruct_last(_s);					\
+	     _i = vstruct_next(_i))
+
+#define vstruct_for_each_safe(_s, _i, _t)				\
+	for (_i = (_s)->start;						\
+	     _i < vstruct_last(_s) && (_t = vstruct_next(_i), true);	\
+	     _i = _t)
+
+#define vstruct_idx(_s, _idx)						\
+	((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
+
+#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
new file mode 100644
index 000000000000..b069b1a62e25
--- /dev/null
+++ b/fs/bcachefs/xattr.c
@@ -0,0 +1,651 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "rebalance.h"
+#include "str_hash.h"
+#include "xattr.h"
+
+#include <linux/dcache.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
+
+static u64 bch2_xattr_hash(const struct bch_hash_info *info,
+			  const struct xattr_search_key *key)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
+	bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
+
+	return bch2_str_hash_end(&ctx, info);
+}
+
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_xattr_hash(info, key);
+}
+
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
+
+	return bch2_xattr_hash(info,
+		 &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+}
+
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	const struct xattr_search_key *r = _r;
+
+	return l.v->x_type != r->type ||
+		l.v->x_name_len != r->name.len ||
+		memcmp(l.v->x_name, r->name.name, r->name.len);
+}
+
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
+
+	return l.v->x_type != r.v->x_type ||
+		l.v->x_name_len != r.v->x_name_len ||
+		memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+}
+
+const struct bch_hash_desc bch2_xattr_hash_desc = {
+	.btree_id	= BTREE_ID_xattrs,
+	.key_type	= KEY_TYPE_xattr,
+	.hash_key	= xattr_hash_key,
+	.hash_bkey	= xattr_hash_bkey,
+	.cmp_key	= xattr_cmp_key,
+	.cmp_bkey	= xattr_cmp_bkey,
+};
+
+int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
+{
+	const struct xattr_handler *handler;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+
+	if (bkey_val_u64s(k.k) <
+	    xattr_val_u64s(xattr.v->x_name_len,
+			   le16_to_cpu(xattr.v->x_val_len))) {
+		prt_printf(err, "value too small (%zu < %u)",
+		       bkey_val_u64s(k.k),
+		       xattr_val_u64s(xattr.v->x_name_len,
+				      le16_to_cpu(xattr.v->x_val_len)));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	/* XXX why +4 ? */
+	if (bkey_val_u64s(k.k) >
+	    xattr_val_u64s(xattr.v->x_name_len,
+			   le16_to_cpu(xattr.v->x_val_len) + 4)) {
+		prt_printf(err, "value too big (%zu > %u)",
+		       bkey_val_u64s(k.k),
+		       xattr_val_u64s(xattr.v->x_name_len,
+				      le16_to_cpu(xattr.v->x_val_len) + 4));
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (!handler) {
+		prt_printf(err, "invalid type (%u)", xattr.v->x_type);
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
+		prt_printf(err, "xattr name has invalid characters");
+		return -BCH_ERR_invalid_bkey;
+	}
+
+	return 0;
+}
+
+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	const struct xattr_handler *handler;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (handler && handler->prefix)
+		prt_printf(out, "%s", handler->prefix);
+	else if (handler)
+		prt_printf(out, "(type %u)", xattr.v->x_type);
+	else
+		prt_printf(out, "(unknown type %u)", xattr.v->x_type);
+
+	prt_printf(out, "%.*s:%.*s",
+	       xattr.v->x_name_len,
+	       xattr.v->x_name,
+	       le16_to_cpu(xattr.v->x_val_len),
+	       (char *) xattr_val(xattr.v));
+
+	if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+	    xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+		prt_char(out, ' ');
+		bch2_acl_to_text(out, xattr_val(xattr.v),
+				 le16_to_cpu(xattr.v->x_val_len));
+	}
+}
+
+static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
+				const char *name, void *buffer, size_t size, int type)
+{
+	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
+	struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
+	struct btree_iter iter;
+	struct bkey_s_c_xattr xattr;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+			       inode_inum(inode), &search, 0);
+	if (ret)
+		goto err1;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err2;
+
+	xattr = bkey_s_c_to_xattr(k);
+	ret = le16_to_cpu(xattr.v->x_val_len);
+	if (buffer) {
+		if (ret > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, xattr_val(xattr.v), ret);
+	}
+err2:
+	bch2_trans_iter_exit(trans, &iter);
+err1:
+	return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret;
+}
+
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
+		   struct bch_inode_unpacked *inode_u,
+		   const struct bch_hash_info *hash_info,
+		   const char *name, const void *value, size_t size,
+		   int type, int flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter inode_iter = { NULL };
+	int ret;
+
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
+
+	inode_u->bi_ctime = bch2_current_time(c);
+
+	ret = bch2_inode_write(trans, &inode_iter, inode_u);
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (ret)
+		return ret;
+
+	if (value) {
+		struct bkey_i_xattr *xattr;
+		unsigned namelen = strlen(name);
+		unsigned u64s = BKEY_U64s +
+			xattr_val_u64s(namelen, size);
+
+		if (u64s > U8_MAX)
+			return -ERANGE;
+
+		xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		bkey_xattr_init(&xattr->k_i);
+		xattr->k.u64s		= u64s;
+		xattr->v.x_type		= type;
+		xattr->v.x_name_len	= namelen;
+		xattr->v.x_val_len	= cpu_to_le16(size);
+		memcpy(xattr->v.x_name, name, namelen);
+		memcpy(xattr_val(&xattr->v), value, size);
+
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+			      inum, &xattr->k_i,
+			      (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+			      (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(type, name, strlen(name));
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
+				       hash_info, inum, &search);
+	}
+
+	if (bch2_err_matches(ret, ENOENT))
+		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
+
+	return ret;
+}
+
+struct xattr_buf {
+	char		*buf;
+	size_t		len;
+	size_t		used;
+};
+
+static int __bch2_xattr_emit(const char *prefix,
+			     const char *name, size_t name_len,
+			     struct xattr_buf *buf)
+{
+	const size_t prefix_len = strlen(prefix);
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (buf->buf) {
+		if (buf->used + total_len > buf->len)
+			return -ERANGE;
+
+		memcpy(buf->buf + buf->used, prefix, prefix_len);
+		memcpy(buf->buf + buf->used + prefix_len,
+		       name, name_len);
+		buf->buf[buf->used + prefix_len + name_len] = '\0';
+	}
+
+	buf->used += total_len;
+	return 0;
+}
+
+static int bch2_xattr_emit(struct dentry *dentry,
+			    const struct bch_xattr *xattr,
+			    struct xattr_buf *buf)
+{
+	const struct xattr_handler *handler =
+		bch2_xattr_type_to_handler(xattr->x_type);
+
+	return handler && (!handler->list || handler->list(dentry))
+		? __bch2_xattr_emit(handler->prefix ?: handler->name,
+				    xattr->x_name, xattr->x_name_len, buf)
+		: 0;
+}
+
+static int bch2_xattr_list_bcachefs(struct bch_fs *c,
+				    struct bch_inode_unpacked *inode,
+				    struct xattr_buf *buf,
+				    bool all)
+{
+	const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
+	unsigned id;
+	int ret = 0;
+	u64 v;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		v = bch2_inode_opt_get(inode, id);
+		if (!v)
+			continue;
+
+		if (!all &&
+		    !(inode->bi_fields_set & (1 << id)))
+			continue;
+
+		ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
+					strlen(bch2_inode_opts[id]), buf);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct bch_fs *c = dentry->d_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
+	u64 offset = 0, inum = inode->ei_inode.bi_inum;
+	u32 snapshot;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
+			   SPOS(inum, offset, snapshot),
+			   POS(inum, U64_MAX), 0, k, ret) {
+		if (k.k->type != KEY_TYPE_xattr)
+			continue;
+
+		ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+		if (ret)
+			break;
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		goto out;
+
+	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
+	if (ret)
+		goto out;
+
+	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
+	if (ret)
+		goto out;
+
+	return buf.used;
+out:
+	return bch2_err_class(ret);
+}
+
+static int bch2_xattr_get_handler(const struct xattr_handler *handler,
+				  struct dentry *dentry, struct inode *vinode,
+				  const char *name, void *buffer, size_t size)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret = bch2_trans_do(c, NULL, NULL, 0,
+		bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+				  struct mnt_idmap *idmap,
+				  struct dentry *dentry, struct inode *vinode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		commit_do(trans, NULL, NULL, 0,
+			bch2_xattr_set(trans, inode_inum(inode), &inode_u,
+				       &hash, name, value, size,
+				       handler->flags, flags)) ?:
+		(bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
+
+	return bch2_err_class(ret);
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_USER,
+};
+
+static bool bch2_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= bch2_xattr_trusted_list,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_SECURITY,
+};
+
+#ifndef NO_BCACHEFS_FS
+
+static int opt_to_inode_opt(int id)
+{
+	switch (id) {
+#define x(name, ...)				\
+	case Opt_##name: return Inode_opt_##name;
+	BCH_INODE_OPTS()
+#undef  x
+	default:
+		return -1;
+	}
+}
+
+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size,
+				bool all)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_opts opts =
+		bch2_inode_opts_to_opts(&inode->ei_inode);
+	const struct bch_option *opt;
+	int id, inode_opt_id;
+	struct printbuf out = PRINTBUF;
+	int ret;
+	u64 v;
+
+	id = bch2_opt_lookup(name);
+	if (id < 0 || !bch2_opt_is_inode_opt(id))
+		return -EINVAL;
+
+	inode_opt_id = opt_to_inode_opt(id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
+	opt = bch2_opt_table + id;
+
+	if (!bch2_opt_defined_by_id(&opts, id))
+		return -ENODATA;
+
+	if (!all &&
+	    !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
+		return -ENODATA;
+
+	v = bch2_opt_get_by_id(&opts, id);
+	bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
+
+	ret = out.pos;
+
+	if (out.allocation_failure) {
+		ret = -ENOMEM;
+	} else if (buffer) {
+		if (out.pos > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, out.buf, out.pos);
+	}
+
+	printbuf_exit(&out);
+	return ret;
+}
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, false);
+}
+
+struct inode_opt_set {
+	int			id;
+	u64			v;
+	bool			defined;
+};
+
+static int inode_opt_set_fn(struct btree_trans *trans,
+			    struct bch_inode_info *inode,
+			    struct bch_inode_unpacked *bi,
+			    void *p)
+{
+	struct inode_opt_set *s = p;
+
+	if (s->defined)
+		bi->bi_fields_set |= 1U << s->id;
+	else
+		bi->bi_fields_set &= ~(1U << s->id);
+
+	bch2_inode_opt_set(bi, s->id, s->v);
+
+	return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+				   struct mnt_idmap *idmap,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	const struct bch_option *opt;
+	char *buf;
+	struct inode_opt_set s;
+	int opt_id, inode_opt_id, ret;
+
+	opt_id = bch2_opt_lookup(name);
+	if (opt_id < 0)
+		return -EINVAL;
+
+	opt = bch2_opt_table + opt_id;
+
+	inode_opt_id = opt_to_inode_opt(opt_id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
+	s.id = inode_opt_id;
+
+	if (value) {
+		u64 v = 0;
+
+		buf = kmalloc(size + 1, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		memcpy(buf, value, size);
+		buf[size] = '\0';
+
+		ret = bch2_opt_parse(c, opt, buf, &v, NULL);
+		kfree(buf);
+
+		if (ret < 0)
+			return ret;
+
+		ret = bch2_opt_check_may_set(c, opt_id, v);
+		if (ret < 0)
+			return ret;
+
+		s.v = v + 1;
+		s.defined = true;
+	} else {
+		if (!IS_ROOT(dentry)) {
+			struct bch_inode_info *dir =
+				to_bch_ei(d_inode(dentry->d_parent));
+
+			s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
+		} else {
+			s.v = 0;
+		}
+
+		s.defined = false;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	if (inode_opt_id == Inode_opt_project) {
+		/*
+		 * inode fields accessible via the xattr interface are stored
+		 * with a +1 bias, so that 0 means unset:
+		 */
+		ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+
+	if (value &&
+	    (opt_id == Opt_background_compression ||
+	     opt_id == Opt_background_target))
+		bch2_rebalance_add_work(c, inode->v.i_blocks);
+
+	return bch2_err_class(ret);
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+	.prefix	= "bcachefs.",
+	.get	= bch2_xattr_bcachefs_get,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+static int bch2_xattr_bcachefs_get_effective(
+				const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, true);
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
+	.prefix	= "bcachefs_effective.",
+	.get	= bch2_xattr_bcachefs_get_effective,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
+const struct xattr_handler *bch2_xattr_handlers[] = {
+	&bch_xattr_user_handler,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	&nop_posix_acl_access,
+	&nop_posix_acl_default,
+#endif
+	&bch_xattr_trusted_handler,
+	&bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+	&bch_xattr_bcachefs_handler,
+	&bch_xattr_bcachefs_effective_handler,
+#endif
+	NULL
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+	[KEY_TYPE_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&nop_posix_acl_access,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&nop_posix_acl_default,
+	[KEY_TYPE_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[KEY_TYPE_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+};
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
+{
+	return type < ARRAY_SIZE(bch_xattr_handler_map)
+		? bch_xattr_handler_map[type]
+		: NULL;
+}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
new file mode 100644
index 000000000000..f5a52e3a6016
--- /dev/null
+++ b/fs/bcachefs/xattr.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_H
+#define _BCACHEFS_XATTR_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
+
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
+	.key_invalid	= bch2_xattr_invalid,		\
+	.val_to_text	= bch2_xattr_to_text,		\
+	.min_val_size	= 8,				\
+})
+
+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+			    name_len + val_len, sizeof(u64));
+}
+
+#define xattr_val(_xattr)					\
+	((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+
+struct xattr_search_key {
+	u8		type;
+	struct qstr	name;
+};
+
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)	\
+	{ .type = _type, .name = QSTR_INIT(_name, _len) })
+
+struct dentry;
+struct xattr_handler;
+struct bch_hash_info;
+struct bch_inode_info;
+
+/* Exported for cmd_migrate.c in tools: */
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+		   struct bch_inode_unpacked *, const struct bch_hash_info *,
+		   const char *, const void *, size_t, int, int);
+
+ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch2_xattr_handlers[];
+
+#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9a16a51fbb88..9acdec56f626 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -360,11 +360,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	 * for indexing purposes. (PFD, page 54)
 	 */
 
-	inode->i_mtime.tv_sec =
-	    fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
-	inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */
-	inode_set_ctime_to_ts(inode, inode->i_mtime);
-	inode->i_atime = inode->i_mtime;
+	inode_set_mtime(inode,
+			fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16,
+			0);/* lower 16 bits are not a time */
+	inode_set_ctime_to_ts(inode, inode_get_mtime(inode));
+	inode_set_atime_to_ts(inode, inode_get_mtime(inode));
 
 	befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
 	befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 12b8af04dcb3..fbc4ae80a4b2 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	set_bit(ino, info->si_imap);
 	info->si_freei--;
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_blocks = 0;
 	inode->i_op = &bfs_file_inops;
 	inode->i_fop = &bfs_file_operations;
@@ -187,7 +187,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 	de->ino = 0;
 	mark_buffer_dirty_inode(bh, dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
 	inode_dec_link_count(inode);
@@ -240,7 +240,7 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 			goto end_rename;
 	}
 	old_de->ino = 0;
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
 	mark_inode_dirty(old_dir);
 	if (new_inode) {
 		inode_set_ctime_current(new_inode);
@@ -294,7 +294,8 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
 					dir->i_size += BFS_DIRENT_SIZE;
 					inode_set_ctime_current(dir);
 				}
-				dir->i_mtime = inode_set_ctime_current(dir);
+				inode_set_mtime_to_ts(dir,
+						      inode_set_ctime_current(dir));
 				mark_inode_dirty(dir);
 				de->ino = cpu_to_le16((u16)ino);
 				for (i = 0; i < BFS_NAMELEN; i++)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e6a76ae9eb44..355957dbce39 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -80,11 +80,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
 	set_nlink(inode, le32_to_cpu(di->i_nlink));
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
-	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
-	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
+	inode_set_atime(inode, le32_to_cpu(di->i_atime), 0);
+	inode_set_mtime(inode, le32_to_cpu(di->i_mtime), 0);
 	inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0);
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
 
 	brelse(bh);
 	unlock_new_inode(inode);
@@ -140,9 +138,9 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	di->i_uid = cpu_to_le32(i_uid_read(inode));
 	di->i_gid = cpu_to_le32(i_gid_read(inode));
 	di->i_nlink = cpu_to_le32(inode->i_nlink);
-	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
-	di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
+	di->i_atime = cpu_to_le32(inode_get_atime_sec(inode));
+	di->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode));
+	di->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode));
 	i_sblock = BFS_I(inode)->i_sblock;
 	di->i_sblock = cpu_to_le32(i_sblock);
 	di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7b3d2d491407..5397b552fbeb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -110,38 +110,19 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
 
-static int set_brk(unsigned long start, unsigned long end, int prot)
-{
-	start = ELF_PAGEALIGN(start);
-	end = ELF_PAGEALIGN(end);
-	if (end > start) {
-		/*
-		 * Map the last of the bss segment.
-		 * If the header is requesting these pages to be
-		 * executable, honour that (ppc32 needs this).
-		 */
-		int error = vm_brk_flags(start, end - start,
-				prot & PROT_EXEC ? VM_EXEC : 0);
-		if (error)
-			return error;
-	}
-	current->mm->start_brk = current->mm->brk = end;
-	return 0;
-}
-
-/* We need to explicitly zero any fractional pages
-   after the data section (i.e. bss).  This would
-   contain the junk from the file that should not
-   be in memory
+/*
+ * We need to explicitly zero any trailing portion of the page that follows
+ * p_filesz when it ends before the page ends (e.g. bss), otherwise this
+ * memory will contain the junk from the file that should not be present.
  */
-static int padzero(unsigned long elf_bss)
+static int padzero(unsigned long address)
 {
 	unsigned long nbyte;
 
-	nbyte = ELF_PAGEOFFSET(elf_bss);
+	nbyte = ELF_PAGEOFFSET(address);
 	if (nbyte) {
 		nbyte = ELF_MIN_ALIGN - nbyte;
-		if (clear_user((void __user *) elf_bss, nbyte))
+		if (clear_user((void __user *)address, nbyte))
 			return -EFAULT;
 	}
 	return 0;
@@ -367,6 +348,11 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 	return 0;
 }
 
+/*
+ * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset"
+ * into memory at "addr". (Note that p_filesz is rounded up to the
+ * next page, so any extra bytes from the file must be wiped.)
+ */
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		const struct elf_phdr *eppnt, int prot, int type,
 		unsigned long total_size)
@@ -406,6 +392,60 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	return(map_addr);
 }
 
+/*
+ * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset"
+ * into memory at "addr". Memory from "p_filesz" through "p_memsz"
+ * rounded up to the next page is zeroed.
+ */
+static unsigned long elf_load(struct file *filep, unsigned long addr,
+		const struct elf_phdr *eppnt, int prot, int type,
+		unsigned long total_size)
+{
+	unsigned long zero_start, zero_end;
+	unsigned long map_addr;
+
+	if (eppnt->p_filesz) {
+		map_addr = elf_map(filep, addr, eppnt, prot, type, total_size);
+		if (BAD_ADDR(map_addr))
+			return map_addr;
+		if (eppnt->p_memsz > eppnt->p_filesz) {
+			zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+				eppnt->p_filesz;
+			zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+				eppnt->p_memsz;
+
+			/*
+			 * Zero the end of the last mapped page but ignore
+			 * any errors if the segment isn't writable.
+			 */
+			if (padzero(zero_start) && (prot & PROT_WRITE))
+				return -EFAULT;
+		}
+	} else {
+		map_addr = zero_start = ELF_PAGESTART(addr);
+		zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+			eppnt->p_memsz;
+	}
+	if (eppnt->p_memsz > eppnt->p_filesz) {
+		/*
+		 * Map the last of the segment.
+		 * If the header is requesting these pages to be
+		 * executable, honour that (ppc32 needs this).
+		 */
+		int error;
+
+		zero_start = ELF_PAGEALIGN(zero_start);
+		zero_end = ELF_PAGEALIGN(zero_end);
+
+		error = vm_brk_flags(zero_start, zero_end - zero_start,
+				     prot & PROT_EXEC ? VM_EXEC : 0);
+		if (error)
+			map_addr = error;
+	}
+	return map_addr;
+}
+
+
 static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
 {
 	elf_addr_t min_addr = -1;
@@ -596,8 +636,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	struct elf_phdr *eppnt;
 	unsigned long load_addr = 0;
 	int load_addr_set = 0;
-	unsigned long last_bss = 0, elf_bss = 0;
-	int bss_prot = 0;
 	unsigned long error = ~0UL;
 	unsigned long total_size;
 	int i;
@@ -634,7 +672,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			else if (no_base && interp_elf_ex->e_type == ET_DYN)
 				load_addr = -vaddr;
 
-			map_addr = elf_map(interpreter, load_addr + vaddr,
+			map_addr = elf_load(interpreter, load_addr + vaddr,
 					eppnt, elf_prot, elf_type, total_size);
 			total_size = 0;
 			error = map_addr;
@@ -660,51 +698,9 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				error = -ENOMEM;
 				goto out;
 			}
-
-			/*
-			 * Find the end of the file mapping for this phdr, and
-			 * keep track of the largest address we see for this.
-			 */
-			k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
-			if (k > elf_bss)
-				elf_bss = k;
-
-			/*
-			 * Do the same thing for the memory mapping - between
-			 * elf_bss and last_bss is the bss section.
-			 */
-			k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
-			if (k > last_bss) {
-				last_bss = k;
-				bss_prot = elf_prot;
-			}
 		}
 	}
 
-	/*
-	 * Now fill out the bss section: first pad the last page from
-	 * the file up to the page boundary, and zero it from elf_bss
-	 * up to the end of the page.
-	 */
-	if (padzero(elf_bss)) {
-		error = -EFAULT;
-		goto out;
-	}
-	/*
-	 * Next, align both the file and mem bss up to the page size,
-	 * since this is where elf_bss was just zeroed up to, and where
-	 * last_bss will end after the vm_brk_flags() below.
-	 */
-	elf_bss = ELF_PAGEALIGN(elf_bss);
-	last_bss = ELF_PAGEALIGN(last_bss);
-	/* Finally, if there is still more bss to allocate, do it. */
-	if (last_bss > elf_bss) {
-		error = vm_brk_flags(elf_bss, last_bss - elf_bss,
-				bss_prot & PROT_EXEC ? VM_EXEC : 0);
-		if (error)
-			goto out;
-	}
-
 	error = load_addr;
 out:
 	return error;
@@ -828,8 +824,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
 	struct elf_phdr *elf_property_phdata = NULL;
-	unsigned long elf_bss, elf_brk;
-	int bss_prot = 0;
+	unsigned long elf_brk;
 	int retval, i;
 	unsigned long elf_entry;
 	unsigned long e_entry;
@@ -1020,7 +1015,6 @@ out_free_interp:
 	if (retval < 0)
 		goto out_free_dentry;
 
-	elf_bss = 0;
 	elf_brk = 0;
 
 	start_code = ~0UL;
@@ -1040,33 +1034,6 @@ out_free_interp:
 		if (elf_ppnt->p_type != PT_LOAD)
 			continue;
 
-		if (unlikely (elf_brk > elf_bss)) {
-			unsigned long nbyte;
-
-			/* There was a PT_LOAD segment with p_memsz > p_filesz
-			   before this one. Map anonymous pages, if needed,
-			   and clear the area.  */
-			retval = set_brk(elf_bss + load_bias,
-					 elf_brk + load_bias,
-					 bss_prot);
-			if (retval)
-				goto out_free_dentry;
-			nbyte = ELF_PAGEOFFSET(elf_bss);
-			if (nbyte) {
-				nbyte = ELF_MIN_ALIGN - nbyte;
-				if (nbyte > elf_brk - elf_bss)
-					nbyte = elf_brk - elf_bss;
-				if (clear_user((void __user *)elf_bss +
-							load_bias, nbyte)) {
-					/*
-					 * This bss-zeroing can fail if the ELF
-					 * file specifies odd protections. So
-					 * we don't check the return value
-					 */
-				}
-			}
-		}
-
 		elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
 				     !!interpreter, false);
 
@@ -1162,7 +1129,7 @@ out_free_interp:
 			}
 		}
 
-		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+		error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
 				elf_prot, elf_flags, total_size);
 		if (BAD_ADDR(error)) {
 			retval = IS_ERR_VALUE(error) ?
@@ -1210,40 +1177,24 @@ out_free_interp:
 
 		k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
 
-		if (k > elf_bss)
-			elf_bss = k;
 		if ((elf_ppnt->p_flags & PF_X) && end_code < k)
 			end_code = k;
 		if (end_data < k)
 			end_data = k;
 		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
-		if (k > elf_brk) {
-			bss_prot = elf_prot;
+		if (k > elf_brk)
 			elf_brk = k;
-		}
 	}
 
 	e_entry = elf_ex->e_entry + load_bias;
 	phdr_addr += load_bias;
-	elf_bss += load_bias;
 	elf_brk += load_bias;
 	start_code += load_bias;
 	end_code += load_bias;
 	start_data += load_bias;
 	end_data += load_bias;
 
-	/* Calling set_brk effectively mmaps the pages that we need
-	 * for the bss and break sections.  We must do this before
-	 * mapping in the interpreter, to make sure it doesn't wind
-	 * up getting placed where the bss needs to go.
-	 */
-	retval = set_brk(elf_bss, elf_brk, bss_prot);
-	if (retval)
-		goto out_free_dentry;
-	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
-		retval = -EFAULT; /* Nobody gets to see this, but.. */
-		goto out_free_dentry;
-	}
+	current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk);
 
 	if (interpreter) {
 		elf_entry = load_elf_interp(interp_elf_ex,
@@ -1369,7 +1320,6 @@ static int load_elf_library(struct file *file)
 {
 	struct elf_phdr *elf_phdata;
 	struct elf_phdr *eppnt;
-	unsigned long elf_bss, bss, len;
 	int retval, error, i, j;
 	struct elfhdr elf_ex;
 
@@ -1414,30 +1364,15 @@ static int load_elf_library(struct file *file)
 		eppnt++;
 
 	/* Now use mmap to map the library into memory. */
-	error = vm_mmap(file,
-			ELF_PAGESTART(eppnt->p_vaddr),
-			(eppnt->p_filesz +
-			 ELF_PAGEOFFSET(eppnt->p_vaddr)),
+	error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr),
+			eppnt,
 			PROT_READ | PROT_WRITE | PROT_EXEC,
 			MAP_FIXED_NOREPLACE | MAP_PRIVATE,
-			(eppnt->p_offset -
-			 ELF_PAGEOFFSET(eppnt->p_vaddr)));
+			0);
+
 	if (error != ELF_PAGESTART(eppnt->p_vaddr))
 		goto out_free_ph;
 
-	elf_bss = eppnt->p_vaddr + eppnt->p_filesz;
-	if (padzero(elf_bss)) {
-		error = -EFAULT;
-		goto out_free_ph;
-	}
-
-	len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr);
-	bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr);
-	if (bss > len) {
-		error = vm_brk(len, bss - len);
-		if (error)
-			goto out_free_ph;
-	}
 	error = 0;
 
 out_free_ph:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 206812ce544a..fefc642541cb 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -899,10 +899,12 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
 	kdebug("- DYNAMIC[]: %lx", params->dynamic_addr);
 	seg = loadmap->segs;
 	for (loop = 0; loop < loadmap->nsegs; loop++, seg++)
-		kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]",
+		kdebug("- LOAD[%d] : %08llx-%08llx [va=%llx ms=%llx]",
 		       loop,
-		       seg->addr, seg->addr + seg->p_memsz - 1,
-		       seg->p_vaddr, seg->p_memsz);
+		       (unsigned long long) seg->addr,
+		       (unsigned long long) seg->addr + seg->p_memsz - 1,
+		       (unsigned long long) seg->p_vaddr,
+		       (unsigned long long) seg->p_memsz);
 
 	return 0;
 
@@ -1081,9 +1083,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
 				phdr->p_offset - disp);
 
-		kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
-		       loop, phdr->p_memsz + disp, prot, flags,
-		       phdr->p_offset - disp, maddr);
+		kdebug("mmap[%d] <file> sz=%llx pr=%x fl=%x of=%llx --> %08lx",
+		       loop, (unsigned long long) phdr->p_memsz + disp,
+		       prot, flags, (unsigned long long) phdr->p_offset - disp,
+		       maddr);
 
 		if (IS_ERR_VALUE(maddr))
 			return (int) maddr;
@@ -1145,8 +1148,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 
 #else
 		if (excess > 0) {
-			kdebug("clear[%d] ad=%lx sz=%lx",
-			       loop, maddr + phdr->p_filesz, excess);
+			kdebug("clear[%d] ad=%llx sz=%lx", loop,
+			       (unsigned long long) maddr + phdr->p_filesz,
+			       excess);
 			if (clear_user((void *) maddr + phdr->p_filesz, excess))
 				return -EFAULT;
 		}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e0108d17b085..68fa225f89e5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -40,9 +40,6 @@ enum {
 	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
 };
 
-static LIST_HEAD(entries);
-static int enabled = 1;
-
 enum {Enabled, Magic};
 #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
 #define MISC_FMT_OPEN_BINARY (1UL << 30)
@@ -60,12 +57,10 @@ typedef struct {
 	char *name;
 	struct dentry *dentry;
 	struct file *interp_file;
+	refcount_t users;		/* sync removal with load_misc_binary() */
 } Node;
 
-static DEFINE_RWLOCK(entries_lock);
 static struct file_system_type bm_fs_type;
-static struct vfsmount *bm_mnt;
-static int entry_count;
 
 /*
  * Max length of the register string.  Determined by:
@@ -82,19 +77,24 @@ static int entry_count;
  */
 #define MAX_REGISTER_LENGTH 1920
 
-/*
- * Check if we support the binfmt
- * if we do, return the node, else NULL
- * locking is done in load_misc_binary
+/**
+ * search_binfmt_handler - search for a binary handler for @bprm
+ * @misc: handle to binfmt_misc instance
+ * @bprm: binary for which we are looking for a handler
+ *
+ * Search for a binary type handler for @bprm in the list of registered binary
+ * type handlers.
+ *
+ * Return: binary type list entry on success, NULL on failure
  */
-static Node *check_file(struct linux_binprm *bprm)
+static Node *search_binfmt_handler(struct binfmt_misc *misc,
+				   struct linux_binprm *bprm)
 {
 	char *p = strrchr(bprm->interp, '.');
-	struct list_head *l;
+	Node *e;
 
 	/* Walk all the registered handlers. */
-	list_for_each(l, &entries) {
-		Node *e = list_entry(l, Node, list);
+	list_for_each_entry(e, &misc->entries, list) {
 		char *s;
 		int j;
 
@@ -123,9 +123,79 @@ static Node *check_file(struct linux_binprm *bprm)
 		if (j == e->size)
 			return e;
 	}
+
 	return NULL;
 }
 
+/**
+ * get_binfmt_handler - try to find a binary type handler
+ * @misc: handle to binfmt_misc instance
+ * @bprm: binary for which we are looking for a handler
+ *
+ * Try to find a binfmt handler for the binary type. If one is found take a
+ * reference to protect against removal via bm_{entry,status}_write().
+ *
+ * Return: binary type list entry on success, NULL on failure
+ */
+static Node *get_binfmt_handler(struct binfmt_misc *misc,
+				struct linux_binprm *bprm)
+{
+	Node *e;
+
+	read_lock(&misc->entries_lock);
+	e = search_binfmt_handler(misc, bprm);
+	if (e)
+		refcount_inc(&e->users);
+	read_unlock(&misc->entries_lock);
+	return e;
+}
+
+/**
+ * put_binfmt_handler - put binary handler node
+ * @e: node to put
+ *
+ * Free node syncing with load_misc_binary() and defer final free to
+ * load_misc_binary() in case it is using the binary type handler we were
+ * requested to remove.
+ */
+static void put_binfmt_handler(Node *e)
+{
+	if (refcount_dec_and_test(&e->users)) {
+		if (e->flags & MISC_FMT_OPEN_FILE)
+			filp_close(e->interp_file, NULL);
+		kfree(e);
+	}
+}
+
+/**
+ * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
+ *
+ * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
+ * If a user namespace doesn't have its own binfmt_misc mount it can make use
+ * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
+ * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
+ * available to all user and user namespaces on the system.
+ *
+ * Return: the binfmt_misc instance of the caller's user namespace
+ */
+static struct binfmt_misc *load_binfmt_misc(void)
+{
+	const struct user_namespace *user_ns;
+	struct binfmt_misc *misc;
+
+	user_ns = current_user_ns();
+	while (user_ns) {
+		/* Pairs with smp_store_release() in bm_fill_super(). */
+		misc = smp_load_acquire(&user_ns->binfmt_misc);
+		if (misc)
+			return misc;
+
+		user_ns = user_ns->parent;
+	}
+
+	return &init_binfmt_misc;
+}
+
 /*
  * the loader itself
  */
@@ -133,18 +203,14 @@ static int load_misc_binary(struct linux_binprm *bprm)
 {
 	Node *fmt;
 	struct file *interp_file = NULL;
-	int retval;
+	int retval = -ENOEXEC;
+	struct binfmt_misc *misc;
 
-	retval = -ENOEXEC;
-	if (!enabled)
+	misc = load_binfmt_misc();
+	if (!misc->enabled)
 		return retval;
 
-	/* to keep locking time low, we copy the interpreter string */
-	read_lock(&entries_lock);
-	fmt = check_file(bprm);
-	if (fmt)
-		dget(fmt->dentry);
-	read_unlock(&entries_lock);
+	fmt = get_binfmt_handler(misc, bprm);
 	if (!fmt)
 		return retval;
 
@@ -198,7 +264,16 @@ static int load_misc_binary(struct linux_binprm *bprm)
 
 	retval = 0;
 ret:
-	dput(fmt->dentry);
+
+	/*
+	 * If we actually put the node here all concurrent calls to
+	 * load_misc_binary() will have finished. We also know
+	 * that for the refcount to be zero someone must have concurently
+	 * removed the binary type handler from the list and it's our job to
+	 * free it.
+	 */
+	put_binfmt_handler(fmt);
+
 	return retval;
 }
 
@@ -287,7 +362,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
 
 	err = -ENOMEM;
 	memsize = sizeof(Node) + count + 8;
-	e = kmalloc(memsize, GFP_KERNEL);
+	e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
 	if (!e)
 		goto out;
 
@@ -399,7 +474,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
 
 			if (e->mask) {
 				int i;
-				char *masked = kmalloc(e->size, GFP_KERNEL);
+				char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
 
 				print_hex_dump_bytes(
 					KBUILD_MODNAME ": register:  mask[decoded]: ",
@@ -547,35 +622,114 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 	return inode;
 }
 
+/**
+ * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
+ * @inode: inode of the relevant binfmt_misc instance
+ *
+ * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
+ * be done without any memory barriers because we are guaranteed that
+ * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
+ * binfmt_misc mount was first created.
+ *
+ * Return: struct binfmt_misc of the relevant binfmt_misc instance
+ */
+static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
+{
+	return inode->i_sb->s_user_ns->binfmt_misc;
+}
+
+/**
+ * bm_evict_inode - cleanup data associated with @inode
+ * @inode: inode to which the data is attached
+ *
+ * Cleanup the binary type handler data associated with @inode if a binary type
+ * entry is removed or the filesystem is unmounted and the super block is
+ * shutdown.
+ *
+ * If the ->evict call was not caused by a super block shutdown but by a write
+ * to remove the entry or all entries via bm_{entry,status}_write() the entry
+ * will have already been removed from the list. We keep the list_empty() check
+ * to make that explicit.
+*/
 static void bm_evict_inode(struct inode *inode)
 {
 	Node *e = inode->i_private;
 
-	if (e && e->flags & MISC_FMT_OPEN_FILE)
-		filp_close(e->interp_file, NULL);
-
 	clear_inode(inode);
-	kfree(e);
+
+	if (e) {
+		struct binfmt_misc *misc;
+
+		misc = i_binfmt_misc(inode);
+		write_lock(&misc->entries_lock);
+		if (!list_empty(&e->list))
+			list_del_init(&e->list);
+		write_unlock(&misc->entries_lock);
+		put_binfmt_handler(e);
+	}
 }
 
-static void kill_node(Node *e)
+/**
+ * unlink_binfmt_dentry - remove the dentry for the binary type handler
+ * @dentry: dentry associated with the binary type handler
+ *
+ * Do the actual filesystem work to remove a dentry for a registered binary
+ * type handler. Since binfmt_misc only allows simple files to be created
+ * directly under the root dentry of the filesystem we ensure that we are
+ * indeed passed a dentry directly beneath the root dentry, that the inode
+ * associated with the root dentry is locked, and that it is a regular file we
+ * are asked to remove.
+ */
+static void unlink_binfmt_dentry(struct dentry *dentry)
 {
-	struct dentry *dentry;
+	struct dentry *parent = dentry->d_parent;
+	struct inode *inode, *parent_inode;
 
-	write_lock(&entries_lock);
-	list_del_init(&e->list);
-	write_unlock(&entries_lock);
+	/* All entries are immediate descendants of the root dentry. */
+	if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
+		return;
 
-	dentry = e->dentry;
-	drop_nlink(d_inode(dentry));
-	d_drop(dentry);
-	dput(dentry);
-	simple_release_fs(&bm_mnt, &entry_count);
+	/* We only expect to be called on regular files. */
+	inode = d_inode(dentry);
+	if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
+		return;
+
+	/* The parent inode must be locked. */
+	parent_inode = d_inode(parent);
+	if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
+		return;
+
+	if (simple_positive(dentry)) {
+		dget(dentry);
+		simple_unlink(parent_inode, dentry);
+		d_delete(dentry);
+		dput(dentry);
+	}
+}
+
+/**
+ * remove_binfmt_handler - remove a binary type handler
+ * @misc: handle to binfmt_misc instance
+ * @e: binary type handler to remove
+ *
+ * Remove a binary type handler from the list of binary type handlers and
+ * remove its associated dentry. This is called from
+ * binfmt_{entry,status}_write(). In the future, we might want to think about
+ * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's
+ * to use writes to files in order to delete binary type handlers. But it has
+ * worked for so long that it's not a pressing issue.
+ */
+static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
+{
+	write_lock(&misc->entries_lock);
+	list_del_init(&e->list);
+	write_unlock(&misc->entries_lock);
+	unlink_binfmt_dentry(e->dentry);
 }
 
 /* /<entry> */
@@ -602,8 +756,8 @@ bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
-	struct dentry *root;
-	Node *e = file_inode(file)->i_private;
+	struct inode *inode = file_inode(file);
+	Node *e = inode->i_private;
 	int res = parse_command(buffer, count);
 
 	switch (res) {
@@ -617,13 +771,22 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 		break;
 	case 3:
 		/* Delete this handler. */
-		root = file_inode(file)->i_sb->s_root;
-		inode_lock(d_inode(root));
+		inode = d_inode(inode->i_sb->s_root);
+		inode_lock(inode);
 
+		/*
+		 * In order to add new element or remove elements from the list
+		 * via bm_{entry,register,status}_write() inode_lock() on the
+		 * root inode must be held.
+		 * The lock is exclusive ensuring that the list can't be
+		 * modified. Only load_misc_binary() can access but does so
+		 * read-only. So we only need to take the write lock when we
+		 * actually remove the entry from the list.
+		 */
 		if (!list_empty(&e->list))
-			kill_node(e);
+			remove_binfmt_handler(i_binfmt_misc(inode), e);
 
-		inode_unlock(d_inode(root));
+		inode_unlock(inode);
 		break;
 	default:
 		return res;
@@ -647,6 +810,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	struct inode *inode;
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct dentry *root = sb->s_root, *dentry;
+	struct binfmt_misc *misc;
 	int err = 0;
 	struct file *f = NULL;
 
@@ -656,7 +820,18 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		return PTR_ERR(e);
 
 	if (e->flags & MISC_FMT_OPEN_FILE) {
+		const struct cred *old_cred;
+
+		/*
+		 * Now that we support unprivileged binfmt_misc mounts make
+		 * sure we use the credentials that the register @file was
+		 * opened with to also open the interpreter. Before that this
+		 * didn't matter much as only a privileged process could open
+		 * the register file.
+		 */
+		old_cred = override_creds(file->f_cred);
 		f = open_exec(e->interpreter);
+		revert_creds(old_cred);
 		if (IS_ERR(f)) {
 			pr_notice("register: failed to install interpreter file %s\n",
 				 e->interpreter);
@@ -682,21 +857,16 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
-	if (err) {
-		iput(inode);
-		inode = NULL;
-		goto out2;
-	}
-
+	refcount_set(&e->users, 1);
 	e->dentry = dget(dentry);
 	inode->i_private = e;
 	inode->i_fop = &bm_entry_operations;
 
 	d_instantiate(dentry, inode);
-	write_lock(&entries_lock);
-	list_add(&e->list, &entries);
-	write_unlock(&entries_lock);
+	misc = i_binfmt_misc(inode);
+	write_lock(&misc->entries_lock);
+	list_add(&e->list, &misc->entries);
+	write_unlock(&misc->entries_lock);
 
 	err = 0;
 out2:
@@ -723,35 +893,50 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled\n" : "disabled\n";
+	struct binfmt_misc *misc;
+	char *s;
 
+	misc = i_binfmt_misc(file_inode(file));
+	s = misc->enabled ? "enabled\n" : "disabled\n";
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
 
 static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 		size_t count, loff_t *ppos)
 {
+	struct binfmt_misc *misc;
 	int res = parse_command(buffer, count);
-	struct dentry *root;
+	Node *e, *next;
+	struct inode *inode;
 
+	misc = i_binfmt_misc(file_inode(file));
 	switch (res) {
 	case 1:
 		/* Disable all handlers. */
-		enabled = 0;
+		misc->enabled = false;
 		break;
 	case 2:
 		/* Enable all handlers. */
-		enabled = 1;
+		misc->enabled = true;
 		break;
 	case 3:
 		/* Delete all handlers. */
-		root = file_inode(file)->i_sb->s_root;
-		inode_lock(d_inode(root));
+		inode = d_inode(file_inode(file)->i_sb->s_root);
+		inode_lock(inode);
 
-		while (!list_empty(&entries))
-			kill_node(list_first_entry(&entries, Node, list));
+		/*
+		 * In order to add new element or remove elements from the list
+		 * via bm_{entry,register,status}_write() inode_lock() on the
+		 * root inode must be held.
+		 * The lock is exclusive ensuring that the list can't be
+		 * modified. Only load_misc_binary() can access but does so
+		 * read-only. So we only need to take the write lock when we
+		 * actually remove the entry from the list.
+		 */
+		list_for_each_entry_safe(e, next, &misc->entries, list)
+			remove_binfmt_handler(misc, e);
 
-		inode_unlock(d_inode(root));
+		inode_unlock(inode);
 		break;
 	default:
 		return res;
@@ -768,32 +953,100 @@ static const struct file_operations bm_status_operations = {
 
 /* Superblock handling */
 
+static void bm_put_super(struct super_block *sb)
+{
+	struct user_namespace *user_ns = sb->s_fs_info;
+
+	sb->s_fs_info = NULL;
+	put_user_ns(user_ns);
+}
+
 static const struct super_operations s_ops = {
 	.statfs		= simple_statfs,
 	.evict_inode	= bm_evict_inode,
+	.put_super	= bm_put_super,
 };
 
 static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	int err;
+	struct user_namespace *user_ns = sb->s_user_ns;
+	struct binfmt_misc *misc;
 	static const struct tree_descr bm_files[] = {
 		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
 		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
 	};
 
+	if (WARN_ON(user_ns != current_user_ns()))
+		return -EINVAL;
+
+	/*
+	 * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
+	 * do it here during the first mount of binfmt_misc. We don't need to
+	 * waste memory for every user namespace allocation. It's likely much
+	 * more common to not mount a separate binfmt_misc instance than it is
+	 * to mount one.
+	 *
+	 * While multiple superblocks can exist they are keyed by userns in
+	 * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
+	 * bm_fill_super() is called exactly once whenever a binfmt_misc
+	 * superblock for a userns is created. This in turn lets us conclude
+	 * that when a binfmt_misc superblock is created for the first time for
+	 * a userns there's no one racing us. Therefore we don't need any
+	 * barriers when we dereference binfmt_misc.
+	 */
+	misc = user_ns->binfmt_misc;
+	if (!misc) {
+		/*
+		 * If it turns out that most user namespaces actually want to
+		 * register their own binary type handler and therefore all
+		 * create their own separate binfm_misc mounts we should
+		 * consider turning this into a kmem cache.
+		 */
+		misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+		if (!misc)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&misc->entries);
+		rwlock_init(&misc->entries_lock);
+
+		/* Pairs with smp_load_acquire() in load_binfmt_misc(). */
+		smp_store_release(&user_ns->binfmt_misc, misc);
+	}
+
+	/*
+	 * When the binfmt_misc superblock for this userns is shutdown
+	 * ->enabled might have been set to false and we don't reinitialize
+	 * ->enabled again in put_super() as someone might already be mounting
+	 * binfmt_misc again. It also would be pointless since by the time
+	 * ->put_super() is called we know that the binary type list for this
+	 * bintfmt_misc mount is empty making load_misc_binary() return
+	 * -ENOEXEC independent of whether ->enabled is true. Instead, if
+	 * someone mounts binfmt_misc for the first time or again we simply
+	 * reset ->enabled to true.
+	 */
+	misc->enabled = true;
+
 	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
 	if (!err)
 		sb->s_op = &s_ops;
 	return err;
 }
 
+static void bm_free(struct fs_context *fc)
+{
+	if (fc->s_fs_info)
+		put_user_ns(fc->s_fs_info);
+}
+
 static int bm_get_tree(struct fs_context *fc)
 {
-	return get_tree_single(fc, bm_fill_super);
+	return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
 }
 
 static const struct fs_context_operations bm_context_ops = {
+	.free		= bm_free,
 	.get_tree	= bm_get_tree,
 };
 
@@ -812,6 +1065,7 @@ static struct file_system_type bm_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "binfmt_misc",
 	.init_fs_context = bm_init_fs_context,
+	.fs_flags	= FS_USERNS_MOUNT,
 	.kill_sb	= kill_litter_super,
 };
 MODULE_ALIAS_FS("binfmt_misc");
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a25c9910d90b..4fb925e8c981 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -48,27 +48,6 @@ config BTRFS_FS_POSIX_ACL
 
 	  If you don't know what Access Control Lists are, say N
 
-config BTRFS_FS_CHECK_INTEGRITY
-	bool "Btrfs with integrity check tool compiled in (DEPRECATED)"
-	depends on BTRFS_FS
-	help
-	  This feature has been deprecated and will be removed in 6.7.
-
-	  Adds code that examines all block write requests (including
-	  writes of the super block). The goal is to verify that the
-	  state of the filesystem on disk is always consistent, i.e.,
-	  after a power-loss or kernel panic event the filesystem is
-	  in a consistent state.
-
-	  If the integrity check tool is included and activated in
-	  the mount options, plenty of kernel memory is used, and
-	  plenty of additional CPU cycles are spent. Enabling this
-	  functionality is not intended for normal use.
-
-	  In most cases, unless you are a btrfs developer who needs
-	  to verify the integrity of (super)-block write requests
-	  during the run of a regression test, say N
-
 config BTRFS_FS_RUN_SANITY_TESTS
 	bool "Btrfs will run sanity tests upon loading"
 	depends on BTRFS_FS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 90d53209755b..525af975f61c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
 	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
-	   lru_cache.o
+	   lru_cache.o raid-stripe-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
 btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
 btrfs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 8cfc8214109c..aa0844535644 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -4,6 +4,7 @@
 #define BTRFS_ACCESSORS_H
 
 #include <linux/stddef.h>
+#include <asm/unaligned.h>
 
 struct btrfs_map_token {
 	struct extent_buffer *eb;
@@ -305,6 +306,14 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
 
+BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
+			 struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
 BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
@@ -349,6 +358,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3
 
 BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);
 
+BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref,
+		   root_id, 64);
+
 BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
 		   type, 8);
 BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
@@ -365,6 +377,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type)
 	if (type == BTRFS_EXTENT_DATA_REF_KEY)
 		return sizeof(struct btrfs_extent_data_ref) +
 		       offsetof(struct btrfs_extent_inline_ref, offset);
+	if (type == BTRFS_EXTENT_OWNER_REF_KEY)
+		return sizeof(struct btrfs_extent_inline_ref);
 	return 0;
 }
 
@@ -966,6 +980,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
 		   flags, 64);
 BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
 		   rescan, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item,
+		   enable_gen, 64);
 
 /* btrfs_qgroup_info_item */
 BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ce083e99ef68..9e261aac671e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -9,6 +9,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
+#include <trace/events/btrfs.h>
 #include "async-thread.h"
 #include "ctree.h"
 
@@ -242,7 +243,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
 			break;
 		trace_btrfs_ordered_sched(work);
 		spin_unlock_irqrestore(lock, flags);
-		work->ordered_func(work);
+		work->ordered_func(work, false);
 
 		/* now take the lock again and drop our item from the list */
 		spin_lock_irqsave(lock, flags);
@@ -277,7 +278,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
 			 * We don't want to call the ordered free functions with
 			 * the lock held.
 			 */
-			work->ordered_free(work);
+			work->ordered_func(work, true);
 			/* NB: work must not be dereferenced past this point. */
 			trace_btrfs_all_work_done(wq->fs_info, work);
 		}
@@ -285,7 +286,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
 	spin_unlock_irqrestore(lock, flags);
 
 	if (free_self) {
-		self->ordered_free(self);
+		self->ordered_func(self, true);
 		/* NB: self must not be dereferenced past this point. */
 		trace_btrfs_all_work_done(wq->fs_info, self);
 	}
@@ -300,7 +301,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
 
 	/*
 	 * We should not touch things inside work in the following cases:
-	 * 1) after work->func() if it has no ordered_free
+	 * 1) after work->func() if it has no ordered_func(..., true) to free
 	 *    Since the struct is freed in work->func().
 	 * 2) after setting WORK_DONE_BIT
 	 *    The work may be freed in other threads almost instantly.
@@ -329,11 +330,10 @@ static void btrfs_work_helper(struct work_struct *normal_work)
 }
 
 void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
-		     btrfs_func_t ordered_func, btrfs_func_t ordered_free)
+		     btrfs_ordered_func_t ordered_func)
 {
 	work->func = func;
 	work->ordered_func = ordered_func;
-	work->ordered_free = ordered_free;
 	INIT_WORK(&work->normal_work, btrfs_work_helper);
 	INIT_LIST_HEAD(&work->ordered_list);
 	work->flags = 0;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 30f66c5e2e6e..62b8a0d57898 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -13,11 +13,11 @@ struct btrfs_fs_info;
 struct btrfs_workqueue;
 struct btrfs_work;
 typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool);
 
 struct btrfs_work {
 	btrfs_func_t func;
-	btrfs_func_t ordered_func;
-	btrfs_func_t ordered_free;
+	btrfs_ordered_func_t ordered_func;
 
 	/* Don't touch things below */
 	struct work_struct normal_work;
@@ -35,7 +35,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
 				struct btrfs_fs_info *fs_info, const char *name,
 				unsigned int flags);
 void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
-		     btrfs_func_t ordered_func, btrfs_func_t ordered_free);
+		     btrfs_ordered_func_t ordered_func);
 void btrfs_queue_work(struct btrfs_workqueue *wq,
 		      struct btrfs_work *work);
 void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a4a809efc92f..beed7e459dab 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1129,6 +1129,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
 						       count, sc, GFP_NOFS);
 			break;
 		}
+		case BTRFS_EXTENT_OWNER_REF_KEY:
+			ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA));
+			break;
 		default:
 			WARN_ON(1);
 		}
@@ -2998,7 +3001,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
 }
 
 void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
-			      struct btrfs_backref_cache *cache, int is_reloc)
+			      struct btrfs_backref_cache *cache, bool is_reloc)
 {
 	int i;
 
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 71d535e03dca..ab4ca0eda605 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -247,7 +247,7 @@ struct prelim_ref {
 	struct rb_node rbnode;
 	u64 root_id;
 	struct btrfs_key key_for_search;
-	int level;
+	u8 level;
 	int count;
 	struct extent_inode_elem *inode_list;
 	u64 parent;
@@ -440,11 +440,11 @@ struct btrfs_backref_cache {
 	 * Reloction backref cache require more info for reloc root compared
 	 * to generic backref cache.
 	 */
-	unsigned int is_reloc;
+	bool is_reloc;
 };
 
 void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
-			      struct btrfs_backref_cache *cache, int is_reloc);
+			      struct btrfs_backref_cache *cache, bool is_reloc);
 struct btrfs_backref_node *btrfs_backref_alloc_node(
 		struct btrfs_backref_cache *cache, u64 bytenr, int level);
 struct btrfs_backref_edge *btrfs_backref_alloc_edge(
@@ -533,9 +533,9 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
 void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
 
 static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
-				       u64 bytenr, int errno)
+				       u64 bytenr, int error)
 {
-	btrfs_panic(fs_info, errno,
+	btrfs_panic(fs_info, error,
 		    "Inconsistency in backref cache found at offset %llu",
 		    bytenr);
 }
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 12b12443efaa..4f3b693a16b1 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -10,11 +10,11 @@
 #include "volumes.h"
 #include "raid56.h"
 #include "async-thread.h"
-#include "check-integrity.h"
 #include "dev-replace.h"
 #include "rcu-string.h"
 #include "zoned.h"
 #include "file-item.h"
+#include "raid-stripe-tree.h"
 
 static struct bio_set btrfs_bioset;
 static struct bio_set btrfs_clone_bioset;
@@ -416,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio)
 	else
 		bio->bi_status = BLK_STS_OK;
 
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
 	btrfs_orig_bbio_end_io(bbio);
 	btrfs_put_bioc(bioc);
 }
@@ -427,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
 	if (bio->bi_status) {
 		atomic_inc(&stripe->bioc->error);
 		btrfs_log_dev_io_error(bio, stripe->dev);
+	} else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 	}
 
 	/* Pass on control to the original bio this one was cloned from */
@@ -463,8 +468,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
 		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
 		dev->devid, bio->bi_iter.bi_size);
 
-	btrfsic_check_bio(bio);
-
 	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
 		blkcg_punt_bio_submit(bio);
 	else
@@ -490,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
 	bio->bi_private = &bioc->stripes[dev_nr];
 	bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
 	bioc->stripes[dev_nr].bioc = bioc;
+	bioc->size = bio->bi_iter.bi_size;
 	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
 }
 
@@ -499,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
 	if (!bioc) {
 		/* Single mirror read/write fast path. */
 		btrfs_bio(bio)->mirror_num = mirror_num;
+		if (bio_op(bio) != REQ_OP_READ)
+			btrfs_bio(bio)->orig_physical = smap->physical;
 		bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
 		if (bio_op(bio) != REQ_OP_READ)
 			btrfs_bio(bio)->orig_physical = smap->physical;
@@ -568,13 +574,20 @@ static void run_one_async_start(struct btrfs_work *work)
  *
  * At IO completion time the csums attached on the ordered extent record are
  * inserted into the tree.
+ *
+ * If called with @do_free == true, then it will free the work struct.
  */
-static void run_one_async_done(struct btrfs_work *work)
+static void run_one_async_done(struct btrfs_work *work, bool do_free)
 {
 	struct async_submit_bio *async =
 		container_of(work, struct async_submit_bio, work);
 	struct bio *bio = &async->bbio->bio;
 
+	if (do_free) {
+		kfree(container_of(work, struct async_submit_bio, work));
+		return;
+	}
+
 	/* If an error occurred we just want to clean up the bio and move on. */
 	if (bio->bi_status) {
 		btrfs_orig_bbio_end_io(async->bbio);
@@ -590,11 +603,6 @@ static void run_one_async_done(struct btrfs_work *work)
 	__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
 }
 
-static void run_one_async_free(struct btrfs_work *work)
-{
-	kfree(container_of(work, struct async_submit_bio, work));
-}
-
 static bool should_async_write(struct btrfs_bio *bbio)
 {
 	/* Submit synchronously if the checksum implementation is fast. */
@@ -636,8 +644,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 	async->smap = *smap;
 	async->mirror_num = mirror_num;
 
-	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
-			run_one_async_free);
+	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
 	btrfs_queue_work(fs_info->workers, &async->work);
 	return true;
 }
@@ -657,9 +664,11 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 	blk_status_t ret;
 	int error;
 
+	smap.is_scrub = !bbio->inode;
+
 	btrfs_bio_counter_inc_blocked(fs_info);
 	error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
-				&bioc, &smap, &mirror_num, 1);
+				&bioc, &smap, &mirror_num);
 	if (error) {
 		ret = errno_to_blk_status(error);
 		goto fail;
@@ -691,6 +700,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 			bio->bi_opf |= REQ_OP_ZONE_APPEND;
 		}
 
+		if (is_data_bbio(bbio) && bioc &&
+		    btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+			/*
+			 * No locking for the list update, as we only add to
+			 * the list in the I/O submission path, and list
+			 * iteration only happens in the completion path, which
+			 * can't happen until after the last submission.
+			 */
+			btrfs_get_bioc(bioc);
+			list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
+		}
+
 		/*
 		 * Csum items for reloc roots have already been cloned at this
 		 * point, so they are handled as part of the no-checksum case.
@@ -779,8 +800,6 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
 	bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
 	__bio_add_page(&bio, page, length, pg_offset);
-
-	btrfsic_check_bio(&bio);
 	ret = submit_bio_wait(&bio);
 	if (ret) {
 		/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index b2e5107b7cec..6e5dc68ff661 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -935,7 +935,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
 	caching_ctl->block_group = cache;
 	refcount_set(&caching_ctl->count, 2);
 	atomic_set(&caching_ctl->progress, 0);
-	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+	btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
 
 	spin_lock(&cache->lock);
 	if (cache->cached != BTRFS_CACHE_NO) {
@@ -1286,7 +1286,7 @@ out:
 	/* Once for the lookup reference */
 	btrfs_put_block_group(block_group);
 	if (remove_rsv)
-		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2601,7 +2601,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -2709,7 +2709,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 
 		/* Already aborted the transaction if it failed. */
 next:
-		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
 		list_del_init(&block_group->bg_list);
 		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
 	}
@@ -2819,8 +2819,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
 #endif
 
 	list_add_tail(&cache->bg_list, &trans->new_bgs);
-	trans->delayed_ref_updates++;
-	btrfs_update_delayed_refs_rsv(trans);
+	btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
 
 	set_avail_alloc_bits(fs_info, type);
 	return cache;
@@ -3025,7 +3024,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 						   cache->global_root_id);
 	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
 	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 fail:
 	btrfs_release_path(path);
 	/*
@@ -3051,7 +3050,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
 			    struct btrfs_path *path)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct btrfs_root *root = fs_info->tree_root;
 	struct inode *inode = NULL;
 	struct extent_changeset *data_reserved = NULL;
 	u64 alloc_hint = 0;
@@ -3103,7 +3101,7 @@ again:
 	 * time.
 	 */
 	BTRFS_I(inode)->generation = 0;
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	if (ret) {
 		/*
 		 * So theoretically we could recover from this, simply set the
@@ -3370,7 +3368,7 @@ again:
 		if (should_put)
 			btrfs_put_block_group(cache);
 		if (drop_reserve)
-			btrfs_delayed_refs_rsv_release(fs_info, 1);
+			btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
 		/*
 		 * Avoid blocking other tasks for too long. It might even save
 		 * us from writing caches for block groups that are going to be
@@ -3474,8 +3472,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 		cache_save_setup(cache, trans, path);
 
 		if (!ret)
-			ret = btrfs_run_delayed_refs(trans,
-						     (unsigned long) -1);
+			ret = btrfs_run_delayed_refs(trans, U64_MAX);
 
 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
 			cache->io_ctl.inode = NULL;
@@ -3518,7 +3515,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 		/* If its not on the io list, we need to put the block group */
 		if (should_put)
 			btrfs_put_block_group(cache);
-		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
 		spin_lock(&cur_trans->dirty_bgs_lock);
 	}
 	spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -3543,12 +3540,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 			     u64 bytenr, u64 num_bytes, bool alloc)
 {
 	struct btrfs_fs_info *info = trans->fs_info;
-	struct btrfs_block_group *cache = NULL;
-	u64 total = num_bytes;
+	struct btrfs_space_info *space_info;
+	struct btrfs_block_group *cache;
 	u64 old_val;
-	u64 byte_in_group;
+	bool reclaim = false;
+	bool bg_already_dirty = true;
 	int factor;
-	int ret = 0;
 
 	/* Block accounting for super block */
 	spin_lock(&info->delalloc_root_lock);
@@ -3560,97 +3557,86 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 	btrfs_set_super_bytes_used(info->super_copy, old_val);
 	spin_unlock(&info->delalloc_root_lock);
 
-	while (total) {
-		struct btrfs_space_info *space_info;
-		bool reclaim = false;
-
-		cache = btrfs_lookup_block_group(info, bytenr);
-		if (!cache) {
-			ret = -ENOENT;
-			break;
-		}
-		space_info = cache->space_info;
-		factor = btrfs_bg_type_to_factor(cache->flags);
+	cache = btrfs_lookup_block_group(info, bytenr);
+	if (!cache)
+		return -ENOENT;
 
-		/*
-		 * If this block group has free space cache written out, we
-		 * need to make sure to load it if we are removing space.  This
-		 * is because we need the unpinning stage to actually add the
-		 * space back to the block group, otherwise we will leak space.
-		 */
-		if (!alloc && !btrfs_block_group_done(cache))
-			btrfs_cache_block_group(cache, true);
+	/* An extent can not span multiple block groups. */
+	ASSERT(bytenr + num_bytes <= cache->start + cache->length);
 
-		byte_in_group = bytenr - cache->start;
-		WARN_ON(byte_in_group > cache->length);
+	space_info = cache->space_info;
+	factor = btrfs_bg_type_to_factor(cache->flags);
 
-		spin_lock(&space_info->lock);
-		spin_lock(&cache->lock);
+	/*
+	 * If this block group has free space cache written out, we need to make
+	 * sure to load it if we are removing space.  This is because we need
+	 * the unpinning stage to actually add the space back to the block group,
+	 * otherwise we will leak space.
+	 */
+	if (!alloc && !btrfs_block_group_done(cache))
+		btrfs_cache_block_group(cache, true);
 
-		if (btrfs_test_opt(info, SPACE_CACHE) &&
-		    cache->disk_cache_state < BTRFS_DC_CLEAR)
-			cache->disk_cache_state = BTRFS_DC_CLEAR;
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
 
-		old_val = cache->used;
-		num_bytes = min(total, cache->length - byte_in_group);
-		if (alloc) {
-			old_val += num_bytes;
-			cache->used = old_val;
-			cache->reserved -= num_bytes;
-			space_info->bytes_reserved -= num_bytes;
-			space_info->bytes_used += num_bytes;
-			space_info->disk_used += num_bytes * factor;
-			spin_unlock(&cache->lock);
-			spin_unlock(&space_info->lock);
-		} else {
-			old_val -= num_bytes;
-			cache->used = old_val;
-			cache->pinned += num_bytes;
-			btrfs_space_info_update_bytes_pinned(info, space_info,
-							     num_bytes);
-			space_info->bytes_used -= num_bytes;
-			space_info->disk_used -= num_bytes * factor;
+	if (btrfs_test_opt(info, SPACE_CACHE) &&
+	    cache->disk_cache_state < BTRFS_DC_CLEAR)
+		cache->disk_cache_state = BTRFS_DC_CLEAR;
 
-			reclaim = should_reclaim_block_group(cache, num_bytes);
+	old_val = cache->used;
+	if (alloc) {
+		old_val += num_bytes;
+		cache->used = old_val;
+		cache->reserved -= num_bytes;
+		space_info->bytes_reserved -= num_bytes;
+		space_info->bytes_used += num_bytes;
+		space_info->disk_used += num_bytes * factor;
+		spin_unlock(&cache->lock);
+		spin_unlock(&space_info->lock);
+	} else {
+		old_val -= num_bytes;
+		cache->used = old_val;
+		cache->pinned += num_bytes;
+		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
+		space_info->bytes_used -= num_bytes;
+		space_info->disk_used -= num_bytes * factor;
 
-			spin_unlock(&cache->lock);
-			spin_unlock(&space_info->lock);
+		reclaim = should_reclaim_block_group(cache, num_bytes);
 
-			set_extent_bit(&trans->transaction->pinned_extents,
-				       bytenr, bytenr + num_bytes - 1,
-				       EXTENT_DIRTY, NULL);
-		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&space_info->lock);
 
-		spin_lock(&trans->transaction->dirty_bgs_lock);
-		if (list_empty(&cache->dirty_list)) {
-			list_add_tail(&cache->dirty_list,
-				      &trans->transaction->dirty_bgs);
-			trans->delayed_ref_updates++;
-			btrfs_get_block_group(cache);
-		}
-		spin_unlock(&trans->transaction->dirty_bgs_lock);
+		set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+			       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+	}
 
-		/*
-		 * No longer have used bytes in this block group, queue it for
-		 * deletion. We do this after adding the block group to the
-		 * dirty list to avoid races between cleaner kthread and space
-		 * cache writeout.
-		 */
-		if (!alloc && old_val == 0) {
-			if (!btrfs_test_opt(info, DISCARD_ASYNC))
-				btrfs_mark_bg_unused(cache);
-		} else if (!alloc && reclaim) {
-			btrfs_mark_bg_to_reclaim(cache);
-		}
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	if (list_empty(&cache->dirty_list)) {
+		list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
+		bg_already_dirty = false;
+		btrfs_get_block_group(cache);
+	}
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
 
-		btrfs_put_block_group(cache);
-		total -= num_bytes;
-		bytenr += num_bytes;
+	/*
+	 * No longer have used bytes in this block group, queue it for deletion.
+	 * We do this after adding the block group to the dirty list to avoid
+	 * races between cleaner kthread and space cache writeout.
+	 */
+	if (!alloc && old_val == 0) {
+		if (!btrfs_test_opt(info, DISCARD_ASYNC))
+			btrfs_mark_bg_unused(cache);
+	} else if (!alloc && reclaim) {
+		btrfs_mark_bg_to_reclaim(cache);
 	}
 
+	btrfs_put_block_group(cache);
+
 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
-	btrfs_update_delayed_refs_rsv(trans);
-	return ret;
+	if (!bg_already_dirty)
+		btrfs_inc_delayed_refs_rsv_bg_updates(info);
+
+	return 0;
 }
 
 /*
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 77684c5e0c8b..ceb5f586a2d5 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -221,7 +221,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+					   num_bytes, flush);
 	if (!ret)
 		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
 
@@ -261,7 +262,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
 	if (!ret)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+					   num_bytes, flush);
 	if (!ret) {
 		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
 		return 0;
@@ -279,10 +281,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 	struct btrfs_block_rsv *target = NULL;
 
 	/*
-	 * If we are the delayed_rsv then push to the global rsv, otherwise dump
-	 * into the delayed rsv if it is not full.
+	 * If we are a delayed block reserve then push to the global rsv,
+	 * otherwise dump into the global delayed reserve if it is not full.
 	 */
-	if (block_rsv == delayed_rsv)
+	if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS)
 		target = global_rsv;
 	else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
 		target = delayed_rsv;
@@ -354,6 +356,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 		min_items++;
 	}
 
+	if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+		num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item);
+		min_items++;
+	}
+
 	/*
 	 * But we also want to reserve enough space so we can do the fallback
 	 * global reserve for an unlink, which is an additional
@@ -405,6 +412,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
 	case BTRFS_EXTENT_TREE_OBJECTID:
 	case BTRFS_FREE_SPACE_TREE_OBJECTID:
 	case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+	case BTRFS_RAID_STRIPE_TREE_OBJECTID:
 		root->block_rsv = &fs_info->delayed_refs_rsv;
 		break;
 	case BTRFS_ROOT_TREE_OBJECTID:
@@ -517,8 +525,8 @@ again:
 				block_rsv->type, ret);
 	}
 try_reserve:
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
-					   BTRFS_RESERVE_NO_FLUSH);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+					   blocksize, BTRFS_RESERVE_NO_FLUSH);
 	if (!ret)
 		return block_rsv;
 	/*
@@ -539,7 +547,7 @@ try_reserve:
 	 * one last time to force a reservation if there's enough actual space
 	 * on disk to make the reservation.
 	 */
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
 					   BTRFS_RESERVE_FLUSH_EMERGENCY);
 	if (!ret)
 		return block_rsv;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index bda1fdbba666..5572ae52444e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -8,6 +8,8 @@
 
 #include <linux/hash.h>
 #include <linux/refcount.h>
+#include <linux/fscrypt.h>
+#include <trace/events/btrfs.h>
 #include "extent_map.h"
 #include "extent_io.h"
 #include "ordered-data.h"
@@ -79,11 +81,21 @@ struct btrfs_inode {
 	 */
 	struct btrfs_key location;
 
+	/* Cached value of inode property 'compression'. */
+	u8 prop_compress;
+
+	/*
+	 * Force compression on the file using the defrag ioctl, could be
+	 * different from prop_compress and takes precedence if set.
+	 */
+	u8 defrag_compress;
+
 	/*
 	 * Lock for counters and all fields used to determine if the inode is in
 	 * the log or not (last_trans, last_sub_trans, last_log_commit,
-	 * logged_trans), to access/update new_delalloc_bytes and to update the
-	 * VFS' inode number of bytes used.
+	 * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes,
+	 * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to
+	 * update the VFS' inode number of bytes used.
 	 */
 	spinlock_t lock;
 
@@ -102,8 +114,18 @@ struct btrfs_inode {
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
 
+	/*
+	 * Counters to keep track of the number of extent item's we may use due
+	 * to delalloc and such.  outstanding_extents is the number of extent
+	 * items we think we'll end up using, and reserved_extents is the number
+	 * of extent items we've reserved metadata for. Protected by 'lock'.
+	 */
+	unsigned outstanding_extents;
+
 	/* used to order data wrt metadata */
-	struct btrfs_ordered_inode_tree ordered_tree;
+	spinlock_t ordered_tree_lock;
+	struct rb_root ordered_tree;
+	struct rb_node *ordered_tree_last;
 
 	/* list of all the delalloc inodes in the FS.  There are times we need
 	 * to write all the delalloc pages to disk, and this list is used
@@ -122,28 +144,31 @@ struct btrfs_inode {
 	u64 generation;
 
 	/*
-	 * transid of the trans_handle that last modified this inode
+	 * ID of the transaction handle that last modified this inode.
+	 * Protected by 'lock'.
 	 */
 	u64 last_trans;
 
 	/*
-	 * transid that last logged this inode
+	 * ID of the transaction that last logged this inode.
+	 * Protected by 'lock'.
 	 */
 	u64 logged_trans;
 
 	/*
-	 * log transid when this inode was last modified
+	 * Log transaction ID when this inode was last modified.
+	 * Protected by 'lock'.
 	 */
 	int last_sub_trans;
 
-	/* a local copy of root's last_log_commit */
+	/* A local copy of root's last_log_commit. Protected by 'lock'. */
 	int last_log_commit;
 
 	union {
 		/*
 		 * Total number of bytes pending delalloc, used by stat to
 		 * calculate the real block usage of the file. This is used
-		 * only for files.
+		 * only for files. Protected by 'lock'.
 		 */
 		u64 delalloc_bytes;
 		/*
@@ -161,7 +186,7 @@ struct btrfs_inode {
 		 * Total number of bytes pending delalloc that fall within a file
 		 * range that is either a hole or beyond EOF (and no prealloc extent
 		 * exists in the range). This is always <= delalloc_bytes and this
-		 * is used only for files.
+		 * is used only for files. Protected by 'lock'.
 		 */
 		u64 new_delalloc_bytes;
 		/*
@@ -172,15 +197,15 @@ struct btrfs_inode {
 	};
 
 	/*
-	 * total number of bytes pending defrag, used by stat to check whether
-	 * it needs COW.
+	 * Total number of bytes pending defrag, used by stat to check whether
+	 * it needs COW. Protected by 'lock'.
 	 */
 	u64 defrag_bytes;
 
 	/*
-	 * the size of the file stored in the metadata on disk.  data=ordered
+	 * The size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
-	 * because not all the blocks are written yet.
+	 * because not all the blocks are written yet. Protected by 'lock'.
 	 */
 	u64 disk_i_size;
 
@@ -214,7 +239,7 @@ struct btrfs_inode {
 
 	/*
 	 * Number of bytes outstanding that are going to need csums.  This is
-	 * used in ENOSPC accounting.
+	 * used in ENOSPC accounting. Protected by 'lock'.
 	 */
 	u64 csum_bytes;
 
@@ -223,30 +248,13 @@ struct btrfs_inode {
 	/* Read-only compatibility flags, upper half of inode_item::flags */
 	u32 ro_flags;
 
-	/*
-	 * Counters to keep track of the number of extent item's we may use due
-	 * to delalloc and such.  outstanding_extents is the number of extent
-	 * items we think we'll end up using, and reserved_extents is the number
-	 * of extent items we've reserved metadata for.
-	 */
-	unsigned outstanding_extents;
-
 	struct btrfs_block_rsv block_rsv;
 
-	/*
-	 * Cached values of inode properties
-	 */
-	unsigned prop_compress;		/* per-file compression algorithm */
-	/*
-	 * Force compression on the file using the defrag ioctl, could be
-	 * different from prop_compress and takes precedence if set
-	 */
-	unsigned defrag_compress;
-
 	struct btrfs_delayed_node *delayed_node;
 
 	/* File creation time. */
-	struct timespec64 i_otime;
+	u64 i_otime_sec;
+	u32 i_otime_nsec;
 
 	/* Hook into fs_info->delayed_iputs */
 	struct list_head delayed_iput;
@@ -387,7 +395,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 	spin_lock(&inode->lock);
 	if (inode->logged_trans == generation &&
 	    inode->last_sub_trans <= inode->last_log_commit &&
-	    inode->last_sub_trans <= inode->root->last_log_commit)
+	    inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root))
 		ret = true;
 	spin_unlock(&inode->lock);
 	return ret;
@@ -481,9 +489,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 				    struct page *page, size_t pg_offset,
 				    u64 start, u64 end);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct btrfs_inode *inode);
+		       struct btrfs_inode *inode);
 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct btrfs_inode *inode);
+				struct btrfs_inode *inode);
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
deleted file mode 100644
index 3caf339c4bb3..000000000000
--- a/fs/btrfs/check-integrity.c
+++ /dev/null
@@ -1,2871 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) STRATO AG 2011.  All rights reserved.
- */
-
-/*
- * This module can be used to catch cases when the btrfs kernel
- * code executes write requests to the disk that bring the file
- * system in an inconsistent state. In such a state, a power-loss
- * or kernel panic event would cause that the data on disk is
- * lost or at least damaged.
- *
- * Code is added that examines all block write requests during
- * runtime (including writes of the super block). Three rules
- * are verified and an error is printed on violation of the
- * rules:
- * 1. It is not allowed to write a disk block which is
- *    currently referenced by the super block (either directly
- *    or indirectly).
- * 2. When a super block is written, it is verified that all
- *    referenced (directly or indirectly) blocks fulfill the
- *    following requirements:
- *    2a. All referenced blocks have either been present when
- *        the file system was mounted, (i.e., they have been
- *        referenced by the super block) or they have been
- *        written since then and the write completion callback
- *        was called and no write error was indicated and a
- *        FLUSH request to the device where these blocks are
- *        located was received and completed.
- *    2b. All referenced blocks need to have a generation
- *        number which is equal to the parent's number.
- *
- * One issue that was found using this module was that the log
- * tree on disk became temporarily corrupted because disk blocks
- * that had been in use for the log tree had been freed and
- * reused too early, while being referenced by the written super
- * block.
- *
- * The search term in the kernel log that can be used to filter
- * on the existence of detected integrity issues is
- * "btrfs: attempt".
- *
- * The integrity check is enabled via mount options. These
- * mount options are only supported if the integrity check
- * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
- *
- * Example #1, apply integrity checks to all metadata:
- * mount /dev/sdb1 /mnt -o check_int
- *
- * Example #2, apply integrity checks to all metadata and
- * to data extents:
- * mount /dev/sdb1 /mnt -o check_int_data
- *
- * Example #3, apply integrity checks to all metadata and dump
- * the tree that the super block references to kernel messages
- * each time after a super block was written:
- * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
- *
- * If the integrity check tool is included and activated in
- * the mount options, plenty of kernel memory is used, and
- * plenty of additional CPU cycles are spent. Enabling this
- * functionality is not intended for normal use. In most
- * cases, unless you are a btrfs developer who needs to verify
- * the integrity of (super)-block write requests, do not
- * enable the config option BTRFS_FS_CHECK_INTEGRITY to
- * include and compile the integrity check tool.
- *
- * Expect millions of lines of information in the kernel log with an
- * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
- * kernel config to at least 26 (which is 64MB). Usually the value is
- * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
- * changed like this before LOG_BUF_SHIFT can be set to a high value:
- * config LOG_BUF_SHIFT
- *       int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
- *       range 12 30
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/blkdev.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <crypto/hash.h>
-#include "messages.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "extent_io.h"
-#include "volumes.h"
-#include "print-tree.h"
-#include "locking.h"
-#include "check-integrity.h"
-#include "rcu-string.h"
-#include "compression.h"
-#include "accessors.h"
-
-#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
-#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
-#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
-#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
-#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
-#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,
-							 * excluding " [...]" */
-#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
-
-/*
- * The definition of the bitmask fields for the print_mask.
- * They are specified with the mount option check_integrity_print_mask.
- */
-#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE			0x00000001
-#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION		0x00000002
-#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE			0x00000004
-#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE			0x00000008
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH			0x00000010
-#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH			0x00000020
-#define BTRFSIC_PRINT_MASK_VERBOSE				0x00000040
-#define BTRFSIC_PRINT_MASK_VERY_VERBOSE				0x00000080
-#define BTRFSIC_PRINT_MASK_INITIAL_TREE				0x00000100
-#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES			0x00000200
-#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
-#define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
-#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE		0x00002000
-
-struct btrfsic_dev_state;
-struct btrfsic_state;
-
-struct btrfsic_block {
-	u32 magic_num;		/* only used for debug purposes */
-	unsigned int is_metadata:1;	/* if it is meta-data, not data-data */
-	unsigned int is_superblock:1;	/* if it is one of the superblocks */
-	unsigned int is_iodone:1;	/* if is done by lower subsystem */
-	unsigned int iodone_w_error:1;	/* error was indicated to endio */
-	unsigned int never_written:1;	/* block was added because it was
-					 * referenced, not because it was
-					 * written */
-	unsigned int mirror_num;	/* large enough to hold
-					 * BTRFS_SUPER_MIRROR_MAX */
-	struct btrfsic_dev_state *dev_state;
-	u64 dev_bytenr;		/* key, physical byte num on disk */
-	u64 logical_bytenr;	/* logical byte num on disk */
-	u64 generation;
-	struct btrfs_disk_key disk_key;	/* extra info to print in case of
-					 * issues, will not always be correct */
-	struct list_head collision_resolving_node;	/* list node */
-	struct list_head all_blocks_node;	/* list node */
-
-	/* the following two lists contain block_link items */
-	struct list_head ref_to_list;	/* list */
-	struct list_head ref_from_list;	/* list */
-	struct btrfsic_block *next_in_same_bio;
-	void *orig_bio_private;
-	bio_end_io_t *orig_bio_end_io;
-	blk_opf_t submit_bio_bh_rw;
-	u64 flush_gen; /* only valid if !never_written */
-};
-
-/*
- * Elements of this type are allocated dynamically and required because
- * each block object can refer to and can be ref from multiple blocks.
- * The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block referred from.
- * The fact that they are searchable via a hashtable and that a
- * ref_cnt is maintained is not required for the btrfs integrity
- * check algorithm itself, it is only used to make the output more
- * beautiful in case that an error is detected (an error is defined
- * as a write operation to a block while that block is still referenced).
- */
-struct btrfsic_block_link {
-	u32 magic_num;		/* only used for debug purposes */
-	u32 ref_cnt;
-	struct list_head node_ref_to;	/* list node */
-	struct list_head node_ref_from;	/* list node */
-	struct list_head collision_resolving_node;	/* list node */
-	struct btrfsic_block *block_ref_to;
-	struct btrfsic_block *block_ref_from;
-	u64 parent_generation;
-};
-
-struct btrfsic_dev_state {
-	u32 magic_num;		/* only used for debug purposes */
-	struct block_device *bdev;
-	struct btrfsic_state *state;
-	struct list_head collision_resolving_node;	/* list node */
-	struct btrfsic_block dummy_block_for_bio_bh_flush;
-	u64 last_flush_gen;
-};
-
-struct btrfsic_block_hashtable {
-	struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_link_hashtable {
-	struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_dev_state_hashtable {
-	struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_data_ctx {
-	u64 start;		/* virtual bytenr */
-	u64 dev_bytenr;		/* physical bytenr on device */
-	u32 len;
-	struct btrfsic_dev_state *dev;
-	char **datav;
-	struct page **pagev;
-	void *mem_to_free;
-};
-
-/* This structure is used to implement recursion without occupying
- * any stack space, refer to btrfsic_process_metablock() */
-struct btrfsic_stack_frame {
-	u32 magic;
-	u32 nr;
-	int error;
-	int i;
-	int limit_nesting;
-	int num_copies;
-	int mirror_num;
-	struct btrfsic_block *block;
-	struct btrfsic_block_data_ctx *block_ctx;
-	struct btrfsic_block *next_block;
-	struct btrfsic_block_data_ctx next_block_ctx;
-	struct btrfs_header *hdr;
-	struct btrfsic_stack_frame *prev;
-};
-
-/* Some state per mounted filesystem */
-struct btrfsic_state {
-	u32 print_mask;
-	int include_extent_data;
-	struct list_head all_blocks_list;
-	struct btrfsic_block_hashtable block_hashtable;
-	struct btrfsic_block_link_hashtable block_link_hashtable;
-	struct btrfs_fs_info *fs_info;
-	u64 max_superblock_generation;
-	struct btrfsic_block *latest_superblock;
-	u32 metablock_size;
-	u32 datablock_size;
-};
-
-static int btrfsic_process_metablock(struct btrfsic_state *state,
-				     struct btrfsic_block *block,
-				     struct btrfsic_block_data_ctx *block_ctx,
-				     int limit_nesting, int force_iodone_flag);
-static void btrfsic_read_from_block_data(
-	struct btrfsic_block_data_ctx *block_ctx,
-	void *dst, u32 offset, size_t len);
-static int btrfsic_create_link_to_next_block(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx
-		*block_ctx, u64 next_bytenr,
-		int limit_nesting,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block **next_blockp,
-		int force_iodone_flag,
-		int *num_copiesp, int *mirror_nump,
-		struct btrfs_disk_key *disk_key,
-		u64 parent_generation);
-static int btrfsic_handle_extent_data(struct btrfsic_state *state,
-				      struct btrfsic_block *block,
-				      struct btrfsic_block_data_ctx *block_ctx,
-				      u32 item_offset, int force_iodone_flag);
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
-			     struct btrfsic_block_data_ctx *block_ctx_out,
-			     int mirror_num);
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_read_block(struct btrfsic_state *state,
-			      struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_process_written_superblock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const block,
-		struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp);
-static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
-					      const struct btrfsic_block *block,
-					      int recursion_level);
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
-					struct btrfsic_block *const block,
-					int recursion_level);
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l);
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l);
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
-				   const struct btrfsic_block *block);
-static void btrfsic_dump_tree(const struct btrfsic_state *state);
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
-				  const struct btrfsic_block *block,
-				  int indent_level);
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block *next_block,
-		struct btrfsic_block *from_block,
-		u64 parent_generation);
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *block_ctx,
-		const char *additional_string,
-		int is_metadata,
-		int is_iodone,
-		int never_written,
-		int mirror_num,
-		int *was_created);
-static int btrfsic_process_superblock_dev_mirror(
-		struct btrfsic_state *state,
-		struct btrfsic_dev_state *dev_state,
-		struct btrfs_device *device,
-		int superblock_mirror_num,
-		struct btrfsic_dev_state **selected_dev_state,
-		struct btrfs_super_block *selected_super);
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev);
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
-					   u64 bytenr,
-					   struct btrfsic_dev_state *dev_state,
-					   u64 dev_bytenr);
-
-static struct mutex btrfsic_mutex;
-static int btrfsic_is_initialized;
-static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
-
-
-static void btrfsic_block_init(struct btrfsic_block *b)
-{
-	b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
-	b->dev_state = NULL;
-	b->dev_bytenr = 0;
-	b->logical_bytenr = 0;
-	b->generation = BTRFSIC_GENERATION_UNKNOWN;
-	b->disk_key.objectid = 0;
-	b->disk_key.type = 0;
-	b->disk_key.offset = 0;
-	b->is_metadata = 0;
-	b->is_superblock = 0;
-	b->is_iodone = 0;
-	b->iodone_w_error = 0;
-	b->never_written = 0;
-	b->mirror_num = 0;
-	b->next_in_same_bio = NULL;
-	b->orig_bio_private = NULL;
-	b->orig_bio_end_io = NULL;
-	INIT_LIST_HEAD(&b->collision_resolving_node);
-	INIT_LIST_HEAD(&b->all_blocks_node);
-	INIT_LIST_HEAD(&b->ref_to_list);
-	INIT_LIST_HEAD(&b->ref_from_list);
-	b->submit_bio_bh_rw = 0;
-	b->flush_gen = 0;
-}
-
-static struct btrfsic_block *btrfsic_block_alloc(void)
-{
-	struct btrfsic_block *b;
-
-	b = kzalloc(sizeof(*b), GFP_NOFS);
-	if (NULL != b)
-		btrfsic_block_init(b);
-
-	return b;
-}
-
-static void btrfsic_block_free(struct btrfsic_block *b)
-{
-	BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
-	kfree(b);
-}
-
-static void btrfsic_block_link_init(struct btrfsic_block_link *l)
-{
-	l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
-	l->ref_cnt = 1;
-	INIT_LIST_HEAD(&l->node_ref_to);
-	INIT_LIST_HEAD(&l->node_ref_from);
-	INIT_LIST_HEAD(&l->collision_resolving_node);
-	l->block_ref_to = NULL;
-	l->block_ref_from = NULL;
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
-{
-	struct btrfsic_block_link *l;
-
-	l = kzalloc(sizeof(*l), GFP_NOFS);
-	if (NULL != l)
-		btrfsic_block_link_init(l);
-
-	return l;
-}
-
-static void btrfsic_block_link_free(struct btrfsic_block_link *l)
-{
-	BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
-	kfree(l);
-}
-
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
-{
-	ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
-	ds->bdev = NULL;
-	ds->state = NULL;
-	INIT_LIST_HEAD(&ds->collision_resolving_node);
-	ds->last_flush_gen = 0;
-	btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
-	ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
-	ds->dummy_block_for_bio_bh_flush.dev_state = ds;
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
-{
-	struct btrfsic_dev_state *ds;
-
-	ds = kzalloc(sizeof(*ds), GFP_NOFS);
-	if (NULL != ds)
-		btrfsic_dev_state_init(ds);
-
-	return ds;
-}
-
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
-{
-	BUG_ON(!(NULL == ds ||
-		 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
-	kfree(ds);
-}
-
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
-					struct btrfsic_block_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(b->dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
-	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-
-	list_add(&b->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
-{
-	list_del(&b->collision_resolving_node);
-}
-
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
-		struct block_device *bdev,
-		u64 dev_bytenr,
-		struct btrfsic_block_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)bdev))) &
-	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-	struct btrfsic_block *b;
-
-	list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
-		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
-			return b;
-	}
-
-	return NULL;
-}
-
-static void btrfsic_block_link_hashtable_init(
-		struct btrfsic_block_link_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_link_hashtable_add(
-		struct btrfsic_block_link *l,
-		struct btrfsic_block_link_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
-	     ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
-	     ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
-	     & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-
-	BUG_ON(NULL == l->block_ref_to);
-	BUG_ON(NULL == l->block_ref_from);
-	list_add(&l->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
-{
-	list_del(&l->collision_resolving_node);
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
-		struct block_device *bdev_ref_to,
-		u64 dev_bytenr_ref_to,
-		struct block_device *bdev_ref_from,
-		u64 dev_bytenr_ref_from,
-		struct btrfsic_block_link_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
-	     ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
-	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
-	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
-	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-	struct btrfsic_block_link *l;
-
-	list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
-		BUG_ON(NULL == l->block_ref_to);
-		BUG_ON(NULL == l->block_ref_from);
-		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
-		    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
-		    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
-		    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
-			return l;
-	}
-
-	return NULL;
-}
-
-static void btrfsic_dev_state_hashtable_init(
-		struct btrfsic_dev_state_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_dev_state_hashtable_add(
-		struct btrfsic_dev_state *ds,
-		struct btrfsic_dev_state_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
-	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
-
-	list_add(&ds->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
-{
-	list_del(&ds->collision_resolving_node);
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
-		struct btrfsic_dev_state_hashtable *h)
-{
-	const unsigned int hashval =
-		dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1);
-	struct btrfsic_dev_state *ds;
-
-	list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
-		if (ds->bdev->bd_dev == dev)
-			return ds;
-	}
-
-	return NULL;
-}
-
-static int btrfsic_process_superblock(struct btrfsic_state *state,
-				      struct btrfs_fs_devices *fs_devices)
-{
-	struct btrfs_super_block *selected_super;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-	struct btrfsic_dev_state *selected_dev_state = NULL;
-	int ret = 0;
-	int pass;
-
-	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
-	if (!selected_super)
-		return -ENOMEM;
-
-	list_for_each_entry(device, dev_head, dev_list) {
-		int i;
-		struct btrfsic_dev_state *dev_state;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev);
-		BUG_ON(NULL == dev_state);
-		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-			ret = btrfsic_process_superblock_dev_mirror(
-					state, dev_state, device, i,
-					&selected_dev_state, selected_super);
-			if (0 != ret && 0 == i) {
-				kfree(selected_super);
-				return ret;
-			}
-		}
-	}
-
-	if (NULL == state->latest_superblock) {
-		pr_info("btrfsic: no superblock found!\n");
-		kfree(selected_super);
-		return -1;
-	}
-
-	for (pass = 0; pass < 3; pass++) {
-		int num_copies;
-		int mirror_num;
-		u64 next_bytenr;
-
-		switch (pass) {
-		case 0:
-			next_bytenr = btrfs_super_root(selected_super);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("root@%llu\n", next_bytenr);
-			break;
-		case 1:
-			next_bytenr = btrfs_super_chunk_root(selected_super);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("chunk@%llu\n", next_bytenr);
-			break;
-		case 2:
-			next_bytenr = btrfs_super_log_root(selected_super);
-			if (0 == next_bytenr)
-				continue;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("log@%llu\n", next_bytenr);
-			break;
-		}
-
-		num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
-					      state->metablock_size);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, num_copies);
-
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block *next_block;
-			struct btrfsic_block_data_ctx tmp_next_block_ctx;
-			struct btrfsic_block_link *l;
-
-			ret = btrfsic_map_block(state, next_bytenr,
-						state->metablock_size,
-						&tmp_next_block_ctx,
-						mirror_num);
-			if (ret) {
-				pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n",
-				       next_bytenr, mirror_num);
-				kfree(selected_super);
-				return -1;
-			}
-
-			next_block = btrfsic_block_hashtable_lookup(
-					tmp_next_block_ctx.dev->bdev,
-					tmp_next_block_ctx.dev_bytenr,
-					&state->block_hashtable);
-			BUG_ON(NULL == next_block);
-
-			l = btrfsic_block_link_hashtable_lookup(
-					tmp_next_block_ctx.dev->bdev,
-					tmp_next_block_ctx.dev_bytenr,
-					state->latest_superblock->dev_state->
-					bdev,
-					state->latest_superblock->dev_bytenr,
-					&state->block_link_hashtable);
-			BUG_ON(NULL == l);
-
-			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
-			if (ret < (int)PAGE_SIZE) {
-				pr_info("btrfsic: read @logical %llu failed!\n",
-				       tmp_next_block_ctx.start);
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				kfree(selected_super);
-				return -1;
-			}
-
-			ret = btrfsic_process_metablock(state,
-							next_block,
-							&tmp_next_block_ctx,
-							BTRFS_MAX_LEVEL + 3, 1);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-		}
-	}
-
-	kfree(selected_super);
-	return ret;
-}
-
-static int btrfsic_process_superblock_dev_mirror(
-		struct btrfsic_state *state,
-		struct btrfsic_dev_state *dev_state,
-		struct btrfs_device *device,
-		int superblock_mirror_num,
-		struct btrfsic_dev_state **selected_dev_state,
-		struct btrfs_super_block *selected_super)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	struct btrfs_super_block *super_tmp;
-	u64 dev_bytenr;
-	struct btrfsic_block *superblock_tmp;
-	int pass;
-	struct block_device *const superblock_bdev = device->bdev;
-	struct page *page;
-	struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
-	int ret = 0;
-
-	/* super block bytenr is always the unmapped device bytenr */
-	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
-	if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
-		return -1;
-
-	page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
-	if (IS_ERR(page))
-		return -1;
-
-	super_tmp = page_address(page);
-
-	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
-	    btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
-	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
-	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||
-	    btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
-		ret = 0;
-		goto out;
-	}
-
-	superblock_tmp =
-	    btrfsic_block_hashtable_lookup(superblock_bdev,
-					   dev_bytenr,
-					   &state->block_hashtable);
-	if (NULL == superblock_tmp) {
-		superblock_tmp = btrfsic_block_alloc();
-		if (NULL == superblock_tmp) {
-			ret = -1;
-			goto out;
-		}
-		/* for superblock, only the dev_bytenr makes sense */
-		superblock_tmp->dev_bytenr = dev_bytenr;
-		superblock_tmp->dev_state = dev_state;
-		superblock_tmp->logical_bytenr = dev_bytenr;
-		superblock_tmp->generation = btrfs_super_generation(super_tmp);
-		superblock_tmp->is_metadata = 1;
-		superblock_tmp->is_superblock = 1;
-		superblock_tmp->is_iodone = 1;
-		superblock_tmp->never_written = 0;
-		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			btrfs_info_in_rcu(fs_info,
-			"new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
-				     superblock_bdev,
-				     btrfs_dev_name(device), dev_bytenr,
-				     dev_state->bdev, dev_bytenr,
-				     superblock_mirror_num);
-		list_add(&superblock_tmp->all_blocks_node,
-			 &state->all_blocks_list);
-		btrfsic_block_hashtable_add(superblock_tmp,
-					    &state->block_hashtable);
-	}
-
-	/* select the one with the highest generation field */
-	if (btrfs_super_generation(super_tmp) >
-	    state->max_superblock_generation ||
-	    0 == state->max_superblock_generation) {
-		memcpy(selected_super, super_tmp, sizeof(*selected_super));
-		*selected_dev_state = dev_state;
-		state->max_superblock_generation =
-		    btrfs_super_generation(super_tmp);
-		state->latest_superblock = superblock_tmp;
-	}
-
-	for (pass = 0; pass < 3; pass++) {
-		u64 next_bytenr;
-		int num_copies;
-		int mirror_num;
-		const char *additional_string = NULL;
-		struct btrfs_disk_key tmp_disk_key;
-
-		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
-		tmp_disk_key.offset = 0;
-		switch (pass) {
-		case 0:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_ROOT_TREE_OBJECTID);
-			additional_string = "initial root ";
-			next_bytenr = btrfs_super_root(super_tmp);
-			break;
-		case 1:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_CHUNK_TREE_OBJECTID);
-			additional_string = "initial chunk ";
-			next_bytenr = btrfs_super_chunk_root(super_tmp);
-			break;
-		case 2:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_TREE_LOG_OBJECTID);
-			additional_string = "initial log ";
-			next_bytenr = btrfs_super_log_root(super_tmp);
-			if (0 == next_bytenr)
-				continue;
-			break;
-		}
-
-		num_copies = btrfs_num_copies(fs_info, next_bytenr,
-					      state->metablock_size);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block *next_block;
-			struct btrfsic_block_data_ctx tmp_next_block_ctx;
-			struct btrfsic_block_link *l;
-
-			if (btrfsic_map_block(state, next_bytenr,
-					      state->metablock_size,
-					      &tmp_next_block_ctx,
-					      mirror_num)) {
-				pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
-				       next_bytenr, mirror_num);
-				ret = -1;
-				goto out;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state, &tmp_next_block_ctx,
-					additional_string, 1, 1, 0,
-					mirror_num, NULL);
-			if (NULL == next_block) {
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				ret = -1;
-				goto out;
-			}
-
-			next_block->disk_key = tmp_disk_key;
-			next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
-			l = btrfsic_block_link_lookup_or_add(
-					state, &tmp_next_block_ctx,
-					next_block, superblock_tmp,
-					BTRFSIC_GENERATION_UNKNOWN);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-			if (NULL == l) {
-				ret = -1;
-				goto out;
-			}
-		}
-	}
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
-		btrfsic_dump_tree_sub(state, superblock_tmp, 0);
-
-out:
-	put_page(page);
-	return ret;
-}
-
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
-{
-	struct btrfsic_stack_frame *sf;
-
-	sf = kzalloc(sizeof(*sf), GFP_NOFS);
-	if (sf)
-		sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
-	return sf;
-}
-
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
-{
-	BUG_ON(!(NULL == sf ||
-		 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
-	kfree(sf);
-}
-
-static noinline_for_stack int btrfsic_process_metablock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const first_block,
-		struct btrfsic_block_data_ctx *const first_block_ctx,
-		int first_limit_nesting, int force_iodone_flag)
-{
-	struct btrfsic_stack_frame initial_stack_frame = { 0 };
-	struct btrfsic_stack_frame *sf;
-	struct btrfsic_stack_frame *next_stack;
-	struct btrfs_header *const first_hdr =
-		(struct btrfs_header *)first_block_ctx->datav[0];
-
-	BUG_ON(!first_hdr);
-	sf = &initial_stack_frame;
-	sf->error = 0;
-	sf->i = -1;
-	sf->limit_nesting = first_limit_nesting;
-	sf->block = first_block;
-	sf->block_ctx = first_block_ctx;
-	sf->next_block = NULL;
-	sf->hdr = first_hdr;
-	sf->prev = NULL;
-
-continue_with_new_stack_frame:
-	sf->block->generation = btrfs_stack_header_generation(sf->hdr);
-	if (0 == sf->hdr->level) {
-		struct btrfs_leaf *const leafhdr =
-		    (struct btrfs_leaf *)sf->hdr;
-
-		if (-1 == sf->i) {
-			sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info("leaf %llu items %d generation %llu owner %llu\n",
-				       sf->block_ctx->start, sf->nr,
-				       btrfs_stack_header_generation(
-					       &leafhdr->header),
-				       btrfs_stack_header_owner(
-					       &leafhdr->header));
-		}
-
-continue_with_current_leaf_stack_frame:
-		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
-			sf->i++;
-			sf->num_copies = 0;
-		}
-
-		if (sf->i < sf->nr) {
-			struct btrfs_item disk_item;
-			u32 disk_item_offset =
-				(uintptr_t)(leafhdr->items + sf->i) -
-				(uintptr_t)leafhdr;
-			struct btrfs_disk_key *disk_key;
-			u8 type;
-			u32 item_offset;
-			u32 item_size;
-
-			if (disk_item_offset + sizeof(struct btrfs_item) >
-			    sf->block_ctx->len) {
-leaf_item_out_of_bounce_error:
-				pr_info(
-		"btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
-				       sf->block_ctx->start,
-				       sf->block_ctx->dev->bdev);
-				goto one_stack_frame_backwards;
-			}
-			btrfsic_read_from_block_data(sf->block_ctx,
-						     &disk_item,
-						     disk_item_offset,
-						     sizeof(struct btrfs_item));
-			item_offset = btrfs_stack_item_offset(&disk_item);
-			item_size = btrfs_stack_item_size(&disk_item);
-			disk_key = &disk_item.key;
-			type = btrfs_disk_key_type(disk_key);
-
-			if (BTRFS_ROOT_ITEM_KEY == type) {
-				struct btrfs_root_item root_item;
-				u32 root_item_offset;
-				u64 next_bytenr;
-
-				root_item_offset = item_offset +
-					offsetof(struct btrfs_leaf, items);
-				if (root_item_offset + item_size >
-				    sf->block_ctx->len)
-					goto leaf_item_out_of_bounce_error;
-				btrfsic_read_from_block_data(
-					sf->block_ctx, &root_item,
-					root_item_offset,
-					item_size);
-				next_bytenr = btrfs_root_bytenr(&root_item);
-
-				sf->error =
-				    btrfsic_create_link_to_next_block(
-						state,
-						sf->block,
-						sf->block_ctx,
-						next_bytenr,
-						sf->limit_nesting,
-						&sf->next_block_ctx,
-						&sf->next_block,
-						force_iodone_flag,
-						&sf->num_copies,
-						&sf->mirror_num,
-						disk_key,
-						btrfs_root_generation(
-						&root_item));
-				if (sf->error)
-					goto one_stack_frame_backwards;
-
-				if (NULL != sf->next_block) {
-					struct btrfs_header *const next_hdr =
-					    (struct btrfs_header *)
-					    sf->next_block_ctx.datav[0];
-
-					next_stack =
-					    btrfsic_stack_frame_alloc();
-					if (NULL == next_stack) {
-						sf->error = -1;
-						btrfsic_release_block_ctx(
-								&sf->
-								next_block_ctx);
-						goto one_stack_frame_backwards;
-					}
-
-					next_stack->i = -1;
-					next_stack->block = sf->next_block;
-					next_stack->block_ctx =
-					    &sf->next_block_ctx;
-					next_stack->next_block = NULL;
-					next_stack->hdr = next_hdr;
-					next_stack->limit_nesting =
-					    sf->limit_nesting - 1;
-					next_stack->prev = sf;
-					sf = next_stack;
-					goto continue_with_new_stack_frame;
-				}
-			} else if (BTRFS_EXTENT_DATA_KEY == type &&
-				   state->include_extent_data) {
-				sf->error = btrfsic_handle_extent_data(
-						state,
-						sf->block,
-						sf->block_ctx,
-						item_offset,
-						force_iodone_flag);
-				if (sf->error)
-					goto one_stack_frame_backwards;
-			}
-
-			goto continue_with_current_leaf_stack_frame;
-		}
-	} else {
-		struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
-
-		if (-1 == sf->i) {
-			sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info("node %llu level %d items %d generation %llu owner %llu\n",
-				       sf->block_ctx->start,
-				       nodehdr->header.level, sf->nr,
-				       btrfs_stack_header_generation(
-				       &nodehdr->header),
-				       btrfs_stack_header_owner(
-				       &nodehdr->header));
-		}
-
-continue_with_current_node_stack_frame:
-		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
-			sf->i++;
-			sf->num_copies = 0;
-		}
-
-		if (sf->i < sf->nr) {
-			struct btrfs_key_ptr key_ptr;
-			u32 key_ptr_offset;
-			u64 next_bytenr;
-
-			key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
-					  (uintptr_t)nodehdr;
-			if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
-			    sf->block_ctx->len) {
-				pr_info(
-		"btrfsic: node item out of bounce at logical %llu, dev %pg\n",
-				       sf->block_ctx->start,
-				       sf->block_ctx->dev->bdev);
-				goto one_stack_frame_backwards;
-			}
-			btrfsic_read_from_block_data(
-				sf->block_ctx, &key_ptr, key_ptr_offset,
-				sizeof(struct btrfs_key_ptr));
-			next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
-
-			sf->error = btrfsic_create_link_to_next_block(
-					state,
-					sf->block,
-					sf->block_ctx,
-					next_bytenr,
-					sf->limit_nesting,
-					&sf->next_block_ctx,
-					&sf->next_block,
-					force_iodone_flag,
-					&sf->num_copies,
-					&sf->mirror_num,
-					&key_ptr.key,
-					btrfs_stack_key_generation(&key_ptr));
-			if (sf->error)
-				goto one_stack_frame_backwards;
-
-			if (NULL != sf->next_block) {
-				struct btrfs_header *const next_hdr =
-				    (struct btrfs_header *)
-				    sf->next_block_ctx.datav[0];
-
-				next_stack = btrfsic_stack_frame_alloc();
-				if (NULL == next_stack) {
-					sf->error = -1;
-					goto one_stack_frame_backwards;
-				}
-
-				next_stack->i = -1;
-				next_stack->block = sf->next_block;
-				next_stack->block_ctx = &sf->next_block_ctx;
-				next_stack->next_block = NULL;
-				next_stack->hdr = next_hdr;
-				next_stack->limit_nesting =
-				    sf->limit_nesting - 1;
-				next_stack->prev = sf;
-				sf = next_stack;
-				goto continue_with_new_stack_frame;
-			}
-
-			goto continue_with_current_node_stack_frame;
-		}
-	}
-
-one_stack_frame_backwards:
-	if (NULL != sf->prev) {
-		struct btrfsic_stack_frame *const prev = sf->prev;
-
-		/* the one for the initial block is freed in the caller */
-		btrfsic_release_block_ctx(sf->block_ctx);
-
-		if (sf->error) {
-			prev->error = sf->error;
-			btrfsic_stack_frame_free(sf);
-			sf = prev;
-			goto one_stack_frame_backwards;
-		}
-
-		btrfsic_stack_frame_free(sf);
-		sf = prev;
-		goto continue_with_new_stack_frame;
-	} else {
-		BUG_ON(&initial_stack_frame != sf);
-	}
-
-	return sf->error;
-}
-
-static void btrfsic_read_from_block_data(
-	struct btrfsic_block_data_ctx *block_ctx,
-	void *dstv, u32 offset, size_t len)
-{
-	size_t cur;
-	size_t pgoff;
-	char *kaddr;
-	char *dst = (char *)dstv;
-	size_t start_offset = offset_in_page(block_ctx->start);
-	unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
-
-	WARN_ON(offset + len > block_ctx->len);
-	pgoff = offset_in_page(start_offset + offset);
-
-	while (len > 0) {
-		cur = min(len, ((size_t)PAGE_SIZE - pgoff));
-		BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
-		kaddr = block_ctx->datav[i];
-		memcpy(dst, kaddr + pgoff, cur);
-
-		dst += cur;
-		len -= cur;
-		pgoff = 0;
-		i++;
-	}
-}
-
-static int btrfsic_create_link_to_next_block(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx *block_ctx,
-		u64 next_bytenr,
-		int limit_nesting,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block **next_blockp,
-		int force_iodone_flag,
-		int *num_copiesp, int *mirror_nump,
-		struct btrfs_disk_key *disk_key,
-		u64 parent_generation)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	struct btrfsic_block *next_block = NULL;
-	int ret;
-	struct btrfsic_block_link *l;
-	int did_alloc_block_link;
-	int block_was_created;
-
-	*next_blockp = NULL;
-	if (0 == *num_copiesp) {
-		*num_copiesp = btrfs_num_copies(fs_info, next_bytenr,
-						state->metablock_size);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, *num_copiesp);
-		*mirror_nump = 1;
-	}
-
-	if (*mirror_nump > *num_copiesp)
-		return 0;
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-		pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n",
-		       *mirror_nump);
-	ret = btrfsic_map_block(state, next_bytenr,
-				state->metablock_size,
-				next_block_ctx, *mirror_nump);
-	if (ret) {
-		pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
-		       next_bytenr, *mirror_nump);
-		btrfsic_release_block_ctx(next_block_ctx);
-		*next_blockp = NULL;
-		return -1;
-	}
-
-	next_block = btrfsic_block_lookup_or_add(state,
-						 next_block_ctx, "referenced ",
-						 1, force_iodone_flag,
-						 !force_iodone_flag,
-						 *mirror_nump,
-						 &block_was_created);
-	if (NULL == next_block) {
-		btrfsic_release_block_ctx(next_block_ctx);
-		*next_blockp = NULL;
-		return -1;
-	}
-	if (block_was_created) {
-		l = NULL;
-		next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
-	} else {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
-			if (next_block->logical_bytenr != next_bytenr &&
-			    !(!next_block->is_metadata &&
-			      0 == next_block->logical_bytenr))
-				pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
-				       next_bytenr, next_block_ctx->dev->bdev,
-				       next_block_ctx->dev_bytenr, *mirror_nump,
-				       btrfsic_get_block_type(state,
-							      next_block),
-				       next_block->logical_bytenr);
-			else
-				pr_info(
-		"referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
-				       next_bytenr, next_block_ctx->dev->bdev,
-				       next_block_ctx->dev_bytenr, *mirror_nump,
-				       btrfsic_get_block_type(state,
-							      next_block));
-		}
-		next_block->logical_bytenr = next_bytenr;
-
-		next_block->mirror_num = *mirror_nump;
-		l = btrfsic_block_link_hashtable_lookup(
-				next_block_ctx->dev->bdev,
-				next_block_ctx->dev_bytenr,
-				block_ctx->dev->bdev,
-				block_ctx->dev_bytenr,
-				&state->block_link_hashtable);
-	}
-
-	next_block->disk_key = *disk_key;
-	if (NULL == l) {
-		l = btrfsic_block_link_alloc();
-		if (NULL == l) {
-			btrfsic_release_block_ctx(next_block_ctx);
-			*next_blockp = NULL;
-			return -1;
-		}
-
-		did_alloc_block_link = 1;
-		l->block_ref_to = next_block;
-		l->block_ref_from = block;
-		l->ref_cnt = 1;
-		l->parent_generation = parent_generation;
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-
-		list_add(&l->node_ref_to, &block->ref_to_list);
-		list_add(&l->node_ref_from, &next_block->ref_from_list);
-
-		btrfsic_block_link_hashtable_add(l,
-						 &state->block_link_hashtable);
-	} else {
-		did_alloc_block_link = 0;
-		if (0 == limit_nesting) {
-			l->ref_cnt++;
-			l->parent_generation = parent_generation;
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_add_link(state, l);
-		}
-	}
-
-	if (limit_nesting > 0 && did_alloc_block_link) {
-		ret = btrfsic_read_block(state, next_block_ctx);
-		if (ret < (int)next_block_ctx->len) {
-			pr_info("btrfsic: read block @logical %llu failed!\n",
-			       next_bytenr);
-			btrfsic_release_block_ctx(next_block_ctx);
-			*next_blockp = NULL;
-			return -1;
-		}
-
-		*next_blockp = next_block;
-	} else {
-		*next_blockp = NULL;
-	}
-	(*mirror_nump)++;
-
-	return 0;
-}
-
-static int btrfsic_handle_extent_data(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx *block_ctx,
-		u32 item_offset, int force_iodone_flag)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	struct btrfs_file_extent_item file_extent_item;
-	u64 file_extent_item_offset;
-	u64 next_bytenr;
-	u64 num_bytes;
-	u64 generation;
-	struct btrfsic_block_link *l;
-	int ret;
-
-	file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
-				  item_offset;
-	if (file_extent_item_offset +
-	    offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
-	    block_ctx->len) {
-		pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
-		       block_ctx->start, block_ctx->dev->bdev);
-		return -1;
-	}
-
-	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
-		file_extent_item_offset,
-		offsetof(struct btrfs_file_extent_item, disk_num_bytes));
-	if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
-	    btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
-			pr_info("extent_data: type %u, disk_bytenr = %llu\n",
-			       file_extent_item.type,
-			       btrfs_stack_file_extent_disk_bytenr(
-			       &file_extent_item));
-		return 0;
-	}
-
-	if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
-	    block_ctx->len) {
-		pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
-		       block_ctx->start, block_ctx->dev->bdev);
-		return -1;
-	}
-	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
-				     file_extent_item_offset,
-				     sizeof(struct btrfs_file_extent_item));
-	next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
-	if (btrfs_stack_file_extent_compression(&file_extent_item) ==
-	    BTRFS_COMPRESS_NONE) {
-		next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
-		num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
-	} else {
-		num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
-	}
-	generation = btrfs_stack_file_extent_generation(&file_extent_item);
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
-		pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n",
-		       file_extent_item.type,
-		       btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
-		       btrfs_stack_file_extent_offset(&file_extent_item),
-		       num_bytes);
-	while (num_bytes > 0) {
-		u32 chunk_len;
-		int num_copies;
-		int mirror_num;
-
-		if (num_bytes > state->datablock_size)
-			chunk_len = state->datablock_size;
-		else
-			chunk_len = num_bytes;
-
-		num_copies = btrfs_num_copies(fs_info, next_bytenr,
-					      state->datablock_size);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block_data_ctx next_block_ctx;
-			struct btrfsic_block *next_block;
-			int block_was_created;
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n",
-					mirror_num);
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
-				pr_info("\tdisk_bytenr = %llu, num_bytes %u\n",
-				       next_bytenr, chunk_len);
-			ret = btrfsic_map_block(state, next_bytenr,
-						chunk_len, &next_block_ctx,
-						mirror_num);
-			if (ret) {
-				pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
-				       next_bytenr, mirror_num);
-				return -1;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state,
-					&next_block_ctx,
-					"referenced ",
-					0,
-					force_iodone_flag,
-					!force_iodone_flag,
-					mirror_num,
-					&block_was_created);
-			if (NULL == next_block) {
-				btrfsic_release_block_ctx(&next_block_ctx);
-				return -1;
-			}
-			if (!block_was_created) {
-				if ((state->print_mask &
-				     BTRFSIC_PRINT_MASK_VERBOSE) &&
-				    next_block->logical_bytenr != next_bytenr &&
-				    !(!next_block->is_metadata &&
-				      0 == next_block->logical_bytenr)) {
-					pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
-					       next_bytenr,
-					       next_block_ctx.dev->bdev,
-					       next_block_ctx.dev_bytenr,
-					       mirror_num,
-					       next_block->logical_bytenr);
-				}
-				next_block->logical_bytenr = next_bytenr;
-				next_block->mirror_num = mirror_num;
-			}
-
-			l = btrfsic_block_link_lookup_or_add(state,
-							     &next_block_ctx,
-							     next_block, block,
-							     generation);
-			btrfsic_release_block_ctx(&next_block_ctx);
-			if (NULL == l)
-				return -1;
-		}
-
-		next_bytenr += chunk_len;
-		num_bytes -= chunk_len;
-	}
-
-	return 0;
-}
-
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
-			     struct btrfsic_block_data_ctx *block_ctx_out,
-			     int mirror_num)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	int ret;
-	u64 length;
-	struct btrfs_io_context *bioc = NULL;
-	struct btrfs_io_stripe smap, *map;
-	struct btrfs_device *device;
-
-	length = len;
-	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc,
-			      NULL, &mirror_num, 0);
-	if (ret) {
-		block_ctx_out->start = 0;
-		block_ctx_out->dev_bytenr = 0;
-		block_ctx_out->len = 0;
-		block_ctx_out->dev = NULL;
-		block_ctx_out->datav = NULL;
-		block_ctx_out->pagev = NULL;
-		block_ctx_out->mem_to_free = NULL;
-
-		return ret;
-	}
-
-	if (bioc)
-		map = &bioc->stripes[0];
-	else
-		map = &smap;
-
-	device = map->dev;
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
-	    !device->bdev || !device->name)
-		block_ctx_out->dev = NULL;
-	else
-		block_ctx_out->dev = btrfsic_dev_state_lookup(
-							device->bdev->bd_dev);
-	block_ctx_out->dev_bytenr = map->physical;
-	block_ctx_out->start = bytenr;
-	block_ctx_out->len = len;
-	block_ctx_out->datav = NULL;
-	block_ctx_out->pagev = NULL;
-	block_ctx_out->mem_to_free = NULL;
-
-	kfree(bioc);
-	if (NULL == block_ctx_out->dev) {
-		ret = -ENXIO;
-		pr_info("btrfsic: error, cannot lookup dev (#1)!\n");
-	}
-
-	return ret;
-}
-
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
-{
-	if (block_ctx->mem_to_free) {
-		unsigned int num_pages;
-
-		BUG_ON(!block_ctx->datav);
-		BUG_ON(!block_ctx->pagev);
-		num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
-			    PAGE_SHIFT;
-		/* Pages must be unmapped in reverse order */
-		while (num_pages > 0) {
-			num_pages--;
-			if (block_ctx->datav[num_pages])
-				block_ctx->datav[num_pages] = NULL;
-			if (block_ctx->pagev[num_pages]) {
-				__free_page(block_ctx->pagev[num_pages]);
-				block_ctx->pagev[num_pages] = NULL;
-			}
-		}
-
-		kfree(block_ctx->mem_to_free);
-		block_ctx->mem_to_free = NULL;
-		block_ctx->pagev = NULL;
-		block_ctx->datav = NULL;
-	}
-}
-
-static int btrfsic_read_block(struct btrfsic_state *state,
-			      struct btrfsic_block_data_ctx *block_ctx)
-{
-	unsigned int num_pages;
-	unsigned int i;
-	size_t size;
-	u64 dev_bytenr;
-	int ret;
-
-	BUG_ON(block_ctx->datav);
-	BUG_ON(block_ctx->pagev);
-	BUG_ON(block_ctx->mem_to_free);
-	if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
-		pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
-		       block_ctx->dev_bytenr);
-		return -1;
-	}
-
-	num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
-		    PAGE_SHIFT;
-	size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
-	block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
-	if (!block_ctx->mem_to_free)
-		return -ENOMEM;
-	block_ctx->datav = block_ctx->mem_to_free;
-	block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
-	ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
-	if (ret)
-		return ret;
-
-	dev_bytenr = block_ctx->dev_bytenr;
-	for (i = 0; i < num_pages;) {
-		struct bio *bio;
-		unsigned int j;
-
-		bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
-				REQ_OP_READ, GFP_NOFS);
-		bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT;
-
-		for (j = i; j < num_pages; j++) {
-			ret = bio_add_page(bio, block_ctx->pagev[j],
-					   PAGE_SIZE, 0);
-			if (PAGE_SIZE != ret)
-				break;
-		}
-		if (j == i) {
-			pr_info("btrfsic: error, failed to add a single page!\n");
-			return -1;
-		}
-		if (submit_bio_wait(bio)) {
-			pr_info("btrfsic: read error at logical %llu dev %pg!\n",
-			       block_ctx->start, block_ctx->dev->bdev);
-			bio_put(bio);
-			return -1;
-		}
-		bio_put(bio);
-		dev_bytenr += (j - i) * PAGE_SIZE;
-		i = j;
-	}
-	for (i = 0; i < num_pages; i++)
-		block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
-
-	return block_ctx->len;
-}
-
-static void btrfsic_dump_database(struct btrfsic_state *state)
-{
-	const struct btrfsic_block *b_all;
-
-	BUG_ON(NULL == state);
-
-	pr_info("all_blocks_list:\n");
-	list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
-		const struct btrfsic_block_link *l;
-
-		pr_info("%c-block @%llu (%pg/%llu/%d)\n",
-		       btrfsic_get_block_type(state, b_all),
-		       b_all->logical_bytenr, b_all->dev_state->bdev,
-		       b_all->dev_bytenr, b_all->mirror_num);
-
-		list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
-			pr_info(
-		" %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
-			       btrfsic_get_block_type(state, b_all),
-			       b_all->logical_bytenr, b_all->dev_state->bdev,
-			       b_all->dev_bytenr, b_all->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-		}
-
-		list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
-			pr_info(
-		" %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
-			       btrfsic_get_block_type(state, b_all),
-			       b_all->logical_bytenr, b_all->dev_state->bdev,
-			       b_all->dev_bytenr, b_all->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_from),
-			       l->block_ref_from->logical_bytenr,
-			       l->block_ref_from->dev_state->bdev,
-			       l->block_ref_from->dev_bytenr,
-			       l->block_ref_from->mirror_num);
-		}
-
-		pr_info("\n");
-	}
-}
-
-/*
- * Test whether the disk block contains a tree block (leaf or node)
- * (note that this test fails for the super block)
- */
-static noinline_for_stack int btrfsic_test_for_metadata(
-		struct btrfsic_state *state,
-		char **datav, unsigned int num_pages)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-	struct btrfs_header *h;
-	u8 csum[BTRFS_CSUM_SIZE];
-	unsigned int i;
-
-	if (num_pages * PAGE_SIZE < state->metablock_size)
-		return 1; /* not metadata */
-	num_pages = state->metablock_size >> PAGE_SHIFT;
-	h = (struct btrfs_header *)datav[0];
-
-	if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
-		return 1;
-
-	shash->tfm = fs_info->csum_shash;
-	crypto_shash_init(shash);
-
-	for (i = 0; i < num_pages; i++) {
-		u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
-		size_t sublen = i ? PAGE_SIZE :
-				    (PAGE_SIZE - BTRFS_CSUM_SIZE);
-
-		crypto_shash_update(shash, data, sublen);
-	}
-	crypto_shash_final(shash, csum);
-	if (memcmp(csum, h->csum, fs_info->csum_size))
-		return 1;
-
-	return 0; /* is metadata */
-}
-
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-					  u64 dev_bytenr, char **mapped_datav,
-					  unsigned int num_pages,
-					  struct bio *bio, int *bio_is_patched,
-					  blk_opf_t submit_bio_bh_rw)
-{
-	int is_metadata;
-	struct btrfsic_block *block;
-	struct btrfsic_block_data_ctx block_ctx;
-	int ret;
-	struct btrfsic_state *state = dev_state->state;
-	struct block_device *bdev = dev_state->bdev;
-	unsigned int processed_len;
-
-	if (NULL != bio_is_patched)
-		*bio_is_patched = 0;
-
-again:
-	if (num_pages == 0)
-		return;
-
-	processed_len = 0;
-	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
-						      num_pages));
-
-	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
-					       &state->block_hashtable);
-	if (NULL != block) {
-		u64 bytenr = 0;
-		struct btrfsic_block_link *l, *tmp;
-
-		if (block->is_superblock) {
-			bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
-						    mapped_datav[0]);
-			if (num_pages * PAGE_SIZE <
-			    BTRFS_SUPER_INFO_SIZE) {
-				pr_info("btrfsic: cannot work with too short bios!\n");
-				return;
-			}
-			is_metadata = 1;
-			BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
-			processed_len = BTRFS_SUPER_INFO_SIZE;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
-				pr_info("[before new superblock is written]:\n");
-				btrfsic_dump_tree_sub(state, block, 0);
-			}
-		}
-		if (is_metadata) {
-			if (!block->is_superblock) {
-				if (num_pages * PAGE_SIZE <
-				    state->metablock_size) {
-					pr_info("btrfsic: cannot work with too short bios!\n");
-					return;
-				}
-				processed_len = state->metablock_size;
-				bytenr = btrfs_stack_header_bytenr(
-						(struct btrfs_header *)
-						mapped_datav[0]);
-				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
-							       dev_state,
-							       dev_bytenr);
-			}
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
-				if (block->logical_bytenr != bytenr &&
-				    !(!block->is_metadata &&
-				      block->logical_bytenr == 0))
-					pr_info(
-"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
-					       bytenr, dev_state->bdev,
-					       dev_bytenr,
-					       block->mirror_num,
-					       btrfsic_get_block_type(state,
-								      block),
-					       block->logical_bytenr);
-				else
-					pr_info(
-		"written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
-					       bytenr, dev_state->bdev,
-					       dev_bytenr, block->mirror_num,
-					       btrfsic_get_block_type(state,
-								      block));
-			}
-			block->logical_bytenr = bytenr;
-		} else {
-			if (num_pages * PAGE_SIZE <
-			    state->datablock_size) {
-				pr_info("btrfsic: cannot work with too short bios!\n");
-				return;
-			}
-			processed_len = state->datablock_size;
-			bytenr = block->logical_bytenr;
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info(
-		"written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
-				       bytenr, dev_state->bdev, dev_bytenr,
-				       block->mirror_num,
-				       btrfsic_get_block_type(state, block));
-		}
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("ref_to_list: %cE, ref_from_list: %cE\n",
-			       list_empty(&block->ref_to_list) ? ' ' : '!',
-			       list_empty(&block->ref_from_list) ? ' ' : '!');
-		if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
-			pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
-			       btrfsic_get_block_type(state, block), bytenr,
-			       dev_state->bdev, dev_bytenr, block->mirror_num,
-			       block->generation,
-			       btrfs_disk_key_objectid(&block->disk_key),
-			       block->disk_key.type,
-			       btrfs_disk_key_offset(&block->disk_key),
-			       btrfs_stack_header_generation(
-				       (struct btrfs_header *) mapped_datav[0]),
-			       state->max_superblock_generation);
-			btrfsic_dump_tree(state);
-		}
-
-		if (!block->is_iodone && !block->never_written) {
-			pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, block), bytenr,
-			       dev_state->bdev, dev_bytenr, block->mirror_num,
-			       block->generation,
-			       btrfs_stack_header_generation(
-				       (struct btrfs_header *)
-				       mapped_datav[0]));
-			/* it would not be safe to go on */
-			btrfsic_dump_tree(state);
-			goto continue_loop;
-		}
-
-		/*
-		 * Clear all references of this block. Do not free
-		 * the block itself even if is not referenced anymore
-		 * because it still carries valuable information
-		 * like whether it was ever written and IO completed.
-		 */
-		list_for_each_entry_safe(l, tmp, &block->ref_to_list,
-					 node_ref_to) {
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_rem_link(state, l);
-			l->ref_cnt--;
-			if (0 == l->ref_cnt) {
-				list_del(&l->node_ref_to);
-				list_del(&l->node_ref_from);
-				btrfsic_block_link_hashtable_remove(l);
-				btrfsic_block_link_free(l);
-			}
-		}
-
-		block_ctx.dev = dev_state;
-		block_ctx.dev_bytenr = dev_bytenr;
-		block_ctx.start = bytenr;
-		block_ctx.len = processed_len;
-		block_ctx.pagev = NULL;
-		block_ctx.mem_to_free = NULL;
-		block_ctx.datav = mapped_datav;
-
-		if (is_metadata || state->include_extent_data) {
-			block->never_written = 0;
-			block->iodone_w_error = 0;
-			if (NULL != bio) {
-				block->is_iodone = 0;
-				BUG_ON(NULL == bio_is_patched);
-				if (!*bio_is_patched) {
-					block->orig_bio_private =
-					    bio->bi_private;
-					block->orig_bio_end_io =
-					    bio->bi_end_io;
-					block->next_in_same_bio = NULL;
-					bio->bi_private = block;
-					bio->bi_end_io = btrfsic_bio_end_io;
-					*bio_is_patched = 1;
-				} else {
-					struct btrfsic_block *chained_block =
-					    (struct btrfsic_block *)
-					    bio->bi_private;
-
-					BUG_ON(NULL == chained_block);
-					block->orig_bio_private =
-					    chained_block->orig_bio_private;
-					block->orig_bio_end_io =
-					    chained_block->orig_bio_end_io;
-					block->next_in_same_bio = chained_block;
-					bio->bi_private = block;
-				}
-			} else {
-				block->is_iodone = 1;
-				block->orig_bio_private = NULL;
-				block->orig_bio_end_io = NULL;
-				block->next_in_same_bio = NULL;
-			}
-		}
-
-		block->flush_gen = dev_state->last_flush_gen + 1;
-		block->submit_bio_bh_rw = submit_bio_bh_rw;
-		if (is_metadata) {
-			block->logical_bytenr = bytenr;
-			block->is_metadata = 1;
-			if (block->is_superblock) {
-				BUG_ON(PAGE_SIZE !=
-				       BTRFS_SUPER_INFO_SIZE);
-				ret = btrfsic_process_written_superblock(
-						state,
-						block,
-						(struct btrfs_super_block *)
-						mapped_datav[0]);
-				if (state->print_mask &
-				    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
-					pr_info("[after new superblock is written]:\n");
-					btrfsic_dump_tree_sub(state, block, 0);
-				}
-			} else {
-				block->mirror_num = 0;	/* unknown */
-				ret = btrfsic_process_metablock(
-						state,
-						block,
-						&block_ctx,
-						0, 0);
-			}
-			if (ret)
-				pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n",
-				       dev_bytenr);
-		} else {
-			block->is_metadata = 0;
-			block->mirror_num = 0;	/* unknown */
-			block->generation = BTRFSIC_GENERATION_UNKNOWN;
-			if (!state->include_extent_data
-			    && list_empty(&block->ref_from_list)) {
-				/*
-				 * disk block is overwritten with extent
-				 * data (not meta data) and we are configured
-				 * to not include extent data: take the
-				 * chance and free the block's memory
-				 */
-				btrfsic_block_hashtable_remove(block);
-				list_del(&block->all_blocks_node);
-				btrfsic_block_free(block);
-			}
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	} else {
-		/* block has not been found in hash table */
-		u64 bytenr;
-
-		if (!is_metadata) {
-			processed_len = state->datablock_size;
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info(
-			"written block (%pg/%llu/?) !found in hash table, D\n",
-				       dev_state->bdev, dev_bytenr);
-			if (!state->include_extent_data) {
-				/* ignore that written D block */
-				goto continue_loop;
-			}
-
-			/* this is getting ugly for the
-			 * include_extent_data case... */
-			bytenr = 0;	/* unknown */
-		} else {
-			processed_len = state->metablock_size;
-			bytenr = btrfs_stack_header_bytenr(
-					(struct btrfs_header *)
-					mapped_datav[0]);
-			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
-						       dev_bytenr);
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info(
-			"written block @%llu (%pg/%llu/?) !found in hash table, M\n",
-				       bytenr, dev_state->bdev, dev_bytenr);
-		}
-
-		block_ctx.dev = dev_state;
-		block_ctx.dev_bytenr = dev_bytenr;
-		block_ctx.start = bytenr;
-		block_ctx.len = processed_len;
-		block_ctx.pagev = NULL;
-		block_ctx.mem_to_free = NULL;
-		block_ctx.datav = mapped_datav;
-
-		block = btrfsic_block_alloc();
-		if (NULL == block) {
-			btrfsic_release_block_ctx(&block_ctx);
-			goto continue_loop;
-		}
-		block->dev_state = dev_state;
-		block->dev_bytenr = dev_bytenr;
-		block->logical_bytenr = bytenr;
-		block->is_metadata = is_metadata;
-		block->never_written = 0;
-		block->iodone_w_error = 0;
-		block->mirror_num = 0;	/* unknown */
-		block->flush_gen = dev_state->last_flush_gen + 1;
-		block->submit_bio_bh_rw = submit_bio_bh_rw;
-		if (NULL != bio) {
-			block->is_iodone = 0;
-			BUG_ON(NULL == bio_is_patched);
-			if (!*bio_is_patched) {
-				block->orig_bio_private = bio->bi_private;
-				block->orig_bio_end_io = bio->bi_end_io;
-				block->next_in_same_bio = NULL;
-				bio->bi_private = block;
-				bio->bi_end_io = btrfsic_bio_end_io;
-				*bio_is_patched = 1;
-			} else {
-				struct btrfsic_block *chained_block =
-				    (struct btrfsic_block *)
-				    bio->bi_private;
-
-				BUG_ON(NULL == chained_block);
-				block->orig_bio_private =
-				    chained_block->orig_bio_private;
-				block->orig_bio_end_io =
-				    chained_block->orig_bio_end_io;
-				block->next_in_same_bio = chained_block;
-				bio->bi_private = block;
-			}
-		} else {
-			block->is_iodone = 1;
-			block->orig_bio_private = NULL;
-			block->orig_bio_end_io = NULL;
-			block->next_in_same_bio = NULL;
-		}
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
-			       is_metadata ? 'M' : 'D',
-			       block->logical_bytenr, block->dev_state->bdev,
-			       block->dev_bytenr, block->mirror_num);
-		list_add(&block->all_blocks_node, &state->all_blocks_list);
-		btrfsic_block_hashtable_add(block, &state->block_hashtable);
-
-		if (is_metadata) {
-			ret = btrfsic_process_metablock(state, block,
-							&block_ctx, 0, 0);
-			if (ret)
-				pr_info("btrfsic: process_metablock(root @%llu) failed!\n",
-				       dev_bytenr);
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	}
-
-continue_loop:
-	BUG_ON(!processed_len);
-	dev_bytenr += processed_len;
-	mapped_datav += processed_len >> PAGE_SHIFT;
-	num_pages -= processed_len >> PAGE_SHIFT;
-	goto again;
-}
-
-static void btrfsic_bio_end_io(struct bio *bp)
-{
-	struct btrfsic_block *block = bp->bi_private;
-	int iodone_w_error;
-
-	/* mutex is not held! This is not save if IO is not yet completed
-	 * on umount */
-	iodone_w_error = 0;
-	if (bp->bi_status)
-		iodone_w_error = 1;
-
-	BUG_ON(NULL == block);
-	bp->bi_private = block->orig_bio_private;
-	bp->bi_end_io = block->orig_bio_end_io;
-
-	do {
-		struct btrfsic_block *next_block;
-		struct btrfsic_dev_state *const dev_state = block->dev_state;
-
-		if ((dev_state->state->print_mask &
-		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-			pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
-			       bp->bi_status,
-			       btrfsic_get_block_type(dev_state->state, block),
-			       block->logical_bytenr, dev_state->bdev,
-			       block->dev_bytenr, block->mirror_num);
-		next_block = block->next_in_same_bio;
-		block->iodone_w_error = iodone_w_error;
-		if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
-			dev_state->last_flush_gen++;
-			if ((dev_state->state->print_mask &
-			     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-				pr_info("bio_end_io() new %pg flush_gen=%llu\n",
-				       dev_state->bdev,
-				       dev_state->last_flush_gen);
-		}
-		if (block->submit_bio_bh_rw & REQ_FUA)
-			block->flush_gen = 0; /* FUA completed means block is
-					       * on disk */
-		block->is_iodone = 1; /* for FLUSH, this releases the block */
-		block = next_block;
-	} while (NULL != block);
-
-	bp->bi_end_io(bp);
-}
-
-static int btrfsic_process_written_superblock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const superblock,
-		struct btrfs_super_block *const super_hdr)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	int pass;
-
-	superblock->generation = btrfs_super_generation(super_hdr);
-	if (!(superblock->generation > state->max_superblock_generation ||
-	      0 == state->max_superblock_generation)) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			pr_info(
-	"btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
-			       superblock->logical_bytenr,
-			       superblock->dev_state->bdev,
-			       superblock->dev_bytenr, superblock->mirror_num,
-			       btrfs_super_generation(super_hdr),
-			       state->max_superblock_generation);
-	} else {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			pr_info(
-	"btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
-			       superblock->logical_bytenr,
-			       superblock->dev_state->bdev,
-			       superblock->dev_bytenr, superblock->mirror_num,
-			       btrfs_super_generation(super_hdr),
-			       state->max_superblock_generation);
-
-		state->max_superblock_generation =
-		    btrfs_super_generation(super_hdr);
-		state->latest_superblock = superblock;
-	}
-
-	for (pass = 0; pass < 3; pass++) {
-		int ret;
-		u64 next_bytenr;
-		struct btrfsic_block *next_block;
-		struct btrfsic_block_data_ctx tmp_next_block_ctx;
-		struct btrfsic_block_link *l;
-		int num_copies;
-		int mirror_num;
-		const char *additional_string = NULL;
-		struct btrfs_disk_key tmp_disk_key = {0};
-
-		btrfs_set_disk_key_objectid(&tmp_disk_key,
-					    BTRFS_ROOT_ITEM_KEY);
-		btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
-
-		switch (pass) {
-		case 0:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_ROOT_TREE_OBJECTID);
-			additional_string = "root ";
-			next_bytenr = btrfs_super_root(super_hdr);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("root@%llu\n", next_bytenr);
-			break;
-		case 1:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_CHUNK_TREE_OBJECTID);
-			additional_string = "chunk ";
-			next_bytenr = btrfs_super_chunk_root(super_hdr);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("chunk@%llu\n", next_bytenr);
-			break;
-		case 2:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_TREE_LOG_OBJECTID);
-			additional_string = "log ";
-			next_bytenr = btrfs_super_log_root(super_hdr);
-			if (0 == next_bytenr)
-				continue;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("log@%llu\n", next_bytenr);
-			break;
-		}
-
-		num_copies = btrfs_num_copies(fs_info, next_bytenr,
-					      BTRFS_SUPER_INFO_SIZE);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			int was_created;
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num);
-			ret = btrfsic_map_block(state, next_bytenr,
-						BTRFS_SUPER_INFO_SIZE,
-						&tmp_next_block_ctx,
-						mirror_num);
-			if (ret) {
-				pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
-				       next_bytenr, mirror_num);
-				return -1;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state,
-					&tmp_next_block_ctx,
-					additional_string,
-					1, 0, 1,
-					mirror_num,
-					&was_created);
-			if (NULL == next_block) {
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				return -1;
-			}
-
-			next_block->disk_key = tmp_disk_key;
-			if (was_created)
-				next_block->generation =
-				    BTRFSIC_GENERATION_UNKNOWN;
-			l = btrfsic_block_link_lookup_or_add(
-					state,
-					&tmp_next_block_ctx,
-					next_block,
-					superblock,
-					BTRFSIC_GENERATION_UNKNOWN);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-			if (NULL == l)
-				return -1;
-		}
-	}
-
-	if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
-		btrfsic_dump_tree(state);
-
-	return 0;
-}
-
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
-					struct btrfsic_block *const block,
-					int recursion_level)
-{
-	const struct btrfsic_block_link *l;
-	int ret = 0;
-
-	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
-		/*
-		 * Note that this situation can happen and does not
-		 * indicate an error in regular cases. It happens
-		 * when disk blocks are freed and later reused.
-		 * The check-integrity module is not aware of any
-		 * block free operations, it just recognizes block
-		 * write operations. Therefore it keeps the linkage
-		 * information for a block until a block is
-		 * rewritten. This can temporarily cause incorrect
-		 * and even circular linkage information. This
-		 * causes no harm unless such blocks are referenced
-		 * by the most recent super block.
-		 */
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("btrfsic: abort cyclic linkage (case 1).\n");
-
-		return ret;
-	}
-
-	/*
-	 * This algorithm is recursive because the amount of used stack
-	 * space is very small and the max recursion depth is limited.
-	 */
-	list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info(
-		"rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
-			       recursion_level,
-			       btrfsic_get_block_type(state, block),
-			       block->logical_bytenr, block->dev_state->bdev,
-			       block->dev_bytenr, block->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-		if (l->block_ref_to->never_written) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-			ret = -1;
-		} else if (!l->block_ref_to->is_iodone) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-			ret = -1;
-		} else if (l->block_ref_to->iodone_w_error) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-			ret = -1;
-		} else if (l->parent_generation !=
-			   l->block_ref_to->generation &&
-			   BTRFSIC_GENERATION_UNKNOWN !=
-			   l->parent_generation &&
-			   BTRFSIC_GENERATION_UNKNOWN !=
-			   l->block_ref_to->generation) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num,
-			       l->block_ref_to->generation,
-			       l->parent_generation);
-			ret = -1;
-		} else if (l->block_ref_to->flush_gen >
-			   l->block_ref_to->dev_state->last_flush_gen) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num, block->flush_gen,
-			       l->block_ref_to->dev_state->last_flush_gen);
-			ret = -1;
-		} else if (-1 == btrfsic_check_all_ref_blocks(state,
-							      l->block_ref_to,
-							      recursion_level +
-							      1)) {
-			ret = -1;
-		}
-	}
-
-	return ret;
-}
-
-static int btrfsic_is_block_ref_by_superblock(
-		const struct btrfsic_state *state,
-		const struct btrfsic_block *block,
-		int recursion_level)
-{
-	const struct btrfsic_block_link *l;
-
-	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
-		/* refer to comment at "abort cyclic linkage (case 1)" */
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("btrfsic: abort cyclic linkage (case 2).\n");
-
-		return 0;
-	}
-
-	/*
-	 * This algorithm is recursive because the amount of used stack space
-	 * is very small and the max recursion depth is limited.
-	 */
-	list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info(
-	"rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
-			       recursion_level,
-			       btrfsic_get_block_type(state, block),
-			       block->logical_bytenr, block->dev_state->bdev,
-			       block->dev_bytenr, block->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_from),
-			       l->block_ref_from->logical_bytenr,
-			       l->block_ref_from->dev_state->bdev,
-			       l->block_ref_from->dev_bytenr,
-			       l->block_ref_from->mirror_num);
-		if (l->block_ref_from->is_superblock &&
-		    state->latest_superblock->dev_bytenr ==
-		    l->block_ref_from->dev_bytenr &&
-		    state->latest_superblock->dev_state->bdev ==
-		    l->block_ref_from->dev_state->bdev)
-			return 1;
-		else if (btrfsic_is_block_ref_by_superblock(state,
-							    l->block_ref_from,
-							    recursion_level +
-							    1))
-			return 1;
-	}
-
-	return 0;
-}
-
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l)
-{
-	pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
-	       l->ref_cnt,
-	       btrfsic_get_block_type(state, l->block_ref_from),
-	       l->block_ref_from->logical_bytenr,
-	       l->block_ref_from->dev_state->bdev,
-	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
-	       btrfsic_get_block_type(state, l->block_ref_to),
-	       l->block_ref_to->logical_bytenr,
-	       l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
-	       l->block_ref_to->mirror_num);
-}
-
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l)
-{
-	pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
-	       l->ref_cnt,
-	       btrfsic_get_block_type(state, l->block_ref_from),
-	       l->block_ref_from->logical_bytenr,
-	       l->block_ref_from->dev_state->bdev,
-	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
-	       btrfsic_get_block_type(state, l->block_ref_to),
-	       l->block_ref_to->logical_bytenr,
-	       l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
-	       l->block_ref_to->mirror_num);
-}
-
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
-				   const struct btrfsic_block *block)
-{
-	if (block->is_superblock &&
-	    state->latest_superblock->dev_bytenr == block->dev_bytenr &&
-	    state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
-		return 'S';
-	else if (block->is_superblock)
-		return 's';
-	else if (block->is_metadata)
-		return 'M';
-	else
-		return 'D';
-}
-
-static void btrfsic_dump_tree(const struct btrfsic_state *state)
-{
-	btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
-}
-
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
-				  const struct btrfsic_block *block,
-				  int indent_level)
-{
-	const struct btrfsic_block_link *l;
-	int indent_add;
-	static char buf[80];
-	int cursor_position;
-
-	/*
-	 * Should better fill an on-stack buffer with a complete line and
-	 * dump it at once when it is time to print a newline character.
-	 */
-
-	/*
-	 * This algorithm is recursive because the amount of used stack space
-	 * is very small and the max recursion depth is limited.
-	 */
-	indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
-			     btrfsic_get_block_type(state, block),
-			     block->logical_bytenr, block->dev_state->bdev,
-			     block->dev_bytenr, block->mirror_num);
-	if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
-		printk("[...]\n");
-		return;
-	}
-	printk(buf);
-	indent_level += indent_add;
-	if (list_empty(&block->ref_to_list)) {
-		printk("\n");
-		return;
-	}
-	if (block->mirror_num > 1 &&
-	    !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
-		printk(" [...]\n");
-		return;
-	}
-
-	cursor_position = indent_level;
-	list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
-		while (cursor_position < indent_level) {
-			printk(" ");
-			cursor_position++;
-		}
-		if (l->ref_cnt > 1)
-			indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
-		else
-			indent_add = sprintf(buf, " --> ");
-		if (indent_level + indent_add >
-		    BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
-			printk("[...]\n");
-			cursor_position = 0;
-			continue;
-		}
-
-		printk(buf);
-
-		btrfsic_dump_tree_sub(state, l->block_ref_to,
-				      indent_level + indent_add);
-		cursor_position = 0;
-	}
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block *next_block,
-		struct btrfsic_block *from_block,
-		u64 parent_generation)
-{
-	struct btrfsic_block_link *l;
-
-	l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
-						next_block_ctx->dev_bytenr,
-						from_block->dev_state->bdev,
-						from_block->dev_bytenr,
-						&state->block_link_hashtable);
-	if (NULL == l) {
-		l = btrfsic_block_link_alloc();
-		if (!l)
-			return NULL;
-
-		l->block_ref_to = next_block;
-		l->block_ref_from = from_block;
-		l->ref_cnt = 1;
-		l->parent_generation = parent_generation;
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-
-		list_add(&l->node_ref_to, &from_block->ref_to_list);
-		list_add(&l->node_ref_from, &next_block->ref_from_list);
-
-		btrfsic_block_link_hashtable_add(l,
-						 &state->block_link_hashtable);
-	} else {
-		l->ref_cnt++;
-		l->parent_generation = parent_generation;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-	}
-
-	return l;
-}
-
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *block_ctx,
-		const char *additional_string,
-		int is_metadata,
-		int is_iodone,
-		int never_written,
-		int mirror_num,
-		int *was_created)
-{
-	struct btrfsic_block *block;
-
-	block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
-					       block_ctx->dev_bytenr,
-					       &state->block_hashtable);
-	if (NULL == block) {
-		struct btrfsic_dev_state *dev_state;
-
-		block = btrfsic_block_alloc();
-		if (!block)
-			return NULL;
-
-		dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
-		if (NULL == dev_state) {
-			pr_info("btrfsic: error, lookup dev_state failed!\n");
-			btrfsic_block_free(block);
-			return NULL;
-		}
-		block->dev_state = dev_state;
-		block->dev_bytenr = block_ctx->dev_bytenr;
-		block->logical_bytenr = block_ctx->start;
-		block->is_metadata = is_metadata;
-		block->is_iodone = is_iodone;
-		block->never_written = never_written;
-		block->mirror_num = mirror_num;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
-			       additional_string,
-			       btrfsic_get_block_type(state, block),
-			       block->logical_bytenr, dev_state->bdev,
-			       block->dev_bytenr, mirror_num);
-		list_add(&block->all_blocks_node, &state->all_blocks_list);
-		btrfsic_block_hashtable_add(block, &state->block_hashtable);
-		if (NULL != was_created)
-			*was_created = 1;
-	} else {
-		if (NULL != was_created)
-			*was_created = 0;
-	}
-
-	return block;
-}
-
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
-					   u64 bytenr,
-					   struct btrfsic_dev_state *dev_state,
-					   u64 dev_bytenr)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	struct btrfsic_block_data_ctx block_ctx;
-	int num_copies;
-	int mirror_num;
-	int match = 0;
-	int ret;
-
-	num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size);
-
-	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-		ret = btrfsic_map_block(state, bytenr, state->metablock_size,
-					&block_ctx, mirror_num);
-		if (ret) {
-			pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n",
-			       bytenr, mirror_num);
-			continue;
-		}
-
-		if (dev_state->bdev == block_ctx.dev->bdev &&
-		    dev_bytenr == block_ctx.dev_bytenr) {
-			match++;
-			btrfsic_release_block_ctx(&block_ctx);
-			break;
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	}
-
-	if (WARN_ON(!match)) {
-		pr_info(
-"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
-		       bytenr, dev_state->bdev, dev_bytenr);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			ret = btrfsic_map_block(state, bytenr,
-						state->metablock_size,
-						&block_ctx, mirror_num);
-			if (ret)
-				continue;
-
-			pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
-			       bytenr, block_ctx.dev->bdev,
-			       block_ctx.dev_bytenr, mirror_num);
-		}
-	}
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
-{
-	return btrfsic_dev_state_hashtable_lookup(dev,
-						  &btrfsic_dev_state_hashtable);
-}
-
-static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
-	unsigned int segs = bio_segments(bio);
-	u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
-	u64 cur_bytenr = dev_bytenr;
-	struct bvec_iter iter;
-	struct bio_vec bvec;
-	char **mapped_datav;
-	int bio_is_patched = 0;
-	int i = 0;
-
-	if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-		pr_info(
-"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
-		       bio_op(bio), bio->bi_opf, segs,
-		       bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
-	mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
-	if (!mapped_datav)
-		return;
-
-	bio_for_each_segment(bvec, bio, iter) {
-		BUG_ON(bvec.bv_len != PAGE_SIZE);
-		mapped_datav[i] = page_address(bvec.bv_page);
-		i++;
-
-		if (dev_state->state->print_mask &
-		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
-			pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
-			       i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
-		cur_bytenr += bvec.bv_len;
-	}
-
-	btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
-				      bio, &bio_is_patched, bio->bi_opf);
-	kfree(mapped_datav);
-}
-
-static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
-	if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-		pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
-		       bio_op(bio), bio->bi_opf, bio->bi_bdev);
-
-	if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
-		struct btrfsic_block *const block =
-			&dev_state->dummy_block_for_bio_bh_flush;
-
-		block->is_iodone = 0;
-		block->never_written = 0;
-		block->iodone_w_error = 0;
-		block->flush_gen = dev_state->last_flush_gen + 1;
-		block->submit_bio_bh_rw = bio->bi_opf;
-		block->orig_bio_private = bio->bi_private;
-		block->orig_bio_end_io = bio->bi_end_io;
-		block->next_in_same_bio = NULL;
-		bio->bi_private = block;
-		bio->bi_end_io = btrfsic_bio_end_io;
-	} else if ((dev_state->state->print_mask &
-		   (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-		    BTRFSIC_PRINT_MASK_VERBOSE))) {
-		pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
-		       dev_state->bdev);
-	}
-}
-
-void btrfsic_check_bio(struct bio *bio)
-{
-	struct btrfsic_dev_state *dev_state;
-
-	if (!btrfsic_is_initialized)
-		return;
-
-	/*
-	 * We can be called before btrfsic_mount, so there might not be a
-	 * dev_state.
-	 */
-	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
-	mutex_lock(&btrfsic_mutex);
-	if (dev_state) {
-		if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
-			btrfsic_check_write_bio(bio, dev_state);
-		else if (bio->bi_opf & REQ_PREFLUSH)
-			btrfsic_check_flush_bio(bio, dev_state);
-	}
-	mutex_unlock(&btrfsic_mutex);
-}
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
-		  struct btrfs_fs_devices *fs_devices,
-		  int including_extent_data, u32 print_mask)
-{
-	int ret;
-	struct btrfsic_state *state;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-
-	if (!PAGE_ALIGNED(fs_info->nodesize)) {
-		pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
-		       fs_info->nodesize, PAGE_SIZE);
-		return -1;
-	}
-	if (!PAGE_ALIGNED(fs_info->sectorsize)) {
-		pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
-		       fs_info->sectorsize, PAGE_SIZE);
-		return -1;
-	}
-	state = kvzalloc(sizeof(*state), GFP_KERNEL);
-	if (!state)
-		return -ENOMEM;
-
-	if (!btrfsic_is_initialized) {
-		mutex_init(&btrfsic_mutex);
-		btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
-		btrfsic_is_initialized = 1;
-	}
-	mutex_lock(&btrfsic_mutex);
-	state->fs_info = fs_info;
-	state->print_mask = print_mask;
-	state->include_extent_data = including_extent_data;
-	state->metablock_size = fs_info->nodesize;
-	state->datablock_size = fs_info->sectorsize;
-	INIT_LIST_HEAD(&state->all_blocks_list);
-	btrfsic_block_hashtable_init(&state->block_hashtable);
-	btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
-	state->max_superblock_generation = 0;
-	state->latest_superblock = NULL;
-
-	list_for_each_entry(device, dev_head, dev_list) {
-		struct btrfsic_dev_state *ds;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		ds = btrfsic_dev_state_alloc();
-		if (NULL == ds) {
-			mutex_unlock(&btrfsic_mutex);
-			return -ENOMEM;
-		}
-		ds->bdev = device->bdev;
-		ds->state = state;
-		btrfsic_dev_state_hashtable_add(ds,
-						&btrfsic_dev_state_hashtable);
-	}
-
-	ret = btrfsic_process_superblock(state, fs_devices);
-	if (0 != ret) {
-		mutex_unlock(&btrfsic_mutex);
-		btrfsic_unmount(fs_devices);
-		return ret;
-	}
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
-		btrfsic_dump_database(state);
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
-		btrfsic_dump_tree(state);
-
-	mutex_unlock(&btrfsic_mutex);
-	return 0;
-}
-
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
-{
-	struct btrfsic_block *b_all, *tmp_all;
-	struct btrfsic_state *state;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-
-	if (!btrfsic_is_initialized)
-		return;
-
-	mutex_lock(&btrfsic_mutex);
-
-	state = NULL;
-	list_for_each_entry(device, dev_head, dev_list) {
-		struct btrfsic_dev_state *ds;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		ds = btrfsic_dev_state_hashtable_lookup(
-				device->bdev->bd_dev,
-				&btrfsic_dev_state_hashtable);
-		if (NULL != ds) {
-			state = ds->state;
-			btrfsic_dev_state_hashtable_remove(ds);
-			btrfsic_dev_state_free(ds);
-		}
-	}
-
-	if (NULL == state) {
-		pr_info("btrfsic: error, cannot find state information on umount!\n");
-		mutex_unlock(&btrfsic_mutex);
-		return;
-	}
-
-	/*
-	 * Don't care about keeping the lists' state up to date,
-	 * just free all memory that was allocated dynamically.
-	 * Free the blocks and the block_links.
-	 */
-	list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
-				 all_blocks_node) {
-		struct btrfsic_block_link *l, *tmp;
-
-		list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
-					 node_ref_to) {
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_rem_link(state, l);
-
-			l->ref_cnt--;
-			if (0 == l->ref_cnt)
-				btrfsic_block_link_free(l);
-		}
-
-		if (b_all->is_iodone || b_all->never_written)
-			btrfsic_block_free(b_all);
-		else
-			pr_info(
-"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, b_all),
-			       b_all->logical_bytenr, b_all->dev_state->bdev,
-			       b_all->dev_bytenr, b_all->mirror_num);
-	}
-
-	mutex_unlock(&btrfsic_mutex);
-
-	kvfree(state);
-}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
deleted file mode 100644
index e4c8aed7996f..000000000000
--- a/fs/btrfs/check-integrity.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) STRATO AG 2011.  All rights reserved.
- */
-
-#ifndef BTRFS_CHECK_INTEGRITY_H
-#define BTRFS_CHECK_INTEGRITY_H
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_check_bio(struct bio *bio);
-#else
-static inline void btrfsic_check_bio(struct bio *bio) { }
-#endif
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
-		  struct btrfs_fs_devices *fs_devices,
-		  int including_extent_data, u32 print_mask);
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices);
-
-#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8818ed5c390f..19b22b4653c8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -193,12 +193,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
 	unsigned long index = cb->start >> PAGE_SHIFT;
 	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
 	struct folio_batch fbatch;
-	const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
+	const int error = blk_status_to_errno(cb->bbio.bio.bi_status);
 	int i;
 	int ret;
 
-	if (errno)
-		mapping_set_error(inode->i_mapping, errno);
+	if (error)
+		mapping_set_error(inode->i_mapping, error);
 
 	folio_batch_init(&fbatch);
 	while (index <= end_index) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 617d4827eec2..2a9344a3fcee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p)
  * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
  * caused by external factors.
  */
-bool __cold abort_should_print_stack(int errno)
+bool __cold abort_should_print_stack(int error)
 {
-	switch (errno) {
+	switch (error) {
 	case -EIO:
 	case -EROFS:
 	case -ENOMEM:
@@ -316,6 +316,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int level;
 	struct btrfs_disk_key disk_key;
+	u64 reloc_src_root = 0;
 
 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 		trans->transid != fs_info->running_transaction->transid);
@@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_node_key(buf, &disk_key, 0);
 
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		reloc_src_root = btrfs_header_owner(buf);
 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 				     &disk_key, level, buf->start, 0,
-				     BTRFS_NESTING_NEW_ROOT);
+				     reloc_src_root, BTRFS_NESTING_NEW_ROOT);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -359,7 +362,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	btrfs_mark_buffer_dirty(cow);
+	btrfs_mark_buffer_dirty(trans, cow);
 	*cow_ret = cow;
 	return 0;
 }
@@ -518,13 +521,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
  * bytes the allocator should try to find free next to the block it returns.
  * This is just a hint and may be ignored by the allocator.
  */
-static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
-			     struct extent_buffer *buf,
-			     struct extent_buffer *parent, int parent_slot,
-			     struct extent_buffer **cow_ret,
-			     u64 search_start, u64 empty_size,
-			     enum btrfs_lock_nesting nest)
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct extent_buffer *buf,
+			  struct extent_buffer *parent, int parent_slot,
+			  struct extent_buffer **cow_ret,
+			  u64 search_start, u64 empty_size,
+			  enum btrfs_lock_nesting nest)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_disk_key disk_key;
@@ -533,6 +536,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	int last_ref = 0;
 	int unlock_orig = 0;
 	u64 parent_start = 0;
+	u64 reloc_src_root = 0;
 
 	if (*cow_ret == buf)
 		unlock_orig = 1;
@@ -551,12 +555,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	else
 		btrfs_node_key(buf, &disk_key, 0);
 
-	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
-		parent_start = parent->start;
-
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent)
+			parent_start = parent->start;
+		reloc_src_root = btrfs_header_owner(buf);
+	}
 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
 				     root->root_key.objectid, &disk_key, level,
-				     search_start, empty_size, nest);
+				     search_start, empty_size, reloc_src_root, nest);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -627,7 +633,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					cow->start);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
-		btrfs_mark_buffer_dirty(parent);
+		btrfs_mark_buffer_dirty(trans, parent);
 		if (last_ref) {
 			ret = btrfs_tree_mod_log_free_eb(buf);
 			if (ret) {
@@ -643,7 +649,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
 	free_extent_buffer_stale(buf);
-	btrfs_mark_buffer_dirty(cow);
+	btrfs_mark_buffer_dirty(trans, cow);
 	*cow_ret = cow;
 	return 0;
 }
@@ -679,11 +685,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
 }
 
 /*
- * cows a single block, see __btrfs_cow_block for the real work.
+ * COWs a single block, see btrfs_force_cow_block() for the real work.
  * This version of it has extra checks so that a block isn't COWed more than
  * once per transaction, as long as it hasn't been written yet
  */
-noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret,
@@ -723,7 +729,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	search_start = buf->start & ~((u64)SZ_1G - 1);
+	search_start = round_down(buf->start, SZ_1G);
 
 	/*
 	 * Before CoWing this block for later modification, check if it's
@@ -732,8 +738,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	 * Also We don't care about the error, as it's handled internally.
 	 */
 	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
-	ret = __btrfs_cow_block(trans, root, buf, parent,
-				 parent_slot, cow_ret, search_start, 0, nest);
+	ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
+				    cow_ret, search_start, 0, nest);
 
 	trace_btrfs_cow_block(root, buf, *cow_ret);
 
@@ -742,49 +748,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
 
 /*
- * helper function for defrag to decide if two blocks pointed to by a
- * node are actually close by
- */
-static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
-{
-	if (blocknr < other && other - (blocknr + blocksize) < 32768)
-		return 1;
-	if (blocknr > other && blocknr - (other + blocksize) < 32768)
-		return 1;
-	return 0;
-}
-
-#ifdef __LITTLE_ENDIAN
-
-/*
- * Compare two keys, on little-endian the disk order is same as CPU order and
- * we can avoid the conversion.
- */
-static int comp_keys(const struct btrfs_disk_key *disk_key,
-		     const struct btrfs_key *k2)
-{
-	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
-
-	return btrfs_comp_cpu_keys(k1, k2);
-}
-
-#else
-
-/*
- * compare two keys in a memcmp fashion
- */
-static int comp_keys(const struct btrfs_disk_key *disk,
-		     const struct btrfs_key *k2)
-{
-	struct btrfs_key k1;
-
-	btrfs_disk_key_to_cpu(&k1, disk);
-
-	return btrfs_comp_cpu_keys(&k1, k2);
-}
-#endif
-
-/*
  * same as comp_keys only with two btrfs_key's
  */
 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
@@ -805,105 +768,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
 }
 
 /*
- * this is used by the defrag code to go through all the
- * leaves pointed to by a node and reallocate them so that
- * disk order is close to key order
- */
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, u64 *last_ret,
-		       struct btrfs_key *progress)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct extent_buffer *cur;
-	u64 blocknr;
-	u64 search_start = *last_ret;
-	u64 last_block = 0;
-	u64 other;
-	u32 parent_nritems;
-	int end_slot;
-	int i;
-	int err = 0;
-	u32 blocksize;
-	int progress_passed = 0;
-	struct btrfs_disk_key disk_key;
-
-	/*
-	 * COWing must happen through a running transaction, which always
-	 * matches the current fs generation (it's a transaction with a state
-	 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
-	 * into error state to prevent the commit of any transaction.
-	 */
-	if (unlikely(trans->transaction != fs_info->running_transaction ||
-		     trans->transid != fs_info->generation)) {
-		btrfs_abort_transaction(trans, -EUCLEAN);
-		btrfs_crit(fs_info,
-"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
-			   parent->start, btrfs_root_id(root), trans->transid,
-			   fs_info->running_transaction->transid,
-			   fs_info->generation);
-		return -EUCLEAN;
-	}
-
-	parent_nritems = btrfs_header_nritems(parent);
-	blocksize = fs_info->nodesize;
-	end_slot = parent_nritems - 1;
-
-	if (parent_nritems <= 1)
-		return 0;
-
-	for (i = start_slot; i <= end_slot; i++) {
-		int close = 1;
-
-		btrfs_node_key(parent, &disk_key, i);
-		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
-			continue;
-
-		progress_passed = 1;
-		blocknr = btrfs_node_blockptr(parent, i);
-		if (last_block == 0)
-			last_block = blocknr;
-
-		if (i > 0) {
-			other = btrfs_node_blockptr(parent, i - 1);
-			close = close_blocks(blocknr, other, blocksize);
-		}
-		if (!close && i < end_slot) {
-			other = btrfs_node_blockptr(parent, i + 1);
-			close = close_blocks(blocknr, other, blocksize);
-		}
-		if (close) {
-			last_block = blocknr;
-			continue;
-		}
-
-		cur = btrfs_read_node_slot(parent, i);
-		if (IS_ERR(cur))
-			return PTR_ERR(cur);
-		if (search_start == 0)
-			search_start = last_block;
-
-		btrfs_tree_lock(cur);
-		err = __btrfs_cow_block(trans, root, cur, parent, i,
-					&cur, search_start,
-					min(16 * blocksize,
-					    (end_slot - i) * blocksize),
-					BTRFS_NESTING_COW);
-		if (err) {
-			btrfs_tree_unlock(cur);
-			free_extent_buffer(cur);
-			break;
-		}
-		search_start = cur->start;
-		last_block = cur->start;
-		*last_ret = search_start;
-		btrfs_tree_unlock(cur);
-		free_extent_buffer(cur);
-	}
-	return err;
-}
-
-/*
  * Search for a key in the given extent_buffer.
  *
  * The lower boundary for the search is specified by the slot number @first_slot.
@@ -969,7 +833,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 			tmp = &unaligned;
 		}
 
-		ret = comp_keys(tmp, key);
+		ret = btrfs_comp_keys(tmp, key);
 
 		if (ret < 0)
 			low = mid + 1;
@@ -984,19 +848,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 	return 1;
 }
 
-static void root_add_used(struct btrfs_root *root, u32 size)
+static void root_add_used_bytes(struct btrfs_root *root)
 {
 	spin_lock(&root->accounting_lock);
 	btrfs_set_root_used(&root->root_item,
-			    btrfs_root_used(&root->root_item) + size);
+		btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
 	spin_unlock(&root->accounting_lock);
 }
 
-static void root_sub_used(struct btrfs_root *root, u32 size)
+static void root_sub_used_bytes(struct btrfs_root *root)
 {
 	spin_lock(&root->accounting_lock);
 	btrfs_set_root_used(&root->root_item,
-			    btrfs_root_used(&root->root_item) - size);
+		btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
 	spin_unlock(&root->accounting_lock);
 }
 
@@ -1112,7 +976,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* once for the path */
 		free_extent_buffer(mid);
 
-		root_sub_used(root, mid->len);
+		root_sub_used_bytes(root);
 		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer_stale(mid);
@@ -1182,7 +1046,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 				right = NULL;
 				goto out;
 			}
-			root_sub_used(root, right->len);
+			root_sub_used_bytes(root);
 			btrfs_free_tree_block(trans, btrfs_root_id(root), right,
 					      0, 1);
 			free_extent_buffer_stale(right);
@@ -1197,7 +1061,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 				goto out;
 			}
 			btrfs_set_node_key(parent, &right_key, pslot + 1);
-			btrfs_mark_buffer_dirty(parent);
+			btrfs_mark_buffer_dirty(trans, parent);
 		}
 	}
 	if (btrfs_header_nritems(mid) == 1) {
@@ -1240,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			mid = NULL;
 			goto out;
 		}
-		root_sub_used(root, mid->len);
+		root_sub_used_bytes(root);
 		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
 		free_extent_buffer_stale(mid);
 		mid = NULL;
@@ -1255,7 +1119,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			goto out;
 		}
 		btrfs_set_node_key(parent, &mid_key, pslot);
-		btrfs_mark_buffer_dirty(parent);
+		btrfs_mark_buffer_dirty(trans, parent);
 	}
 
 	/* update the path */
@@ -1362,7 +1226,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				return ret;
 			}
 			btrfs_set_node_key(parent, &disk_key, pslot);
-			btrfs_mark_buffer_dirty(parent);
+			btrfs_mark_buffer_dirty(trans, parent);
 			if (btrfs_header_nritems(left) > orig_slot) {
 				path->nodes[level] = left;
 				path->slots[level + 1] -= 1;
@@ -1422,7 +1286,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				return ret;
 			}
 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
-			btrfs_mark_buffer_dirty(parent);
+			btrfs_mark_buffer_dirty(trans, parent);
 
 			if (btrfs_header_nritems(mid) <= orig_slot) {
 				path->nodes[level] = right;
@@ -2006,7 +1870,7 @@ static int search_leaf(struct btrfs_trans_handle *trans,
 			 * the extent buffer's header and we have recently accessed
 			 * the header's level field.
 			 */
-			ret = comp_keys(&first_key, key);
+			ret = btrfs_comp_keys(&first_key, key);
 			if (ret < 0) {
 				/*
 				 * The first key is smaller than the key we want
@@ -2091,8 +1955,8 @@ static int search_leaf(struct btrfs_trans_handle *trans,
 }
 
 /*
- * btrfs_search_slot - look for a key in a tree and perform necessary
- * modifications to preserve tree invariants.
+ * Look for a key in a tree and perform necessary modifications to preserve
+ * tree invariants.
  *
  * @trans:	Handle of transaction, used when modifying the tree
  * @p:		Holds all btree nodes along the search path
@@ -2515,7 +2379,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	 */
 	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
 		btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
-		ret = comp_keys(&found_key, &orig_key);
+		ret = btrfs_comp_keys(&found_key, &orig_key);
 		if (ret == 0) {
 			if (path->slots[0] > 0) {
 				path->slots[0]--;
@@ -2530,7 +2394,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	}
 
 	btrfs_item_key(path->nodes[0], &found_key, 0);
-	ret = comp_keys(&found_key, &key);
+	ret = btrfs_comp_keys(&found_key, &key);
 	/*
 	 * We might have had an item with the previous key in the tree right
 	 * before we released our path. And after we released our path, that
@@ -2678,7 +2542,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
  * higher levels
  *
  */
-static void fixup_low_keys(struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_trans_handle *trans,
+			   struct btrfs_path *path,
 			   struct btrfs_disk_key *key, int level)
 {
 	int i;
@@ -2695,7 +2560,7 @@ static void fixup_low_keys(struct btrfs_path *path,
 						    BTRFS_MOD_LOG_KEY_REPLACE);
 		BUG_ON(ret < 0);
 		btrfs_set_node_key(t, key, tslot);
-		btrfs_mark_buffer_dirty(path->nodes[i]);
+		btrfs_mark_buffer_dirty(trans, path->nodes[i]);
 		if (tslot != 0)
 			break;
 	}
@@ -2707,10 +2572,11 @@ static void fixup_low_keys(struct btrfs_path *path,
  * This function isn't completely safe. It's the caller's responsibility
  * that the new key won't break the order
  */
-void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			     struct btrfs_path *path,
 			     const struct btrfs_key *new_key)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *eb;
 	int slot;
@@ -2719,7 +2585,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
 	slot = path->slots[0];
 	if (slot > 0) {
 		btrfs_item_key(eb, &disk_key, slot - 1);
-		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+		if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
 			btrfs_print_leaf(eb);
 			btrfs_crit(fs_info,
 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2733,7 +2599,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
 	}
 	if (slot < btrfs_header_nritems(eb) - 1) {
 		btrfs_item_key(eb, &disk_key, slot + 1);
-		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+		if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
 			btrfs_print_leaf(eb);
 			btrfs_crit(fs_info,
 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2748,9 +2614,9 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
 
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(eb, &disk_key, slot);
-	btrfs_mark_buffer_dirty(eb);
+	btrfs_mark_buffer_dirty(trans, eb);
 	if (slot == 0)
-		fixup_low_keys(path, &disk_key, 1);
+		fixup_low_keys(trans, path, &disk_key, 1);
 }
 
 /*
@@ -2881,8 +2747,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	}
 	btrfs_set_header_nritems(src, src_nritems - push_items);
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
-	btrfs_mark_buffer_dirty(src);
-	btrfs_mark_buffer_dirty(dst);
+	btrfs_mark_buffer_dirty(trans, src);
+	btrfs_mark_buffer_dirty(trans, dst);
 
 	return ret;
 }
@@ -2957,8 +2823,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(src, src_nritems - push_items);
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
 
-	btrfs_mark_buffer_dirty(src);
-	btrfs_mark_buffer_dirty(dst);
+	btrfs_mark_buffer_dirty(trans, src);
+	btrfs_mark_buffer_dirty(trans, dst);
 
 	return ret;
 }
@@ -2974,7 +2840,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 lower_gen;
 	struct extent_buffer *lower;
 	struct extent_buffer *c;
@@ -2993,11 +2858,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
 				   &lower_key, level, root->node->start, 0,
-				   BTRFS_NESTING_NEW_ROOT);
+				   0, BTRFS_NESTING_NEW_ROOT);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
-	root_add_used(root, fs_info->nodesize);
+	root_add_used_bytes(root);
 
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_node_key(c, &lower_key, 0);
@@ -3007,7 +2872,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
 
-	btrfs_mark_buffer_dirty(c);
+	btrfs_mark_buffer_dirty(trans, c);
 
 	old = root->node;
 	ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
@@ -3079,7 +2944,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
 	WARN_ON(trans->transid == 0);
 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
 	btrfs_set_header_nritems(lower, nritems + 1);
-	btrfs_mark_buffer_dirty(lower);
+	btrfs_mark_buffer_dirty(trans, lower);
 
 	return 0;
 }
@@ -3137,11 +3002,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
 	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
 				       &disk_key, level, c->start, 0,
-				       BTRFS_NESTING_SPLIT);
+				       0, BTRFS_NESTING_SPLIT);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
-	root_add_used(root, fs_info->nodesize);
+	root_add_used_bytes(root);
 	ASSERT(btrfs_header_level(c) == level);
 
 	ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
@@ -3158,8 +3023,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(split, c_nritems - mid);
 	btrfs_set_header_nritems(c, mid);
 
-	btrfs_mark_buffer_dirty(c);
-	btrfs_mark_buffer_dirty(split);
+	btrfs_mark_buffer_dirty(trans, c);
+	btrfs_mark_buffer_dirty(trans, split);
 
 	ret = insert_ptr(trans, path, &disk_key, split->start,
 			 path->slots[level + 1] + 1, level + 1);
@@ -3325,15 +3190,15 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(left, left_nritems);
 
 	if (left_nritems)
-		btrfs_mark_buffer_dirty(left);
+		btrfs_mark_buffer_dirty(trans, left);
 	else
 		btrfs_clear_buffer_dirty(trans, left);
 
-	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(trans, right);
 
 	btrfs_item_key(right, &disk_key, 0);
 	btrfs_set_node_key(upper, &disk_key, slot + 1);
-	btrfs_mark_buffer_dirty(upper);
+	btrfs_mark_buffer_dirty(trans, upper);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
@@ -3545,14 +3410,14 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		btrfs_set_token_item_offset(&token, i, push_space);
 	}
 
-	btrfs_mark_buffer_dirty(left);
+	btrfs_mark_buffer_dirty(trans, left);
 	if (right_nritems)
-		btrfs_mark_buffer_dirty(right);
+		btrfs_mark_buffer_dirty(trans, right);
 	else
 		btrfs_clear_buffer_dirty(trans, right);
 
 	btrfs_item_key(right, &disk_key, 0);
-	fixup_low_keys(path, &disk_key, 1);
+	fixup_low_keys(trans, path, &disk_key, 1);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
@@ -3683,8 +3548,8 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		return ret;
 
-	btrfs_mark_buffer_dirty(right);
-	btrfs_mark_buffer_dirty(l);
+	btrfs_mark_buffer_dirty(trans, right);
+	btrfs_mark_buffer_dirty(trans, l);
 	BUG_ON(path->slots[0] != slot);
 
 	if (mid <= slot) {
@@ -3888,13 +3753,13 @@ again:
 	 * use BTRFS_NESTING_NEW_ROOT.
 	 */
 	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-				       &disk_key, 0, l->start, 0,
+				       &disk_key, 0, l->start, 0, 0,
 				       num_doubles ? BTRFS_NESTING_NEW_ROOT :
 				       BTRFS_NESTING_SPLIT);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
-	root_add_used(root, fs_info->nodesize);
+	root_add_used_bytes(root);
 
 	if (split == 0) {
 		if (mid <= slot) {
@@ -3925,7 +3790,7 @@ again:
 			path->nodes[0] = right;
 			path->slots[0] = 0;
 			if (path->slots[1] == 0)
-				fixup_low_keys(path, &disk_key, 1);
+				fixup_low_keys(trans, path, &disk_key, 1);
 		}
 		/*
 		 * We create a new leaf 'right' for the required ins_len and
@@ -4024,7 +3889,8 @@ err:
 	return ret;
 }
 
-static noinline int split_item(struct btrfs_path *path,
+static noinline int split_item(struct btrfs_trans_handle *trans,
+			       struct btrfs_path *path,
 			       const struct btrfs_key *new_key,
 			       unsigned long split_offset)
 {
@@ -4083,7 +3949,7 @@ static noinline int split_item(struct btrfs_path *path,
 	write_extent_buffer(leaf, buf + split_offset,
 			    btrfs_item_ptr_offset(leaf, slot),
 			    item_size - split_offset);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
 	kfree(buf);
@@ -4117,7 +3983,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 	if (ret)
 		return ret;
 
-	ret = split_item(path, new_key, split_offset);
+	ret = split_item(trans, path, new_key, split_offset);
 	return ret;
 }
 
@@ -4127,7 +3993,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
  * off the end of the item or if we shift the item to chop bytes off
  * the front.
  */
-void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_path *path, u32 new_size, int from_end)
 {
 	int slot;
 	struct extent_buffer *leaf;
@@ -4203,11 +4070,11 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
 		btrfs_set_item_key(leaf, &disk_key, slot);
 		if (slot == 0)
-			fixup_low_keys(path, &disk_key, 1);
+			fixup_low_keys(trans, path, &disk_key, 1);
 	}
 
 	btrfs_set_item_size(leaf, slot, new_size);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
@@ -4218,7 +4085,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 /*
  * make the item pointed to by the path bigger, data_size is the added size.
  */
-void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+		       struct btrfs_path *path, u32 data_size)
 {
 	int slot;
 	struct extent_buffer *leaf;
@@ -4268,7 +4136,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 	data_end = old_data;
 	old_size = btrfs_item_size(leaf, slot);
 	btrfs_set_item_size(leaf, slot, old_size + data_size);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
@@ -4279,6 +4147,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 /*
  * Make space in the node before inserting one or more items.
  *
+ * @trans:	transaction handle
  * @root:	root we are inserting items to
  * @path:	points to the leaf/slot where we are going to insert new items
  * @batch:      information about the batch of items to insert
@@ -4286,7 +4155,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
  * Main purpose is to save stack depth by doing the bulk of the work in a
  * function that doesn't call btrfs_search_slot
  */
-static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+static void setup_items_for_insert(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, struct btrfs_path *path,
 				   const struct btrfs_item_batch *batch)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4306,7 +4176,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 	 */
 	if (path->slots[0] == 0) {
 		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
-		fixup_low_keys(path, &disk_key, 1);
+		fixup_low_keys(trans, path, &disk_key, 1);
 	}
 	btrfs_unlock_up_safe(path, 1);
 
@@ -4365,7 +4235,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 	}
 
 	btrfs_set_header_nritems(leaf, nritems + batch->nr);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
@@ -4376,12 +4246,14 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 /*
  * Insert a new item into a leaf.
  *
+ * @trans:     Transaction handle.
  * @root:      The root of the btree.
  * @path:      A path pointing to the target leaf and slot.
  * @key:       The key of the new item.
  * @data_size: The size of the data associated with the new key.
  */
-void btrfs_setup_item_for_insert(struct btrfs_root *root,
+void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 const struct btrfs_key *key,
 				 u32 data_size)
@@ -4393,7 +4265,7 @@ void btrfs_setup_item_for_insert(struct btrfs_root *root,
 	batch.total_data_size = data_size;
 	batch.nr = 1;
 
-	setup_items_for_insert(root, path, &batch);
+	setup_items_for_insert(trans, root, path, &batch);
 }
 
 /*
@@ -4419,7 +4291,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 
-	setup_items_for_insert(root, path, batch);
+	setup_items_for_insert(trans, root, path, batch);
 	return 0;
 }
 
@@ -4444,7 +4316,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		leaf = path->nodes[0];
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		write_extent_buffer(leaf, data, ptr, data_size);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	}
 	btrfs_free_path(path);
 	return ret;
@@ -4475,7 +4347,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 		return ret;
 
 	path->slots[0]++;
-	btrfs_setup_item_for_insert(root, path, new_key, item_size);
+	btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
 	leaf = path->nodes[0];
 	memcpy_extent_buffer(leaf,
 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4533,9 +4405,9 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		struct btrfs_disk_key disk_key;
 
 		btrfs_node_key(parent, &disk_key, 0);
-		fixup_low_keys(path, &disk_key, level + 1);
+		fixup_low_keys(trans, path, &disk_key, level + 1);
 	}
-	btrfs_mark_buffer_dirty(parent);
+	btrfs_mark_buffer_dirty(trans, parent);
 	return 0;
 }
 
@@ -4567,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_unlock_up_safe(path, 0);
 
-	root_sub_used(root, leaf->len);
+	root_sub_used_bytes(root);
 
 	atomic_inc(&leaf->refs);
 	btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
@@ -4632,7 +4504,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			struct btrfs_disk_key disk_key;
 
 			btrfs_item_key(leaf, &disk_key, 0);
-			fixup_low_keys(path, &disk_key, 1);
+			fixup_low_keys(trans, path, &disk_key, 1);
 		}
 
 		/*
@@ -4697,11 +4569,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				 * dirtied this buffer
 				 */
 				if (path->nodes[0] == leaf)
-					btrfs_mark_buffer_dirty(leaf);
+					btrfs_mark_buffer_dirty(trans, leaf);
 				free_extent_buffer(leaf);
 			}
 		} else {
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 		}
 	}
 	return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ff40acd63a37..196c005c31f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,37 +6,10 @@
 #ifndef BTRFS_CTREE_H
 #define BTRFS_CTREE_H
 
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/highmem.h>
-#include <linux/fs.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/completion.h>
-#include <linux/backing-dev.h>
-#include <linux/wait.h>
-#include <linux/slab.h>
-#include <trace/events/btrfs.h>
-#include <asm/unaligned.h>
 #include <linux/pagemap.h>
-#include <linux/btrfs.h>
-#include <linux/btrfs_tree.h>
-#include <linux/workqueue.h>
-#include <linux/security.h>
-#include <linux/sizes.h>
-#include <linux/dynamic_debug.h>
-#include <linux/refcount.h>
-#include <linux/crc32c.h>
-#include <linux/iomap.h>
-#include <linux/fscrypt.h>
-#include "extent-io-tree.h"
-#include "extent_io.h"
-#include "extent_map.h"
-#include "async-thread.h"
-#include "block-rsv.h"
 #include "locking.h"
-#include "misc.h"
 #include "fs.h"
+#include "accessors.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -218,10 +191,22 @@ struct btrfs_root {
 	atomic_t log_commit[2];
 	/* Used only for log trees of subvolumes, not for the log root tree */
 	atomic_t log_batch;
+	/*
+	 * Protected by the 'log_mutex' lock but can be read without holding
+	 * that lock to avoid unnecessary lock contention, in which case it
+	 * should be read using btrfs_get_root_log_transid() except if it's a
+	 * log tree in which case it can be directly accessed. Updates to this
+	 * field should always use btrfs_set_root_log_transid(), except for log
+	 * trees where the field can be updated directly.
+	 */
 	int log_transid;
 	/* No matter the commit succeeds or not*/
 	int log_transid_committed;
-	/* Just be updated when the commit succeeds. */
+	/*
+	 * Just be updated when the commit succeeds. Use
+	 * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit()
+	 * to access this field.
+	 */
 	int last_log_commit;
 	pid_t log_start_pid;
 
@@ -326,6 +311,9 @@ struct btrfs_root {
 	/* Used only by log trees, when logging csum items */
 	struct extent_io_tree log_csum_range;
 
+	/* Used in simple quotas, track root during relocation. */
+	u64 relocation_src_root;
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 	u64 alloc_bytenr;
 #endif
@@ -352,6 +340,26 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root)
 	return root->root_key.objectid;
 }
 
+static inline int btrfs_get_root_log_transid(const struct btrfs_root *root)
+{
+	return READ_ONCE(root->log_transid);
+}
+
+static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid)
+{
+	WRITE_ONCE(root->log_transid, log_transid);
+}
+
+static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root)
+{
+	return READ_ONCE(root->last_log_commit);
+}
+
+static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id)
+{
+	WRITE_ONCE(root->last_log_commit, commit_id);
+}
+
 /*
  * Structure that conveys information about an extent that is going to replace
  * all the extents in a file range.
@@ -470,30 +478,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
 				((bytes) >> (fs_info)->sectorsize_bits)
 
-static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
-{
-	return crc32c(crc, address, length);
-}
-
-static inline void btrfs_crc32c_final(u32 crc, u8 *result)
-{
-	put_unaligned_le32(~crc, result);
-}
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
-       return crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
-                                   int len)
-{
-       return (u64) crc32c(parent_objectid, name, len);
-}
-
 static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 {
 	return mapping_gfp_constraint(mapping, ~__GFP_FS);
@@ -513,12 +497,42 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 		     const struct btrfs_key *key, int *slot);
 
 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key,
+				  const struct btrfs_key *k2)
+{
+	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+	return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
+/* Compare two keys in a memcmp fashion. */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk,
+				  const struct btrfs_key *k2)
+{
+	struct btrfs_key k1;
+
+	btrfs_disk_key_to_cpu(&k1, disk);
+
+	return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+#endif
+
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
 int btrfs_previous_extent_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid);
-void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			     struct btrfs_path *path,
 			     const struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
@@ -536,6 +550,13 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret,
 		    enum btrfs_lock_nesting nest);
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct extent_buffer *buf,
+			  struct extent_buffer *parent, int parent_slot,
+			  struct extent_buffer **cow_ret,
+			  u64 search_start, u64 empty_size,
+			  enum btrfs_lock_nesting nest);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
@@ -545,8 +566,10 @@ int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *buf);
 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct btrfs_path *path, int level, int slot);
-void btrfs_extend_item(struct btrfs_path *path, u32 data_size);
-void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+		       struct btrfs_path *path, u32 data_size);
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_path *path, u32 new_size, int from_end);
 int btrfs_split_item(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_path *path,
@@ -567,10 +590,6 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
 			       const struct btrfs_key *key,
 			       struct btrfs_path *p, int find_higher,
 			       int return_any);
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, u64 *last_ret,
-		       struct btrfs_key *progress);
 void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
@@ -610,7 +629,8 @@ struct btrfs_item_batch {
 	int nr;
 };
 
-void btrfs_setup_item_for_insert(struct btrfs_root *root,
+void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 const struct btrfs_key *key,
 				 u32 data_size);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index f2ff4cbe8656..5244561e2016 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -338,13 +338,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * Check if two blocks addresses are close, used by defrag.
+ */
+static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+	if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
+		return true;
+	if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
+		return true;
+	return false;
+}
+
+/*
+ * Go through all the leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order.
+ */
+static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *parent,
+			      int start_slot, u64 *last_ret,
+			      struct btrfs_key *progress)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	const u32 blocksize = fs_info->nodesize;
+	const int end_slot = btrfs_header_nritems(parent) - 1;
+	u64 search_start = *last_ret;
+	u64 last_block = 0;
+	int ret = 0;
+	bool progress_passed = false;
+
+	/*
+	 * COWing must happen through a running transaction, which always
+	 * matches the current fs generation (it's a transaction with a state
+	 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
+	 * into error state to prevent the commit of any transaction.
+	 */
+	if (unlikely(trans->transaction != fs_info->running_transaction ||
+		     trans->transid != fs_info->generation)) {
+		btrfs_abort_transaction(trans, -EUCLEAN);
+		btrfs_crit(fs_info,
+"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
+			   parent->start, btrfs_root_id(root), trans->transid,
+			   fs_info->running_transaction->transid,
+			   fs_info->generation);
+		return -EUCLEAN;
+	}
+
+	if (btrfs_header_nritems(parent) <= 1)
+		return 0;
+
+	for (int i = start_slot; i <= end_slot; i++) {
+		struct extent_buffer *cur;
+		struct btrfs_disk_key disk_key;
+		u64 blocknr;
+		u64 other;
+		bool close = true;
+
+		btrfs_node_key(parent, &disk_key, i);
+		if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0)
+			continue;
+
+		progress_passed = true;
+		blocknr = btrfs_node_blockptr(parent, i);
+		if (last_block == 0)
+			last_block = blocknr;
+
+		if (i > 0) {
+			other = btrfs_node_blockptr(parent, i - 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (!close && i < end_slot) {
+			other = btrfs_node_blockptr(parent, i + 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (close) {
+			last_block = blocknr;
+			continue;
+		}
+
+		cur = btrfs_read_node_slot(parent, i);
+		if (IS_ERR(cur))
+			return PTR_ERR(cur);
+		if (search_start == 0)
+			search_start = last_block;
+
+		btrfs_tree_lock(cur);
+		ret = btrfs_force_cow_block(trans, root, cur, parent, i,
+					    &cur, search_start,
+					    min(16 * blocksize,
+						(end_slot - i) * blocksize),
+					    BTRFS_NESTING_COW);
+		if (ret) {
+			btrfs_tree_unlock(cur);
+			free_extent_buffer(cur);
+			break;
+		}
+		search_start = cur->start;
+		last_block = cur->start;
+		*last_ret = search_start;
+		btrfs_tree_unlock(cur);
+		free_extent_buffer(cur);
+	}
+	return ret;
+}
+
+/*
  * Defrag all the leaves in a given btree.
  * Read all the leaves and try to get key order to
  * better reflect disk order
  */
 
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root)
+static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
 {
 	struct btrfs_path *path = NULL;
 	struct btrfs_key key;
@@ -461,6 +566,45 @@ done:
 }
 
 /*
+ * Defrag a given btree.  Every leaf in the btree is read and defragmented.
+ */
+int btrfs_defrag_root(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
+		return 0;
+
+	while (1) {
+		struct btrfs_trans_handle *trans;
+
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+
+		ret = btrfs_defrag_leaves(trans, root);
+
+		btrfs_end_transaction(trans);
+		btrfs_btree_balance_dirty(fs_info);
+		cond_resched();
+
+		if (btrfs_fs_closing(fs_info) || ret != -EAGAIN)
+			break;
+
+		if (btrfs_defrag_cancelled(fs_info)) {
+			btrfs_debug(fs_info, "defrag_root cancelled");
+			ret = -EAGAIN;
+			break;
+		}
+	}
+	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
+	return ret;
+}
+
+/*
  * Defrag specific helper to get an extent map.
  *
  * Differences between this and btrfs_get_extent() are:
@@ -891,8 +1035,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
 		 *    very likely resulting in a larger extent after writeback is
 		 *    triggered (except in a case of free space fragmentation).
 		 */
-		if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
-				   EXTENT_DELALLOC, 0, NULL))
+		if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
+					  EXTENT_DELALLOC))
 			goto next;
 
 		/*
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 5305f2283b5e..5a62763528d1 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -12,7 +12,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct btrfs_inode *inode, u32 extent_thresh);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
 
 static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
 {
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 427abaf608b8..51453d4928fa 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -322,9 +322,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	} else {
 		if (current->journal_info)
 			flush = BTRFS_RESERVE_FLUSH_LIMIT;
-
-		if (btrfs_transaction_in_commit(fs_info))
-			schedule_timeout(1);
 	}
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
@@ -346,7 +343,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 						 noflush);
 	if (ret)
 		return ret;
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+					   meta_reserve, flush);
 	if (ret) {
 		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
 		return ret;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 90aaedce1548..7381241334e8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -328,7 +328,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
 }
 
 /*
- * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * Look up the delayed item by key.
+ *
  * @delayed_node: pointer to the delayed node
  * @index:	  the dir index value to lookup (offset of a dir index key)
  *
@@ -517,7 +518,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 		/*
 		 * For insertions we track reserved metadata space by accounting
 		 * for the number of leaves that will be used, based on the delayed
-		 * node's index_items_size field.
+		 * node's curr_index_batch_size and index_item_leaves fields.
 		 */
 		if (item->type == BTRFS_DELAYED_DELETION_ITEM)
 			item->bytes_reserved = num_bytes;
@@ -1030,7 +1031,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 				    struct btrfs_inode_item);
 	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
 			    sizeof(struct btrfs_inode_item));
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
 		goto out;
@@ -1378,8 +1379,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 		return -ENOMEM;
 
 	async_work->delayed_root = delayed_root;
-	btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
-			NULL);
+	btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL);
 	async_work->nr = nr;
 
 	btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
@@ -1760,8 +1760,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
 }
 
 /*
- * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
- *
+ * Read dir info stored in the delayed tree.
  */
 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 				    struct list_head *ins_list)
@@ -1834,24 +1833,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_stack_inode_block_group(inode_item, 0);
 
 	btrfs_set_stack_timespec_sec(&inode_item->atime,
-				     inode->i_atime.tv_sec);
+				     inode_get_atime_sec(inode));
 	btrfs_set_stack_timespec_nsec(&inode_item->atime,
-				      inode->i_atime.tv_nsec);
+				      inode_get_atime_nsec(inode));
 
 	btrfs_set_stack_timespec_sec(&inode_item->mtime,
-				     inode->i_mtime.tv_sec);
+				     inode_get_mtime_sec(inode));
 	btrfs_set_stack_timespec_nsec(&inode_item->mtime,
-				      inode->i_mtime.tv_nsec);
+				      inode_get_mtime_nsec(inode));
 
 	btrfs_set_stack_timespec_sec(&inode_item->ctime,
-				     inode_get_ctime(inode).tv_sec);
+				     inode_get_ctime_sec(inode));
 	btrfs_set_stack_timespec_nsec(&inode_item->ctime,
-				      inode_get_ctime(inode).tv_nsec);
+				      inode_get_ctime_nsec(inode));
 
-	btrfs_set_stack_timespec_sec(&inode_item->otime,
-				     BTRFS_I(inode)->i_otime.tv_sec);
-	btrfs_set_stack_timespec_nsec(&inode_item->otime,
-				     BTRFS_I(inode)->i_otime.tv_nsec);
+	btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
+	btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
 }
 
 int btrfs_fill_inode(struct inode *inode, u32 *rdev)
@@ -1891,19 +1888,17 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 	btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
 				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
 
-	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
-	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+	inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+			btrfs_stack_timespec_nsec(&inode_item->atime));
 
-	inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
-	inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
+	inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+			btrfs_stack_timespec_nsec(&inode_item->mtime));
 
 	inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
 			btrfs_stack_timespec_nsec(&inode_item->ctime));
 
-	BTRFS_I(inode)->i_otime.tv_sec =
-		btrfs_stack_timespec_sec(&inode_item->otime);
-	BTRFS_I(inode)->i_otime.tv_nsec =
-		btrfs_stack_timespec_nsec(&inode_item->otime);
+	BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+	BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
 
 	inode->i_generation = BTRFS_I(inode)->generation;
 	BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1914,9 +1909,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 }
 
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
 			       struct btrfs_inode *inode)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_delayed_node *delayed_node;
 	int ret = 0;
 
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1da213197f55..5cceb31bbd16 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -135,7 +135,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
 
 
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
 			       struct btrfs_inode *inode);
 int btrfs_fill_inode(struct inode *inode, u32 *rdev);
 int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9fe4ccca50a0..9223934d95f4 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -57,16 +57,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
  * Release a ref head's reservation.
  *
  * @fs_info:  the filesystem
- * @nr:       number of items to drop
+ * @nr_refs:  number of delayed refs to drop
+ * @nr_csums: number of csum items to drop
  *
  * Drops the delayed ref head's count from the delayed refs rsv and free any
  * excess reservation we had.
  */
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
-	const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
-	u64 released = 0;
+	u64 num_bytes;
+	u64 released;
+
+	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs);
+	num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
 
 	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
 	if (released)
@@ -77,26 +81,118 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 /*
  * Adjust the size of the delayed refs rsv.
  *
- * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
- * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates
+ * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and
+ * add it to the delayed_refs_rsv.
  */
 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv;
 	u64 num_bytes;
+	u64 reserved_bytes;
 
-	if (!trans->delayed_ref_updates)
+	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
+	num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
+						       trans->delayed_ref_csum_deletions);
+
+	if (num_bytes == 0)
 		return;
 
-	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
-						 trans->delayed_ref_updates);
+	/*
+	 * Try to take num_bytes from the transaction's local delayed reserve.
+	 * If not possible, try to take as much as it's available. If the local
+	 * reserve doesn't have enough reserved space, the delayed refs reserve
+	 * will be refilled next time btrfs_delayed_refs_rsv_refill() is called
+	 * by someone or if a transaction commit is triggered before that, the
+	 * global block reserve will be used. We want to minimize using the
+	 * global block reserve for cases we can account for in advance, to
+	 * avoid exhausting it and reach -ENOSPC during a transaction commit.
+	 */
+	spin_lock(&local_rsv->lock);
+	reserved_bytes = min(num_bytes, local_rsv->reserved);
+	local_rsv->reserved -= reserved_bytes;
+	local_rsv->full = (local_rsv->reserved >= local_rsv->size);
+	spin_unlock(&local_rsv->lock);
 
 	spin_lock(&delayed_rsv->lock);
 	delayed_rsv->size += num_bytes;
-	delayed_rsv->full = false;
+	delayed_rsv->reserved += reserved_bytes;
+	delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size);
 	spin_unlock(&delayed_rsv->lock);
 	trans->delayed_ref_updates = 0;
+	trans->delayed_ref_csum_deletions = 0;
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * insertion, used after allocating a block group.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+	spin_lock(&delayed_rsv->lock);
+	/*
+	 * Inserting a block group item does not require changing the free space
+	 * tree, only the extent tree or the block group tree, so this is all we
+	 * need.
+	 */
+	delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1);
+	delayed_rsv->full = false;
+	spin_unlock(&delayed_rsv->lock);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item insertion.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+	u64 released;
+
+	released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+	if (released > 0)
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, released, 0);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * update.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+	spin_lock(&delayed_rsv->lock);
+	/*
+	 * Updating a block group item does not result in new nodes/leaves and
+	 * does not require changing the free space tree, only the extent tree
+	 * or the block group tree, so this is all we need.
+	 */
+	delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1);
+	delayed_rsv->full = false;
+	spin_unlock(&delayed_rsv->lock);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item update.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1);
+	u64 released;
+
+	released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+	if (released > 0)
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, released, 0);
 }
 
 /*
@@ -154,6 +250,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 				  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_space_info *space_info = block_rsv->space_info;
 	u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
 	u64 num_bytes = 0;
 	u64 refilled_bytes;
@@ -170,7 +267,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 	if (!num_bytes)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush);
 	if (ret)
 		return ret;
 
@@ -199,8 +296,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 	spin_unlock(&block_rsv->lock);
 
 	if (to_free > 0)
-		btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info,
-						    to_free);
+		btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
 
 	if (refilled_bytes > 0)
 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
@@ -422,7 +518,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
 	return 0;
 }
 
-static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
+				    struct btrfs_delayed_ref_root *delayed_refs,
 				    struct btrfs_delayed_ref_head *head,
 				    struct btrfs_delayed_ref_node *ref)
 {
@@ -433,9 +530,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
 		list_del(&ref->add_list);
 	btrfs_put_delayed_ref(ref);
 	atomic_dec(&delayed_refs->num_entries);
+	btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
 }
 
-static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static bool merge_ref(struct btrfs_fs_info *fs_info,
+		      struct btrfs_delayed_ref_root *delayed_refs,
 		      struct btrfs_delayed_ref_head *head,
 		      struct btrfs_delayed_ref_node *ref,
 		      u64 seq)
@@ -464,10 +563,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
 			mod = -next->ref_mod;
 		}
 
-		drop_delayed_ref(delayed_refs, head, next);
+		drop_delayed_ref(fs_info, delayed_refs, head, next);
 		ref->ref_mod += mod;
 		if (ref->ref_mod == 0) {
-			drop_delayed_ref(delayed_refs, head, ref);
+			drop_delayed_ref(fs_info, delayed_refs, head, ref);
 			done = true;
 		} else {
 			/*
@@ -505,7 +604,7 @@ again:
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		if (seq && ref->seq >= seq)
 			continue;
-		if (merge_ref(delayed_refs, head, ref, seq))
+		if (merge_ref(fs_info, delayed_refs, head, ref, seq))
 			goto again;
 	}
 }
@@ -584,10 +683,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
  * Return true if the ref was merged into an existing one (and therefore can be
  * freed by the caller).
  */
-static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
+static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
 			       struct btrfs_delayed_ref_head *href,
 			       struct btrfs_delayed_ref_node *ref)
 {
+	struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs;
 	struct btrfs_delayed_ref_node *exist;
 	int mod;
 
@@ -598,6 +698,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
 			list_add_tail(&ref->add_list, &href->ref_add_list);
 		atomic_inc(&root->num_entries);
 		spin_unlock(&href->lock);
+		trans->delayed_ref_updates++;
 		return false;
 	}
 
@@ -626,7 +727,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
 
 	/* remove existing tail if its ref_mod is zero */
 	if (exist->ref_mod == 0)
-		drop_delayed_ref(root, href, exist);
+		drop_delayed_ref(trans->fs_info, root, href, exist);
 	spin_unlock(&href->lock);
 	return true;
 }
@@ -647,6 +748,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 	BUG_ON(existing->is_data != update->is_data);
 
 	spin_lock(&existing->lock);
+
+	/*
+	 * When freeing an extent, we may not know the owning root when we
+	 * first create the head_ref. However, some deref before the last deref
+	 * will know it, so we just need to update the head_ref accordingly.
+	 */
+	if (!existing->owning_root)
+		existing->owning_root = update->owning_root;
+
 	if (update->must_insert_reserved) {
 		/* if the extent was freed and then
 		 * reallocated before the delayed ref
@@ -656,6 +766,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 		 * Set it again here
 		 */
 		existing->must_insert_reserved = update->must_insert_reserved;
+		existing->owning_root = update->owning_root;
 
 		/*
 		 * update the num_bytes so we make sure the accounting
@@ -695,6 +806,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 	/*
 	 * If we are going to from a positive ref mod to a negative or vice
 	 * versa we need to make sure to adjust pending_csums accordingly.
+	 * We reserve bytes for csum deletion when adding or updating a ref head
+	 * see add_delayed_ref_head() for more details.
 	 */
 	if (existing->is_data) {
 		u64 csum_leaves =
@@ -703,11 +816,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 
 		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
 			delayed_refs->pending_csums -= existing->num_bytes;
-			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+			btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves);
 		}
 		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
 			delayed_refs->pending_csums += existing->num_bytes;
-			trans->delayed_ref_updates += csum_leaves;
+			trans->delayed_ref_csum_deletions += csum_leaves;
 		}
 	}
 
@@ -718,7 +831,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
 				  struct btrfs_qgroup_extent_record *qrecord,
 				  u64 bytenr, u64 num_bytes, u64 ref_root,
 				  u64 reserved, int action, bool is_data,
-				  bool is_system)
+				  bool is_system, u64 owning_root)
 {
 	int count_mod = 1;
 	bool must_insert_reserved = false;
@@ -758,7 +871,9 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
 	head_ref->bytenr = bytenr;
 	head_ref->num_bytes = num_bytes;
 	head_ref->ref_mod = count_mod;
+	head_ref->reserved_bytes = reserved;
 	head_ref->must_insert_reserved = must_insert_reserved;
+	head_ref->owning_root = owning_root;
 	head_ref->is_data = is_data;
 	head_ref->is_system = is_system;
 	head_ref->ref_tree = RB_ROOT_CACHED;
@@ -819,16 +934,21 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
 		head_ref = existing;
 	} else {
+		/*
+		 * We reserve the amount of bytes needed to delete csums when
+		 * adding the ref head and not when adding individual drop refs
+		 * since the csum items are deleted only after running the last
+		 * delayed drop ref (the data extent's ref count drops to 0).
+		 */
 		if (head_ref->is_data && head_ref->ref_mod < 0) {
 			delayed_refs->pending_csums += head_ref->num_bytes;
-			trans->delayed_ref_updates +=
+			trans->delayed_ref_csum_deletions +=
 				btrfs_csum_bytes_to_leaves(trans->fs_info,
 							   head_ref->num_bytes);
 		}
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		atomic_inc(&delayed_refs->num_entries);
-		trans->delayed_ref_updates++;
 	}
 	if (qrecord_inserted_ret)
 		*qrecord_inserted_ret = qrecord_inserted;
@@ -837,8 +957,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 }
 
 /*
- * init_delayed_ref_common - Initialize the structure which represents a
- *			     modification to a an extent.
+ * Initialize the structure which represents a modification to a an extent.
  *
  * @fs_info:    Internal to the mounted filesystem mount structure.
  *
@@ -909,7 +1028,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	u64 parent = generic_ref->parent;
 	u8 ref_type;
 
-	is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
+	is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID);
 
 	ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
@@ -922,8 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    !generic_ref->skip_qgroup) {
+	if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
 		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -938,15 +1056,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
 
 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
-				generic_ref->tree_ref.owning_root, action,
+				generic_ref->tree_ref.ref_root, action,
 				ref_type);
-	ref->root = generic_ref->tree_ref.owning_root;
+	ref->root = generic_ref->tree_ref.ref_root;
 	ref->parent = parent;
 	ref->level = level;
 
 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
-			      generic_ref->tree_ref.owning_root, 0, action,
-			      false, is_system);
+			      generic_ref->tree_ref.ref_root, 0, action,
+			      false, is_system, generic_ref->owning_root);
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -959,7 +1077,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	head_ref = add_delayed_ref_head(trans, head_ref, record,
 					action, &qrecord_inserted);
 
-	merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+	merged = insert_delayed_ref(trans, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
 	/*
@@ -998,7 +1116,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	u64 bytenr = generic_ref->bytenr;
 	u64 num_bytes = generic_ref->len;
 	u64 parent = generic_ref->parent;
-	u64 ref_root = generic_ref->data_ref.owning_root;
+	u64 ref_root = generic_ref->data_ref.ref_root;
 	u64 owner = generic_ref->data_ref.ino;
 	u64 offset = generic_ref->data_ref.offset;
 	u8 ref_type;
@@ -1026,8 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    !generic_ref->skip_qgroup) {
+	if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
 		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -1038,7 +1155,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	}
 
 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
-			      reserved, action, true, false);
+			      reserved, action, true, false, generic_ref->owning_root);
 	head_ref->extent_op = NULL;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -1051,7 +1168,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	head_ref = add_delayed_ref_head(trans, head_ref, record,
 					action, &qrecord_inserted);
 
-	merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+	merged = insert_delayed_ref(trans, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
 	/*
@@ -1084,7 +1201,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
-			      BTRFS_UPDATE_DELAYED_HEAD, false, false);
+			      BTRFS_UPDATE_DELAYED_HEAD, false, false, 0);
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index fd9bf2b709c0..62d679d40f4f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -9,10 +9,16 @@
 #include <linux/refcount.h>
 
 /* these are the possible values of struct btrfs_delayed_ref_node->action */
-#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
-#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
-#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
-#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+enum btrfs_delayed_ref_action {
+	/* Add one backref to the tree */
+	BTRFS_ADD_DELAYED_REF = 1,
+	/* Delete one backref from the tree */
+	BTRFS_DROP_DELAYED_REF,
+	/* Record a full extent allocation */
+	BTRFS_ADD_DELAYED_EXTENT,
+	/* Not changing ref count on head ref */
+	BTRFS_UPDATE_DELAYED_HEAD,
+} __packed;
 
 struct btrfs_delayed_ref_node {
 	struct rb_node ref_node;
@@ -105,6 +111,18 @@ struct btrfs_delayed_ref_head {
 	int ref_mod;
 
 	/*
+	 * The root that triggered the allocation when must_insert_reserved is
+	 * set to true.
+	 */
+	u64 owning_root;
+
+	/*
+	 * Track reserved bytes when setting must_insert_reserved.  On success
+	 * or cleanup, we will need to free the reservation.
+	 */
+	u64 reserved_bytes;
+
+	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
 	 * until the delayed ref is processed.  must_insert_reserved is
@@ -117,6 +135,7 @@ struct btrfs_delayed_ref_head {
 	 * the free has happened.
 	 */
 	bool must_insert_reserved;
+
 	bool is_data;
 	bool is_system;
 	bool processing;
@@ -183,13 +202,13 @@ enum btrfs_ref_type {
 	BTRFS_REF_DATA,
 	BTRFS_REF_METADATA,
 	BTRFS_REF_LAST,
-};
+} __packed;
 
 struct btrfs_data_ref {
 	/* For EXTENT_DATA_REF */
 
-	/* Original root this data extent belongs to */
-	u64 owning_root;
+	/* Root which owns this data reference. */
+	u64 ref_root;
 
 	/* Inode which refers to this data extent */
 	u64 ino;
@@ -212,18 +231,18 @@ struct btrfs_tree_ref {
 	int level;
 
 	/*
-	 * Root which owns this tree block.
+	 * Root which owns this tree block reference.
 	 *
 	 * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
 	 */
-	u64 owning_root;
+	u64 ref_root;
 
 	/* For non-skinny metadata, no special member needed */
 };
 
 struct btrfs_ref {
 	enum btrfs_ref_type type;
-	int action;
+	enum btrfs_delayed_ref_action action;
 
 	/*
 	 * Whether this extent should go through qgroup record.
@@ -239,6 +258,7 @@ struct btrfs_ref {
 #endif
 	u64 bytenr;
 	u64 len;
+	u64 owning_root;
 
 	/* Bytenr of the parent tree block */
 	u64 parent;
@@ -277,24 +297,37 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in
 	return num_bytes;
 }
 
+static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info,
+						    int num_csum_items)
+{
+	/*
+	 * Deleting csum items does not result in new nodes/leaves and does not
+	 * require changing the free space tree, only the csum tree, so this is
+	 * all we need.
+	 */
+	return btrfs_calc_metadata_size(fs_info, num_csum_items);
+}
+
 static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
-				int action, u64 bytenr, u64 len, u64 parent)
+					  int action, u64 bytenr, u64 len,
+					  u64 parent, u64 owning_root)
 {
 	generic_ref->action = action;
 	generic_ref->bytenr = bytenr;
 	generic_ref->len = len;
 	generic_ref->parent = parent;
+	generic_ref->owning_root = owning_root;
 }
 
-static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
-				int level, u64 root, u64 mod_root, bool skip_qgroup)
+static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level,
+				       u64 root, u64 mod_root, bool skip_qgroup)
 {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	/* If @real_root not set, use @root as fallback */
 	generic_ref->real_root = mod_root ?: root;
 #endif
 	generic_ref->tree_ref.level = level;
-	generic_ref->tree_ref.owning_root = root;
+	generic_ref->tree_ref.ref_root = root;
 	generic_ref->type = BTRFS_REF_METADATA;
 	if (skip_qgroup || !(is_fstree(root) &&
 			     (!mod_root || is_fstree(mod_root))))
@@ -312,7 +345,7 @@ static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
 	/* If @real_root not set, use @root as fallback */
 	generic_ref->real_root = mod_root ?: ref_root;
 #endif
-	generic_ref->data_ref.owning_root = ref_root;
+	generic_ref->data_ref.ref_root = ref_root;
 	generic_ref->data_ref.ino = ino;
 	generic_ref->data_ref.offset = offset;
 	generic_ref->type = BTRFS_REF_DATA;
@@ -338,7 +371,6 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
-	WARN_ON(refcount_read(&ref->refs) == 0);
 	if (refcount_dec_and_test(&ref->refs)) {
 		WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
 		switch (ref->type) {
@@ -402,8 +434,12 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums);
 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 				  enum btrfs_reserve_flush_enum flush);
 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index fff22ed55c42..f9544fda38e9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -17,7 +17,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
-#include "check-integrity.h"
 #include "dev-replace.h"
 #include "sysfs.h"
 #include "zoned.h"
@@ -247,6 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *device;
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 	u64 devid = BTRFS_DEV_REPLACE_DEVID;
 	int ret = 0;
@@ -257,12 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 		return -EINVAL;
 	}
 
-	bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
-				  fs_info->bdev_holder, NULL);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+					fs_info->bdev_holder, NULL);
+	if (IS_ERR(bdev_handle)) {
 		btrfs_err(fs_info, "target device %s is invalid!", device_path);
-		return PTR_ERR(bdev);
+		return PTR_ERR(bdev_handle);
 	}
+	bdev = bdev_handle->bdev;
 
 	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
 		btrfs_err(fs_info,
@@ -313,9 +314,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->commit_bytes_used = device->bytes_used;
 	device->fs_info = fs_info;
 	device->bdev = bdev;
+	device->bdev_handle = bdev_handle;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
-	device->holder = fs_info->bdev_holder;
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_devices;
@@ -334,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	return 0;
 
 error:
-	blkdev_put(bdev, fs_info->bdev_holder);
+	bdev_release(bdev_handle);
 	return ret;
 }
 
@@ -442,7 +443,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
 	dev_replace->item_needs_writeback = 0;
 	up_write(&dev_replace->rwsem);
 
-	btrfs_mark_buffer_dirty(eb);
+	btrfs_mark_buffer_dirty(trans, eb);
 
 out:
 	btrfs_free_path(path);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 082eb0e19598..9c07d5c3e5ad 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -38,7 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 		di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
 		if (di)
 			return ERR_PTR(-EEXIST);
-		btrfs_extend_item(path, data_size);
+		btrfs_extend_item(trans, path, data_size);
 	} else if (ret < 0)
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
@@ -93,7 +93,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 
 	write_extent_buffer(leaf, name, name_ptr, name_len);
 	write_extent_buffer(leaf, data, data_ptr, data_len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 	return ret;
 }
@@ -153,7 +153,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 	name_ptr = (unsigned long)(dir_item + 1);
 
 	write_extent_buffer(leaf, name->name, name_ptr, name->len);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 second_insert:
 	/* FIXME, use some real flag for selecting the extra index */
@@ -439,7 +439,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			item_len - (ptr + sub_item_len - start));
-		btrfs_truncate_item(path, item_len - sub_item_len, 1);
+		btrfs_truncate_item(trans, path, item_len - sub_item_len, 1);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index aab4b7cc7fa0..e40a226373d7 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -3,6 +3,10 @@
 #ifndef BTRFS_DIR_ITEM_H
 #define BTRFS_DIR_ITEM_H
 
+#include <linux/crc32c.h>
+
+struct fscrypt_str;
+
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 			  const struct fscrypt_str *name);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
@@ -39,4 +43,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
 						 const char *name,
 						 int name_len);
 
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+       return crc32c((u32)~1, name, len);
+}
+
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 68f60d50e1fd..401ea09ae4b8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "free-space-tree.h"
-#include "check-integrity.h"
 #include "rcu-string.h"
 #include "dev-replace.h"
 #include "raid56.h"
@@ -245,6 +244,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
 	struct extent_buffer *eb = bbio->private;
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	u64 found_start = btrfs_header_bytenr(eb);
+	u64 last_trans;
 	u8 result[BTRFS_CSUM_SIZE];
 	int ret;
 
@@ -282,12 +282,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
 	 * Also check the generation, the eb reached here must be newer than
 	 * last committed. Or something seriously wrong happened.
 	 */
-	if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+	last_trans = btrfs_get_last_trans_committed(fs_info);
+	if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
 		ret = -EUCLEAN;
 		btrfs_err(fs_info,
 			"block=%llu bad generation, have %llu expect > %llu",
-			  eb->start, btrfs_header_generation(eb),
-			  fs_info->last_trans_committed);
+			  eb->start, btrfs_header_generation(eb), last_trans);
 		goto error;
 	}
 	write_extent_buffer(eb, result, 0, fs_info->csum_size);
@@ -318,9 +318,10 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
 			   BTRFS_FSID_SIZE);
 
 	/*
-	 * alloc_fs_devices() copies the fsid into metadata_uuid if the
-	 * metadata_uuid is unset in the superblock, including for a seed device.
-	 * So, we can use fs_devices->metadata_uuid.
+	 * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
+	 * This is then overwritten by metadata_uuid if it is present in the
+	 * device_list_add(). The same true for a seed device as well. So use of
+	 * fs_devices::metadata_uuid is appropriate here.
 	 */
 	if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
 		return false;
@@ -675,9 +676,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	refcount_set(&root->refs, 1);
 	atomic_set(&root->snapshot_force_cow, 0);
 	atomic_set(&root->nr_swapfiles, 0);
-	root->log_transid = 0;
+	btrfs_set_root_log_transid(root, 0);
 	root->log_transid_committed = -1;
-	root->last_log_commit = 0;
+	btrfs_set_root_last_log_commit(root, 0);
 	root->anon_dev = 0;
 	if (!dummy) {
 		extent_io_tree_init(fs_info, &root->dirty_log_pages,
@@ -859,7 +860,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	root->root_key.offset = 0;
 
 	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
-				      BTRFS_NESTING_NORMAL);
+				      0, BTRFS_NESTING_NORMAL);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		leaf = NULL;
@@ -867,7 +868,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	}
 
 	root->node = leaf;
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	root->commit_root = btrfs_root_node(root);
 	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -936,13 +937,13 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
 	 */
 
 	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
-			NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
+			NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
 	if (IS_ERR(leaf))
 		return PTR_ERR(leaf);
 
 	root->node = leaf;
 
-	btrfs_mark_buffer_dirty(root->node);
+	btrfs_mark_buffer_dirty(trans, root->node);
 	btrfs_tree_unlock(root->node);
 
 	return 0;
@@ -1004,9 +1005,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
-	root->log_transid = 0;
+	btrfs_set_root_log_transid(root, 0);
 	root->log_transid_committed = -1;
-	root->last_log_commit = 0;
+	btrfs_set_root_last_log_commit(root, 0);
 	return 0;
 }
 
@@ -1179,6 +1180,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 		return btrfs_grab_root(fs_info->block_group_root);
 	case BTRFS_FREE_SPACE_TREE_OBJECTID:
 		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+	case BTRFS_RAID_STRIPE_TREE_OBJECTID:
+		return btrfs_grab_root(fs_info->stripe_root);
 	default:
 		return NULL;
 	}
@@ -1259,6 +1262,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->fs_root);
 	btrfs_put_root(fs_info->data_reloc_root);
 	btrfs_put_root(fs_info->block_group_root);
+	btrfs_put_root(fs_info->stripe_root);
 	btrfs_check_leaked_roots(fs_info);
 	btrfs_extent_buffer_leak_debug_check(fs_info);
 	kfree(fs_info->super_copy);
@@ -1402,7 +1406,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * Return a root for the given objectid.
+ *
  * @fs_info:	the fs_info
  * @objectid:	the objectid we need to lookup
  *
@@ -1699,11 +1704,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 }
 
 /*
- * read_backup_root - Reads a backup root based on the passed priority. Prio 0
- * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
+ * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
+ * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
  *
- * fs_info - filesystem whose backup roots need to be read
- * priority - priority of backup root required
+ * @fs_info:  filesystem whose backup roots need to be read
+ * @priority: priority of backup root required
  *
  * Returns backup root index on success and -EINVAL otherwise.
  */
@@ -1803,6 +1808,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 	free_root_extent_buffers(info->fs_root);
 	free_root_extent_buffers(info->data_reloc_root);
 	free_root_extent_buffers(info->block_group_root);
+	free_root_extent_buffers(info->stripe_root);
 	if (free_chunk_root)
 		free_root_extent_buffers(info->chunk_root);
 }
@@ -2262,7 +2268,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (!IS_ERR(root)) {
 		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 		fs_info->quota_root = root;
 	}
 
@@ -2279,6 +2284,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 		fs_info->uuid_root = root;
 	}
 
+	if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+		location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+		root = btrfs_read_tree_root(tree_root, &location);
+		if (IS_ERR(root)) {
+			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+				ret = PTR_ERR(root);
+				goto out;
+			}
+		} else {
+			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+			fs_info->stripe_root = root;
+		}
+	}
+
 	return 0;
 out:
 	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2381,7 +2400,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
-	if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
+	if (!fs_info->fs_devices->temp_fsid &&
+	    memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
 		btrfs_err(fs_info,
 		"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
 			  sb->fsid, fs_info->fs_devices->fsid);
@@ -2634,7 +2654,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 
 		/* All successful */
 		fs_info->generation = btrfs_header_generation(tree_root->node);
-		fs_info->last_trans_committed = fs_info->generation;
+		btrfs_set_last_trans_committed(fs_info, fs_info->generation);
 		fs_info->last_reloc_trans = 0;
 
 		/* Always begin writing backup roots after the one being used */
@@ -2735,9 +2755,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	spin_lock_init(&fs_info->ordered_root_lock);
 
 	btrfs_init_scrub(fs_info);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	fs_info->check_integrity_print_mask = 0;
-#endif
 	btrfs_init_balance(fs_info);
 	btrfs_init_async_reclaim_work(fs_info);
 
@@ -3157,7 +3174,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	u32 nodesize;
 	u32 stripesize;
 	u64 generation;
-	u64 features;
 	u16 csum_type;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -3239,15 +3255,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 
 	disk_super = fs_info->super_copy;
 
-
-	features = btrfs_super_flags(disk_super);
-	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
-		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
-		btrfs_set_super_flags(disk_super, features);
-		btrfs_info(fs_info,
-			"found metadata UUID change in progress flag, clearing");
-	}
-
 	memcpy(fs_info->super_for_commit, fs_info->super_copy,
 	       sizeof(*fs_info->super_for_commit));
 
@@ -3509,18 +3516,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 				   "auto enabling async discard");
 	}
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
-		ret = btrfsic_mount(fs_info, fs_devices,
-				    btrfs_test_opt(fs_info,
-					CHECK_INTEGRITY_DATA) ? 1 : 0,
-				    fs_info->check_integrity_print_mask);
-		if (ret)
-			btrfs_warn(fs_info,
-				"failed to initialize integrity check module: %d",
-				ret);
-	}
-#endif
 	ret = btrfs_read_qgroup_config(fs_info);
 	if (ret)
 		goto fail_trans_kthread;
@@ -3820,8 +3815,6 @@ static int write_dev_supers(struct btrfs_device *device,
 		 */
 		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
 			bio->bi_opf |= REQ_FUA;
-
-		btrfsic_check_bio(bio);
 		submit_bio(bio);
 
 		if (btrfs_advance_sb_log(device, i))
@@ -3917,28 +3910,11 @@ static void write_dev_flush(struct btrfs_device *device)
 
 	device->last_flush_error = BLK_STS_OK;
 
-#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	/*
-	 * When a disk has write caching disabled, we skip submission of a bio
-	 * with flush and sync requests before writing the superblock, since
-	 * it's not needed. However when the integrity checker is enabled, this
-	 * results in reports that there are metadata blocks referred by a
-	 * superblock that were not properly flushed. So don't skip the bio
-	 * submission only when the integrity checker is enabled for the sake
-	 * of simplicity, since this is a debug tool and not meant for use in
-	 * non-debug builds.
-	 */
-	if (!bdev_write_cache(device->bdev))
-		return;
-#endif
-
 	bio_init(bio, device->bdev, NULL, 0,
 		 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
-
-	btrfsic_check_bio(bio);
 	submit_bio(bio);
 	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
 }
@@ -4414,16 +4390,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 
 	iput(fs_info->btree_inode);
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
-		btrfsic_unmount(fs_info->fs_devices);
-#endif
-
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 	btrfs_close_devices(fs_info->fs_devices);
 }
 
-void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
+			     struct extent_buffer *buf)
 {
 	struct btrfs_fs_info *fs_info = buf->fs_info;
 	u64 transid = btrfs_header_generation(buf);
@@ -4437,21 +4409,16 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
 		return;
 #endif
+	/* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
+	ASSERT(trans->transid == fs_info->generation);
 	btrfs_assert_tree_write_locked(buf);
-	if (transid != fs_info->generation)
-		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
-			buf->start, transid, fs_info->generation);
-	set_extent_buffer_dirty(buf);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	/*
-	 * btrfs_check_leaf() won't check item data if we don't have WRITTEN
-	 * set, so this will only validate the basic structure of the items.
-	 */
-	if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) {
-		btrfs_print_leaf(buf);
-		ASSERT(0);
+	if (unlikely(transid != fs_info->generation)) {
+		btrfs_abort_transaction(trans, -EUCLEAN);
+		btrfs_crit(fs_info,
+"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
+			   buf->start, transid, fs_info->generation);
 	}
-#endif
+	set_extent_buffer_dirty(buf);
 }
 
 static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
@@ -4611,6 +4578,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				list_del(&ref->add_list);
 			atomic_dec(&delayed_refs->num_entries);
 			btrfs_put_delayed_ref(ref);
+			btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
 		}
 		if (head->must_insert_reserved)
 			pin_bytes = true;
@@ -4808,7 +4776,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
 
 		spin_unlock(&cur_trans->dirty_bgs_lock);
 		btrfs_put_block_group(cache);
-		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
 		spin_lock(&cur_trans->dirty_bgs_lock);
 	}
 	spin_unlock(&cur_trans->dirty_bgs_lock);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 02b645744a82..50dab8f639dc 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -104,7 +104,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 }
 
 void btrfs_put_root(struct btrfs_root *root);
-void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
+			     struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
 int btrfs_read_extent_buffer(struct extent_buffer *buf,
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ff8e117a1ace..ea149be28dff 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -105,32 +105,40 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 		lockdep_set_class(&tree->lock, &file_extent_tree_class);
 }
 
+/*
+ * Empty an io tree, removing and freeing every extent state record from the
+ * tree. This should be called once we are sure no other task can access the
+ * tree anymore, so no tree updates happen after we empty the tree and there
+ * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
+ * set on any extent state when calling this function).
+ */
 void extent_io_tree_release(struct extent_io_tree *tree)
 {
+	struct rb_root root;
+	struct extent_state *state;
+	struct extent_state *tmp;
+
 	spin_lock(&tree->lock);
-	/*
-	 * Do a single barrier for the waitqueue_active check here, the state
-	 * of the waitqueue should not change once extent_io_tree_release is
-	 * called.
-	 */
-	smp_mb();
-	while (!RB_EMPTY_ROOT(&tree->state)) {
-		struct rb_node *node;
-		struct extent_state *state;
-
-		node = rb_first(&tree->state);
-		state = rb_entry(node, struct extent_state, rb_node);
-		rb_erase(&state->rb_node, &tree->state);
+	root = tree->state;
+	tree->state = RB_ROOT;
+	rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
+		/* Clear node to keep free_extent_state() happy. */
 		RB_CLEAR_NODE(&state->rb_node);
+		ASSERT(!(state->state & EXTENT_LOCKED));
 		/*
-		 * btree io trees aren't supposed to have tasks waiting for
-		 * changes in the flags of extent states ever.
+		 * No need for a memory barrier here, as we are holding the tree
+		 * lock and we only change the waitqueue while holding that lock
+		 * (see wait_extent_bit()).
 		 */
 		ASSERT(!waitqueue_active(&state->wq));
 		free_extent_state(state);
-
 		cond_resched_lock(&tree->lock);
 	}
+	/*
+	 * Should still be empty even after a reschedule, no other task should
+	 * be accessing the tree anymore.
+	 */
+	ASSERT(RB_EMPTY_ROOT(&tree->state));
 	spin_unlock(&tree->lock);
 }
 
@@ -327,6 +335,36 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 	"locking error: extent tree was modified by another thread while locked");
 }
 
+static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+	struct extent_state *prev;
+
+	prev = prev_state(state);
+	if (prev && prev->end == state->start - 1 && prev->state == state->state) {
+		if (tree->inode)
+			btrfs_merge_delalloc_extent(tree->inode, state, prev);
+		state->start = prev->start;
+		rb_erase(&prev->rb_node, &tree->state);
+		RB_CLEAR_NODE(&prev->rb_node);
+		free_extent_state(prev);
+	}
+}
+
+static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+	struct extent_state *next;
+
+	next = next_state(state);
+	if (next && next->start == state->end + 1 && next->state == state->state) {
+		if (tree->inode)
+			btrfs_merge_delalloc_extent(tree->inode, state, next);
+		state->end = next->end;
+		rb_erase(&next->rb_node, &tree->state);
+		RB_CLEAR_NODE(&next->rb_node);
+		free_extent_state(next);
+	}
+}
+
 /*
  * Utility function to look for merge candidates inside a given range.  Any
  * extents with matching state are merged together into a single extent in the
@@ -338,31 +376,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  */
 static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
 {
-	struct extent_state *other;
-
 	if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 		return;
 
-	other = prev_state(state);
-	if (other && other->end == state->start - 1 &&
-	    other->state == state->state) {
-		if (tree->inode)
-			btrfs_merge_delalloc_extent(tree->inode, state, other);
-		state->start = other->start;
-		rb_erase(&other->rb_node, &tree->state);
-		RB_CLEAR_NODE(&other->rb_node);
-		free_extent_state(other);
-	}
-	other = next_state(state);
-	if (other && other->start == state->end + 1 &&
-	    other->state == state->state) {
-		if (tree->inode)
-			btrfs_merge_delalloc_extent(tree->inode, state, other);
-		state->end = other->end;
-		rb_erase(&other->rb_node, &tree->state);
-		RB_CLEAR_NODE(&other->rb_node);
-		free_extent_state(other);
-	}
+	merge_prev_state(tree, state);
+	merge_next_state(tree, state);
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
@@ -384,19 +402,27 @@ static void set_state_bits(struct extent_io_tree *tree,
  * Insert an extent_state struct into the tree.  'bits' are set on the
  * struct before it is inserted.
  *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
+ * Returns a pointer to the struct extent_state record containing the range
+ * requested for insertion, which may be the same as the given struct or it
+ * may be an existing record in the tree that was expanded to accommodate the
+ * requested range. In case of an extent_state different from the one that was
+ * given, the later can be freed or reused by the caller.
+ *
+ * On error it returns an error pointer.
  *
  * The tree lock is not taken internally.  This is a utility function and
  * probably isn't what you want to call (see set/clear_extent_bit).
  */
-static int insert_state(struct extent_io_tree *tree,
-			struct extent_state *state,
-			u32 bits, struct extent_changeset *changeset)
+static struct extent_state *insert_state(struct extent_io_tree *tree,
+					 struct extent_state *state,
+					 u32 bits,
+					 struct extent_changeset *changeset)
 {
 	struct rb_node **node;
 	struct rb_node *parent = NULL;
-	const u64 end = state->end;
+	const u64 start = state->start - 1;
+	const u64 end = state->end + 1;
+	const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));
 
 	set_state_bits(tree, state, bits, changeset);
 
@@ -407,23 +433,42 @@ static int insert_state(struct extent_io_tree *tree,
 		parent = *node;
 		entry = rb_entry(parent, struct extent_state, rb_node);
 
-		if (end < entry->start) {
+		if (state->end < entry->start) {
+			if (try_merge && end == entry->start &&
+			    state->state == entry->state) {
+				if (tree->inode)
+					btrfs_merge_delalloc_extent(tree->inode,
+								    state, entry);
+				entry->start = state->start;
+				merge_prev_state(tree, entry);
+				state->state = 0;
+				return entry;
+			}
 			node = &(*node)->rb_left;
-		} else if (end > entry->end) {
+		} else if (state->end > entry->end) {
+			if (try_merge && entry->end == start &&
+			    state->state == entry->state) {
+				if (tree->inode)
+					btrfs_merge_delalloc_extent(tree->inode,
+								    state, entry);
+				entry->end = state->end;
+				merge_next_state(tree, entry);
+				state->state = 0;
+				return entry;
+			}
 			node = &(*node)->rb_right;
 		} else {
 			btrfs_err(tree->fs_info,
 			       "found node %llu %llu on insert of %llu %llu",
-			       entry->start, entry->end, state->start, end);
-			return -EEXIST;
+			       entry->start, entry->end, state->start, state->end);
+			return ERR_PTR(-EEXIST);
 		}
 	}
 
 	rb_link_node(&state->rb_node, parent, node);
 	rb_insert_color(&state->rb_node, &tree->state);
 
-	merge_state(tree, state);
-	return 0;
+	return state;
 }
 
 /*
@@ -708,26 +753,13 @@ out:
 
 }
 
-static void wait_on_state(struct extent_io_tree *tree,
-			  struct extent_state *state)
-		__releases(tree->lock)
-		__acquires(tree->lock)
-{
-	DEFINE_WAIT(wait);
-	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
-	spin_unlock(&tree->lock);
-	schedule();
-	spin_lock(&tree->lock);
-	finish_wait(&state->wq, &wait);
-}
-
 /*
  * Wait for one or more bits to clear on a range in the state tree.
  * The range [start, end] is inclusive.
  * The tree lock is taken by this function
  */
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
-		     struct extent_state **cached_state)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			    u32 bits, struct extent_state **cached_state)
 {
 	struct extent_state *state;
 
@@ -758,9 +790,15 @@ process_node:
 			goto out;
 
 		if (state->state & bits) {
+			DEFINE_WAIT(wait);
+
 			start = state->start;
 			refcount_inc(&state->refs);
-			wait_on_state(tree, state);
+			prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+			spin_unlock(&tree->lock);
+			schedule();
+			spin_lock(&tree->lock);
+			finish_wait(&state->wq, &wait);
 			free_extent_state(state);
 			goto again;
 		}
@@ -847,10 +885,19 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 		if (state->end == start - 1 && extent_state_in_tree(state)) {
 			while ((state = next_state(state)) != NULL) {
 				if (state->state & bits)
-					goto got_it;
+					break;
 			}
+			/*
+			 * If we found the next extent state, clear cached_state
+			 * so that we can cache the next extent state below and
+			 * avoid future calls going over the same extent state
+			 * again. If we haven't found any, clear as well since
+			 * it's now useless.
+			 */
 			free_extent_state(*cached_state);
 			*cached_state = NULL;
+			if (state)
+				goto got_it;
 			goto out;
 		}
 		free_extent_state(*cached_state);
@@ -1133,6 +1180,8 @@ hit_next:
 	 */
 	if (state->start > start) {
 		u64 this_end;
+		struct extent_state *inserted_state;
+
 		if (end < last_start)
 			this_end = end;
 		else
@@ -1148,12 +1197,15 @@ hit_next:
 		 */
 		prealloc->start = start;
 		prealloc->end = this_end;
-		err = insert_state(tree, prealloc, bits, changeset);
-		if (err)
+		inserted_state = insert_state(tree, prealloc, bits, changeset);
+		if (IS_ERR(inserted_state)) {
+			err = PTR_ERR(inserted_state);
 			extent_io_tree_panic(tree, err);
+		}
 
-		cache_state(prealloc, cached_state);
-		prealloc = NULL;
+		cache_state(inserted_state, cached_state);
+		if (inserted_state == prealloc)
+			prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
 	}
@@ -1356,6 +1408,8 @@ hit_next:
 	 */
 	if (state->start > start) {
 		u64 this_end;
+		struct extent_state *inserted_state;
+
 		if (end < last_start)
 			this_end = end;
 		else
@@ -1373,11 +1427,14 @@ hit_next:
 		 */
 		prealloc->start = start;
 		prealloc->end = this_end;
-		err = insert_state(tree, prealloc, bits, NULL);
-		if (err)
+		inserted_state = insert_state(tree, prealloc, bits, NULL);
+		if (IS_ERR(inserted_state)) {
+			err = PTR_ERR(inserted_state);
 			extent_io_tree_panic(tree, err);
-		cache_state(prealloc, cached_state);
-		prealloc = NULL;
+		}
+		cache_state(inserted_state, cached_state);
+		if (inserted_state == prealloc)
+			prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
 	}
@@ -1640,15 +1697,46 @@ search:
 }
 
 /*
- * Search a range in the state tree for a given mask.  If 'filled' == 1, this
- * returns 1 only if every extent in the tree has the bits set.  Otherwise, 1
- * is returned if any bit in the range is found set.
+ * Check if the single @bit exists in the given range.
+ */
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
+{
+	struct extent_state *state = NULL;
+	bool bitset = false;
+
+	ASSERT(is_power_of_2(bit));
+
+	spin_lock(&tree->lock);
+	state = tree_search(tree, start);
+	while (state && start <= end) {
+		if (state->start > end)
+			break;
+
+		if (state->state & bit) {
+			bitset = true;
+			break;
+		}
+
+		/* If state->end is (u64)-1, start will overflow to 0 */
+		start = state->end + 1;
+		if (start > end || start == 0)
+			break;
+		state = next_state(state);
+	}
+	spin_unlock(&tree->lock);
+	return bitset;
+}
+
+/*
+ * Check if the whole range [@start,@end) contains the single @bit set.
  */
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   u32 bits, int filled, struct extent_state *cached)
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+		    struct extent_state *cached)
 {
 	struct extent_state *state = NULL;
-	int bitset = 0;
+	bool bitset = true;
+
+	ASSERT(is_power_of_2(bit));
 
 	spin_lock(&tree->lock);
 	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
@@ -1657,35 +1745,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	else
 		state = tree_search(tree, start);
 	while (state && start <= end) {
-		if (filled && state->start > start) {
-			bitset = 0;
+		if (state->start > start) {
+			bitset = false;
 			break;
 		}
 
 		if (state->start > end)
 			break;
 
-		if (state->state & bits) {
-			bitset = 1;
-			if (!filled)
-				break;
-		} else if (filled) {
-			bitset = 0;
+		if ((state->state & bit) == 0) {
+			bitset = false;
 			break;
 		}
 
 		if (state->end == (u64)-1)
 			break;
 
+		/*
+		 * Last entry (if state->end is (u64)-1 and overflow happens),
+		 * or next entry starts after the range.
+		 */
 		start = state->end + 1;
-		if (start > end)
+		if (start > end || start == 0)
 			break;
 		state = next_state(state);
 	}
 
 	/* We ran out of states and were still inside of our range. */
-	if (filled && !state)
-		bitset = 0;
+	if (!state)
+		bitset = false;
 	spin_unlock(&tree->lock);
 	return bitset;
 }
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 28c23a23d121..5602b0137fcd 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -131,8 +131,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		     struct extent_state **cached_state);
 
 void free_extent_state(struct extent_state *state);
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   u32 bits, int filled, struct extent_state *cached_state);
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+		    struct extent_state *cached_state);
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 			     u32 bits, struct extent_changeset *changeset);
 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -192,7 +193,5 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
 			       u64 *end, u64 max_bytes,
 			       struct extent_state **cached_state);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
-		     struct extent_state **cached_state);
 
 #endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc313fce5bbd..c8e5b4715b49 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,14 +42,16 @@
 #include "file-item.h"
 #include "orphan.h"
 #include "tree-checker.h"
+#include "raid-stripe-tree.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
 
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_ref_head *href,
 			       struct btrfs_delayed_ref_node *node, u64 parent,
 			       u64 root_objectid, u64 owner_objectid,
-			       u64 owner_offset, int refs_to_drop,
+			       u64 owner_offset,
 			       struct btrfs_delayed_extent_op *extra_op);
 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 				    struct extent_buffer *leaf,
@@ -57,7 +59,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      u64 parent, u64 root_objectid,
 				      u64 flags, u64 owner, u64 offset,
-				      struct btrfs_key *ins, int ref_mod);
+				      struct btrfs_key *ins, int ref_mod, u64 oref_root);
 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_delayed_ref_node *node,
 				     struct btrfs_delayed_extent_op *extent_op);
@@ -344,9 +346,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 				     struct btrfs_extent_inline_ref *iref,
 				     enum btrfs_inline_ref_type is_data)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
 	int type = btrfs_extent_inline_ref_type(eb, iref);
 	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
 
+	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+		ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+		return type;
+	}
+
 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
 	    type == BTRFS_SHARED_DATA_REF_KEY ||
@@ -355,26 +363,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
 				return type;
 			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
-				ASSERT(eb->fs_info);
+				ASSERT(fs_info);
 				/*
 				 * Every shared one has parent tree block,
 				 * which must be aligned to sector size.
 				 */
-				if (offset &&
-				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
+				if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
 					return type;
 			}
 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
 			if (type == BTRFS_EXTENT_DATA_REF_KEY)
 				return type;
 			if (type == BTRFS_SHARED_DATA_REF_KEY) {
-				ASSERT(eb->fs_info);
+				ASSERT(fs_info);
 				/*
 				 * Every shared one has parent tree block,
 				 * which must be aligned to sector size.
 				 */
 				if (offset &&
-				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
+				    IS_ALIGNED(offset, fs_info->sectorsize))
 					return type;
 			}
 		} else {
@@ -385,7 +392,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 
 	WARN_ON(1);
 	btrfs_print_leaf(eb);
-	btrfs_err(eb->fs_info,
+	btrfs_err(fs_info,
 		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
 		  eb->start, (unsigned long)iref, type);
 
@@ -399,11 +406,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 	__le64 lenum;
 
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(owner);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(offset);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
 	return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -575,7 +582,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 		}
 	}
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	ret = 0;
 fail:
 	btrfs_release_path(path);
@@ -623,7 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	}
 	return ret;
 }
@@ -789,7 +796,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	int type;
 	int want;
 	int ret;
-	int err = 0;
 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 	int needed;
 
@@ -816,10 +822,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 
 again:
 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
-	if (ret < 0) {
-		err = ret;
+	if (ret < 0)
 		goto out;
-	}
 
 	/*
 	 * We may be a newly converted file system which still has the old fat
@@ -846,7 +850,7 @@ again:
 	}
 
 	if (ret && !insert) {
-		err = -ENOENT;
+		ret = -ENOENT;
 		goto out;
 	} else if (WARN_ON(ret)) {
 		btrfs_print_leaf(path->nodes[0]);
@@ -854,18 +858,18 @@ again:
 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
 			  bytenr, num_bytes, parent, root_objectid, owner,
 			  offset);
-		err = -EIO;
+		ret = -EUCLEAN;
 		goto out;
 	}
 
 	leaf = path->nodes[0];
 	item_size = btrfs_item_size(leaf, path->slots[0]);
 	if (unlikely(item_size < sizeof(*ei))) {
-		err = -EUCLEAN;
+		ret = -EUCLEAN;
 		btrfs_err(fs_info,
 			  "unexpected extent item size, has %llu expect >= %zu",
 			  item_size, sizeof(*ei));
-		btrfs_abort_transaction(trans, err);
+		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
@@ -885,22 +889,17 @@ again:
 	else
 		needed = BTRFS_REF_TYPE_BLOCK;
 
-	err = -ENOENT;
-	while (1) {
-		if (ptr >= end) {
-			if (ptr > end) {
-				err = -EUCLEAN;
-				btrfs_print_leaf(path->nodes[0]);
-				btrfs_crit(fs_info,
-"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
-					path->slots[0], root_objectid, owner, offset, parent);
-			}
-			break;
-		}
+	ret = -ENOENT;
+	while (ptr < end) {
 		iref = (struct btrfs_extent_inline_ref *)ptr;
 		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
+		if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+			ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+			ptr += btrfs_extent_inline_ref_size(type);
+			continue;
+		}
 		if (type == BTRFS_REF_TYPE_INVALID) {
-			err = -EUCLEAN;
+			ret = -EUCLEAN;
 			goto out;
 		}
 
@@ -916,7 +915,7 @@ again:
 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 			if (match_extent_data_ref(leaf, dref, root_objectid,
 						  owner, offset)) {
-				err = 0;
+				ret = 0;
 				break;
 			}
 			if (hash_extent_data_ref_item(leaf, dref) <
@@ -927,14 +926,14 @@ again:
 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
 			if (parent > 0) {
 				if (parent == ref_offset) {
-					err = 0;
+					ret = 0;
 					break;
 				}
 				if (ref_offset < parent)
 					break;
 			} else {
 				if (root_objectid == ref_offset) {
-					err = 0;
+					ret = 0;
 					break;
 				}
 				if (ref_offset < root_objectid)
@@ -943,10 +942,20 @@ again:
 		}
 		ptr += btrfs_extent_inline_ref_size(type);
 	}
-	if (err == -ENOENT && insert) {
+
+	if (unlikely(ptr > end)) {
+		ret = -EUCLEAN;
+		btrfs_print_leaf(path->nodes[0]);
+		btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+			   path->slots[0], root_objectid, owner, offset, parent);
+		goto out;
+	}
+
+	if (ret == -ENOENT && insert) {
 		if (item_size + extra_size >=
 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
-			err = -EAGAIN;
+			ret = -EAGAIN;
 			goto out;
 		}
 		/*
@@ -958,7 +967,7 @@ again:
 		if (find_next_key(path, 0, &key) == 0 &&
 		    key.objectid == bytenr &&
 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			err = -EAGAIN;
+			ret = -EAGAIN;
 			goto out;
 		}
 	}
@@ -969,14 +978,14 @@ out:
 		path->search_for_extension = 0;
 		btrfs_unlock_up_safe(path, 1);
 	}
-	return err;
+	return ret;
 }
 
 /*
  * helper to add new inline back ref
  */
 static noinline_for_stack
-void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
+void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path,
 				 struct btrfs_extent_inline_ref *iref,
 				 u64 parent, u64 root_objectid,
@@ -999,7 +1008,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 	type = extent_ref_type(parent, owner);
 	size = btrfs_extent_inline_ref_size(type);
 
-	btrfs_extend_item(path, size);
+	btrfs_extend_item(trans, path, size);
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
@@ -1033,7 +1042,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 	} else {
 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
 	}
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 }
 
 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1066,7 +1075,9 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
 /*
  * helper to update/remove inline back ref
  */
-static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path,
+static noinline_for_stack int update_inline_extent_backref(
+				  struct btrfs_trans_handle *trans,
+				  struct btrfs_path *path,
 				  struct btrfs_extent_inline_ref *iref,
 				  int refs_to_mod,
 				  struct btrfs_delayed_extent_op *extent_op)
@@ -1174,9 +1185,9 @@ static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *pa
 			memmove_extent_buffer(leaf, ptr, ptr + size,
 					      end - ptr - size);
 		item_size -= size;
-		btrfs_truncate_item(path, item_size, 1);
+		btrfs_truncate_item(trans, path, item_size, 1);
 	}
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	return 0;
 }
 
@@ -1206,9 +1217,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
 				   bytenr, num_bytes, root_objectid, path->slots[0]);
 			return -EUCLEAN;
 		}
-		ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op);
+		ret = update_inline_extent_backref(trans, path, iref,
+						   refs_to_add, extent_op);
 	} else if (ret == -ENOENT) {
-		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
+		setup_inline_extent_backref(trans, path, iref, parent,
 					    root_objectid, owner, offset,
 					    refs_to_add, extent_op);
 		ret = 0;
@@ -1226,7 +1238,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 
 	BUG_ON(!is_data && refs_to_drop != 1);
 	if (iref)
-		ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+		ret = update_inline_extent_backref(trans, path, iref,
+						   -refs_to_drop, NULL);
 	else if (is_data)
 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
 	else
@@ -1422,7 +1435,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
 	       generic_ref->action);
 	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
-	       generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
+	       generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID);
 
 	if (generic_ref->type == BTRFS_REF_METADATA)
 		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -1435,7 +1448,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 }
 
 /*
- * __btrfs_inc_extent_ref - insert backreference for a given extent
+ * Insert backreference for a given extent.
  *
  * The counterpart is in __btrfs_free_extent(), with examples and more details
  * how it works.
@@ -1465,8 +1478,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
  *		    always passed as 0. For data extents it is the fileoffset
  *		    this extent belongs to.
  *
- * @refs_to_add     Number of references to add
- *
  * @extent_op       Pointer to a structure, holding information necessary when
  *                  updating a tree block's flags
  *
@@ -1474,7 +1485,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_delayed_ref_node *node,
 				  u64 parent, u64 root_objectid,
-				  u64 owner, u64 offset, int refs_to_add,
+				  u64 owner, u64 offset,
 				  struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_path *path;
@@ -1484,6 +1495,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	u64 bytenr = node->bytenr;
 	u64 num_bytes = node->num_bytes;
 	u64 refs;
+	int refs_to_add = node->ref_mod;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -1510,7 +1522,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (extent_op)
 		__run_delayed_extent_op(extent_op, leaf, item);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/* now insert the actual backref */
@@ -1530,44 +1542,57 @@ out:
 }
 
 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_delayed_ref_head *href,
 				struct btrfs_delayed_ref_node *node,
 				struct btrfs_delayed_extent_op *extent_op,
 				bool insert_reserved)
 {
 	int ret = 0;
 	struct btrfs_delayed_data_ref *ref;
-	struct btrfs_key ins;
 	u64 parent = 0;
-	u64 ref_root = 0;
 	u64 flags = 0;
 
-	ins.objectid = node->bytenr;
-	ins.offset = node->num_bytes;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
-
 	ref = btrfs_delayed_node_to_data_ref(node);
 	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
 
 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
 		parent = ref->parent;
-	ref_root = ref->root;
 
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		struct btrfs_key key;
+		struct btrfs_squota_delta delta = {
+			.root = href->owning_root,
+			.num_bytes = node->num_bytes,
+			.rsv_bytes = href->reserved_bytes,
+			.is_data = true,
+			.is_inc	= true,
+			.generation = trans->transid,
+		};
+
 		if (extent_op)
 			flags |= extent_op->flags_to_set;
-		ret = alloc_reserved_file_extent(trans, parent, ref_root,
+
+		key.objectid = node->bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = node->num_bytes;
+
+		ret = alloc_reserved_file_extent(trans, parent, ref->root,
 						 flags, ref->objectid,
-						 ref->offset, &ins,
-						 node->ref_mod);
+						 ref->offset, &key,
+						 node->ref_mod, href->owning_root);
+		if (!ret)
+			ret = btrfs_record_squota_delta(trans->fs_info, &delta);
+		else
+			btrfs_qgroup_free_refroot(trans->fs_info, delta.root,
+						  delta.rsv_bytes, BTRFS_QGROUP_RSV_DATA);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
+		ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root,
 					     ref->objectid, ref->offset,
-					     node->ref_mod, extent_op);
+					     extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, node, parent,
-					  ref_root, ref->objectid,
-					  ref->offset, node->ref_mod,
-					  extent_op);
+		ret = __btrfs_free_extent(trans, href, node, parent,
+					  ref->root, ref->objectid,
+					  ref->offset, extent_op);
 	} else {
 		BUG();
 	}
@@ -1604,7 +1629,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	u32 item_size;
 	int ret;
-	int err = 0;
 	int metadata = 1;
 
 	if (TRANS_ABORTED(trans))
@@ -1631,10 +1655,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 again:
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
-		err = ret;
 		goto out;
-	}
-	if (ret > 0) {
+	} else if (ret > 0) {
 		if (metadata) {
 			if (path->slots[0] > 0) {
 				path->slots[0]--;
@@ -1655,7 +1677,7 @@ again:
 				goto again;
 			}
 		} else {
-			err = -EUCLEAN;
+			ret = -EUCLEAN;
 			btrfs_err(fs_info,
 		  "missing extent item for extent %llu num_bytes %llu level %d",
 				  head->bytenr, head->num_bytes, extent_op->level);
@@ -1667,29 +1689,31 @@ again:
 	item_size = btrfs_item_size(leaf, path->slots[0]);
 
 	if (unlikely(item_size < sizeof(*ei))) {
-		err = -EUCLEAN;
+		ret = -EUCLEAN;
 		btrfs_err(fs_info,
 			  "unexpected extent item size, has %u expect >= %zu",
 			  item_size, sizeof(*ei));
-		btrfs_abort_transaction(trans, err);
+		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	__run_delayed_extent_op(extent_op, leaf, ei);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
-	return err;
+	return ret;
 }
 
 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_delayed_ref_head *href,
 				struct btrfs_delayed_ref_node *node,
 				struct btrfs_delayed_extent_op *extent_op,
 				bool insert_reserved)
 {
 	int ret = 0;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_tree_ref *ref;
 	u64 parent = 0;
 	u64 ref_root = 0;
@@ -1709,14 +1733,25 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		return -EUCLEAN;
 	}
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		struct btrfs_squota_delta delta = {
+			.root = href->owning_root,
+			.num_bytes = fs_info->nodesize,
+			.rsv_bytes = 0,
+			.is_data = false,
+			.is_inc = true,
+			.generation = trans->transid,
+		};
+
 		BUG_ON(!extent_op || !extent_op->update_flags);
 		ret = alloc_reserved_tree_block(trans, node, extent_op);
+		if (!ret)
+			btrfs_record_squota_delta(fs_info, &delta);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
-					     ref->level, 0, 1, extent_op);
+					     ref->level, 0, extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, node, parent, ref_root,
-					  ref->level, 0, 1, extent_op);
+		ret = __btrfs_free_extent(trans, href, node, parent, ref_root,
+					  ref->level, 0, extent_op);
 	} else {
 		BUG();
 	}
@@ -1725,6 +1760,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 
 /* helper function to actually process a single delayed ref entry */
 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_ref_head *href,
 			       struct btrfs_delayed_ref_node *node,
 			       struct btrfs_delayed_extent_op *extent_op,
 			       bool insert_reserved)
@@ -1739,12 +1775,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 
 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
-		ret = run_delayed_tree_ref(trans, node, extent_op,
+		ret = run_delayed_tree_ref(trans, href, node, extent_op,
 					   insert_reserved);
 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
-		ret = run_delayed_data_ref(trans, node, extent_op,
+		ret = run_delayed_data_ref(trans, href, node, extent_op,
 					   insert_reserved);
+	else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
+		ret = 0;
 	else
 		BUG();
 	if (ret && insert_reserved)
@@ -1823,28 +1861,37 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
 	return ret ? ret : 1;
 }
 
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
 				  struct btrfs_delayed_ref_root *delayed_refs,
 				  struct btrfs_delayed_ref_head *head)
 {
-	int nr_items = 1;	/* Dropping this ref head update. */
-
 	/*
 	 * We had csum deletions accounted for in our delayed refs rsv, we need
 	 * to drop the csum leaves for this update from our delayed_refs_rsv.
 	 */
 	if (head->total_ref_mod < 0 && head->is_data) {
+		int nr_csums;
+
 		spin_lock(&delayed_refs->lock);
 		delayed_refs->pending_csums -= head->num_bytes;
 		spin_unlock(&delayed_refs->lock);
-		nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+		nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+
+		btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
+
+		return btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
 	}
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
+	    head->must_insert_reserved && head->is_data)
+		btrfs_qgroup_free_refroot(fs_info, head->owning_root,
+					  head->reserved_bytes, BTRFS_QGROUP_RSV_DATA);
 
-	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+	return 0;
 }
 
 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
-			    struct btrfs_delayed_ref_head *head)
+			    struct btrfs_delayed_ref_head *head,
+			    u64 *bytes_released)
 {
 
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1889,7 +1936,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+	*bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
 
 	trace_run_delayed_ref_head(fs_info, head, 0);
 	btrfs_delayed_ref_unlock(head);
@@ -1931,7 +1978,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
 }
 
 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
-					   struct btrfs_delayed_ref_head *locked_ref)
+					   struct btrfs_delayed_ref_head *locked_ref,
+					   u64 *bytes_released)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -1985,8 +2033,10 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 		locked_ref->extent_op = NULL;
 		spin_unlock(&locked_ref->lock);
 
-		ret = run_one_delayed_ref(trans, ref, extent_op,
+		ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
 					  must_insert_reserved);
+		btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
+		*bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
 
 		btrfs_free_delayed_extent_op(extent_op);
 		if (ret) {
@@ -2010,15 +2060,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
  */
 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-					     unsigned long nr)
+					     u64 min_bytes)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	int ret;
 	unsigned long count = 0;
+	unsigned long max_count = 0;
+	u64 bytes_processed = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
+	if (min_bytes == 0) {
+		max_count = delayed_refs->num_heads_ready;
+		min_bytes = U64_MAX;
+	}
+
 	do {
 		if (!locked_ref) {
 			locked_ref = btrfs_obtain_ref_head(trans);
@@ -2046,7 +2103,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		spin_lock(&locked_ref->lock);
 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
 
-		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
+		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
 		if (ret < 0 && ret != -EAGAIN) {
 			/*
 			 * Error, btrfs_run_delayed_refs_for_head already
@@ -2058,7 +2115,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			 * Success, perform the usual cleanup of a processed
 			 * head
 			 */
-			ret = cleanup_ref_head(trans, locked_ref);
+			ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
 			if (ret > 0 ) {
 				/* We dropped our lock, we need to loop. */
 				ret = 0;
@@ -2075,7 +2132,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 		locked_ref = NULL;
 		cond_resched();
-	} while ((nr != -1 && count < nr) || locked_ref);
+	} while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
+		 (max_count > 0 && count < max_count) ||
+		 locked_ref);
 
 	return 0;
 }
@@ -2124,24 +2183,25 @@ static u64 find_middle(struct rb_root *root)
 #endif
 
 /*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far.  count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
+ * Start processing the delayed reference count updates and extent insertions
+ * we have queued up so far.
+ *
+ * @trans:	Transaction handle.
+ * @min_bytes:	How many bytes of delayed references to process. After this
+ *		many bytes we stop processing delayed references if there are
+ *		any more. If 0 it means to run all existing delayed references,
+ *		but not new ones added after running all existing ones.
+ *		Use (u64)-1 (U64_MAX) to run all existing delayed references
+ *		plus any new ones that are added.
  *
  * Returns 0 on success or if called with an aborted transaction
  * Returns <0 on error and aborts the transaction
  */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-			   unsigned long count)
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_head *head;
 	int ret;
-	int run_all = count == (unsigned long)-1;
 
 	/* We'll clean this up in btrfs_cleanup_transaction */
 	if (TRANS_ABORTED(trans))
@@ -2151,42 +2211,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		return 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	if (count == 0)
-		count = delayed_refs->num_heads_ready;
-
 again:
 #ifdef SCRAMBLE_DELAYED_REFS
 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
-	ret = __btrfs_run_delayed_refs(trans, count);
+	ret = __btrfs_run_delayed_refs(trans, min_bytes);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
 
-	if (run_all) {
+	if (min_bytes == U64_MAX) {
 		btrfs_create_pending_block_groups(trans);
 
 		spin_lock(&delayed_refs->lock);
-		node = rb_first_cached(&delayed_refs->href_root);
-		if (!node) {
+		if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
 			spin_unlock(&delayed_refs->lock);
-			goto out;
+			return 0;
 		}
-		head = rb_entry(node, struct btrfs_delayed_ref_head,
-				href_node);
-		refcount_inc(&head->refs);
 		spin_unlock(&delayed_refs->lock);
 
-		/* Mutex was contended, block until it's released and retry. */
-		mutex_lock(&head->mutex);
-		mutex_unlock(&head->mutex);
-
-		btrfs_put_delayed_ref_head(head);
 		cond_resched();
 		goto again;
 	}
-out:
+
 	return 0;
 }
 
@@ -2311,6 +2359,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
 	u32 item_size;
+	u32 expected_size;
 	int type;
 	int ret;
 
@@ -2337,10 +2386,22 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	ret = 1;
 	item_size = btrfs_item_size(leaf, path->slots[0]);
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
+
+	/* No inline refs; we need to bail before checking for owner ref. */
+	if (item_size == sizeof(*ei))
+		goto out;
+
+	/* Check for an owner ref; skip over it to the real inline refs. */
+	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+	if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
+		expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+		iref = (struct btrfs_extent_inline_ref *)(iref + 1);
+	}
 
 	/* If extent item has more than 1 inline ref then it's shared */
-	if (item_size != sizeof(*ei) +
-	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+	if (item_size != expected_size)
 		goto out;
 
 	/*
@@ -2352,8 +2413,6 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	     btrfs_root_last_snapshot(&root->root_item)))
 		goto out;
 
-	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-
 	/* If this extent has SHARED_DATA_REF then it's shared */
 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
@@ -2450,7 +2509,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			btrfs_init_generic_ref(&generic_ref, action, bytenr,
-					       num_bytes, parent);
+					       num_bytes, parent, ref_root);
 			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
 					    key.offset, root->root_key.objectid,
 					    for_reloc);
@@ -2463,8 +2522,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = fs_info->nodesize;
+			/* We don't know the owning_root, use 0. */
 			btrfs_init_generic_ref(&generic_ref, action, bytenr,
-					       num_bytes, parent);
+					       num_bytes, parent, 0);
 			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
 					    root->root_key.objectid, for_reloc);
 			if (inc)
@@ -2565,16 +2625,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-/*
- * this function must be called within transaction
- */
 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    u64 bytenr, u64 num_bytes)
+				    const struct extent_buffer *eb)
 {
 	struct btrfs_block_group *cache;
 	int ret;
 
-	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
+	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
 	if (!cache)
 		return -EINVAL;
 
@@ -2586,10 +2643,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	pin_down_extent(trans, cache, bytenr, num_bytes, 0);
+	pin_down_extent(trans, cache, eb->start, eb->len, 0);
 
 	/* remove us from the free space cache (if we're there at all) */
-	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+	ret = btrfs_remove_free_space(cache, eb->start, eb->len);
 out:
 	btrfs_put_block_group(cache);
 	return ret;
@@ -2844,12 +2901,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 	return 0;
 }
 
+/*
+ * Parse an extent item's inline extents looking for a simple quotas owner ref.
+ *
+ * @fs_info:	the btrfs_fs_info for this mount
+ * @leaf:	a leaf in the extent tree containing the extent item
+ * @slot:	the slot in the leaf where the extent item is found
+ *
+ * Returns the objectid of the root that originally allocated the extent item
+ * if the inline owner ref is expected and present, otherwise 0.
+ *
+ * If an extent item has an owner ref item, it will be the first inline ref
+ * item. Therefore the logic is to check whether there are any inline ref
+ * items, then check the type of the first one.
+ */
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+				struct extent_buffer *leaf, int slot)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_owner_ref *oref;
+	unsigned long ptr;
+	unsigned long end;
+	int type;
+
+	if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
+		return 0;
+
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + btrfs_item_size(leaf, slot);
+
+	/* No inline ref items of any kind, can't check type. */
+	if (ptr == end)
+		return 0;
+
+	iref = (struct btrfs_extent_inline_ref *)ptr;
+	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
+
+	/* We found an owner ref, get the root out of it. */
+	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+		return btrfs_extent_owner_ref_root_id(leaf, oref);
+	}
+
+	/* We have inline refs, but not an owner ref. */
+	return 0;
+}
+
 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
-				     u64 bytenr, u64 num_bytes, bool is_data)
+				     u64 bytenr, struct btrfs_squota_delta *delta)
 {
 	int ret;
+	u64 num_bytes = delta->num_bytes;
 
-	if (is_data) {
+	if (delta->is_data) {
 		struct btrfs_root *csum_root;
 
 		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
@@ -2858,6 +2964,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
+
+		ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			return ret;
+		}
+	}
+
+	ret = btrfs_record_squota_delta(trans->fs_info, delta);
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		return ret;
 	}
 
 	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
@@ -2940,9 +3058,10 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
  */
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_ref_head *href,
 			       struct btrfs_delayed_ref_node *node, u64 parent,
 			       u64 root_objectid, u64 owner_objectid,
-			       u64 owner_offset, int refs_to_drop,
+			       u64 owner_offset,
 			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_fs_info *info = trans->fs_info;
@@ -2957,11 +3076,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
+	int refs_to_drop = node->ref_mod;
 	u32 item_size;
 	u64 refs;
 	u64 bytenr = node->bytenr;
 	u64 num_bytes = node->num_bytes;
 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+	u64 delayed_ref_root = href->owning_root;
 
 	extent_root = btrfs_extent_root(info, bytenr);
 	ASSERT(extent_root);
@@ -3151,7 +3272,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		} else {
 			btrfs_set_extent_refs(leaf, ei, refs);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 		}
 		if (found_extent) {
 			ret = remove_extent_backref(trans, extent_root, path,
@@ -3162,6 +3283,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 	} else {
+		struct btrfs_squota_delta delta = {
+			.root = delayed_ref_root,
+			.num_bytes = num_bytes,
+			.rsv_bytes = 0,
+			.is_data = is_data,
+			.is_inc = false,
+			.generation = btrfs_extent_generation(leaf, ei),
+		};
+
 		/* In this branch refs == 1 */
 		if (found_extent) {
 			if (is_data && refs_to_drop !=
@@ -3200,6 +3330,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				num_to_del = 2;
 			}
 		}
+		/*
+		 * We can't infer the data owner from the delayed ref, so we need
+		 * to try to get it from the owning ref item.
+		 *
+		 * If it is not present, then that extent was not written under
+		 * simple quotas mode, so we don't need to account for its deletion.
+		 */
+		if (is_data)
+			delta.root = btrfs_get_extent_owner_root(trans->fs_info,
+								 leaf, extent_slot);
 
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
@@ -3209,7 +3349,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		}
 		btrfs_release_path(path);
 
-		ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
+		ret = do_free_extent_accounting(trans, bytenr, &delta);
 	}
 	btrfs_release_path(path);
 
@@ -3283,7 +3423,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	int ret;
 
 	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
-			       buf->start, buf->len, parent);
+			       buf->start, buf->len, parent, btrfs_header_owner(buf));
 	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
 			    root_id, 0, false);
 
@@ -3370,10 +3510,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
 	 * tree, just update pinning info and exit early.
 	 */
 	if ((ref->type == BTRFS_REF_METADATA &&
-	     ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+	     ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
 	    (ref->type == BTRFS_REF_DATA &&
-	     ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
-		/* unlocks the pinned mutex */
+	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
 		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
 		ret = 0;
 	} else if (ref->type == BTRFS_REF_METADATA) {
@@ -3383,9 +3522,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
 	}
 
 	if (!((ref->type == BTRFS_REF_METADATA &&
-	       ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+	       ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
 	      (ref->type == BTRFS_REF_DATA &&
-	       ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
+	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
 		btrfs_ref_tree_mod(fs_info, ref);
 
 	return ret;
@@ -4442,8 +4581,8 @@ loop:
 }
 
 /*
- * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
- *			  hole that is at least as big as @num_bytes.
+ * Entry point to the extent allocator. Tries to find a hole that is at least
+ * as big as @num_bytes.
  *
  * @root           -	The root that will contain this extent
  *
@@ -4562,20 +4701,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
-			      u64 len)
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+			      const struct extent_buffer *eb)
 {
 	struct btrfs_block_group *cache;
 	int ret = 0;
 
-	cache = btrfs_lookup_block_group(trans->fs_info, start);
+	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
 	if (!cache) {
 		btrfs_err(trans->fs_info, "unable to find block group for %llu",
-			  start);
+			  eb->start);
 		return -ENOSPC;
 	}
 
-	ret = pin_down_extent(trans, cache, start, len, 1);
+	ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
 	btrfs_put_block_group(cache);
 	return ret;
 }
@@ -4605,24 +4744,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      u64 parent, u64 root_objectid,
 				      u64 flags, u64 owner, u64 offset,
-				      struct btrfs_key *ins, int ref_mod)
+				      struct btrfs_key *ins, int ref_mod, u64 oref_root)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *extent_root;
 	int ret;
 	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_owner_ref *oref;
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	int type;
 	u32 size;
+	const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
 
 	if (parent > 0)
 		type = BTRFS_SHARED_DATA_REF_KEY;
 	else
 		type = BTRFS_EXTENT_DATA_REF_KEY;
 
-	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+	size = sizeof(*extent_item);
+	if (simple_quota)
+		size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+	size += btrfs_extent_inline_ref_size(type);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -4644,7 +4788,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 			       flags | BTRFS_EXTENT_FLAG_DATA);
 
 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+	if (simple_quota) {
+		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
+		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+		btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
+		iref = (struct btrfs_extent_inline_ref *)(oref + 1);
+	}
 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+
 	if (parent > 0) {
 		struct btrfs_shared_data_ref *ref;
 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
@@ -4659,7 +4810,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
 	}
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_free_path(path);
 
 	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
@@ -4734,7 +4885,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
 	}
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_free_path(path);
 
 	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
@@ -4746,12 +4897,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_key *ins)
 {
 	struct btrfs_ref generic_ref = { 0 };
+	u64 root_objectid = root->root_key.objectid;
+	u64 owning_root = root_objectid;
 
-	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+		owning_root = root->relocation_src_root;
 
 	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
-			       ins->objectid, ins->offset, 0);
-	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+			       ins->objectid, ins->offset, 0, owning_root);
+	btrfs_init_data_ref(&generic_ref, root_objectid, owner,
 			    offset, 0, false);
 	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
 
@@ -4771,6 +4927,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_block_group *block_group;
 	struct btrfs_space_info *space_info;
+	struct btrfs_squota_delta delta = {
+		.root = root_objectid,
+		.num_bytes = ins->offset,
+		.generation = trans->transid,
+		.rsv_bytes = 0,
+		.is_data = true,
+		.is_inc = true,
+	};
 
 	/*
 	 * Mixed block groups will exclude before processing the log so we only
@@ -4796,13 +4960,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	spin_unlock(&space_info->lock);
 
 	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
-					 offset, ins, 1);
+					 offset, ins, 1, root_objectid);
 	if (ret)
 		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
+	ret = btrfs_record_squota_delta(fs_info, &delta);
 	btrfs_put_block_group(block_group);
 	return ret;
 }
 
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extra safety check in case the extent tree is corrupted and extent allocator
+ * chooses to use a tree block which is already used and locked.
+ */
+static bool check_eb_lock_owner(const struct extent_buffer *eb)
+{
+	if (eb->lock_owner == current->pid) {
+		btrfs_err_rl(eb->fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+			     eb->start, btrfs_header_owner(eb), current->pid);
+		return true;
+	}
+	return false;
+}
+#else
+static bool check_eb_lock_owner(struct extent_buffer *eb)
+{
+	return false;
+}
+#endif
+
 static struct extent_buffer *
 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      u64 bytenr, int level, u64 owner,
@@ -4816,15 +5003,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (IS_ERR(buf))
 		return buf;
 
-	/*
-	 * Extra safety check in case the extent tree is corrupted and extent
-	 * allocator chooses to use a tree block which is already used and
-	 * locked.
-	 */
-	if (buf->lock_owner == current->pid) {
-		btrfs_err_rl(fs_info,
-"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
-			buf->start, btrfs_header_owner(buf), current->pid);
+	if (check_eb_lock_owner(buf)) {
 		free_extent_buffer(buf);
 		return ERR_PTR(-EUCLEAN);
 	}
@@ -4901,6 +5080,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     const struct btrfs_disk_key *key,
 					     int level, u64 hint,
 					     u64 empty_size,
+					     u64 reloc_src_root,
 					     enum btrfs_lock_nesting nest)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4913,6 +5093,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 	int ret;
 	u32 blocksize = fs_info->nodesize;
 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+	u64 owning_root;
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 	if (btrfs_is_testing(fs_info)) {
@@ -4939,11 +5120,13 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 		ret = PTR_ERR(buf);
 		goto out_free_reserved;
 	}
+	owning_root = btrfs_header_owner(buf);
 
 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
 		if (parent == 0)
 			parent = ins.objectid;
 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		owning_root = reloc_src_root;
 	} else
 		BUG_ON(parent > 0);
 
@@ -4963,7 +5146,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 		extent_op->level = level;
 
 		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
-				       ins.objectid, ins.offset, parent);
+				       ins.objectid, ins.offset, parent, owning_root);
 		btrfs_init_tree_ref(&generic_ref, level, root_objectid,
 				    root->root_key.objectid, false);
 		btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -5384,7 +5567,8 @@ skip:
 		find_next_key(path, level, &wc->drop_progress);
 
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
-				       fs_info->nodesize, parent);
+				       fs_info->nodesize, parent,
+				       btrfs_header_owner(next));
 		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
 				    0, false);
 		ret = btrfs_free_extent(trans, &ref);
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 88c249c37516..0716f65d9753 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -7,6 +7,7 @@
 #include "block-group.h"
 
 struct btrfs_free_cluster;
+struct btrfs_delayed_ref_head;
 
 enum btrfs_extent_allocation_policy {
 	BTRFS_EXTENT_ALLOC_CLUSTERED,
@@ -91,8 +92,8 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 				     enum btrfs_inline_ref_type is_data);
 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
 
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes);
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
 				  struct btrfs_delayed_ref_root *delayed_refs,
 				  struct btrfs_delayed_ref_head *head);
 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
@@ -102,7 +103,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
 		     int reserved);
 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    u64 bytenr, u64 num_bytes);
+				    const struct extent_buffer *eb);
 int btrfs_exclude_logged_extents(struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_root *root,
 			  u64 objectid, u64 offset, u64 bytenr, bool strict,
@@ -113,6 +114,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     const struct btrfs_disk_key *key,
 					     int level, u64 hint,
 					     u64 empty_size,
+					     u64 reloc_src_root,
 					     enum btrfs_lock_nesting nest);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   u64 root_id,
@@ -136,12 +138,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct extent_buffer *eb, u64 flags);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
 
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+				struct extent_buffer *leaf, int slot);
 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 			       u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+			      const struct extent_buffer *eb);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
 				     int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index caccd0376342..03cef28d9e37 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -21,7 +21,6 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "bio.h"
-#include "check-integrity.h"
 #include "locking.h"
 #include "rcu-string.h"
 #include "backref.h"
@@ -395,7 +394,7 @@ again:
 
 	/* then test to make sure it is all still delalloc */
 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
-			     EXTENT_DELALLOC, 1, cached_state);
+			     EXTENT_DELALLOC, cached_state);
 	if (!ret) {
 		unlock_extent(tree, delalloc_start, delalloc_end,
 			      &cached_state);
@@ -2294,7 +2293,7 @@ static int try_release_extent_state(struct extent_io_tree *tree,
 	u64 end = start + PAGE_SIZE - 1;
 	int ret = 1;
 
-	if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
+	if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
 		ret = 0;
 	} else {
 		u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
@@ -2353,9 +2352,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
 				free_extent_map(em);
 				break;
 			}
-			if (test_range_bit(tree, em->start,
-					   extent_map_end(em) - 1,
-					   EXTENT_LOCKED, 0, NULL))
+			if (test_range_bit_exists(tree, em->start,
+						  extent_map_end(em) - 1,
+						  EXTENT_LOCKED))
 				goto next;
 			/*
 			 * If it's not in the list of modified extents, used
@@ -3455,6 +3454,12 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 			  start, fs_info->nodesize);
 		return -EINVAL;
 	}
+	if (!IS_ALIGNED(start, fs_info->nodesize) &&
+	    !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+		btrfs_warn(fs_info,
+"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
+			      start, fs_info->nodesize);
+	}
 	return 0;
 }
 
@@ -4248,14 +4253,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
 }
 
 /*
- * eb_bitmap_offset() - calculate the page and offset of the byte containing the
- * given bit number
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains the
- * given bit number
- * @page_offset: return offset into the page given by page_index
+ * Calculate the page and offset of the byte containing the given bit number.
+ *
+ * @eb:           the extent buffer
+ * @start:        offset of the bitmap item in the extent buffer
+ * @nr:           bit number
+ * @page_index:   return index of the page in the extent buffer that contains
+ *                the given bit number
+ * @page_offset:  return offset into the page given by page_index
  *
  * This helper hides the ugliness of finding the byte in an extent buffer which
  * contains a given bit.
@@ -4614,7 +4619,8 @@ int try_release_extent_buffer(struct page *page)
 }
 
 /*
- * btrfs_readahead_tree_block - attempt to readahead a child block
+ * Attempt to readahead a child block.
+ *
  * @fs_info:	the fs_info
  * @bytenr:	bytenr to read
  * @owner_root: objectid of the root that owns this eb
@@ -4653,7 +4659,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * btrfs_readahead_node_child - readahead a node's child block
+ * Readahead a node's child block.
+ *
  * @node:	parent node we're reading from
  * @slot:	slot in the parent node for the child we want to read
  *
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 68368ba99321..2171057a4477 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -80,16 +80,16 @@ struct extent_buffer {
 	spinlock_t refs_lock;
 	atomic_t refs;
 	int read_mirror;
-	struct rcu_head rcu_head;
-	pid_t lock_owner;
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	s8 log_index;
+	struct rcu_head rcu_head;
 
 	struct rw_semaphore lock;
 
 	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
+	pid_t lock_owner;
 #endif
 };
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ce5dd154499..45cae356e89b 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -194,7 +194,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_encryption(leaf, item, 0);
 	btrfs_set_file_extent_other_encoding(leaf, item, 0);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -811,11 +811,12 @@ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
  * This calls btrfs_truncate_item with the correct args based on the overlap,
  * and fixes up the key as required.
  */
-static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
+static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
 				       struct btrfs_path *path,
 				       struct btrfs_key *key,
 				       u64 bytenr, u64 len)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct extent_buffer *leaf;
 	const u32 csum_size = fs_info->csum_size;
 	u64 csum_end;
@@ -836,7 +837,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
 		 */
 		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
 		new_size *= csum_size;
-		btrfs_truncate_item(path, new_size, 1);
+		btrfs_truncate_item(trans, path, new_size, 1);
 	} else if (key->offset >= bytenr && csum_end > end_byte &&
 		   end_byte > key->offset) {
 		/*
@@ -848,10 +849,10 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
 		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
 		new_size *= csum_size;
 
-		btrfs_truncate_item(path, new_size, 0);
+		btrfs_truncate_item(trans, path, new_size, 0);
 
 		key->offset = end_byte;
-		btrfs_set_item_key_safe(fs_info, path, key);
+		btrfs_set_item_key_safe(trans, path, key);
 	} else {
 		BUG();
 	}
@@ -994,7 +995,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 
 			key.offset = end_byte - 1;
 		} else {
-			truncate_one_csum(fs_info, path, &key, bytenr, len);
+			truncate_one_csum(trans, path, &key, bytenr, len);
 			if (key.offset < bytenr)
 				break;
 		}
@@ -1202,7 +1203,7 @@ extend_csum:
 		diff /= csum_size;
 		diff *= csum_size;
 
-		btrfs_extend_item(path, diff);
+		btrfs_extend_item(trans, path, diff);
 		ret = 0;
 		goto csum;
 	}
@@ -1249,7 +1250,7 @@ found:
 	ins_size /= csum_size;
 	total_bytes += ins_size * fs_info->sectorsize;
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	if (total_bytes < sums->len) {
 		btrfs_release_path(path);
 		cond_resched();
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 361535c71c0f..f47731c45bb5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -17,6 +17,7 @@
 #include <linux/uio.h>
 #include <linux/iversion.h>
 #include <linux/fsverity.h>
+#include <linux/iomap.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -368,12 +369,13 @@ next_slot:
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - args->start);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 
 			if (update_refs && disk_bytenr > 0) {
 				btrfs_init_generic_ref(&ref,
 						BTRFS_ADD_DELAYED_REF,
-						disk_bytenr, num_bytes, 0);
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid);
 				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						new_key.objectid,
@@ -405,13 +407,13 @@ next_slot:
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = args->end;
-			btrfs_set_item_key_safe(fs_info, path, &new_key);
+			btrfs_set_item_key_safe(trans, path, &new_key);
 
 			extent_offset += args->end - key.offset;
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - args->end);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 			if (update_refs && disk_bytenr > 0)
 				args->bytes_found += args->end - key.offset;
 			break;
@@ -431,7 +433,7 @@ next_slot:
 
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							args->start - key.offset);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 			if (update_refs && disk_bytenr > 0)
 				args->bytes_found += extent_end - args->start;
 			if (args->end == extent_end)
@@ -463,7 +465,8 @@ delete_extent_item:
 			} else if (update_refs && disk_bytenr > 0) {
 				btrfs_init_generic_ref(&ref,
 						BTRFS_DROP_DELAYED_REF,
-						disk_bytenr, num_bytes, 0);
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid);
 				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						key.objectid,
@@ -536,7 +539,8 @@ delete_extent_item:
 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 				path->slots[0]++;
 		}
-		btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
+		btrfs_setup_item_for_insert(trans, root, path, &key,
+					    args->extent_item_size);
 		args->extent_inserted = true;
 	}
 
@@ -593,7 +597,6 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct btrfs_inode *inode, u64 start, u64 end)
 {
-	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *root = inode->root;
 	struct extent_buffer *leaf;
 	struct btrfs_path *path;
@@ -664,7 +667,7 @@ again:
 				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			new_key.offset = end;
-			btrfs_set_item_key_safe(fs_info, path, &new_key);
+			btrfs_set_item_key_safe(trans, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_generation(leaf, fi,
@@ -679,7 +682,7 @@ again:
 							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							end - other_start);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 			goto out;
 		}
 	}
@@ -698,7 +701,7 @@ again:
 							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
-			btrfs_set_item_key_safe(fs_info, path, &new_key);
+			btrfs_set_item_key_safe(trans, path, &new_key);
 
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
@@ -708,7 +711,7 @@ again:
 							other_end - start);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     start - orig_offset);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 			goto out;
 		}
 	}
@@ -742,10 +745,10 @@ again:
 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - split);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
-				       num_bytes, 0);
+				       num_bytes, 0, root->root_key.objectid);
 		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
 				    orig_offset, 0, false);
 		ret = btrfs_inc_extent_ref(trans, &ref);
@@ -771,7 +774,7 @@ again:
 	other_start = end;
 	other_end = 0;
 	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
-			       num_bytes, 0);
+			       num_bytes, 0, root->root_key.objectid);
 	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
 			    0, false);
 	if (extent_mergeable(leaf, path->slots[0] + 1,
@@ -814,7 +817,7 @@ again:
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	} else {
 		fi = btrfs_item_ptr(leaf, del_slot - 1,
 			   struct btrfs_file_extent_item);
@@ -823,7 +826,7 @@ again:
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - key.offset);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret < 0) {
@@ -1108,17 +1111,18 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
 
 static void update_time_for_write(struct inode *inode)
 {
-	struct timespec64 now, ctime;
+	struct timespec64 now, ts;
 
 	if (IS_NOCMTIME(inode))
 		return;
 
 	now = current_time(inode);
-	if (!timespec64_equal(&inode->i_mtime, &now))
-		inode->i_mtime = now;
+	ts = inode_get_mtime(inode);
+	if (!timespec64_equal(&ts, &now))
+		inode_set_mtime_to_ts(inode, now);
 
-	ctime = inode_get_ctime(inode);
-	if (!timespec64_equal(&ctime, &now))
+	ts = inode_get_ctime(inode);
+	if (!timespec64_equal(&ts, &now))
 		inode_set_ctime_to_ts(inode, now);
 
 	if (IS_I_VERSION(inode))
@@ -1746,7 +1750,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
 	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
-	if (btrfs_inode_in_log(inode, fs_info->generation) &&
+	if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
 	    list_empty(&ctx->ordered_extents))
 		return true;
 
@@ -1757,7 +1761,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
 	 * and for a fast fsync we don't wait for that, we only wait for the
 	 * writeback to complete.
 	 */
-	if (inode->last_trans <= fs_info->last_trans_committed &&
+	if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
 	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
 	     list_empty(&ctx->ordered_extents)))
 		return true;
@@ -1886,7 +1890,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	atomic_inc(&root->log_batch);
 
-	smp_mb();
 	if (skip_inode_logging(&ctx)) {
 		/*
 		 * We've had everything committed since the last time we were
@@ -2104,7 +2107,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 		goto out;
 	}
 
@@ -2112,7 +2115,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 		u64 num_bytes;
 
 		key.offset = offset;
-		btrfs_set_item_key_safe(fs_info, path, &key);
+		btrfs_set_item_key_safe(trans, path, &key);
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2121,7 +2124,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 		goto out;
 	}
 	btrfs_release_path(path);
@@ -2273,7 +2276,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
 	if (extent_info->is_new_extent)
 		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
@@ -2303,7 +2306,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
 
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
 				       extent_info->disk_offset,
-				       extent_info->disk_len, 0);
+				       extent_info->disk_len, 0,
+				       root->root_key.objectid);
 		ref_offset = extent_info->file_offset - extent_info->data_offset;
 		btrfs_init_data_ref(&ref, root->root_key.objectid,
 				    btrfs_ino(inode), ref_offset, 0, false);
@@ -2473,9 +2477,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 		inode_inc_iversion(&inode->vfs_inode);
 
 		if (!extent_info || extent_info->update_times)
-			inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
+			inode_set_mtime_to_ts(&inode->vfs_inode,
+					      inode_set_ctime_current(&inode->vfs_inode));
 
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode(trans, inode);
 		if (ret)
 			break;
 
@@ -2714,8 +2719,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
 
 	ASSERT(trans != NULL);
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	updated_inode = true;
 	btrfs_end_transaction(trans);
 	btrfs_btree_balance_dirty(fs_info);
@@ -2734,14 +2739,14 @@ out_only_mutex:
 		struct timespec64 now = inode_set_ctime_current(inode);
 
 		inode_inc_iversion(inode);
-		inode->i_mtime = now;
+		inode_set_mtime_to_ts(inode, now);
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
 		} else {
 			int ret2;
 
-			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(inode));
 			ret2 = btrfs_end_transaction(trans);
 			if (!ret)
 				ret = ret2;
@@ -2808,7 +2813,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
 	inode_set_ctime_current(inode);
 	i_size_write(inode, end);
 	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	ret2 = btrfs_end_transaction(trans);
 
 	return ret ? ret : ret2;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 27fad70451aa..6f93c9a2c3e3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -57,6 +57,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_free_space *info, u64 offset,
 			      u64 bytes, bool update_stats);
 
+static void btrfs_crc32c_final(u32 crc, u8 *result)
+{
+	put_unaligned_le32(~crc, result);
+}
+
 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
 {
 	struct btrfs_free_space *info;
@@ -195,7 +200,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
 	btrfs_set_inode_nlink(leaf, inode_item, 1);
 	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 	btrfs_set_inode_block_group(leaf, inode_item, offset);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -213,7 +218,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
 				struct btrfs_free_space_header);
 	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
 	btrfs_set_free_space_key(leaf, header, &disk_key);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	return 0;
@@ -354,7 +359,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto fail;
 
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 
 fail:
 	if (locked)
@@ -540,7 +545,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 	if (index == 0)
 		offset = sizeof(u32) * io_ctl->num_pages;
 
-	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+	crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 	btrfs_crc32c_final(crc, (u8 *)&crc);
 	io_ctl_unmap_page(io_ctl);
 	tmp = page_address(io_ctl->pages[0]);
@@ -562,7 +567,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 	val = *tmp;
 
 	io_ctl_map_page(io_ctl, 0);
-	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+	crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 	btrfs_crc32c_final(crc, (u8 *)&crc);
 	if (val != crc) {
 		btrfs_err_rl(io_ctl->fs_info,
@@ -1185,7 +1190,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
 	btrfs_set_free_space_entries(leaf, header, entries);
 	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
 	btrfs_set_free_space_generation(leaf, header, trans->transid);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	return 0;
@@ -1321,7 +1326,7 @@ out:
 	  "failed to write free space cache for block group %llu error %d",
 				  block_group->start, ret);
 	}
-	btrfs_update_inode(trans, root, BTRFS_I(inode));
+	btrfs_update_inode(trans, BTRFS_I(inode));
 
 	if (block_group) {
 		/* the dirty list is protected by the dirty_bgs_lock */
@@ -1362,7 +1367,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
 /*
  * Write out cached info to an inode.
  *
- * @root:        root the inode belongs to
  * @inode:       freespace inode we are writing out
  * @ctl:         free space cache we are going to write out
  * @block_group: block_group for this cache if it belongs to a block_group
@@ -1373,7 +1377,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
  * on mount.  This will return 0 if it was successful in writing the cache out,
  * or an errno if it was not.
  */
-static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_write_out_cache(struct inode *inode,
 				   struct btrfs_free_space_ctl *ctl,
 				   struct btrfs_block_group *block_group,
 				   struct btrfs_io_ctl *io_ctl,
@@ -1506,7 +1510,7 @@ out:
 		invalidate_inode_pages2(inode->i_mapping);
 		BTRFS_I(inode)->generation = 0;
 	}
-	btrfs_update_inode(trans, root, BTRFS_I(inode));
+	btrfs_update_inode(trans, BTRFS_I(inode));
 	if (must_iput)
 		iput(inode);
 	return ret;
@@ -1532,8 +1536,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
 	if (IS_ERR(inode))
 		return 0;
 
-	ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
-				block_group, &block_group->io_ctl, trans);
+	ret = __btrfs_write_out_cache(inode, ctl, block_group,
+				      &block_group->io_ctl, trans);
 	if (ret) {
 		btrfs_debug(fs_info,
 	  "failed to write free space cache for block group %llu error %d",
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index c0e734082dcc..7b598b070700 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -89,7 +89,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
 			      struct btrfs_free_space_info);
 	btrfs_set_free_space_extent_count(leaf, info, 0);
 	btrfs_set_free_space_flags(leaf, info, 0);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	ret = 0;
 out:
@@ -287,7 +287,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 	flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
 	btrfs_set_free_space_flags(leaf, info, flags);
 	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	if (extent_count != expected_extent_count) {
@@ -324,7 +324,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		write_extent_buffer(leaf, bitmap_cursor, ptr,
 				    data_size);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 		btrfs_release_path(path);
 
 		i += extent_size;
@@ -430,7 +430,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 	flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
 	btrfs_set_free_space_flags(leaf, info, flags);
 	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
@@ -495,7 +495,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
 
 	extent_count += new_extents;
 	btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 
 	if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
@@ -533,7 +533,8 @@ int free_space_test_bit(struct btrfs_block_group *block_group,
 	return !!extent_buffer_test_bit(leaf, ptr, i);
 }
 
-static void free_space_set_bits(struct btrfs_block_group *block_group,
+static void free_space_set_bits(struct btrfs_trans_handle *trans,
+				struct btrfs_block_group *block_group,
 				struct btrfs_path *path, u64 *start, u64 *size,
 				int bit)
 {
@@ -563,7 +564,7 @@ static void free_space_set_bits(struct btrfs_block_group *block_group,
 		extent_buffer_bitmap_set(leaf, ptr, first, last - first);
 	else
 		extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	*size -= end - *start;
 	*start = end;
@@ -656,7 +657,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
 	cur_start = start;
 	cur_size = size;
 	while (1) {
-		free_space_set_bits(block_group, path, &cur_start, &cur_size,
+		free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
 				    !remove);
 		if (cur_size == 0)
 			break;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a523d64d5491..318df6f9d9cb 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -139,6 +139,12 @@ enum {
 	 */
 	BTRFS_FS_FEATURE_CHANGED,
 
+	/*
+	 * Indicate that we have found a tree block which is only aligned to
+	 * sectorsize, but not to nodesize.  This should be rare nowadays.
+	 */
+	BTRFS_FS_UNALIGNED_TREE_BLOCK,
+
 #if BITS_PER_LONG == 32
 	/* Indicate if we have error/warn message printed on 32bit systems */
 	BTRFS_FS_32BIT_ERROR,
@@ -171,19 +177,17 @@ enum {
 	BTRFS_MOUNT_AUTO_DEFRAG			= (1UL << 16),
 	BTRFS_MOUNT_USEBACKUPROOT		= (1UL << 17),
 	BTRFS_MOUNT_SKIP_BALANCE		= (1UL << 18),
-	BTRFS_MOUNT_CHECK_INTEGRITY		= (1UL << 19),
-	BTRFS_MOUNT_CHECK_INTEGRITY_DATA	= (1UL << 20),
-	BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	= (1UL << 21),
-	BTRFS_MOUNT_RESCAN_UUID_TREE		= (1UL << 22),
-	BTRFS_MOUNT_FRAGMENT_DATA		= (1UL << 23),
-	BTRFS_MOUNT_FRAGMENT_METADATA		= (1UL << 24),
-	BTRFS_MOUNT_FREE_SPACE_TREE		= (1UL << 25),
-	BTRFS_MOUNT_NOLOGREPLAY			= (1UL << 26),
-	BTRFS_MOUNT_REF_VERIFY			= (1UL << 27),
-	BTRFS_MOUNT_DISCARD_ASYNC		= (1UL << 28),
-	BTRFS_MOUNT_IGNOREBADROOTS		= (1UL << 29),
-	BTRFS_MOUNT_IGNOREDATACSUMS		= (1UL << 30),
-	BTRFS_MOUNT_NODISCARD			= (1UL << 31),
+	BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	= (1UL << 19),
+	BTRFS_MOUNT_RESCAN_UUID_TREE		= (1UL << 20),
+	BTRFS_MOUNT_FRAGMENT_DATA		= (1UL << 21),
+	BTRFS_MOUNT_FRAGMENT_METADATA		= (1UL << 22),
+	BTRFS_MOUNT_FREE_SPACE_TREE		= (1UL << 23),
+	BTRFS_MOUNT_NOLOGREPLAY			= (1UL << 24),
+	BTRFS_MOUNT_REF_VERIFY			= (1UL << 25),
+	BTRFS_MOUNT_DISCARD_ASYNC		= (1UL << 26),
+	BTRFS_MOUNT_IGNOREBADROOTS		= (1UL << 27),
+	BTRFS_MOUNT_IGNOREDATACSUMS		= (1UL << 28),
+	BTRFS_MOUNT_NODISCARD			= (1UL << 29),
 };
 
 /*
@@ -216,7 +220,8 @@ enum {
 	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
 	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
 	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
-	 BTRFS_FEATURE_INCOMPAT_ZONED)
+	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\
+	 BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
 
 #ifdef CONFIG_BTRFS_DEBUG
 	/*
@@ -225,6 +230,7 @@ enum {
 	 */
 #define BTRFS_FEATURE_INCOMPAT_SUPP		\
 	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE |	\
+	 BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
 	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
 
 #else
@@ -369,6 +375,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *uuid_root;
 	struct btrfs_root *data_reloc_root;
 	struct btrfs_root *block_group_root;
+	struct btrfs_root *stripe_root;
 
 	/* The log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -409,7 +416,17 @@ struct btrfs_fs_info {
 
 	struct btrfs_block_rsv empty_block_rsv;
 
+	/*
+	 * Updated while holding the lock 'trans_lock'. Due to the life cycle of
+	 * a transaction, it can be directly read while holding a transaction
+	 * handle, everywhere else must be read with btrfs_get_fs_generation().
+	 * Should always be updated using btrfs_set_fs_generation().
+	 */
 	u64 generation;
+	/*
+	 * Always use btrfs_get_last_trans_committed() and
+	 * btrfs_set_last_trans_committed() to read and update this field.
+	 */
 	u64 last_trans_committed;
 	/*
 	 * Generation of the last transaction used for block group relocation
@@ -645,9 +662,6 @@ struct btrfs_fs_info {
 
 	struct btrfs_discard_ctl discard_ctl;
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	u32 check_integrity_print_mask;
-#endif
 	/* Is qgroup tracking in a consistent state? */
 	u64 qgroup_flags;
 
@@ -683,6 +697,7 @@ struct btrfs_fs_info {
 	/* Protected by qgroup_rescan_lock */
 	bool qgroup_rescan_running;
 	u8 qgroup_drop_subtree_thres;
+	u64 qgroup_enable_gen;
 
 	/*
 	 * If this is not 0, then it indicates a serious filesystem error has
@@ -812,6 +827,26 @@ struct btrfs_fs_info {
 #endif
 };
 
+static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
+{
+	return READ_ONCE(fs_info->generation);
+}
+
+static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen)
+{
+	WRITE_ONCE(fs_info->generation, gen);
+}
+
+static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info)
+{
+	return READ_ONCE(fs_info->last_trans_committed);
+}
+
+static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen)
+{
+	WRITE_ONCE(fs_info->last_trans_committed, gen);
+}
+
 static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
 						u64 gen)
 {
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 4c322b720a80..7d734830e514 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -167,7 +167,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	memmove_extent_buffer(leaf, ptr, ptr + del_len,
 			      item_size - (ptr + del_len - item_start));
 
-	btrfs_truncate_item(path, item_size - del_len, 1);
+	btrfs_truncate_item(trans, path, item_size - del_len, 1);
 
 out:
 	btrfs_free_path(path);
@@ -229,7 +229,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
-	btrfs_truncate_item(path, item_size - sub_item_len, 1);
+	btrfs_truncate_item(trans, path, item_size - sub_item_len, 1);
 out:
 	btrfs_free_path(path);
 
@@ -247,7 +247,7 @@ out:
 }
 
 /*
- * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ * Insert an extended inode ref into a tree.
  *
  * The caller must have checked against BTRFS_LINK_MAX already.
  */
@@ -282,7 +282,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 						   name))
 			goto out;
 
-		btrfs_extend_item(path, ins_len);
+		btrfs_extend_item(trans, path, ins_len);
 		ret = 0;
 	}
 	if (ret < 0)
@@ -299,7 +299,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 
 	ptr = (unsigned long)&extref->name;
 	write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 out:
 	btrfs_free_path(path);
@@ -338,7 +338,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			goto out;
 
 		old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
-		btrfs_extend_item(path, ins_len);
+		btrfs_extend_item(trans, path, ins_len);
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -364,7 +364,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		ptr = (unsigned long)(ref + 1);
 	}
 	write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 out:
 	btrfs_free_path(path);
@@ -591,7 +591,7 @@ search_again:
 				num_dec = (orig_num_bytes - extent_num_bytes);
 				if (extent_start != 0)
 					control->sub_bytes += num_dec;
-				btrfs_mark_buffer_dirty(leaf);
+				btrfs_mark_buffer_dirty(trans, leaf);
 			} else {
 				extent_num_bytes =
 					btrfs_file_extent_disk_num_bytes(leaf, fi);
@@ -617,7 +617,7 @@ search_again:
 
 				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
 				size = btrfs_file_extent_calc_inline_size(size);
-				btrfs_truncate_item(path, size, 1);
+				btrfs_truncate_item(trans, path, size, 1);
 			} else if (!del_item) {
 				/*
 				 * We have to bail so the last_size is set to
@@ -676,7 +676,8 @@ delete:
 			bytes_deleted += extent_num_bytes;
 
 			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
-					extent_start, extent_num_bytes, 0);
+					extent_start, extent_num_bytes, 0,
+					root->root_key.objectid);
 			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
 					control->ino, extent_offset,
 					root->root_key.objectid, false);
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index ede43b6c6559..4337bb26f419 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -4,6 +4,7 @@
 #define BTRFS_INODE_ITEM_H
 
 #include <linux/types.h>
+#include <linux/crc32c.h>
 
 struct btrfs_trans_handle;
 struct btrfs_root;
@@ -12,6 +13,7 @@ struct btrfs_key;
 struct btrfs_inode_extref;
 struct btrfs_inode;
 struct extent_buffer;
+struct fscrypt_str;
 
 /*
  * Return this if we need to call truncate_block for the last bit of the
@@ -76,6 +78,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 	*ro_flags = (u32)(inode_item_flags >> 32);
 }
 
+/* Figure the key offset of an extended inode ref. */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len)
+{
+       return (u64)crc32c(parent_objectid, name, len);
+}
+
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct btrfs_truncate_control *control);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7814b9d654ce..5e3fccddde0c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@
 #include "super.h"
 #include "orphan.h"
 #include "backref.h"
+#include "raid-stripe-tree.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -348,7 +349,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
 }
 
 /*
- * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ * Lock inode i_rwsem based on arguments passed.
  *
  * ilock_flags can have the following bit set:
  *
@@ -382,7 +383,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
 }
 
 /*
- * btrfs_inode_unlock - unock inode i_rwsem
+ * Unock inode i_rwsem.
  *
  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
  * to decide whether the lock acquired is shared or exclusive.
@@ -573,7 +574,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 		kunmap_local(kaddr);
 		put_page(page);
 	}
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/*
@@ -670,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
 	}
 
 	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 	if (ret && ret != -ENOSPC) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -1565,8 +1566,11 @@ out_unlock:
  * Phase two of compressed writeback.  This is the ordered portion of the code,
  * which only gets called in the order the work was queued.  We walk all the
  * async extents created by compress_file_range and send them down to the disk.
+ *
+ * If called with @do_free == true then it'll try to finish the work and free
+ * the work struct eventually.
  */
-static noinline void submit_compressed_extents(struct btrfs_work *work)
+static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
 {
 	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
 						     work);
@@ -1575,6 +1579,21 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
 	unsigned long nr_pages;
 	u64 alloc_hint = 0;
 
+	if (do_free) {
+		struct async_chunk *async_chunk;
+		struct async_cow *async_cow;
+
+		async_chunk = container_of(work, struct async_chunk, work);
+		btrfs_add_delayed_iput(async_chunk->inode);
+		if (async_chunk->blkcg_css)
+			css_put(async_chunk->blkcg_css);
+
+		async_cow = async_chunk->async_cow;
+		if (atomic_dec_and_test(&async_cow->num_chunks))
+			kvfree(async_cow);
+		return;
+	}
+
 	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
 		PAGE_SHIFT;
 
@@ -1591,21 +1610,6 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
 		cond_wake_up_nomb(&fs_info->async_submit_wait);
 }
 
-static noinline void async_cow_free(struct btrfs_work *work)
-{
-	struct async_chunk *async_chunk;
-	struct async_cow *async_cow;
-
-	async_chunk = container_of(work, struct async_chunk, work);
-	btrfs_add_delayed_iput(async_chunk->inode);
-	if (async_chunk->blkcg_css)
-		css_put(async_chunk->blkcg_css);
-
-	async_cow = async_chunk->async_cow;
-	if (atomic_dec_and_test(&async_cow->num_chunks))
-		kvfree(async_cow);
-}
-
 static bool run_delalloc_compressed(struct btrfs_inode *inode,
 				    struct page *locked_page, u64 start,
 				    u64 end, struct writeback_control *wbc)
@@ -1683,7 +1687,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
 		}
 
 		btrfs_init_work(&async_chunk[i].work, compress_file_range,
-				submit_compressed_extents, async_cow_free);
+				submit_compressed_extents);
 
 		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
 		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
@@ -2235,8 +2239,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
 {
 	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
 		if (inode->defrag_bytes &&
-		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
-				   0, NULL))
+		    test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
 			return false;
 		return true;
 	}
@@ -2847,7 +2850,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
 	ihold(inode);
 	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
 	get_page(page);
-	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
 	fixup->page = page;
 	fixup->inode = BTRFS_I(inode);
 	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
@@ -2912,7 +2915,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 			btrfs_item_ptr_offset(leaf, path->slots[0]),
 			sizeof(struct btrfs_file_extent_item));
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/*
@@ -3070,7 +3073,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 			goto out;
 		}
 		trans->block_rsv = &inode->block_rsv;
-		ret = btrfs_update_inode_fallback(trans, root, inode);
+		ret = btrfs_update_inode_fallback(trans, inode);
 		if (ret) /* -ENOMEM or corruption */
 			btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -3091,6 +3094,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 
 	trans->block_rsv = &inode->block_rsv;
 
+	ret = btrfs_insert_raid_extent(trans, ordered_extent);
+	if (ret)
+		goto out;
+
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 		compress_type = ordered_extent->compress_type;
 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@@ -3136,7 +3143,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 				 &cached_state);
 
 	btrfs_inode_safe_disk_i_size_write(inode, 0);
-	ret = btrfs_update_inode_fallback(trans, root, inode);
+	ret = btrfs_update_inode_fallback(trans, inode);
 	if (ret) { /* -ENOMEM or corruption */
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -3224,7 +3231,8 @@ out:
 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
 {
 	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
-	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+	    list_empty(&ordered->bioc_list))
 		btrfs_finish_ordered_zoned(ordered);
 	return btrfs_finish_one_ordered(ordered);
 }
@@ -3282,7 +3290,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 
 	if (btrfs_is_data_reloc_root(inode->root) &&
 	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
-			   1, NULL)) {
+			   NULL)) {
 		/* Skip the range without csum for data reloc inode */
 		clear_extent_bits(&inode->io_tree, file_offset, end,
 				  EXTENT_NODATASUM);
@@ -3306,7 +3314,7 @@ zeroit:
 }
 
 /*
- * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ * Perform a delayed iput on @inode.
  *
  * @inode: The inode we want to perform iput on
  *
@@ -3754,19 +3762,17 @@ static int btrfs_read_locked_inode(struct inode *inode,
 	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
 			round_up(i_size_read(inode), fs_info->sectorsize));
 
-	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
-	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+	inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+			btrfs_timespec_nsec(leaf, &inode_item->atime));
 
-	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
-	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
+	inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+			btrfs_timespec_nsec(leaf, &inode_item->mtime));
 
 	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
 			btrfs_timespec_nsec(leaf, &inode_item->ctime));
 
-	BTRFS_I(inode)->i_otime.tv_sec =
-		btrfs_timespec_sec(leaf, &inode_item->otime);
-	BTRFS_I(inode)->i_otime.tv_nsec =
-		btrfs_timespec_nsec(leaf, &inode_item->otime);
+	BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+	BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3792,7 +3798,7 @@ cache_index:
 	 * This is required for both inode re-read from disk and delayed inode
 	 * in delayed_nodes_tree.
 	 */
-	if (BTRFS_I(inode)->last_trans == fs_info->generation)
+	if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			&BTRFS_I(inode)->runtime_flags);
 
@@ -3922,24 +3928,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
 
 	btrfs_set_token_timespec_sec(&token, &item->atime,
-				     inode->i_atime.tv_sec);
+				     inode_get_atime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->atime,
-				      inode->i_atime.tv_nsec);
+				      inode_get_atime_nsec(inode));
 
 	btrfs_set_token_timespec_sec(&token, &item->mtime,
-				     inode->i_mtime.tv_sec);
+				     inode_get_mtime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
-				      inode->i_mtime.tv_nsec);
+				      inode_get_mtime_nsec(inode));
 
 	btrfs_set_token_timespec_sec(&token, &item->ctime,
-				     inode_get_ctime(inode).tv_sec);
+				     inode_get_ctime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
-				      inode_get_ctime(inode).tv_nsec);
+				      inode_get_ctime_nsec(inode));
 
-	btrfs_set_token_timespec_sec(&token, &item->otime,
-				     BTRFS_I(inode)->i_otime.tv_sec);
-	btrfs_set_token_timespec_nsec(&token, &item->otime,
-				      BTRFS_I(inode)->i_otime.tv_nsec);
+	btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
+	btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
 
 	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
 	btrfs_set_token_inode_generation(&token, item,
@@ -3957,8 +3961,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
  * copy everything in the in-memory inode into the btree.
  */
 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_inode *inode)
+					    struct btrfs_inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_path *path;
@@ -3969,7 +3972,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
+	ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
@@ -3981,7 +3984,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 				    struct btrfs_inode_item);
 
 	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
@@ -3992,10 +3995,10 @@ failed:
 /*
  * copy everything in the in-memory inode into the btree.
  */
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_inode *inode)
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_inode *inode)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 
@@ -4011,23 +4014,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
 		btrfs_update_root_times(trans, root);
 
-		ret = btrfs_delayed_update_inode(trans, root, inode);
+		ret = btrfs_delayed_update_inode(trans, inode);
 		if (!ret)
 			btrfs_set_inode_last_trans(trans, inode);
 		return ret;
 	}
 
-	return btrfs_update_inode_item(trans, root, inode);
+	return btrfs_update_inode_item(trans, inode);
 }
 
 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct btrfs_inode *inode)
+				struct btrfs_inode *inode)
 {
 	int ret;
 
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 	if (ret == -ENOSPC)
-		return btrfs_update_inode_item(trans, root, inode);
+		return btrfs_update_inode_item(trans, inode);
 	return ret;
 }
 
@@ -4132,9 +4135,8 @@ err:
 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
 	inode_inc_iversion(&inode->vfs_inode);
 	inode_inc_iversion(&dir->vfs_inode);
-	inode_set_ctime_current(&inode->vfs_inode);
-	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
-	ret = btrfs_update_inode(trans, root, dir);
+ 	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+	ret = btrfs_update_inode(trans, dir);
 out:
 	return ret;
 }
@@ -4148,7 +4150,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
 	if (!ret) {
 		drop_nlink(&inode->vfs_inode);
-		ret = btrfs_update_inode(trans, inode->root, inode);
+		ret = btrfs_update_inode(trans, inode);
 	}
 	return ret;
 }
@@ -4306,8 +4308,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 
 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
 	inode_inc_iversion(&dir->vfs_inode);
-	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
-	ret = btrfs_update_inode_fallback(trans, root, dir);
+	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+	ret = btrfs_update_inode_fallback(trans, dir);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
 out:
@@ -4641,7 +4643,8 @@ out_notrans:
 }
 
 /*
- * btrfs_truncate_block - read, zero a chunk and write a block
+ * Read, zero a chunk and write a block.
+ *
  * @inode - inode that we're zeroing
  * @from - the offset to start zeroing
  * @len - the length to zero, 0 to zero the entire range respective to the
@@ -4791,9 +4794,9 @@ out:
 	return ret;
 }
 
-static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
-			     u64 offset, u64 len)
+static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_drop_extents_args drop_args = { 0 };
@@ -4833,7 +4836,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
 		btrfs_abort_transaction(trans, ret);
 	} else {
 		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
-		btrfs_update_inode(trans, root, inode);
+		btrfs_update_inode(trans, inode);
 	}
 	btrfs_end_transaction(trans);
 	return ret;
@@ -4889,8 +4892,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 			struct extent_map *hole_em;
 
-			err = maybe_insert_hole(root, inode, cur_offset,
-						hole_size);
+			err = maybe_insert_hole(inode, cur_offset, hole_size);
 			if (err)
 				break;
 
@@ -4916,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 			hole_em->orig_block_len = 0;
 			hole_em->ram_bytes = hole_size;
 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
-			hole_em->generation = fs_info->generation;
+			hole_em->generation = btrfs_get_fs_generation(fs_info);
 
 			err = btrfs_replace_extent_map_range(inode, hole_em, true);
 			free_extent_map(hole_em);
@@ -4956,7 +4958,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 	if (newsize != oldsize) {
 		inode_inc_iversion(inode);
 		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		}
 	}
 
@@ -4984,7 +4987,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		i_size_write(inode, newsize);
 		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 		pagecache_isize_extended(inode, oldsize, newsize);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 		btrfs_drew_write_unlock(&root->snapshot_lock);
 		btrfs_end_transaction(trans);
 	} else {
@@ -5582,6 +5585,7 @@ static struct inode *new_simple_dir(struct inode *dir,
 				    struct btrfs_key *key,
 				    struct btrfs_root *root)
 {
+	struct timespec64 ts;
 	struct inode *inode = new_inode(dir->i_sb);
 
 	if (!inode)
@@ -5600,9 +5604,13 @@ static struct inode *new_simple_dir(struct inode *dir,
 	inode->i_opflags &= ~IOP_XATTR;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-	inode->i_mtime = inode_set_ctime_current(inode);
-	inode->i_atime = dir->i_atime;
-	BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+	ts = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, ts);
+	inode_set_atime_to_ts(inode, inode_get_atime(dir));
+	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+
 	inode->i_uid = dir->i_uid;
 	inode->i_gid = dir->i_gid;
 
@@ -6000,15 +6008,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	ret = btrfs_update_inode(trans, root, inode);
-	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
+	ret = btrfs_update_inode(trans, inode);
+	if (ret == -ENOSPC || ret == -EDQUOT) {
 		/* whoops, lets try again with the full transaction */
 		btrfs_end_transaction(trans);
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode(trans, inode);
 	}
 	btrfs_end_transaction(trans);
 	if (inode->delayed_node)
@@ -6024,7 +6032,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
 static int btrfs_update_time(struct inode *inode, int flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	bool dirty = flags & ~S_VERSION;
+	bool dirty;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -6160,6 +6168,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 			   struct btrfs_new_inode_args *args)
 {
+	struct timespec64 ts;
 	struct inode *dir = args->dir;
 	struct inode *inode = args->inode;
 	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
@@ -6277,9 +6286,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 		goto discard;
 	}
 
-	inode->i_mtime = inode_set_ctime_current(inode);
-	inode->i_atime = inode->i_mtime;
-	BTRFS_I(inode)->i_otime = inode->i_mtime;
+	ts = simple_inode_init_ts(inode);
+	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
 
 	/*
 	 * We're going to fill the inode item now, so at this point the inode
@@ -6310,7 +6319,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	/*
 	 * We don't need the path anymore, plus inheriting properties, adding
 	 * ACLs, security xattrs, orphan item or adding the link, will result in
@@ -6444,10 +6453,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 	 * values (the ones it had when the fsync was done).
 	 */
 	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
-		parent_inode->vfs_inode.i_mtime =
-			inode_set_ctime_current(&parent_inode->vfs_inode);
+		inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+				      inode_set_ctime_current(&parent_inode->vfs_inode));
 
-	ret = btrfs_update_inode(trans, root, parent_inode);
+	ret = btrfs_update_inode(trans, parent_inode);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
 	return ret;
@@ -6598,7 +6607,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	} else {
 		struct dentry *parent = dentry->d_parent;
 
-		err = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		err = btrfs_update_inode(trans, BTRFS_I(inode));
 		if (err)
 			goto fail;
 		if (inode->i_nlink == 1) {
@@ -7103,8 +7112,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 
 		range_end = round_up(offset + nocow_args.num_bytes,
 				     root->fs_info->sectorsize) - 1;
-		ret = test_range_bit(io_tree, offset, range_end,
-				     EXTENT_DELALLOC, 0, NULL);
+		ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
 		if (ret) {
 			ret = -EAGAIN;
 			goto out;
@@ -8005,11 +8013,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
 					 EXTENT_DEFRAG, &cached_state);
 
-		spin_lock_irq(&inode->ordered_tree.lock);
+		spin_lock_irq(&inode->ordered_tree_lock);
 		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
 		ordered->truncated_len = min(ordered->truncated_len,
 					     cur - ordered->file_offset);
-		spin_unlock_irq(&inode->ordered_tree.lock);
+		spin_unlock_irq(&inode->ordered_tree_lock);
 
 		/*
 		 * If the ordered extent has finished, we're safe to delete all
@@ -8339,7 +8347,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
 		if (ret != -ENOSPC && ret != -EAGAIN)
 			break;
 
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode(trans, inode);
 		if (ret)
 			break;
 
@@ -8392,7 +8400,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
 		int ret2;
 
 		trans->block_rsv = &fs_info->trans_block_rsv;
-		ret2 = btrfs_update_inode(trans, root, inode);
+		ret2 = btrfs_update_inode(trans, inode);
 		if (ret2 && !ret)
 			ret = ret2;
 
@@ -8481,8 +8489,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	ei->delayed_node = NULL;
 
-	ei->i_otime.tv_sec = 0;
-	ei->i_otime.tv_nsec = 0;
+	ei->i_otime_sec = 0;
+	ei->i_otime_nsec = 0;
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree);
@@ -8491,7 +8499,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	extent_io_tree_init(fs_info, &ei->file_extent_tree,
 			    IO_TREE_INODE_FILE_EXTENT);
 	mutex_init(&ei->log_mutex);
-	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+	spin_lock_init(&ei->ordered_tree_lock);
+	ei->ordered_tree = RB_ROOT;
+	ei->ordered_tree_last = NULL;
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
 	INIT_LIST_HEAD(&ei->delayed_iput);
 	RB_CLEAR_NODE(&ei->rb_node);
@@ -8634,8 +8644,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
 	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
 
 	stat->result_mask |= STATX_BTIME;
-	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
-	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
+	stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
+	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
 	if (bi_flags & BTRFS_INODE_APPEND)
 		stat->attributes |= STATX_ATTR_APPEND;
 	if (bi_flags & BTRFS_INODE_COMPRESS)
@@ -8823,7 +8833,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 					   BTRFS_I(old_dentry->d_inode),
 					   old_name, &old_rename_ctx);
 		if (!ret)
-			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 	}
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -8838,7 +8848,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 					   BTRFS_I(new_dentry->d_inode),
 					   new_name, &new_rename_ctx);
 		if (!ret)
-			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
 	}
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -9083,7 +9093,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 					   BTRFS_I(d_inode(old_dentry)),
 					   &old_fname.disk_name, &rename_ctx);
 		if (!ret)
-			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 	}
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -9208,7 +9218,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
 	init_completion(&work->completion);
 	INIT_LIST_HEAD(&work->list);
 	work->inode = inode;
-	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
 
 	return work;
 }
@@ -9446,7 +9456,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 
 	ptr = btrfs_file_extent_inline_start(ei);
 	write_extent_buffer(leaf, symname, ptr, name_len);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_free_path(path);
 
 	d_instantiate_new(dentry, inode);
@@ -9639,7 +9649,7 @@ next:
 			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 		}
 
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8e7d03bc1b56..752acff2c734 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -385,7 +385,7 @@ update_flags:
 	btrfs_sync_inode_flags_to_i_flags(inode);
 	inode_inc_iversion(inode);
 	inode_set_ctime_current(inode);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 
  out_end_trans:
 	btrfs_end_transaction(trans);
@@ -652,18 +652,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
 	/* Tree log can't currently deal with an inode which is a new root. */
 	btrfs_set_log_full_commit(trans);
 
-	ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
+	ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit);
 	if (ret)
 		goto out;
 
 	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
-				      BTRFS_NESTING_NORMAL);
+				      0, BTRFS_NESTING_NORMAL);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto out;
 	}
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	inode_item = &root_item->inode;
 	btrfs_set_stack_inode_generation(inode_item, 1);
@@ -2635,6 +2635,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 		return -EINVAL;
 	}
 
+	if (fs_info->fs_devices->temp_fsid) {
+		btrfs_err(fs_info,
+			  "device add not supported on cloned temp-fsid mount");
+		return -EINVAL;
+	}
+
 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
 		if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
 			return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -2676,8 +2682,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_ioctl_vol_args_v2 *vol_args;
-	struct block_device *bdev = NULL;
-	void *holder;
+	struct bdev_handle *bdev_handle = NULL;
 	int ret;
 	bool cancel = false;
 
@@ -2714,7 +2719,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 		goto err_drop;
 
 	/* Exclusive operation is now claimed */
-	ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+	ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
 
 	btrfs_exclop_finish(fs_info);
 
@@ -2728,8 +2733,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	}
 err_drop:
 	mnt_drop_write_file(file);
-	if (bdev)
-		blkdev_put(bdev, holder);
+	if (bdev_handle)
+		bdev_release(bdev_handle);
 out:
 	btrfs_put_dev_args_from_path(&args);
 	kfree(vol_args);
@@ -2742,8 +2747,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_ioctl_vol_args *vol_args;
-	struct block_device *bdev = NULL;
-	void *holder;
+	struct bdev_handle *bdev_handle = NULL;
 	int ret;
 	bool cancel = false;
 
@@ -2770,15 +2774,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
 					   cancel);
 	if (ret == 0) {
-		ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+		ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
 		if (!ret)
 			btrfs_info(fs_info, "disk deleted %s", vol_args->name);
 		btrfs_exclop_finish(fs_info);
 	}
 
 	mnt_drop_write_file(file);
-	if (bdev)
-		blkdev_put(bdev, holder);
+	if (bdev_handle)
+		bdev_release(bdev_handle);
 out:
 	btrfs_put_dev_args_from_path(&args);
 	kfree(vol_args);
@@ -2822,7 +2826,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
 	}
 
 	if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
-		fi_args->generation = fs_info->generation;
+		fi_args->generation = btrfs_get_fs_generation(fs_info);
 		fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
 	}
 
@@ -2947,7 +2951,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
 	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
 	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 
 	btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
@@ -3131,7 +3135,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
 			return PTR_ERR(trans);
 
 		/* No running transaction, don't bother */
-		transid = root->fs_info->last_trans_committed;
+		transid = btrfs_get_last_trans_committed(root->fs_info);
 		goto out;
 	}
 	transid = trans->transid;
@@ -3697,7 +3701,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 
 	switch (sa->cmd) {
 	case BTRFS_QUOTA_CTL_ENABLE:
-		ret = btrfs_quota_enable(fs_info);
+	case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
+		ret = btrfs_quota_enable(fs_info, sa);
 		break;
 	case BTRFS_QUOTA_CTL_DISABLE:
 		ret = btrfs_quota_disable(fs_info);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 7979449a58d6..74d8e2003f58 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
 #include <asm/bug.h>
+#include <trace/events/btrfs.h>
 #include "misc.h"
 #include "ctree.h"
 #include "extent_io.h"
@@ -73,6 +74,7 @@ static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
 	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
 	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
+	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
 	{ .id = 0,				DEFINE_NAME("tree")	},
 };
 
@@ -102,6 +104,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff
 
 #endif
 
+#ifdef CONFIG_BTRFS_DEBUG
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
+{
+	eb->lock_owner = owner;
+}
+#else
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
+#endif
+
 /*
  * Extent buffer locking
  * =====================
@@ -164,7 +175,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
 int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 {
 	if (down_write_trylock(&eb->lock)) {
-		eb->lock_owner = current->pid;
+		btrfs_set_eb_lock_owner(eb, current->pid);
 		trace_btrfs_try_tree_write_lock(eb);
 		return 1;
 	}
@@ -181,7 +192,8 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
 }
 
 /*
- * __btrfs_tree_lock - lock eb for write
+ * Lock eb for write.
+ *
  * @eb:		the eb to lock
  * @nest:	the nesting to use for the lock
  *
@@ -196,7 +208,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
 		start_ns = ktime_get_ns();
 
 	down_write_nested(&eb->lock, nest);
-	eb->lock_owner = current->pid;
+	btrfs_set_eb_lock_owner(eb, current->pid);
 	trace_btrfs_tree_lock(eb, start_ns);
 }
 
@@ -211,7 +223,7 @@ void btrfs_tree_lock(struct extent_buffer *eb)
 void btrfs_tree_unlock(struct extent_buffer *eb)
 {
 	trace_btrfs_tree_unlock(eb);
-	eb->lock_owner = 0;
+	btrfs_set_eb_lock_owner(eb, 0);
 	up_write(&eb->lock);
 }
 
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 7695decc7243..b8f9c9e56c8c 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -72,11 +72,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
  *        over the error.  Each subsequent error that doesn't have any context
  *        of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
  */
-const char * __attribute_const__ btrfs_decode_error(int errno)
+const char * __attribute_const__ btrfs_decode_error(int error)
 {
 	char *errstr = "unknown";
 
-	switch (errno) {
+	switch (error) {
 	case -ENOENT:		/* -2 */
 		errstr = "No such entry";
 		break;
@@ -110,12 +110,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno)
 }
 
 /*
- * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the appropriate error response.
+ * Decodes expected errors from the caller and invokes the appropriate error
+ * response.
  */
 __cold
 void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
-		       unsigned int line, int errno, const char *fmt, ...)
+		       unsigned int line, int error, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
 #ifdef CONFIG_PRINTK
@@ -132,11 +132,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	 * Special case: if the error is EROFS, and we're already under
 	 * SB_RDONLY, then it is safe here.
 	 */
-	if (errno == -EROFS && sb_rdonly(sb))
+	if (error == -EROFS && sb_rdonly(sb))
 		return;
 
 #ifdef CONFIG_PRINTK
-	errstr = btrfs_decode_error(errno);
+	errstr = btrfs_decode_error(error);
 	btrfs_state_to_string(fs_info, statestr);
 	if (fmt) {
 		struct va_format vaf;
@@ -147,11 +147,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 		vaf.va = &args;
 
 		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
-			sb->s_id, statestr, function, line, errno, errstr, &vaf);
+			sb->s_id, statestr, function, line, error, errstr, &vaf);
 		va_end(args);
 	} else {
 		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
-			sb->s_id, statestr, function, line, errno, errstr);
+			sb->s_id, statestr, function, line, error, errstr);
 	}
 #endif
 
@@ -159,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	 * Today we only save the error info to memory.  Long term we'll also
 	 * send it down to the disk.
 	 */
-	WRITE_ONCE(fs_info->fs_error, errno);
+	WRITE_ONCE(fs_info->fs_error, error);
 
 	/* Don't go through full error handling during mount. */
 	if (!(sb->s_flags & SB_BORN))
@@ -283,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
 #endif
 
 /*
- * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
- * alert, and either panics or BUGs, depending on mount options.
+ * Decode unexpected, fatal errors from the caller, issue an alert, and either
+ * panic or BUGs, depending on mount options.
  */
 __cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
-		   unsigned int line, int errno, const char *fmt, ...)
+		   unsigned int line, int error, const char *fmt, ...)
 {
 	char *s_id = "<unknown>";
 	const char *errstr;
@@ -301,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 	va_start(args, fmt);
 	vaf.va = &args;
 
-	errstr = btrfs_decode_error(errno);
+	errstr = btrfs_decode_error(error);
 	if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
 		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
-			s_id, function, line, &vaf, errno, errstr);
+			s_id, function, line, &vaf, error, errstr);
 
 	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
-		   function, line, &vaf, errno, errstr);
+		   function, line, &vaf, error, errstr);
 	va_end(args);
 	/* Caller calls BUG() */
 }
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 1ae6f8e23e07..4d04c1fa5899 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -184,25 +184,25 @@ do {								\
 __printf(5, 6)
 __cold
 void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
-		     unsigned int line, int errno, const char *fmt, ...);
+		     unsigned int line, int error, const char *fmt, ...);
 
-const char * __attribute_const__ btrfs_decode_error(int errno);
+const char * __attribute_const__ btrfs_decode_error(int error);
 
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
+#define btrfs_handle_fs_error(fs_info, error, fmt, args...)		\
 	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
-				(errno), fmt, ##args)
+				(error), fmt, ##args)
 
 __printf(5, 6)
 __cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
-		   unsigned int line, int errno, const char *fmt, ...);
+		   unsigned int line, int error, const char *fmt, ...);
 /*
  * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
  * will panic().  Otherwise we BUG() here.
  */
-#define btrfs_panic(fs_info, errno, fmt, args...)			\
+#define btrfs_panic(fs_info, error, fmt, args...)			\
 do {									\
-	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\
+	__btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args);	\
 	BUG();								\
 } while (0)
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 345c449d588c..574e8a55e24a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
  * look find the first ordered struct that has this offset, otherwise
  * the first one less than this offset
  */
-static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
-					  u64 file_offset)
+static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode,
+						  u64 file_offset)
 {
-	struct rb_root *root = &tree->tree;
 	struct rb_node *prev = NULL;
 	struct rb_node *ret;
 	struct btrfs_ordered_extent *entry;
 
-	if (tree->last) {
-		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+	if (inode->ordered_tree_last) {
+		entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent,
 				 rb_node);
 		if (in_range(file_offset, entry->file_offset, entry->num_bytes))
-			return tree->last;
+			return inode->ordered_tree_last;
 	}
-	ret = __tree_search(root, file_offset, &prev);
+	ret = __tree_search(&inode->ordered_tree, file_offset, &prev);
 	if (!ret)
 		ret = prev;
 	if (ret)
-		tree->last = ret;
+		inode->ordered_tree_last = ret;
 	return ret;
 }
 
@@ -191,6 +190,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
 	INIT_LIST_HEAD(&entry->log_list);
 	INIT_LIST_HEAD(&entry->root_extent_list);
 	INIT_LIST_HEAD(&entry->work_list);
+	INIT_LIST_HEAD(&entry->bioc_list);
 	init_completion(&entry->completion);
 
 	/*
@@ -208,7 +208,6 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
 static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
 {
 	struct btrfs_inode *inode = BTRFS_I(entry->inode);
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct rb_node *node;
@@ -221,13 +220,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
 	/* One ref for the tree. */
 	refcount_inc(&entry->refs);
 
-	spin_lock_irq(&tree->lock);
-	node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
+	spin_lock_irq(&inode->ordered_tree_lock);
+	node = tree_insert(&inode->ordered_tree, entry->file_offset,
+			   &entry->rb_node);
 	if (node)
 		btrfs_panic(fs_info, -EEXIST,
 				"inconsistency in ordered tree at offset %llu",
 				entry->file_offset);
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 
 	spin_lock(&root->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
@@ -287,12 +287,11 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
 			   struct btrfs_ordered_sum *sum)
 {
-	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_inode *inode = BTRFS_I(entry->inode);
 
-	tree = &BTRFS_I(entry->inode)->ordered_tree;
-	spin_lock_irq(&tree->lock);
+	spin_lock_irq(&inode->ordered_tree_lock);
 	list_add_tail(&sum->list, &entry->list);
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 }
 
 static void finish_ordered_fn(struct btrfs_work *work)
@@ -310,7 +309,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
-	lockdep_assert_held(&inode->ordered_tree.lock);
+	lockdep_assert_held(&inode->ordered_tree_lock);
 
 	if (page) {
 		ASSERT(page->mapping);
@@ -364,7 +363,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
 	struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
 		fs_info->endio_freespace_worker : fs_info->endio_write_workers;
 
-	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL);
 	btrfs_queue_work(wq, &ordered->work);
 }
 
@@ -378,9 +377,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
 
 	trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
 
-	spin_lock_irqsave(&inode->ordered_tree.lock, flags);
+	spin_lock_irqsave(&inode->ordered_tree_lock, flags);
 	ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
-	spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
+	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 
 	if (ret)
 		btrfs_queue_ordered_fn(ordered);
@@ -404,7 +403,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 				    struct page *page, u64 file_offset,
 				    u64 num_bytes, bool uptodate)
 {
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 	unsigned long flags;
@@ -414,13 +412,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 					  file_offset + num_bytes - 1,
 					  uptodate);
 
-	spin_lock_irqsave(&tree->lock, flags);
+	spin_lock_irqsave(&inode->ordered_tree_lock, flags);
 	while (cur < file_offset + num_bytes) {
 		u64 entry_end;
 		u64 end;
 		u32 len;
 
-		node = tree_search(tree, cur);
+		node = ordered_tree_search(inode, cur);
 		/* No ordered extents at all */
 		if (!node)
 			break;
@@ -467,13 +465,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 		len = end + 1 - cur;
 
 		if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
-			spin_unlock_irqrestore(&tree->lock, flags);
+			spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 			btrfs_queue_ordered_fn(entry);
-			spin_lock_irqsave(&tree->lock, flags);
+			spin_lock_irqsave(&inode->ordered_tree_lock, flags);
 		}
 		cur += len;
 	}
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 }
 
 /*
@@ -497,19 +495,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
 				    struct btrfs_ordered_extent **cached,
 				    u64 file_offset, u64 io_size)
 {
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 	unsigned long flags;
 	bool finished = false;
 
-	spin_lock_irqsave(&tree->lock, flags);
+	spin_lock_irqsave(&inode->ordered_tree_lock, flags);
 	if (cached && *cached) {
 		entry = *cached;
 		goto have_entry;
 	}
 
-	node = tree_search(tree, file_offset);
+	node = ordered_tree_search(inode, file_offset);
 	if (!node)
 		goto out;
 
@@ -540,7 +537,7 @@ out:
 		refcount_inc(&entry->refs);
 		trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
 	}
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 	return finished;
 }
 
@@ -578,7 +575,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
 				 struct btrfs_ordered_extent *entry)
 {
-	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_root *root = btrfs_inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct rb_node *node;
@@ -609,16 +605,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
 	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
 				 fs_info->delalloc_batch);
 
-	tree = &btrfs_inode->ordered_tree;
-	spin_lock_irq(&tree->lock);
+	spin_lock_irq(&btrfs_inode->ordered_tree_lock);
 	node = &entry->rb_node;
-	rb_erase(node, &tree->tree);
+	rb_erase(node, &btrfs_inode->ordered_tree);
 	RB_CLEAR_NODE(node);
-	if (tree->last == node)
-		tree->last = NULL;
+	if (btrfs_inode->ordered_tree_last == node)
+		btrfs_inode->ordered_tree_last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&btrfs_inode->ordered_tree_lock);
 
 	/*
 	 * The current running transaction is waiting on us, we need to let it
@@ -711,7 +706,7 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
 		spin_unlock(&root->ordered_extent_lock);
 
 		btrfs_init_work(&ordered->flush_work,
-				btrfs_run_ordered_extent_work, NULL, NULL);
+				btrfs_run_ordered_extent_work, NULL);
 		list_add_tail(&ordered->work_list, &works);
 		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
 
@@ -875,14 +870,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
 							 u64 file_offset)
 {
-	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 	unsigned long flags;
 
-	tree = &inode->ordered_tree;
-	spin_lock_irqsave(&tree->lock, flags);
-	node = tree_search(tree, file_offset);
+	spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+	node = ordered_tree_search(inode, file_offset);
 	if (!node)
 		goto out;
 
@@ -894,7 +887,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
 		trace_btrfs_ordered_extent_lookup(inode, entry);
 	}
 out:
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 	return entry;
 }
 
@@ -904,15 +897,13 @@ out:
 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
 		struct btrfs_inode *inode, u64 file_offset, u64 len)
 {
-	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 
-	tree = &inode->ordered_tree;
-	spin_lock_irq(&tree->lock);
-	node = tree_search(tree, file_offset);
+	spin_lock_irq(&inode->ordered_tree_lock);
+	node = ordered_tree_search(inode, file_offset);
 	if (!node) {
-		node = tree_search(tree, file_offset + len);
+		node = ordered_tree_search(inode, file_offset + len);
 		if (!node)
 			goto out;
 	}
@@ -936,7 +927,7 @@ out:
 		refcount_inc(&entry->refs);
 		trace_btrfs_ordered_extent_lookup_range(inode, entry);
 	}
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 	return entry;
 }
 
@@ -947,13 +938,12 @@ out:
 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
 					   struct list_head *list)
 {
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *n;
 
 	ASSERT(inode_is_locked(&inode->vfs_inode));
 
-	spin_lock_irq(&tree->lock);
-	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+	spin_lock_irq(&inode->ordered_tree_lock);
+	for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
 		struct btrfs_ordered_extent *ordered;
 
 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
@@ -966,7 +956,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
 		refcount_inc(&ordered->refs);
 		trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
 	}
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 }
 
 /*
@@ -976,13 +966,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
 {
-	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 
-	tree = &inode->ordered_tree;
-	spin_lock_irq(&tree->lock);
-	node = tree_search(tree, file_offset);
+	spin_lock_irq(&inode->ordered_tree_lock);
+	node = ordered_tree_search(inode, file_offset);
 	if (!node)
 		goto out;
 
@@ -990,7 +978,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
 	refcount_inc(&entry->refs);
 	trace_btrfs_ordered_extent_lookup_first(inode, entry);
 out:
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 	return entry;
 }
 
@@ -1006,15 +994,14 @@ out:
 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
 			struct btrfs_inode *inode, u64 file_offset, u64 len)
 {
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *node;
 	struct rb_node *cur;
 	struct rb_node *prev;
 	struct rb_node *next;
 	struct btrfs_ordered_extent *entry = NULL;
 
-	spin_lock_irq(&tree->lock);
-	node = tree->tree.rb_node;
+	spin_lock_irq(&inode->ordered_tree_lock);
+	node = inode->ordered_tree.rb_node;
 	/*
 	 * Here we don't want to use tree_search() which will use tree->last
 	 * and screw up the search order.
@@ -1068,7 +1055,7 @@ out:
 		trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
 	}
 
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 	return entry;
 }
 
@@ -1147,7 +1134,6 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 			struct btrfs_ordered_extent *ordered, u64 len)
 {
 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 file_offset = ordered->file_offset;
@@ -1187,13 +1173,13 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 	refcount_inc(&new->refs);
 
 	spin_lock_irq(&root->ordered_extent_lock);
-	spin_lock(&tree->lock);
+	spin_lock(&inode->ordered_tree_lock);
 	/* Remove from tree once */
 	node = &ordered->rb_node;
-	rb_erase(node, &tree->tree);
+	rb_erase(node, &inode->ordered_tree);
 	RB_CLEAR_NODE(node);
-	if (tree->last == node)
-		tree->last = NULL;
+	if (inode->ordered_tree_last == node)
+		inode->ordered_tree_last = NULL;
 
 	ordered->file_offset += len;
 	ordered->disk_bytenr += len;
@@ -1224,18 +1210,19 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 	}
 
 	/* Re-insert the node */
-	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+	node = tree_insert(&inode->ordered_tree, ordered->file_offset,
+			   &ordered->rb_node);
 	if (node)
 		btrfs_panic(fs_info, -EEXIST,
 			"zoned: inconsistency in ordered tree at offset %llu",
 			ordered->file_offset);
 
-	node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
+	node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
 	if (node)
 		btrfs_panic(fs_info, -EEXIST,
 			"zoned: inconsistency in ordered tree at offset %llu",
 			new->file_offset);
-	spin_unlock(&tree->lock);
+	spin_unlock(&inode->ordered_tree_lock);
 
 	list_add_tail(&new->root_extent_list, &root->ordered_extents);
 	root->nr_ordered_extents++;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 173bd5c5df26..567a6d3d4712 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,13 +6,6 @@
 #ifndef BTRFS_ORDERED_DATA_H
 #define BTRFS_ORDERED_DATA_H
 
-/* one of these per inode */
-struct btrfs_ordered_inode_tree {
-	spinlock_t lock;
-	struct rb_root tree;
-	struct rb_node *last;
-};
-
 struct btrfs_ordered_sum {
 	/*
 	 * Logical start address and length for of the blocks covered by
@@ -151,15 +144,9 @@ struct btrfs_ordered_extent {
 	struct completion completion;
 	struct btrfs_work flush_work;
 	struct list_head work_list;
-};
 
-static inline void
-btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
-{
-	spin_lock_init(&t->lock);
-	t->tree = RB_ROOT;
-	t->last = NULL;
-}
+	struct list_head bioc_list;
+};
 
 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);
 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0c93439e929f..7e46aa8a0444 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -9,6 +9,8 @@
 #include "print-tree.h"
 #include "accessors.h"
 #include "tree-checker.h"
+#include "volumes.h"
+#include "raid-stripe-tree.h"
 
 struct root_name_map {
 	u64 id;
@@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = {
 	{ BTRFS_FREE_SPACE_TREE_OBJECTID,	"FREE_SPACE_TREE"	},
 	{ BTRFS_BLOCK_GROUP_TREE_OBJECTID,	"BLOCK_GROUP_TREE"	},
 	{ BTRFS_DATA_RELOC_TREE_OBJECTID,	"DATA_RELOC_TREE"	},
+	{ BTRFS_RAID_STRIPE_TREE_OBJECTID,	"RAID_STRIPE_TREE"	},
 };
 
 const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
@@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb,
 	       btrfs_extent_data_ref_count(eb, ref));
 }
 
+static void print_extent_owner_ref(const struct extent_buffer *eb,
+				   const struct btrfs_extent_owner_ref *ref)
+{
+	ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA));
+	pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref));
+}
+
 static void print_extent_item(const struct extent_buffer *eb, int slot, int type)
 {
 	struct btrfs_extent_item *ei;
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_extent_data_ref *dref;
 	struct btrfs_shared_data_ref *sref;
+	struct btrfs_extent_owner_ref *oref;
 	struct btrfs_disk_key key;
 	unsigned long end;
 	unsigned long ptr;
@@ -161,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
 			"\t\t\t(parent %llu not aligned to sectorsize %u)\n",
 				     offset, eb->fs_info->sectorsize);
 			break;
+		case BTRFS_EXTENT_OWNER_REF_KEY:
+			oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+			print_extent_owner_ref(eb, oref);
+			break;
 		default:
 			pr_cont("(extent %llu has INVALID ref type %d)\n",
 				  eb->start, type);
@@ -189,6 +204,22 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
 	}
 }
 
+static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size,
+				  struct btrfs_stripe_extent *stripe)
+{
+	const int num_stripes = btrfs_num_raid_stripes(item_size);
+	const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe);
+
+	pr_info("\t\t\tencoding: %s\n",
+		(encoding && encoding < BTRFS_NR_RAID_TYPES) ?
+		btrfs_raid_array[encoding].raid_name : "unknown");
+
+	for (int i = 0; i < num_stripes; i++)
+		pr_info("\t\t\tstride %d devid %llu physical %llu\n",
+			i, btrfs_raid_stride_devid(eb, &stripe->strides[i]),
+			btrfs_raid_stride_physical(eb, &stripe->strides[i]));
+}
+
 /*
  * Helper to output refs and locking status of extent buffer.  Useful to debug
  * race condition related problems.
@@ -349,6 +380,10 @@ void btrfs_print_leaf(const struct extent_buffer *l)
 			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
 					btrfs_item_size(l, i));
 			break;
+		case BTRFS_RAID_STRIPE_KEY:
+			print_raid_stripe_key(l, btrfs_item_size(l, i),
+				btrfs_item_ptr(l, i, struct btrfs_stripe_extent));
+			break;
 		}
 	}
 }
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 0755af0e53e3..f9bf591a0718 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -15,6 +15,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "super.h"
+#include "dir-item.h"
 
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b99230db3c82..edb84cc03237 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,6 +30,25 @@
 #include "root-tree.h"
 #include "tree-checker.h"
 
+enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
+{
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+		return BTRFS_QGROUP_MODE_DISABLED;
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
+		return BTRFS_QGROUP_MODE_SIMPLE;
+	return BTRFS_QGROUP_MODE_FULL;
+}
+
+bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info)
+{
+	return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
+}
+
+bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info)
+{
+	return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
+}
+
 /*
  * Helpers to access qgroup reservation
  *
@@ -146,16 +165,6 @@ struct btrfs_qgroup_list {
 	struct btrfs_qgroup *member;
 };
 
-static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
-{
-	return (u64)(uintptr_t)qg;
-}
-
-static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
-{
-	return (struct btrfs_qgroup *)(uintptr_t)n->aux;
-}
-
 static int
 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 		   int init_flags);
@@ -180,34 +189,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 	return NULL;
 }
 
-/* must be called with qgroup_lock held */
+/*
+ * Add qgroup to the filesystem's qgroup tree.
+ *
+ * Must be called with qgroup_lock held and @prealloc preallocated.
+ *
+ * The control on the lifespan of @prealloc would be transfered to this
+ * function, thus caller should no longer touch @prealloc.
+ */
 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+					  struct btrfs_qgroup *prealloc,
 					  u64 qgroupid)
 {
 	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct btrfs_qgroup *qgroup;
 
+	/* Caller must have pre-allocated @prealloc. */
+	ASSERT(prealloc);
+
 	while (*p) {
 		parent = *p;
 		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 
-		if (qgroup->qgroupid < qgroupid)
+		if (qgroup->qgroupid < qgroupid) {
 			p = &(*p)->rb_left;
-		else if (qgroup->qgroupid > qgroupid)
+		} else if (qgroup->qgroupid > qgroupid) {
 			p = &(*p)->rb_right;
-		else
+		} else {
+			kfree(prealloc);
 			return qgroup;
+		}
 	}
 
-	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
-	if (!qgroup)
-		return ERR_PTR(-ENOMEM);
-
+	qgroup = prealloc;
 	qgroup->qgroupid = qgroupid;
 	INIT_LIST_HEAD(&qgroup->groups);
 	INIT_LIST_HEAD(&qgroup->members);
 	INIT_LIST_HEAD(&qgroup->dirty);
+	INIT_LIST_HEAD(&qgroup->iterator);
+	INIT_LIST_HEAD(&qgroup->nested_iterator);
 
 	rb_link_node(&qgroup->node, parent, p);
 	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
@@ -254,27 +275,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 /*
  * Add relation specified by two qgroups.
  *
- * Must be called with qgroup_lock held.
+ * Must be called with qgroup_lock held, the ownership of @prealloc is
+ * transferred to this function and caller should not touch it anymore.
  *
  * Return: 0        on success
  *         -ENOENT  if one of the qgroups is NULL
  *         <0       other errors
  */
-static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
+static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
+			     struct btrfs_qgroup *member,
+			     struct btrfs_qgroup *parent)
 {
-	struct btrfs_qgroup_list *list;
-
-	if (!member || !parent)
+	if (!member || !parent) {
+		kfree(prealloc);
 		return -ENOENT;
+	}
 
-	list = kzalloc(sizeof(*list), GFP_ATOMIC);
-	if (!list)
-		return -ENOMEM;
-
-	list->group = parent;
-	list->member = member;
-	list_add_tail(&list->next_group, &member->groups);
-	list_add_tail(&list->next_member, &parent->members);
+	prealloc->group = parent;
+	prealloc->member = member;
+	list_add_tail(&prealloc->next_group, &member->groups);
+	list_add_tail(&prealloc->next_member, &parent->members);
 
 	return 0;
 }
@@ -288,7 +308,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
  *         -ENOENT  if one of the ids does not exist
  *         <0       other errors
  */
-static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+			   struct btrfs_qgroup_list *prealloc,
+			   u64 memberid, u64 parentid)
 {
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup *parent;
@@ -296,7 +318,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare
 	member = find_qgroup_rb(fs_info, memberid);
 	parent = find_qgroup_rb(fs_info, parentid);
 
-	return __add_relation_rb(member, parent);
+	return __add_relation_rb(prealloc, member, parent);
 }
 
 /* Must be called with qgroup_lock held */
@@ -340,11 +362,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 
 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
 {
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		return;
 	fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
 				  BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
 				  BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
 }
 
+static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
+				   struct extent_buffer *leaf, int slot,
+				   struct btrfs_qgroup_status_item *ptr)
+{
+	ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+	ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
+	fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr);
+}
+
 /*
  * The full config is read in one go, only called from open_ctree()
  * It doesn't use any locking, as at this point we're still single-threaded
@@ -361,7 +394,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 	u64 flags = 0;
 	u64 rescan_progress = 0;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!fs_info->quota_root)
 		return 0;
 
 	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
@@ -411,14 +444,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 				 "old qgroup version, quota disabled");
 				goto out;
 			}
-			if (btrfs_qgroup_status_generation(l, ptr) !=
-			    fs_info->generation) {
+			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
+			if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+				qgroup_read_enable_gen(fs_info, l, slot, ptr);
+			} else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
 				qgroup_mark_inconsistent(fs_info);
 				btrfs_err(fs_info,
 					"qgroup generation mismatch, marked as inconsistent");
 			}
-			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
-									  ptr);
 			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 			goto next1;
 		}
@@ -434,11 +467,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 			qgroup_mark_inconsistent(fs_info);
 		}
 		if (!qgroup) {
-			qgroup = add_qgroup_rb(fs_info, found_key.offset);
-			if (IS_ERR(qgroup)) {
-				ret = PTR_ERR(qgroup);
+			struct btrfs_qgroup *prealloc;
+
+			prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+			if (!prealloc) {
+				ret = -ENOMEM;
 				goto out;
 			}
+			qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
 		}
 		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 		if (ret < 0)
@@ -489,6 +525,8 @@ next1:
 	if (ret)
 		goto out;
 	while (1) {
+		struct btrfs_qgroup_list *list = NULL;
+
 		slot = path->slots[0];
 		l = path->nodes[0];
 		btrfs_item_key_to_cpu(l, &found_key, slot);
@@ -502,8 +540,14 @@ next1:
 			goto next2;
 		}
 
-		ret = add_relation_rb(fs_info, found_key.objectid,
+		list = kzalloc(sizeof(*list), GFP_KERNEL);
+		if (!list) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = add_relation_rb(fs_info, list, found_key.objectid,
 				      found_key.offset);
+		list = NULL;
 		if (ret == -ENOENT) {
 			btrfs_warn(fs_info,
 				"orphan qgroup relation 0x%llx->0x%llx",
@@ -522,13 +566,12 @@ next2:
 out:
 	btrfs_free_path(path);
 	fs_info->qgroup_flags |= flags;
-	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
-		clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
-	else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
-		 ret >= 0)
-		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
-
-	if (ret < 0) {
+	if (ret >= 0) {
+		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
+			set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+			ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+	} else {
 		ulist_free(fs_info->qgroup_ulist);
 		fs_info->qgroup_ulist = NULL;
 		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -550,7 +593,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
 	struct rb_node *node;
 	bool ret = false;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
 		return ret;
 	/*
 	 * Since we're unmounting, there is no race and no need to grab qgroup
@@ -622,7 +665,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 
 	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 	btrfs_free_path(path);
 	return ret;
@@ -700,7 +743,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	btrfs_release_path(path);
 
@@ -719,7 +762,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	ret = 0;
 out:
@@ -808,7 +851,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 
-	btrfs_mark_buffer_dirty(l);
+	btrfs_mark_buffer_dirty(trans, l);
 
 out:
 	btrfs_free_path(path);
@@ -854,7 +897,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 
-	btrfs_mark_buffer_dirty(l);
+	btrfs_mark_buffer_dirty(trans, l);
 
 out:
 	btrfs_free_path(path);
@@ -896,7 +939,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 	btrfs_set_qgroup_status_rescan(l, ptr,
 				fs_info->qgroup_rescan_progress.objectid);
 
-	btrfs_mark_buffer_dirty(l);
+	btrfs_mark_buffer_dirty(trans, l);
 
 out:
 	btrfs_free_path(path);
@@ -949,7 +992,8 @@ out:
 	return ret;
 }
 
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+		       struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
@@ -959,8 +1003,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_qgroup *qgroup = NULL;
+	struct btrfs_qgroup *prealloc = NULL;
 	struct btrfs_trans_handle *trans = NULL;
 	struct ulist *ulist = NULL;
+	const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
 	int ret = 0;
 	int slot;
 
@@ -1063,13 +1109,18 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 				 struct btrfs_qgroup_status_item);
 	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
 	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
-	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
-				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
+	if (simple) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+		btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
+	} else {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	}
 	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
 				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
 	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	key.objectid = 0;
 	key.type = BTRFS_ROOT_REF_KEY;
@@ -1094,6 +1145,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 			/* Release locks on tree_root before we access quota_root */
 			btrfs_release_path(path);
 
+			/* We should not have a stray @prealloc pointer. */
+			ASSERT(prealloc == NULL);
+			prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+			if (!prealloc) {
+				ret = -ENOMEM;
+				btrfs_abort_transaction(trans, ret);
+				goto out_free_path;
+			}
+
 			ret = add_qgroup_item(trans, quota_root,
 					      found_key.offset);
 			if (ret) {
@@ -1101,7 +1161,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 				goto out_free_path;
 			}
 
-			qgroup = add_qgroup_rb(fs_info, found_key.offset);
+			qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
+			prealloc = NULL;
 			if (IS_ERR(qgroup)) {
 				ret = PTR_ERR(qgroup);
 				btrfs_abort_transaction(trans, ret);
@@ -1144,18 +1205,22 @@ out_add_root:
 		goto out_free_path;
 	}
 
-	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
-	if (IS_ERR(qgroup)) {
-		ret = PTR_ERR(qgroup);
-		btrfs_abort_transaction(trans, ret);
+	ASSERT(prealloc == NULL);
+	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+	if (!prealloc) {
+		ret = -ENOMEM;
 		goto out_free_path;
 	}
+	qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
+	prealloc = NULL;
 	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
 	}
 
+	fs_info->qgroup_enable_gen = trans->transid;
+
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	/*
 	 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
@@ -1180,8 +1245,14 @@ out_add_root:
 	spin_lock(&fs_info->qgroup_lock);
 	fs_info->quota_root = quota_root;
 	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+	if (simple)
+		btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
 	spin_unlock(&fs_info->qgroup_lock);
 
+	/* Skip rescan for simple qgroups. */
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		goto out_free_path;
+
 	ret = qgroup_rescan_init(fs_info, 0, 1);
 	if (!ret) {
 	        qgroup_rescan_zero_tracking(fs_info);
@@ -1222,6 +1293,39 @@ out:
 	else if (trans)
 		ret = btrfs_end_transaction(trans);
 	ulist_free(ulist);
+	kfree(prealloc);
+	return ret;
+}
+
+/*
+ * It is possible to have outstanding ordered extents which reserved bytes
+ * before we disabled. We need to fully flush delalloc, ordered extents, and a
+ * commit to ensure that we don't leak such reservations, only to have them
+ * come back if we re-enable.
+ *
+ * - enable simple quotas
+ * - reserve space
+ * - release it, store rsv_bytes in OE
+ * - disable quotas
+ * - enable simple quotas (qgroup rsv are all 0)
+ * - OE finishes
+ * - run delayed refs
+ * - free rsv_bytes, resulting in miscounting or even underflow
+ */
+static int flush_reservations(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
+	if (ret)
+		return ret;
+	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+	trans = btrfs_join_transaction(fs_info->tree_root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	btrfs_commit_transaction(trans);
+
 	return ret;
 }
 
@@ -1269,6 +1373,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 	btrfs_qgroup_wait_for_completion(fs_info, false);
 
+	ret = flush_reservations(fs_info);
+	if (ret)
+		goto out_unlock_cleaner;
+
 	/*
 	 * 1 For the root item
 	 *
@@ -1295,6 +1403,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	quota_root = fs_info->quota_root;
 	fs_info->quota_root = NULL;
 	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
 	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
 	spin_unlock(&fs_info->qgroup_lock);
 
@@ -1329,7 +1438,8 @@ out:
 	if (ret && trans)
 		btrfs_end_transaction(trans);
 	else if (trans)
-		ret = btrfs_end_transaction(trans);
+		ret = btrfs_commit_transaction(trans);
+out_unlock_cleaner:
 	mutex_unlock(&fs_info->cleaner_mutex);
 
 	return ret;
@@ -1342,6 +1452,24 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
 		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
 }
 
+static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+	if (!list_empty(&qgroup->iterator))
+		return;
+
+	list_add_tail(&qgroup->iterator, head);
+}
+
+static void qgroup_iterator_clean(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct btrfs_qgroup *qgroup;
+
+		qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
+		list_del_init(&qgroup->iterator);
+	}
+}
+
 /*
  * The easy accounting, we're updating qgroup relationship whose child qgroup
  * only has exclusive extents.
@@ -1356,14 +1484,12 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
  *
  * Caller should hold fs_info->qgroup_lock.
  */
-static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
-				    struct ulist *tmp, u64 ref_root,
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
 				    struct btrfs_qgroup *src, int sign)
 {
 	struct btrfs_qgroup *qgroup;
-	struct btrfs_qgroup_list *glist;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
+	struct btrfs_qgroup *cur;
+	LIST_HEAD(qgroup_list);
 	u64 num_bytes = src->excl;
 	int ret = 0;
 
@@ -1371,53 +1497,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 	if (!qgroup)
 		goto out;
 
-	qgroup->rfer += sign * num_bytes;
-	qgroup->rfer_cmpr += sign * num_bytes;
-
-	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
-	qgroup->excl += sign * num_bytes;
-	qgroup->excl_cmpr += sign * num_bytes;
-
-	if (sign > 0)
-		qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
-	else
-		qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
-
-	qgroup_dirty(fs_info, qgroup);
-
-	/* Get all of the parent groups that contain this qgroup */
-	list_for_each_entry(glist, &qgroup->groups, next_group) {
-		ret = ulist_add(tmp, glist->group->qgroupid,
-				qgroup_to_aux(glist->group), GFP_ATOMIC);
-		if (ret < 0)
-			goto out;
-	}
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(cur, &qgroup_list, iterator) {
+		struct btrfs_qgroup_list *glist;
 
-	/* Iterate all of the parents and adjust their reference counts */
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(tmp, &uiter))) {
-		qgroup = unode_aux_to_qgroup(unode);
 		qgroup->rfer += sign * num_bytes;
 		qgroup->rfer_cmpr += sign * num_bytes;
+
 		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
 		qgroup->excl += sign * num_bytes;
+		qgroup->excl_cmpr += sign * num_bytes;
+
 		if (sign > 0)
 			qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
 		else
 			qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
-		qgroup->excl_cmpr += sign * num_bytes;
 		qgroup_dirty(fs_info, qgroup);
 
-		/* Add any parents of the parents */
-		list_for_each_entry(glist, &qgroup->groups, next_group) {
-			ret = ulist_add(tmp, glist->group->qgroupid,
-					qgroup_to_aux(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
-		}
+		/* Append parent qgroups to @qgroup_list. */
+		list_for_each_entry(glist, &qgroup->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, glist->group);
 	}
 	ret = 0;
 out:
+	qgroup_iterator_clean(&qgroup_list);
 	return ret;
 }
 
@@ -1434,8 +1537,7 @@ out:
  * Return < 0 for other error.
  */
 static int quick_update_accounting(struct btrfs_fs_info *fs_info,
-				   struct ulist *tmp, u64 src, u64 dst,
-				   int sign)
+				   u64 src, u64 dst, int sign)
 {
 	struct btrfs_qgroup *qgroup;
 	int ret = 1;
@@ -1446,8 +1548,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
 		goto out;
 	if (qgroup->excl == qgroup->rfer) {
 		ret = 0;
-		err = __qgroup_excl_accounting(fs_info, tmp, dst,
-					       qgroup, sign);
+		err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
 		if (err < 0) {
 			ret = err;
 			goto out;
@@ -1459,28 +1560,19 @@ out:
 	return ret;
 }
 
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
-			      u64 dst)
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_qgroup *parent;
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup_list *list;
-	struct ulist *tmp;
-	unsigned int nofs_flag;
+	struct btrfs_qgroup_list *prealloc = NULL;
 	int ret = 0;
 
 	/* Check the level of src and dst first */
 	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
 		return -EINVAL;
 
-	/* We hold a transaction handle open, must do a NOFS allocation. */
-	nofs_flag = memalloc_nofs_save();
-	tmp = ulist_alloc(GFP_KERNEL);
-	memalloc_nofs_restore(nofs_flag);
-	if (!tmp)
-		return -ENOMEM;
-
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!fs_info->quota_root) {
 		ret = -ENOTCONN;
@@ -1501,6 +1593,11 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 		}
 	}
 
+	prealloc = kzalloc(sizeof(*list), GFP_NOFS);
+	if (!prealloc) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	ret = add_qgroup_relation_item(trans, src, dst);
 	if (ret)
 		goto out;
@@ -1512,16 +1609,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 	}
 
 	spin_lock(&fs_info->qgroup_lock);
-	ret = __add_relation_rb(member, parent);
+	ret = __add_relation_rb(prealloc, member, parent);
+	prealloc = NULL;
 	if (ret < 0) {
 		spin_unlock(&fs_info->qgroup_lock);
 		goto out;
 	}
-	ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
+	ret = quick_update_accounting(fs_info, src, dst, 1);
 	spin_unlock(&fs_info->qgroup_lock);
 out:
+	kfree(prealloc);
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
-	ulist_free(tmp);
 	return ret;
 }
 
@@ -1532,19 +1630,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 	struct btrfs_qgroup *parent;
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup_list *list;
-	struct ulist *tmp;
 	bool found = false;
-	unsigned int nofs_flag;
 	int ret = 0;
 	int ret2;
 
-	/* We hold a transaction handle open, must do a NOFS allocation. */
-	nofs_flag = memalloc_nofs_save();
-	tmp = ulist_alloc(GFP_KERNEL);
-	memalloc_nofs_restore(nofs_flag);
-	if (!tmp)
-		return -ENOMEM;
-
 	if (!fs_info->quota_root) {
 		ret = -ENOTCONN;
 		goto out;
@@ -1582,11 +1671,10 @@ delete_item:
 	if (found) {
 		spin_lock(&fs_info->qgroup_lock);
 		del_relation_rb(fs_info, src, dst);
-		ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+		ret = quick_update_accounting(fs_info, src, dst, -1);
 		spin_unlock(&fs_info->qgroup_lock);
 	}
 out:
-	ulist_free(tmp);
 	return ret;
 }
 
@@ -1608,8 +1696,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup *prealloc = NULL;
 	int ret = 0;
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
+		return 0;
+
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!fs_info->quota_root) {
 		ret = -ENOTCONN;
@@ -1622,21 +1714,25 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 		goto out;
 	}
 
+	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+	if (!prealloc) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	ret = add_qgroup_item(trans, quota_root, qgroupid);
 	if (ret)
 		goto out;
 
 	spin_lock(&fs_info->qgroup_lock);
-	qgroup = add_qgroup_rb(fs_info, qgroupid);
+	qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
 	spin_unlock(&fs_info->qgroup_lock);
+	prealloc = NULL;
 
-	if (IS_ERR(qgroup)) {
-		ret = PTR_ERR(qgroup);
-		goto out;
-	}
 	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 out:
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	kfree(prealloc);
 	return ret;
 }
 
@@ -1771,6 +1867,17 @@ out:
 	return ret;
 }
 
+/*
+ * Inform qgroup to trace one dirty extent, its info is recorded in @record.
+ * So qgroup can account it at transaction committing time.
+ *
+ * No lock version, caller must acquire delayed ref lock and allocated memory,
+ * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 				struct btrfs_delayed_ref_root *delayed_refs,
 				struct btrfs_qgroup_extent_record *record)
@@ -1780,6 +1887,9 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 	struct btrfs_qgroup_extent_record *entry;
 	u64 bytenr = record->bytenr;
 
+	if (!btrfs_qgroup_full_accounting(fs_info))
+		return 0;
+
 	lockdep_assert_held(&delayed_refs->lock);
 	trace_btrfs_qgroup_trace_extent(fs_info, record);
 
@@ -1806,12 +1916,35 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+/*
+ * Post handler after qgroup_trace_extent_nolock().
+ *
+ * NOTE: Current qgroup does the expensive backref walk at transaction
+ * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
+ * new transaction.
+ * This is designed to allow btrfs_find_all_roots() to get correct new_roots
+ * result.
+ *
+ * However for old_roots there is no need to do backref walk at that time,
+ * since we search commit roots to walk backref and result will always be
+ * correct.
+ *
+ * Due to the nature of no lock version, we can't do backref there.
+ * So we must call btrfs_qgroup_trace_extent_post() after exiting
+ * spinlock context.
+ *
+ * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
+ * using current root, then we can move all expensive backref walk out of
+ * transaction committing, but not now as qgroup accounting will be wrong again.
+ */
 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 				   struct btrfs_qgroup_extent_record *qrecord)
 {
 	struct btrfs_backref_walk_ctx ctx = { 0 };
 	int ret;
 
+	if (!btrfs_qgroup_full_accounting(trans->fs_info))
+		return 0;
 	/*
 	 * We are always called in a context where we are already holding a
 	 * transaction handle. Often we are called when adding a data delayed
@@ -1859,6 +1992,19 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * Inform qgroup to trace one dirty extent, specified by @bytenr and
+ * @num_bytes.
+ * So qgroup can account it at commit trans time.
+ *
+ * Better encapsulated version, with memory allocation and backref walk for
+ * commit roots.
+ * So this can sleep.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 			      u64 num_bytes)
 {
@@ -1867,8 +2013,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
-	    || bytenr == 0 || num_bytes == 0)
+	if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
 		return 0;
 	record = kzalloc(sizeof(*record), GFP_NOFS);
 	if (!record)
@@ -1889,6 +2034,12 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	return btrfs_qgroup_trace_extent_post(trans, record);
 }
 
+/*
+ * Inform qgroup to trace all leaf items of data
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM)
+ */
 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb)
 {
@@ -1900,7 +2051,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 	u64 bytenr, num_bytes;
 
 	/* We can be called directly from walk_up_proc() */
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	for (i = 0; i < nr; i++) {
@@ -2276,7 +2427,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
 	int level;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	/* Wrong parameter order */
@@ -2319,6 +2470,16 @@ out:
 	return ret;
 }
 
+/*
+ * Inform qgroup to trace a whole subtree, including all its child tree
+ * blocks and data.
+ * The root tree block is specified by @root_eb.
+ *
+ * Normally used by relocation(tree block swap) and subvolume deletion.
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM or tree search error)
+ */
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 			       struct extent_buffer *root_eb,
 			       u64 root_gen, int root_level)
@@ -2333,7 +2494,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 	BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
 	BUG_ON(root_eb == NULL);
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	spin_lock(&fs_info->qgroup_lock);
@@ -2445,62 +2606,64 @@ out:
 	return ret;
 }
 
+static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+	if (!list_empty(&qgroup->nested_iterator))
+		return;
+
+	list_add_tail(&qgroup->nested_iterator, head);
+}
+
+static void qgroup_iterator_nested_clean(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct btrfs_qgroup *qgroup;
+
+		qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
+		list_del_init(&qgroup->nested_iterator);
+	}
+}
+
 #define UPDATE_NEW	0
 #define UPDATE_OLD	1
 /*
  * Walk all of the roots that points to the bytenr and adjust their refcnts.
  */
-static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
-				struct ulist *roots, struct ulist *tmp,
-				struct ulist *qgroups, u64 seq, int update_old)
+static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+				 struct ulist *roots, struct list_head *qgroups,
+				 u64 seq, int update_old)
 {
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
-	struct ulist_node *tmp_unode;
-	struct ulist_iterator tmp_uiter;
 	struct btrfs_qgroup *qg;
-	int ret = 0;
 
 	if (!roots)
-		return 0;
+		return;
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(roots, &uiter))) {
+		LIST_HEAD(tmp);
+
 		qg = find_qgroup_rb(fs_info, unode->val);
 		if (!qg)
 			continue;
 
-		ulist_reinit(tmp);
-		ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
-				GFP_ATOMIC);
-		if (ret < 0)
-			return ret;
-		ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
-		if (ret < 0)
-			return ret;
-		ULIST_ITER_INIT(&tmp_uiter);
-		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+		qgroup_iterator_nested_add(qgroups, qg);
+		qgroup_iterator_add(&tmp, qg);
+		list_for_each_entry(qg, &tmp, iterator) {
 			struct btrfs_qgroup_list *glist;
 
-			qg = unode_aux_to_qgroup(tmp_unode);
 			if (update_old)
 				btrfs_qgroup_update_old_refcnt(qg, seq, 1);
 			else
 				btrfs_qgroup_update_new_refcnt(qg, seq, 1);
+
 			list_for_each_entry(glist, &qg->groups, next_group) {
-				ret = ulist_add(qgroups, glist->group->qgroupid,
-						qgroup_to_aux(glist->group),
-						GFP_ATOMIC);
-				if (ret < 0)
-					return ret;
-				ret = ulist_add(tmp, glist->group->qgroupid,
-						qgroup_to_aux(glist->group),
-						GFP_ATOMIC);
-				if (ret < 0)
-					return ret;
+				qgroup_iterator_nested_add(qgroups, glist->group);
+				qgroup_iterator_add(&tmp, glist->group);
 			}
 		}
+		qgroup_iterator_clean(&tmp);
 	}
-	return 0;
 }
 
 /*
@@ -2539,22 +2702,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
  * But this time we don't need to consider other things, the codes and logic
  * is easy to understand now.
  */
-static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
-				  struct ulist *qgroups,
-				  u64 nr_old_roots,
-				  u64 nr_new_roots,
-				  u64 num_bytes, u64 seq)
+static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
+				   struct list_head *qgroups, u64 nr_old_roots,
+				   u64 nr_new_roots, u64 num_bytes, u64 seq)
 {
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
 	struct btrfs_qgroup *qg;
-	u64 cur_new_count, cur_old_count;
 
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(qgroups, &uiter))) {
+	list_for_each_entry(qg, qgroups, nested_iterator) {
+		u64 cur_new_count, cur_old_count;
 		bool dirty = false;
 
-		qg = unode_aux_to_qgroup(unode);
 		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
 		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
 
@@ -2625,7 +2782,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
 		if (dirty)
 			qgroup_dirty(fs_info, qg);
 	}
-	return 0;
 }
 
 /*
@@ -2662,8 +2818,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 				struct ulist *new_roots)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct ulist *qgroups = NULL;
-	struct ulist *tmp = NULL;
+	LIST_HEAD(qgroups);
 	u64 seq;
 	u64 nr_new_roots = 0;
 	u64 nr_old_roots = 0;
@@ -2673,7 +2828,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	 * If quotas get disabled meanwhile, the resources need to be freed and
 	 * we can't just exit here.
 	 */
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (!btrfs_qgroup_full_accounting(fs_info) ||
 	    fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
 		goto out_free;
 
@@ -2697,17 +2852,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
 					num_bytes, nr_old_roots, nr_new_roots);
 
-	qgroups = ulist_alloc(GFP_NOFS);
-	if (!qgroups) {
-		ret = -ENOMEM;
-		goto out_free;
-	}
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp) {
-		ret = -ENOMEM;
-		goto out_free;
-	}
-
 	mutex_lock(&fs_info->qgroup_rescan_lock);
 	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
 		if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
@@ -2722,29 +2866,21 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	seq = fs_info->qgroup_seq;
 
 	/* Update old refcnts using old_roots */
-	ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
-				   UPDATE_OLD);
-	if (ret < 0)
-		goto out;
+	qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);
 
 	/* Update new refcnts using new_roots */
-	ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
-				   UPDATE_NEW);
-	if (ret < 0)
-		goto out;
+	qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);
 
-	qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+	qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
 			       num_bytes, seq);
 
 	/*
 	 * Bump qgroup_seq to avoid seq overlap
 	 */
 	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
-out:
 	spin_unlock(&fs_info->qgroup_lock);
 out_free:
-	ulist_free(tmp);
-	ulist_free(qgroups);
+	qgroup_iterator_nested_clean(&qgroups);
 	ulist_free(old_roots);
 	ulist_free(new_roots);
 	return ret;
@@ -2761,6 +2897,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 	u64 qgroup_to_skip;
 	int ret = 0;
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		return 0;
+
 	delayed_refs = &trans->transaction->delayed_refs;
 	qgroup_to_skip = delayed_refs->qgroup_to_skip;
 	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
@@ -2876,7 +3015,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
 			qgroup_mark_inconsistent(fs_info);
 		spin_lock(&fs_info->qgroup_lock);
 	}
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (btrfs_qgroup_enabled(fs_info))
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
 	else
 		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -2889,6 +3028,47 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
 	return ret;
 }
 
+static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
+			       u64 inode_rootid,
+			       struct btrfs_qgroup_inherit **inherit)
+{
+	int i = 0;
+	u64 num_qgroups = 0;
+	struct btrfs_qgroup *inode_qg;
+	struct btrfs_qgroup_list *qg_list;
+	struct btrfs_qgroup_inherit *res;
+	size_t struct_sz;
+	u64 *qgids;
+
+	if (*inherit)
+		return -EEXIST;
+
+	inode_qg = find_qgroup_rb(fs_info, inode_rootid);
+	if (!inode_qg)
+		return -ENOENT;
+
+	num_qgroups = list_count_nodes(&inode_qg->groups);
+
+	if (!num_qgroups)
+		return 0;
+
+	struct_sz = struct_size(res, qgroups, num_qgroups);
+	if (struct_sz == SIZE_MAX)
+		return -ERANGE;
+
+	res = kzalloc(struct_sz, GFP_NOFS);
+	if (!res)
+		return -ENOMEM;
+	res->num_qgroups = num_qgroups;
+	qgids = res->qgroups;
+
+	list_for_each_entry(qg_list, &inode_qg->groups, next_group)
+		qgids[i] = qg_list->group->qgroupid;
+
+	*inherit = res;
+	return 0;
+}
+
 /*
  * Copy the accounting information between qgroups. This is necessary
  * when a snapshot or a subvolume is created. Throwing an error will
@@ -2896,7 +3076,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
  * when a readonly fs is a reasonable outcome.
  */
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
-			 u64 objectid, struct btrfs_qgroup_inherit *inherit)
+			 u64 objectid, u64 inode_rootid,
+			 struct btrfs_qgroup_inherit *inherit)
 {
 	int ret = 0;
 	int i;
@@ -2906,10 +3087,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *srcgroup;
 	struct btrfs_qgroup *dstgroup;
+	struct btrfs_qgroup *prealloc;
+	struct btrfs_qgroup_list **qlist_prealloc = NULL;
+	bool free_inherit = false;
 	bool need_rescan = false;
 	u32 level_size = 0;
 	u64 nums;
 
+	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+	if (!prealloc)
+		return -ENOMEM;
+
 	/*
 	 * There are only two callers of this function.
 	 *
@@ -2929,7 +3117,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 
 	if (!committing)
 		mutex_lock(&fs_info->qgroup_ioctl_lock);
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_enabled(fs_info))
 		goto out;
 
 	quota_root = fs_info->quota_root;
@@ -2938,6 +3126,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 		goto out;
 	}
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
+		ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit);
+		if (ret)
+			goto out;
+		free_inherit = true;
+	}
+
 	if (inherit) {
 		i_qgroups = (u64 *)(inherit + 1);
 		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
@@ -2982,16 +3177,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 				goto out;
 		}
 		ret = 0;
-	}
 
+		qlist_prealloc = kcalloc(inherit->num_qgroups,
+					 sizeof(struct btrfs_qgroup_list *),
+					 GFP_NOFS);
+		if (!qlist_prealloc) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		for (int i = 0; i < inherit->num_qgroups; i++) {
+			qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
+						    GFP_NOFS);
+			if (!qlist_prealloc[i]) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+	}
 
 	spin_lock(&fs_info->qgroup_lock);
 
-	dstgroup = add_qgroup_rb(fs_info, objectid);
-	if (IS_ERR(dstgroup)) {
-		ret = PTR_ERR(dstgroup);
-		goto unlock;
-	}
+	dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
+	prealloc = NULL;
 
 	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
 		dstgroup->lim_flags = inherit->lim.flags;
@@ -3003,7 +3210,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 		qgroup_dirty(fs_info, dstgroup);
 	}
 
-	if (srcid) {
+	if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
 		srcgroup = find_qgroup_rb(fs_info, srcid);
 		if (!srcgroup)
 			goto unlock;
@@ -3038,7 +3245,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 	i_qgroups = (u64 *)(inherit + 1);
 	for (i = 0; i < inherit->num_qgroups; ++i) {
 		if (*i_qgroups) {
-			ret = add_relation_rb(fs_info, objectid, *i_qgroups);
+			ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
+					      *i_qgroups);
+			qlist_prealloc[i] = NULL;
 			if (ret)
 				goto unlock;
 		}
@@ -3102,6 +3311,14 @@ out:
 		mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	if (need_rescan)
 		qgroup_mark_inconsistent(fs_info);
+	if (qlist_prealloc) {
+		for (int i = 0; i < inherit->num_qgroups; i++)
+			kfree(qlist_prealloc[i]);
+		kfree(qlist_prealloc);
+	}
+	if (free_inherit)
+		kfree(inherit);
+	kfree(prealloc);
 	return ret;
 }
 
@@ -3125,8 +3342,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 ref_root = root->root_key.objectid;
 	int ret = 0;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
+	LIST_HEAD(qgroup_list);
 
 	if (!is_fstree(ref_root))
 		return 0;
@@ -3146,49 +3362,28 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 	if (!qgroup)
 		goto out;
 
-	/*
-	 * in a first step, we check all affected qgroups if any limits would
-	 * be exceeded
-	 */
-	ulist_reinit(fs_info->qgroup_ulist);
-	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-			qgroup_to_aux(qgroup), GFP_ATOMIC);
-	if (ret < 0)
-		goto out;
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(qgroup, &qgroup_list, iterator) {
 		struct btrfs_qgroup_list *glist;
 
-		qg = unode_aux_to_qgroup(unode);
-
-		if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+		if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
 			ret = -EDQUOT;
 			goto out;
 		}
 
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(fs_info->qgroup_ulist,
-					glist->group->qgroupid,
-					qgroup_to_aux(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
-		}
+		list_for_each_entry(glist, &qgroup->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, glist->group);
 	}
+
 	ret = 0;
 	/*
 	 * no limits exceeded, now record the reservation into all qgroups
 	 */
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
-
-		qg = unode_aux_to_qgroup(unode);
-
-		qgroup_rsv_add(fs_info, qg, num_bytes, type);
-	}
+	list_for_each_entry(qgroup, &qgroup_list, iterator)
+		qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
 
 out:
+	qgroup_iterator_clean(&qgroup_list);
 	spin_unlock(&fs_info->qgroup_lock);
 	return ret;
 }
@@ -3207,9 +3402,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 			       enum btrfs_qgroup_rsv_type type)
 {
 	struct btrfs_qgroup *qgroup;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	int ret = 0;
+	LIST_HEAD(qgroup_list);
 
 	if (!is_fstree(ref_root))
 		return;
@@ -3237,30 +3430,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 		 */
 		num_bytes = qgroup->rsv.values[type];
 
-	ulist_reinit(fs_info->qgroup_ulist);
-	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-			qgroup_to_aux(qgroup), GFP_ATOMIC);
-	if (ret < 0)
-		goto out;
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(qgroup, &qgroup_list, iterator) {
 		struct btrfs_qgroup_list *glist;
 
-		qg = unode_aux_to_qgroup(unode);
-
-		qgroup_rsv_release(fs_info, qg, num_bytes, type);
-
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(fs_info->qgroup_ulist,
-					glist->group->qgroupid,
-					qgroup_to_aux(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
+		qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
+		list_for_each_entry(glist, &qgroup->groups, next_group) {
+			qgroup_iterator_add(&qgroup_list, glist->group);
 		}
 	}
-
 out:
+	qgroup_iterator_clean(&qgroup_list);
 	spin_unlock(&fs_info->qgroup_lock);
 }
 
@@ -3295,6 +3475,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
 	int slot;
 	int ret;
 
+	if (!btrfs_qgroup_full_accounting(fs_info))
+		return 1;
+
 	mutex_lock(&fs_info->qgroup_rescan_lock);
 	extent_root = btrfs_extent_root(fs_info,
 				fs_info->qgroup_rescan_progress.objectid);
@@ -3375,10 +3558,15 @@ out:
 
 static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
 {
-	return btrfs_fs_closing(fs_info) ||
-		test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
-		!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
-			  fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
+	if (btrfs_fs_closing(fs_info))
+		return true;
+	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+		return true;
+	if (!btrfs_qgroup_enabled(fs_info))
+		return true;
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
+		return true;
+	return false;
 }
 
 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3392,6 +3580,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	bool stopped = false;
 	bool did_leaf_rescans = false;
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		return;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
@@ -3495,6 +3686,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 {
 	int ret = 0;
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+		btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
+		return -EINVAL;
+	}
+
 	if (!init_flags) {
 		/* we're resuming qgroup rescan at mount time */
 		if (!(fs_info->qgroup_flags &
@@ -3525,7 +3721,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 			btrfs_warn(fs_info,
 			"qgroup rescan init failed, qgroup is not enabled");
 			ret = -EINVAL;
-		} else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+		} else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
 			/* Quota disable is in progress */
 			ret = -EBUSY;
 		}
@@ -3546,7 +3742,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
 
 	btrfs_init_work(&fs_info->qgroup_rescan_work,
-			btrfs_qgroup_rescan_worker, NULL, NULL);
+			btrfs_qgroup_rescan_worker, NULL);
 	return 0;
 }
 
@@ -3784,7 +3980,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
 	u64 to_reserve;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+	if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid) || len == 0)
 		return 0;
 
@@ -3916,8 +4112,12 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
 	int trace_op = QGROUP_RELEASE;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
-		return 0;
+	if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
+		extent_changeset_init(&changeset);
+		return clear_record_extent_bits(&inode->io_tree, start,
+						start + len - 1,
+						EXTENT_QGROUP_RESERVED, &changeset);
+	}
 
 	/* In release case, we shouldn't have @reserved */
 	WARN_ON(!free && reserved);
@@ -4027,7 +4227,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid) || num_bytes == 0)
 		return 0;
 
@@ -4064,11 +4264,15 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 	return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
 }
 
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid))
 		return;
 
@@ -4084,7 +4288,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid))
 		return;
 
@@ -4104,9 +4308,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
 				int num_bytes)
 {
 	struct btrfs_qgroup *qgroup;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	int ret = 0;
+	LIST_HEAD(qgroup_list);
 
 	if (num_bytes == 0)
 		return;
@@ -4117,39 +4319,35 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
 	qgroup = find_qgroup_rb(fs_info, ref_root);
 	if (!qgroup)
 		goto out;
-	ulist_reinit(fs_info->qgroup_ulist);
-	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-		       qgroup_to_aux(qgroup), GFP_ATOMIC);
-	if (ret < 0)
-		goto out;
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
-		struct btrfs_qgroup_list *glist;
 
-		qg = unode_aux_to_qgroup(unode);
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(qgroup, &qgroup_list, iterator) {
+		struct btrfs_qgroup_list *glist;
 
-		qgroup_rsv_release(fs_info, qg, num_bytes,
+		qgroup_rsv_release(fs_info, qgroup, num_bytes,
 				BTRFS_QGROUP_RSV_META_PREALLOC);
-		qgroup_rsv_add(fs_info, qg, num_bytes,
+		qgroup_rsv_add(fs_info, qgroup, num_bytes,
 				BTRFS_QGROUP_RSV_META_PERTRANS);
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(fs_info->qgroup_ulist,
-					glist->group->qgroupid,
-					qgroup_to_aux(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
-		}
+
+		list_for_each_entry(glist, &qgroup->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, glist->group);
 	}
 out:
+	qgroup_iterator_clean(&qgroup_list);
 	spin_unlock(&fs_info->qgroup_lock);
 }
 
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid))
 		return;
 	/* Same as btrfs_qgroup_free_meta_prealloc() */
@@ -4257,7 +4455,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
 	int level = btrfs_header_level(subvol_parent) - 1;
 	int ret = 0;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
@@ -4367,7 +4565,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int i;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 	if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
 		return 0;
@@ -4450,3 +4648,53 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
 	}
 	*root = RB_ROOT;
 }
+
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+			      struct btrfs_squota_delta *delta)
+{
+	int ret;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup *qg;
+	LIST_HEAD(qgroup_list);
+	u64 root = delta->root;
+	u64 num_bytes = delta->num_bytes;
+	const int sign = (delta->is_inc ? 1 : -1);
+
+	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+		return 0;
+
+	if (!is_fstree(root))
+		return 0;
+
+	/* If the extent predates enabling quotas, don't count it. */
+	if (delta->generation < fs_info->qgroup_enable_gen)
+		return 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup = find_qgroup_rb(fs_info, root);
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = 0;
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(qg, &qgroup_list, iterator) {
+		struct btrfs_qgroup_list *glist;
+
+		qg->excl += num_bytes * sign;
+		qg->rfer += num_bytes * sign;
+		qgroup_dirty(fs_info, qg);
+
+		list_for_each_entry(glist, &qg->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, glist->group);
+	}
+	qgroup_iterator_clean(&qgroup_list);
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	if (!ret && delta->rsv_bytes)
+		btrfs_qgroup_free_refroot(fs_info, root, delta->rsv_bytes,
+					  BTRFS_QGROUP_RSV_DATA);
+	return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7bffa10589d6..855a4f978761 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -101,8 +101,15 @@
  *     subtree rescan for them.
  */
 
-#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN		(1UL << 3)
-#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING		(1UL << 4)
+/*
+ * These flags share the flags field of the btrfs_qgroup_status_item with the
+ * persisted flags defined in btrfs_tree.h.
+ *
+ * To minimize the chance of collision with new persisted status flags, these
+ * count backwards from the MSB.
+ */
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN		(1ULL << 63)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING		(1ULL << 62)
 
 /*
  * Record a dirty extent, and info qgroup to update quota on it
@@ -220,6 +227,33 @@ struct btrfs_qgroup {
 	struct list_head groups;  /* groups this group is member of */
 	struct list_head members; /* groups that are members of this group */
 	struct list_head dirty;   /* dirty groups */
+
+	/*
+	 * For qgroup iteration usage.
+	 *
+	 * The iteration list should always be empty until qgroup_iterator_add()
+	 * is called.  And should be reset to empty after the iteration is
+	 * finished.
+	 */
+	struct list_head iterator;
+
+	/*
+	 * For nested iterator usage.
+	 *
+	 * Here we support at most one level of nested iterator calls like:
+	 *
+	 *	LIST_HEAD(all_qgroups);
+	 *	{
+	 *		LIST_HEAD(local_qgroups);
+	 *		qgroup_iterator_add(local_qgroups, qg);
+	 *		qgroup_iterator_nested_add(all_qgroups, qg);
+	 *		do_some_work(local_qgroups);
+	 *		qgroup_iterator_clean(local_qgroups);
+	 *	}
+	 *	do_some_work(all_qgroups);
+	 *	qgroup_iterator_nested_clean(all_qgroups);
+	 */
+	struct list_head nested_iterator;
 	struct rb_node node;	  /* tree of qgroups */
 
 	/*
@@ -235,6 +269,21 @@ struct btrfs_qgroup {
 	struct kobject kobj;
 };
 
+struct btrfs_squota_delta {
+	/* The fstree root this delta counts against. */
+	u64 root;
+	/* The number of bytes in the extent being counted. */
+	u64 num_bytes;
+	/* The number of bytes reserved for this extent. */
+	u64 rsv_bytes;
+	/* The generation the extent was created in. */
+	u64 generation;
+	/* Whether we are using or freeing the extent. */
+	bool is_inc;
+	/* Whether the extent is data or metadata. */
+	bool is_data;
+};
+
 static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
 {
 	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
@@ -249,14 +298,23 @@ enum {
 	ENUM_BIT(QGROUP_FREE),
 };
 
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
+enum btrfs_qgroup_mode {
+	BTRFS_QGROUP_MODE_DISABLED,
+	BTRFS_QGROUP_MODE_FULL,
+	BTRFS_QGROUP_MODE_SIMPLE
+};
+
+enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info);
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+		       struct btrfs_ioctl_quota_ctl_args *quota_ctl_args);
 int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
 void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
 				     bool interruptible);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
-			      u64 dst);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst);
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 			      u64 dst);
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
@@ -267,80 +325,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 struct btrfs_delayed_extent_op;
 
-/*
- * Inform qgroup to trace one dirty extent, its info is recorded in @record.
- * So qgroup can account it at transaction committing time.
- *
- * No lock version, caller must acquire delayed ref lock and allocated memory,
- * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
- *
- * Return 0 for success insert
- * Return >0 for existing record, caller can free @record safely.
- * Error is not possible
- */
 int btrfs_qgroup_trace_extent_nolock(
 		struct btrfs_fs_info *fs_info,
 		struct btrfs_delayed_ref_root *delayed_refs,
 		struct btrfs_qgroup_extent_record *record);
-
-/*
- * Post handler after qgroup_trace_extent_nolock().
- *
- * NOTE: Current qgroup does the expensive backref walk at transaction
- * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
- * new transaction.
- * This is designed to allow btrfs_find_all_roots() to get correct new_roots
- * result.
- *
- * However for old_roots there is no need to do backref walk at that time,
- * since we search commit roots to walk backref and result will always be
- * correct.
- *
- * Due to the nature of no lock version, we can't do backref there.
- * So we must call btrfs_qgroup_trace_extent_post() after exiting
- * spinlock context.
- *
- * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
- * using current root, then we can move all expensive backref walk out of
- * transaction committing, but not now as qgroup accounting will be wrong again.
- */
 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 				   struct btrfs_qgroup_extent_record *qrecord);
-
-/*
- * Inform qgroup to trace one dirty extent, specified by @bytenr and
- * @num_bytes.
- * So qgroup can account it at commit trans time.
- *
- * Better encapsulated version, with memory allocation and backref walk for
- * commit roots.
- * So this can sleep.
- *
- * Return 0 if the operation is done.
- * Return <0 for error, like memory allocation failure or invalid parameter
- * (NULL trans)
- */
 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 			      u64 num_bytes);
-
-/*
- * Inform qgroup to trace all leaf items of data
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM)
- */
 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb);
-/*
- * Inform qgroup to trace a whole subtree, including all its child tree
- * blocks and data.
- * The root tree block is specified by @root_eb.
- *
- * Normally used by relocation(tree block swap) and subvolume deletion.
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM or tree search error)
- */
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 			       struct extent_buffer *root_eb,
 			       u64 root_gen, int root_level);
@@ -350,7 +344,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
 int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
-			 u64 objectid, struct btrfs_qgroup_inherit *inherit);
+			 u64 objectid, u64 inode_rootid,
+			 struct btrfs_qgroup_inherit *inherit);
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 			       u64 ref_root, u64 num_bytes,
 			       enum btrfs_qgroup_rsv_type type);
@@ -408,20 +403,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
 			BTRFS_QGROUP_RSV_META_PREALLOC);
 }
 
-/*
- * Per-transaction meta reservation should be all freed at transaction commit
- * time
- */
 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
-
-/*
- * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
- *
- * This is called when preallocated meta reservation needs to be used.
- * Normally after btrfs_join_transaction() call.
- */
 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-
 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
 
 /* btrfs_qgroup_swapped_blocks related functions */
@@ -439,5 +422,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 		struct btrfs_root *root, struct extent_buffer *eb);
 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+			      struct btrfs_squota_delta *delta);
 
 #endif
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644
index 000000000000..944e8f1862aa
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/btrfs_tree.h>
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+#include "misc.h"
+#include "print-tree.h"
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u64 found_start;
+	u64 found_end;
+	u64 end = start + length;
+	int slot;
+	int ret;
+
+	if (!stripe_root)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		key.objectid = start;
+		key.type = BTRFS_RAID_STRIPE_KEY;
+		key.offset = length;
+
+		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			ret = 0;
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		found_start = key.objectid;
+		found_end = found_start + key.offset;
+
+		/* That stripe ends before we start, we're done. */
+		if (found_end <= start)
+			break;
+
+		trace_btrfs_raid_extent_delete(fs_info, start, end,
+					       found_start, found_end);
+
+		ASSERT(found_start >= start && found_end <= end);
+		ret = btrfs_del_item(trans, stripe_root, path);
+		if (ret)
+			break;
+
+		btrfs_release_path(path);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_io_context *bioc)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_key stripe_key;
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
+	u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type);
+	struct btrfs_stripe_extent *stripe_extent;
+	const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
+	int ret;
+
+	stripe_extent = kzalloc(item_size, GFP_NOFS);
+	if (!stripe_extent) {
+		btrfs_abort_transaction(trans, -ENOMEM);
+		btrfs_end_transaction(trans);
+		return -ENOMEM;
+	}
+
+	trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
+					   num_stripes);
+	btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding);
+	for (int i = 0; i < num_stripes; i++) {
+		u64 devid = bioc->stripes[i].dev->devid;
+		u64 physical = bioc->stripes[i].physical;
+		u64 length = bioc->stripes[i].length;
+		struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
+
+		if (length == 0)
+			length = bioc->size;
+
+		btrfs_set_stack_raid_stride_devid(raid_stride, devid);
+		btrfs_set_stack_raid_stride_physical(raid_stride, physical);
+	}
+
+	stripe_key.objectid = bioc->logical;
+	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+	stripe_key.offset = bioc->size;
+
+	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
+				item_size);
+	if (ret)
+		btrfs_abort_transaction(trans, ret);
+
+	kfree(stripe_extent);
+
+	return ret;
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_ordered_extent *ordered_extent)
+{
+	struct btrfs_io_context *bioc;
+	int ret;
+
+	if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
+		return 0;
+
+	list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
+		ret = btrfs_insert_one_raid_extent(trans, bioc);
+		if (ret)
+			return ret;
+	}
+
+	while (!list_empty(&ordered_extent->bioc_list)) {
+		bioc = list_first_entry(&ordered_extent->bioc_list,
+					typeof(*bioc), rst_ordered_entry);
+		list_del(&bioc->rst_ordered_entry);
+		btrfs_put_bioc(bioc);
+	}
+
+	return ret;
+}
+
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+				 u64 logical, u64 *length, u64 map_type,
+				 u32 stripe_index, struct btrfs_io_stripe *stripe)
+{
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	struct btrfs_stripe_extent *stripe_extent;
+	struct btrfs_key stripe_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	const u64 end = logical + *length;
+	int num_stripes;
+	u8 encoding;
+	u64 offset;
+	u64 found_logical;
+	u64 found_length;
+	u64 found_end;
+	int slot;
+	int ret;
+
+	stripe_key.objectid = logical;
+	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+	stripe_key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (stripe->is_scrub) {
+		path->skip_locking = 1;
+		path->search_commit_root = 1;
+	}
+
+	ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
+	if (ret < 0)
+		goto free_path;
+	if (ret) {
+		if (path->slots[0] != 0)
+			path->slots[0]--;
+	}
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		found_logical = found_key.objectid;
+		found_length = found_key.offset;
+		found_end = found_logical + found_length;
+
+		if (found_logical > end) {
+			ret = -ENOENT;
+			goto out;
+		}
+
+		if (in_range(logical, found_logical, found_length))
+			break;
+
+		ret = btrfs_next_item(stripe_root, path);
+		if (ret)
+			goto out;
+	}
+
+	offset = logical - found_logical;
+
+	/*
+	 * If we have a logically contiguous, but physically non-continuous
+	 * range, we need to split the bio. Record the length after which we
+	 * must split the bio.
+	 */
+	if (end > found_end)
+		*length -= end - found_end;
+
+	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+	encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent);
+
+	if (encoding != btrfs_bg_flags_to_raid_index(map_type)) {
+		ret = -EUCLEAN;
+		btrfs_handle_fs_error(fs_info, ret,
+				      "on-disk stripe encoding %d doesn't match RAID index %d",
+				      encoding,
+				      btrfs_bg_flags_to_raid_index(map_type));
+		goto out;
+	}
+
+	for (int i = 0; i < num_stripes; i++) {
+		struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
+		u64 devid = btrfs_raid_stride_devid(leaf, stride);
+		u64 physical = btrfs_raid_stride_physical(leaf, stride);
+
+		if (devid != stripe->dev->devid)
+			continue;
+
+		if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
+			continue;
+
+		stripe->physical = physical + offset;
+
+		trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
+						   stripe->physical, devid);
+
+		ret = 0;
+		goto free_path;
+	}
+
+	/* If we're here, we haven't found the requested devid in the stripe. */
+	ret = -ENOENT;
+out:
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret && ret != -EIO && !stripe->is_scrub) {
+		if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+			btrfs_print_tree(leaf, 1);
+		btrfs_err(fs_info,
+		"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
+			  logical, logical + *length, stripe->dev->devid,
+			  btrfs_bg_type_to_raid_name(map_type));
+	}
+free_path:
+	btrfs_free_path(path);
+
+	return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644
index 000000000000..cdb58b38fcb5
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK    (BTRFS_BLOCK_GROUP_DUP |		\
+					    BTRFS_BLOCK_GROUP_RAID1_MASK |	\
+					    BTRFS_BLOCK_GROUP_RAID0 |		\
+					    BTRFS_BLOCK_GROUP_RAID10)
+
+struct btrfs_io_context;
+struct btrfs_io_stripe;
+struct btrfs_ordered_extent;
+struct btrfs_trans_handle;
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length);
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+				 u64 logical, u64 *length, u64 map_type,
+				 u32 stripe_index, struct btrfs_io_stripe *stripe);
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_ordered_extent *ordered_extent);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+						 u64 map_type)
+{
+	u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+	u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE))
+		return false;
+
+	if (type != BTRFS_BLOCK_GROUP_DATA)
+		return false;
+
+	if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK)
+		return true;
+
+	return false;
+}
+
+static inline int btrfs_num_raid_stripes(u32 item_size)
+{
+	return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
+		sizeof(struct btrfs_raid_stride);
+}
+
+#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 95d28497de7c..1f62976bee82 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -485,6 +485,9 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
 			ret = add_shared_data_ref(fs_info, offset, count,
 						  key->objectid, key->offset);
 			break;
+		case BTRFS_EXTENT_OWNER_REF_KEY:
+			WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+			break;
 		default:
 			btrfs_err(fs_info, "invalid key type in iref");
 			ret = -EINVAL;
@@ -652,7 +655,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
+ * Called when we modify a ref for a bytenr.
  *
  * This will add an action item to the given bytenr and do sanity checks to make
  * sure we haven't messed something up.  If we are making a new allocation and
@@ -681,10 +684,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
 
 	if (generic_ref->type == BTRFS_REF_METADATA) {
 		if (!parent)
-			ref_root = generic_ref->tree_ref.owning_root;
+			ref_root = generic_ref->tree_ref.ref_root;
 		owner = generic_ref->tree_ref.level;
 	} else if (!parent) {
-		ref_root = generic_ref->data_ref.owning_root;
+		ref_root = generic_ref->data_ref.ref_root;
 		owner = generic_ref->data_ref.ino;
 		offset = generic_ref->data_ref.offset;
 	}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 65d2bd6910f2..f88b0c2ac3fe 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -25,12 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 				     const u64 olen,
 				     int no_time_update)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	inode_inc_iversion(inode);
 	if (!no_time_update) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	}
 	/*
 	 * We round up to the block size at eof when determining which
@@ -43,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 	}
 
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c6d4bb8cbe29..f5d9e5f74a52 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -111,8 +111,8 @@ struct tree_block {
 	}; /* Use rb_simple_node for search/insert */
 	u64 owner;
 	struct btrfs_key key;
-	unsigned int level:8;
-	unsigned int key_ready:1;
+	u8 level;
+	bool key_ready;
 };
 
 #define MAX_EXTENTS 128
@@ -122,6 +122,13 @@ struct file_extent_cluster {
 	u64 end;
 	u64 boundary[MAX_EXTENTS];
 	unsigned int nr;
+	u64 owning_root;
+};
+
+/* Stages of data relocation. */
+enum reloc_stage {
+	MOVE_DATA_EXTENTS,
+	UPDATE_DATA_PTRS
 };
 
 struct reloc_control {
@@ -155,16 +162,12 @@ struct reloc_control {
 	u64 search_start;
 	u64 extents_found;
 
-	unsigned int stage:8;
-	unsigned int create_reloc_tree:1;
-	unsigned int merge_reloc_tree:1;
-	unsigned int found_file_extent:1;
+	enum reloc_stage stage;
+	bool create_reloc_tree;
+	bool merge_reloc_tree;
+	bool found_file_extent;
 };
 
-/* stages of data relocation */
-#define MOVE_DATA_EXTENTS	0
-#define UPDATE_DATA_PTRS	1
-
 static void mark_block_processed(struct reloc_control *rc,
 				 struct btrfs_backref_node *node)
 {
@@ -180,13 +183,6 @@ static void mark_block_processed(struct reloc_control *rc,
 	node->processed = 1;
 }
 
-
-static void mapping_tree_init(struct mapping_tree *tree)
-{
-	tree->rb_root = RB_ROOT;
-	spin_lock_init(&tree->lock);
-}
-
 /*
  * walk up backref nodes until reach node presents tree root
  */
@@ -299,7 +295,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
 	return 1;
 }
 
-static bool reloc_root_is_dead(struct btrfs_root *root)
+static bool reloc_root_is_dead(const struct btrfs_root *root)
 {
 	/*
 	 * Pair with set_bit/clear_bit in clean_dirty_subvols and
@@ -320,7 +316,7 @@ static bool reloc_root_is_dead(struct btrfs_root *root)
  * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
  * special case.
  */
-static bool have_reloc_root(struct btrfs_root *root)
+static bool have_reloc_root(const struct btrfs_root *root)
 {
 	if (reloc_root_is_dead(root))
 		return false;
@@ -329,31 +325,30 @@ static bool have_reloc_root(struct btrfs_root *root)
 	return true;
 }
 
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
+bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root)
 {
 	struct btrfs_root *reloc_root;
 
 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-		return 0;
+		return false;
 
 	/* This root has been merged with its reloc tree, we can ignore it */
 	if (reloc_root_is_dead(root))
-		return 1;
+		return true;
 
 	reloc_root = root->reloc_root;
 	if (!reloc_root)
-		return 0;
+		return false;
 
 	if (btrfs_header_generation(reloc_root->commit_root) ==
 	    root->fs_info->running_transaction->transid)
-		return 0;
+		return false;
 	/*
-	 * if there is reloc tree and it was created in previous
-	 * transaction backref lookup can find the reloc tree,
-	 * so backref node for the fs tree root is useless for
-	 * relocation.
+	 * If there is reloc tree and it was created in previous transaction
+	 * backref lookup can find the reloc tree, so backref node for the fs
+	 * tree root is useless for relocation.
 	 */
-	return 1;
+	return true;
 }
 
 /*
@@ -547,7 +542,7 @@ out:
  */
 static int clone_backref_node(struct btrfs_trans_handle *trans,
 			      struct reloc_control *rc,
-			      struct btrfs_root *src,
+			      const struct btrfs_root *src,
 			      struct btrfs_root *dest)
 {
 	struct btrfs_root *reloc_root = src->reloc_root;
@@ -632,7 +627,7 @@ fail:
 /*
  * helper to add 'address of tree root -> reloc tree' mapping
  */
-static int __must_check __add_reloc_root(struct btrfs_root *root)
+static int __add_reloc_root(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct rb_node *rb_node;
@@ -1159,7 +1154,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 
 		key.offset -= btrfs_file_extent_offset(leaf, fi);
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
-				       num_bytes, parent);
+				       num_bytes, parent, root->root_key.objectid);
 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
 				    key.objectid, key.offset,
 				    root->root_key.objectid, false);
@@ -1170,7 +1165,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		}
 
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
-				       num_bytes, parent);
+				       num_bytes, parent, root->root_key.objectid);
 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
 				    key.objectid, key.offset,
 				    root->root_key.objectid, false);
@@ -1181,15 +1176,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		}
 	}
 	if (dirty)
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	if (inode)
 		btrfs_add_delayed_iput(BTRFS_I(inode));
 	return ret;
 }
 
-static noinline_for_stack
-int memcmp_node_keys(struct extent_buffer *eb, int slot,
-		     struct btrfs_path *path, int level)
+static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb,
+					       int slot, const struct btrfs_path *path,
+					       int level)
 {
 	struct btrfs_disk_key key1;
 	struct btrfs_disk_key key2;
@@ -1374,16 +1369,17 @@ again:
 		 */
 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
-		btrfs_mark_buffer_dirty(parent);
+		btrfs_mark_buffer_dirty(trans, parent);
 
 		btrfs_set_node_blockptr(path->nodes[level],
 					path->slots[level], old_bytenr);
 		btrfs_set_node_ptr_generation(path->nodes[level],
 					      path->slots[level], old_ptr_gen);
-		btrfs_mark_buffer_dirty(path->nodes[level]);
+		btrfs_mark_buffer_dirty(trans, path->nodes[level]);
 
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
-				       blocksize, path->nodes[level]->start);
+				       blocksize, path->nodes[level]->start,
+				       src->root_key.objectid);
 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
 				    0, true);
 		ret = btrfs_inc_extent_ref(trans, &ref);
@@ -1392,7 +1388,7 @@ again:
 			break;
 		}
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
-				       blocksize, 0);
+				       blocksize, 0, dest->root_key.objectid);
 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
 				    true);
 		ret = btrfs_inc_extent_ref(trans, &ref);
@@ -1401,8 +1397,9 @@ again:
 			break;
 		}
 
+		/* We don't know the real owning_root, use 0. */
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
-				       blocksize, path->nodes[level]->start);
+				       blocksize, path->nodes[level]->start, 0);
 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
 				    0, true);
 		ret = btrfs_free_extent(trans, &ref);
@@ -1411,8 +1408,9 @@ again:
 			break;
 		}
 
+		/* We don't know the real owning_root, use 0. */
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
-				       blocksize, 0);
+				       blocksize, 0, 0);
 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
 				    0, true);
 		ret = btrfs_free_extent(trans, &ref);
@@ -1518,8 +1516,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
  * [min_key, max_key)
  */
 static int invalidate_extent_cache(struct btrfs_root *root,
-				   struct btrfs_key *min_key,
-				   struct btrfs_key *max_key)
+				   const struct btrfs_key *min_key,
+				   const struct btrfs_key *max_key)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct inode *inode = NULL;
@@ -1897,7 +1895,7 @@ again:
 		}
 	}
 
-	rc->merge_reloc_tree = 1;
+	rc->merge_reloc_tree = true;
 
 	while (!list_empty(&rc->reloc_roots)) {
 		reloc_root = list_entry(rc->reloc_roots.next,
@@ -2517,11 +2515,12 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						node->eb->start);
 			btrfs_set_node_ptr_generation(upper->eb, slot,
 						      trans->transid);
-			btrfs_mark_buffer_dirty(upper->eb);
+			btrfs_mark_buffer_dirty(trans, upper->eb);
 
 			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
 					       node->eb->start, blocksize,
-					       upper->eb->start);
+					       upper->eb->start,
+					       btrfs_header_owner(upper->eb));
 			btrfs_init_tree_ref(&ref, node->level,
 					    btrfs_header_owner(upper->eb),
 					    root->root_key.objectid, false);
@@ -2633,7 +2632,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
 	u32 blocksize = rc->extent_root->fs_info->nodesize;
 
 	if (test_range_bit(&rc->processed_blocks, bytenr,
-			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+			   bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
 		return 1;
 	return 0;
 }
@@ -2660,7 +2659,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
 	else
 		btrfs_node_key_to_cpu(eb, &block->key, 0);
 	free_extent_buffer(eb);
-	block->key_ready = 1;
+	block->key_ready = true;
 	return 0;
 }
 
@@ -2830,7 +2829,7 @@ out_free_blocks:
 
 static noinline_for_stack int prealloc_file_extent_cluster(
 				struct btrfs_inode *inode,
-				struct file_extent_cluster *cluster)
+				const struct file_extent_cluster *cluster)
 {
 	u64 alloc_hint = 0;
 	u64 start;
@@ -2965,7 +2964,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
 /*
  * Allow error injection to test balance/relocation cancellation
  */
-noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info)
 {
 	return atomic_read(&fs_info->balance_cancel_req) ||
 		atomic_read(&fs_info->reloc_cancel_req) ||
@@ -2973,7 +2972,7 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
 }
 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
 
-static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
 				    int cluster_nr)
 {
 	/* Last extent, use cluster end directly */
@@ -2985,7 +2984,7 @@ static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
 }
 
 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
-			     struct file_extent_cluster *cluster,
+			     const struct file_extent_cluster *cluster,
 			     int *cluster_nr, unsigned long page_index)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3120,7 +3119,7 @@ release_page:
 }
 
 static int relocate_file_extent_cluster(struct inode *inode,
-					struct file_extent_cluster *cluster)
+					const struct file_extent_cluster *cluster)
 {
 	u64 offset = BTRFS_I(inode)->index_cnt;
 	unsigned long index;
@@ -3158,11 +3157,12 @@ out:
 	return ret;
 }
 
-static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
-			 struct file_extent_cluster *cluster)
+static noinline_for_stack int relocate_data_extent(struct inode *inode,
+				const struct btrfs_key *extent_key,
+				struct file_extent_cluster *cluster)
 {
 	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 
 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
 		ret = relocate_file_extent_cluster(inode, cluster);
@@ -3171,8 +3171,38 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
 		cluster->nr = 0;
 	}
 
-	if (!cluster->nr)
+	/*
+	 * Under simple quotas, we set root->relocation_src_root when we find
+	 * the extent. If adjacent extents have different owners, we can't merge
+	 * them while relocating. Handle this by storing the owning root that
+	 * started a cluster and if we see an extent from a different root break
+	 * cluster formation (just like the above case of non-adjacent extents).
+	 *
+	 * Without simple quotas, relocation_src_root is always 0, so we should
+	 * never see a mismatch, and it should have no effect on relocation
+	 * clusters.
+	 */
+	if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
+		u64 tmp = root->relocation_src_root;
+
+		/*
+		 * root->relocation_src_root is the state that actually affects
+		 * the preallocation we do here, so set it to the root owning
+		 * the cluster we need to relocate.
+		 */
+		root->relocation_src_root = cluster->owning_root;
+		ret = relocate_file_extent_cluster(inode, cluster);
+		if (ret)
+			return ret;
+		cluster->nr = 0;
+		/* And reset it back for the current extent's owning root. */
+		root->relocation_src_root = tmp;
+	}
+
+	if (!cluster->nr) {
 		cluster->start = extent_key->objectid;
+		cluster->owning_root = root->relocation_src_root;
+	}
 	else
 		BUG_ON(cluster->nr >= MAX_EXTENTS);
 	cluster->end = extent_key->objectid + extent_key->offset - 1;
@@ -3193,7 +3223,7 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
  * the major work is getting the generation and level of the block
  */
 static int add_tree_block(struct reloc_control *rc,
-			  struct btrfs_key *extent_key,
+			  const struct btrfs_key *extent_key,
 			  struct btrfs_path *path,
 			  struct rb_root *blocks)
 {
@@ -3278,7 +3308,7 @@ static int add_tree_block(struct reloc_control *rc,
 	block->key.objectid = rc->extent_root->fs_info->nodesize;
 	block->key.offset = generation;
 	block->level = level;
-	block->key_ready = 0;
+	block->key_ready = false;
 	block->owner = owner;
 
 	rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
@@ -3444,11 +3474,10 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
 /*
  * helper to find all tree blocks that reference a given data extent
  */
-static noinline_for_stack
-int add_data_references(struct reloc_control *rc,
-			struct btrfs_key *extent_key,
-			struct btrfs_path *path,
-			struct rb_root *blocks)
+static noinline_for_stack int add_data_references(struct reloc_control *rc,
+						  const struct btrfs_key *extent_key,
+						  struct btrfs_path *path,
+						  struct rb_root *blocks)
 {
 	struct btrfs_backref_walk_ctx ctx = { 0 };
 	struct ulist_iterator leaf_uiter;
@@ -3622,7 +3651,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	if (ret)
 		return ret;
 
-	rc->create_reloc_tree = 1;
+	rc->create_reloc_tree = true;
 	set_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root);
@@ -3702,6 +3731,21 @@ restart:
 				    struct btrfs_extent_item);
 		flags = btrfs_extent_flags(path->nodes[0], ei);
 
+		/*
+		 * If we are relocating a simple quota owned extent item, we
+		 * need to note the owner on the reloc data root so that when
+		 * we allocate the replacement item, we can attribute it to the
+		 * correct eventual owner (rather than the reloc data root).
+		 */
+		if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+			struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
+			u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
+								 path->nodes[0],
+								 path->slots[0]);
+
+			root->relocation_src_root = owning_root_id;
+		}
+
 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 			ret = add_tree_block(rc, &key, path, &blocks);
 		} else if (rc->stage == UPDATE_DATA_PTRS &&
@@ -3734,7 +3778,7 @@ restart:
 
 		if (rc->stage == MOVE_DATA_EXTENTS &&
 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
-			rc->found_file_extent = 1;
+			rc->found_file_extent = true;
 			ret = relocate_data_extent(rc->data_inode,
 						   &key, &rc->cluster);
 			if (ret < 0) {
@@ -3771,7 +3815,7 @@ restart:
 			err = ret;
 	}
 
-	rc->create_reloc_tree = 0;
+	rc->create_reloc_tree = false;
 	set_reloc_control(rc);
 
 	btrfs_backref_release_cache(&rc->backref_cache);
@@ -3789,7 +3833,7 @@ restart:
 
 	merge_reloc_roots(rc);
 
-	rc->merge_reloc_tree = 0;
+	rc->merge_reloc_tree = false;
 	unset_reloc_control(rc);
 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
 
@@ -3835,7 +3879,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
 					  BTRFS_INODE_PREALLOC);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3874,9 +3918,9 @@ out:
  * helper to create inode for data relocation.
  * the inode is in data relocation tree and its link count is 0
  */
-static noinline_for_stack
-struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_group *group)
+static noinline_for_stack struct inode *create_reloc_inode(
+					struct btrfs_fs_info *fs_info,
+					const struct btrfs_block_group *group)
 {
 	struct inode *inode = NULL;
 	struct btrfs_trans_handle *trans;
@@ -3971,8 +4015,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 
 	INIT_LIST_HEAD(&rc->reloc_roots);
 	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
-	btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
-	mapping_tree_init(&rc->reloc_root_tree);
+	btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
+	rc->reloc_root_tree.rb_root = RB_ROOT;
+	spin_lock_init(&rc->reloc_root_tree.lock);
 	extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
 	return rc;
 }
@@ -4004,7 +4049,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
 		   block_group->start, buf);
 }
 
-static const char *stage_to_string(int stage)
+static const char *stage_to_string(enum reloc_stage stage)
 {
 	if (stage == MOVE_DATA_EXTENTS)
 		return "move data extents";
@@ -4120,7 +4165,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
 	WARN_ON(ret && ret != -EAGAIN);
 
 	while (1) {
-		int finishes_stage;
+		enum reloc_stage finishes_stage;
 
 		mutex_lock(&fs_info->cleaner_mutex);
 		ret = relocate_block_group(rc);
@@ -4303,7 +4348,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
 		goto out_unset;
 	}
 
-	rc->merge_reloc_tree = 1;
+	rc->merge_reloc_tree = true;
 
 	while (!list_empty(&reloc_roots)) {
 		reloc_root = list_entry(reloc_roots.next,
@@ -4422,7 +4467,8 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
 }
 
 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct btrfs_root *root,
+			  const struct extent_buffer *buf,
 			  struct extent_buffer *cow)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4561,7 +4607,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
  *
  * Return U64_MAX if no running relocation.
  */
-u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info)
+u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
 {
 	u64 logical = U64_MAX;
 
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 77d69f6ae967..5fb60f2deb53 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -10,15 +10,16 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered);
 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct btrfs_root *root,
+			  const struct extent_buffer *buf,
 			  struct extent_buffer *cow);
 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
 			      u64 *bytes_to_reserve);
 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending);
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info);
 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
-u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info);
+bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
+u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
 
 #endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 859874579456..603ad1459368 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -51,7 +51,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 }
 
 /*
- * btrfs_find_root - lookup the root by the key.
+ * Lookup the root by the key.
+ *
  * root: the root of the root tree
  * search_key: the key to search
  * path: the path we search
@@ -191,7 +192,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
 
 	write_extent_buffer(l, item, ptr, sizeof(*item));
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -438,7 +439,7 @@ again:
 	btrfs_set_root_ref_name_len(leaf, ref, name->len);
 	ptr = (unsigned long)(ref + 1);
 	write_extent_buffer(leaf, name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
 		btrfs_release_path(path);
@@ -485,7 +486,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 }
 
 /*
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * Reserve space for subvolume operation.
+ *
  * root: the root of the parent directory
  * rsv: block reservation
  * items: the number of items that we need do reservation
@@ -508,7 +510,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+	if (btrfs_qgroup_enabled(fs_info)) {
 		/* One for parent inode, two for dir entries */
 		qgroup_num_bytes = 3 * fs_info->nodesize;
 		ret = btrfs_qgroup_reserve_meta_prealloc(root,
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index cbbaca32126e..8b2c3859e464 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -3,6 +3,8 @@
 #ifndef BTRFS_ROOT_TREE_H
 #define BTRFS_ROOT_TREE_H
 
+struct fscrypt_str;
+
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int nitems, bool use_global_rsv);
@@ -18,10 +20,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *key,
 		      struct btrfs_root_item *item);
-int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_key *key,
-				   struct btrfs_root_item *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      struct btrfs_key *key, struct btrfs_root_item *item);
 int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
 		    struct btrfs_path *path, struct btrfs_root_item *root_item,
 		    struct btrfs_key *root_key);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b877203f1dc5..9ce5be21b036 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -16,7 +16,6 @@
 #include "backref.h"
 #include "extent_io.h"
 #include "dev-replace.h"
-#include "check-integrity.h"
 #include "raid56.h"
 #include "block-group.h"
 #include "zoned.h"
@@ -24,6 +23,7 @@
 #include "accessors.h"
 #include "file-item.h"
 #include "scrub.h"
+#include "raid-stripe-tree.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -897,7 +897,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
 		ASSERT(stripe->mirror_num >= 1);
 		ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
 				      stripe->logical, &mapped_len, &bioc,
-				      NULL, NULL, 1);
+				      NULL, NULL);
 		/*
 		 * If we failed, dev will be NULL, and later detailed reports
 		 * will just be skipped.
@@ -1635,6 +1635,71 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
 	}
 }
 
+static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
+					    struct scrub_stripe *stripe)
+{
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	struct btrfs_bio *bbio = NULL;
+	u64 stripe_len = BTRFS_STRIPE_LEN;
+	int mirror = stripe->mirror_num;
+	int i;
+
+	atomic_inc(&stripe->pending_io);
+
+	for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+		struct page *page = scrub_stripe_get_page(stripe, i);
+		unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+
+		/* The current sector cannot be merged, submit the bio. */
+		if (bbio &&
+		    ((i > 0 &&
+		      !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
+		     bbio->bio.bi_iter.bi_size >= stripe_len)) {
+			ASSERT(bbio->bio.bi_iter.bi_size);
+			atomic_inc(&stripe->pending_io);
+			btrfs_submit_bio(bbio, mirror);
+			bbio = NULL;
+		}
+
+		if (!bbio) {
+			struct btrfs_io_stripe io_stripe = {};
+			struct btrfs_io_context *bioc = NULL;
+			const u64 logical = stripe->logical +
+					    (i << fs_info->sectorsize_bits);
+			int err;
+
+			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+					       fs_info, scrub_read_endio, stripe);
+			bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+
+			io_stripe.is_scrub = true;
+			err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+					      &stripe_len, &bioc, &io_stripe,
+					      &mirror);
+			btrfs_put_bioc(bioc);
+			if (err) {
+				btrfs_bio_end_io(bbio,
+						 errno_to_blk_status(err));
+				return;
+			}
+		}
+
+		__bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+	}
+
+	if (bbio) {
+		ASSERT(bbio->bio.bi_iter.bi_size);
+		atomic_inc(&stripe->pending_io);
+		btrfs_submit_bio(bbio, mirror);
+	}
+
+	if (atomic_dec_and_test(&stripe->pending_io)) {
+		wake_up(&stripe->io_wait);
+		INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+		queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+	}
+}
+
 static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 				      struct scrub_stripe *stripe)
 {
@@ -1646,6 +1711,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 	ASSERT(stripe->mirror_num > 0);
 	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
 
+	if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
+		scrub_submit_extent_sector_read(sctx, stripe);
+		return;
+	}
+
 	bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
 			       scrub_read_endio, stripe);
 
@@ -1952,7 +2022,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
 
 	btrfs_bio_counter_inc_blocked(fs_info);
 	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
-			      &length, &bioc, NULL, NULL, 1);
+			      &length, &bioc, NULL, NULL);
 	if (ret < 0) {
 		btrfs_put_bioc(bioc);
 		btrfs_bio_counter_dec(fs_info);
@@ -2717,7 +2787,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 	if (scrub_dev->fs_devices != fs_info->fs_devices)
 		gen = scrub_dev->generation;
 	else
-		gen = fs_info->last_trans_committed;
+		gen = btrfs_get_last_trans_committed(fs_info);
 
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 		bytenr = btrfs_sb_offset(i);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3a566150c531..3b929f0e8f04 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -796,7 +796,7 @@ static int send_cmd(struct send_ctx *sctx)
 	put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
 	put_unaligned_le32(0, &hdr->crc);
 
-	crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
 	put_unaligned_le32(crc, &hdr->crc);
 
 	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -5669,8 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
 	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
 	hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
 	hdr->crc = 0;
-	crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
-	crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+	crc = crc32c(0, sctx->send_buf, sctx->send_size);
+	crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
 	hdr->crc = cpu_to_le32(crc);
 
 	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d7e8cd4f140c..571bb13587d5 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -345,8 +345,10 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
 			  struct btrfs_space_info *space_info,
 			  enum btrfs_reserve_flush_enum flush)
 {
+	struct btrfs_space_info *data_sinfo;
 	u64 profile;
 	u64 avail;
+	u64 data_chunk_size;
 	int factor;
 
 	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
@@ -364,6 +366,36 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
 	 */
 	factor = btrfs_bg_type_to_factor(profile);
 	avail = div_u64(avail, factor);
+	if (avail == 0)
+		return 0;
+
+	/*
+	 * Calculate the data_chunk_size, space_info->chunk_size is the
+	 * "optimal" chunk size based on the fs size.  However when we actually
+	 * allocate the chunk we will strip this down further, making it no more
+	 * than 10% of the disk or 1G, whichever is smaller.
+	 */
+	data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+	data_chunk_size = min(data_sinfo->chunk_size,
+			      mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+	data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+
+	/*
+	 * Since data allocations immediately use block groups as part of the
+	 * reservation, because we assume that data reservations will == actual
+	 * usage, we could potentially overcommit and then immediately have that
+	 * available space used by a data allocation, which could put us in a
+	 * bind when we get close to filling the file system.
+	 *
+	 * To handle this simply remove the data_chunk_size from the available
+	 * space.  If we are relatively empty this won't affect our ability to
+	 * overcommit much, and if we're very close to full it'll keep us from
+	 * getting into a position where we've given ourselves very little
+	 * metadata wiggle room.
+	 */
+	if (avail <= data_chunk_size)
+		return 0;
+	avail -= data_chunk_size;
 
 	/*
 	 * If we aren't flushing all things, let us overcommit up to
@@ -556,18 +588,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
 	return nr;
 }
 
-static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
-				       u64 to_reclaim)
-{
-	const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
-	u64 nr;
-
-	nr = div64_u64(to_reclaim, bytes);
-	if (!nr)
-		nr = 1;
-	return nr;
-}
-
 #define EXTENT_SIZE_PER_ITEM	SZ_256K
 
 /*
@@ -749,10 +769,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 			break;
 		}
 		if (state == FLUSH_DELAYED_REFS_NR)
-			nr = calc_delayed_refs_nr(fs_info, num_bytes);
+			btrfs_run_delayed_refs(trans, num_bytes);
 		else
-			nr = 0;
-		btrfs_run_delayed_refs(trans, nr);
+			btrfs_run_delayed_refs(trans, 0);
 		btrfs_end_transaction(trans);
 		break;
 	case ALLOC_CHUNK:
@@ -978,7 +997,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * We've exhausted our flushing, start failing tickets.
+ *
  * @fs_info - fs_info for this fs
  * @space_info - the space info we were flushing
  *
@@ -1742,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
  * Try to reserve metadata bytes from the block_rsv's space.
  *
  * @fs_info:    the filesystem
- * @block_rsv:  block_rsv we're allocating for
+ * @space_info: the space_info we're allocating for
  * @orig_bytes: number of bytes we want
  * @flush:      whether or not we can flush to make our reservation
  *
@@ -1754,21 +1774,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
  * space already.
  */
 int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv,
+				 struct btrfs_space_info *space_info,
 				 u64 orig_bytes,
 				 enum btrfs_reserve_flush_enum flush)
 {
 	int ret;
 
-	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
+	ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
 	if (ret == -ENOSPC) {
 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
-					      block_rsv->space_info->flags,
-					      orig_bytes, 1);
+					      space_info->flags, orig_bytes, 1);
 
 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
-			btrfs_dump_space_info(fs_info, block_rsv->space_info,
-					      orig_bytes, 0);
+			btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 0bb9d14e60a8..92c595fed1b0 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,7 @@
 #ifndef BTRFS_SPACE_INFO_H
 #define BTRFS_SPACE_INFO_H
 
+#include <trace/events/btrfs.h>
 #include "volumes.h"
 
 /*
@@ -212,7 +213,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
 			   struct btrfs_space_info *info, u64 bytes,
 			   int dump_block_groups);
 int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv,
+				 struct btrfs_space_info *space_info,
 				 u64 orig_bytes,
 				 enum btrfs_reserve_flush_enum flush);
 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a093ec0f7e3..6ecf78d09694 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/ratelimit.h>
 #include <linux/crc32c.h>
 #include <linux/btrfs.h>
+#include <linux/security.h>
 #include "messages.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -129,9 +130,6 @@ enum {
 	Opt_inode_cache, Opt_noinode_cache,
 
 	/* Debugging options */
-	Opt_check_integrity,
-	Opt_check_integrity_including_extent_data,
-	Opt_check_integrity_print_mask,
 	Opt_enospc_debug, Opt_noenospc_debug,
 #ifdef CONFIG_BTRFS_DEBUG
 	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
@@ -200,9 +198,6 @@ static const match_table_t tokens = {
 	{Opt_recovery, "recovery"},
 
 	/* Debugging options */
-	{Opt_check_integrity, "check_int"},
-	{Opt_check_integrity_including_extent_data, "check_int_data"},
-	{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
 	{Opt_enospc_debug, "enospc_debug"},
 	{Opt_noenospc_debug, "noenospc_debug"},
 #ifdef CONFIG_BTRFS_DEBUG
@@ -707,44 +702,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 		case Opt_skip_balance:
 			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
 			break;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-		case Opt_check_integrity_including_extent_data:
-			btrfs_warn(info,
-	"integrity checker is deprecated and will be removed in 6.7");
-			btrfs_info(info,
-				   "enabling check integrity including extent data");
-			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
-			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
-			break;
-		case Opt_check_integrity:
-			btrfs_warn(info,
-	"integrity checker is deprecated and will be removed in 6.7");
-			btrfs_info(info, "enabling check integrity");
-			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
-			break;
-		case Opt_check_integrity_print_mask:
-			ret = match_int(&args[0], &intarg);
-			if (ret) {
-				btrfs_err(info,
-				"unrecognized check_integrity_print_mask value %s",
-					args[0].from);
-				goto out;
-			}
-			info->check_integrity_print_mask = intarg;
-			btrfs_warn(info,
-	"integrity checker is deprecated and will be removed in 6.7");
-			btrfs_info(info, "check_integrity_print_mask 0x%x",
-				   info->check_integrity_print_mask);
-			break;
-#else
-		case Opt_check_integrity_including_extent_data:
-		case Opt_check_integrity:
-		case Opt_check_integrity_print_mask:
-			btrfs_err(info,
-				  "support for check_integrity* not compiled in!");
-			ret = -EINVAL;
-			goto out;
-#endif
 		case Opt_fatal_errors:
 			if (strcmp(args[0].from, "panic") == 0) {
 				btrfs_set_opt(info->mount_opt,
@@ -889,7 +846,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
 				error = -ENOMEM;
 				goto out;
 			}
-			device = btrfs_scan_one_device(device_name, flags);
+			device = btrfs_scan_one_device(device_name, flags, false);
 			kfree(device_name);
 			if (IS_ERR(device)) {
 				error = PTR_ERR(device);
@@ -1305,15 +1262,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",autodefrag");
 	if (btrfs_test_opt(info, SKIP_BALANCE))
 		seq_puts(seq, ",skip_balance");
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
-		seq_puts(seq, ",check_int_data");
-	else if (btrfs_test_opt(info, CHECK_INTEGRITY))
-		seq_puts(seq, ",check_int");
-	if (info->check_integrity_print_mask)
-		seq_printf(seq, ",check_int_print_mask=%d",
-				info->check_integrity_print_mask);
-#endif
 	if (info->metadata_ratio)
 		seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
 	if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
@@ -1484,7 +1432,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 		goto error_fs_info;
 	}
 
-	device = btrfs_scan_one_device(device_name, mode);
+	/*
+	 * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+	 * either a valid device or an error.
+	 */
+	device = btrfs_scan_one_device(device_name, mode, true);
+	ASSERT(device != NULL);
 	if (IS_ERR(device)) {
 		mutex_unlock(&uuid_mutex);
 		error = PTR_ERR(device);
@@ -2196,7 +2149,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
 		mutex_lock(&uuid_mutex);
-		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
+		/*
+		 * Scanning outside of mount can return NULL which would turn
+		 * into 0 error code.
+		 */
+		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
 		ret = PTR_ERR_OR_ZERO(device);
 		mutex_unlock(&uuid_mutex);
 		break;
@@ -2210,8 +2167,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		break;
 	case BTRFS_IOC_DEVICES_READY:
 		mutex_lock(&uuid_mutex);
-		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
-		if (IS_ERR(device)) {
+		/*
+		 * Scanning outside of mount can return NULL which would turn
+		 * into 0 error code.
+		 */
+		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+		if (IS_ERR_OR_NULL(device)) {
 			mutex_unlock(&uuid_mutex);
 			ret = PTR_ERR(device);
 			break;
@@ -2256,6 +2217,7 @@ static int check_dev_super(struct btrfs_device *dev)
 {
 	struct btrfs_fs_info *fs_info = dev->fs_info;
 	struct btrfs_super_block *sb;
+	u64 last_trans;
 	u16 csum_type;
 	int ret = 0;
 
@@ -2291,10 +2253,10 @@ static int check_dev_super(struct btrfs_device *dev)
 	if (ret < 0)
 		goto out;
 
-	if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+	last_trans = btrfs_get_last_trans_committed(fs_info);
+	if (btrfs_super_generation(sb) != last_trans) {
 		btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
-			btrfs_super_generation(sb),
-			fs_info->last_trans_committed);
+			  btrfs_super_generation(sb), last_trans);
 		ret = -EUCLEAN;
 		goto out;
 	}
@@ -2404,9 +2366,6 @@ static int __init btrfs_print_mod_info(void)
 #ifdef CONFIG_BTRFS_ASSERT
 			", assert=on"
 #endif
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-			", integrity-checker=on"
-#endif
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 			", ref-verify=on"
 #endif
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b1d1ac25237b..e6b51fb3ddc1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
 BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
 BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
 BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
 #ifdef CONFIG_BLK_DEV_ZONED
 BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
 #endif
 #ifdef CONFIG_BTRFS_DEBUG
 /* Remove once support for extent tree v2 is feature complete */
 BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
+/* Remove once support for raid stripe tree is feature complete. */
+BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE);
 #endif
 #ifdef CONFIG_FS_VERITY
 BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(free_space_tree),
 	BTRFS_FEAT_ATTR_PTR(raid1c34),
 	BTRFS_FEAT_ATTR_PTR(block_group_tree),
+	BTRFS_FEAT_ATTR_PTR(simple_quota),
 #ifdef CONFIG_BLK_DEV_ZONED
 	BTRFS_FEAT_ATTR_PTR(zoned),
 #endif
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
+	BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
 #endif
 #ifdef CONFIG_FS_VERITY
 	BTRFS_FEAT_ATTR_PTR(verity),
@@ -420,6 +425,13 @@ static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *bu
 }
 BTRFS_ATTR(static_feature, acl, acl_show);
 
+static ssize_t temp_fsid_supported_show(struct kobject *kobj,
+					struct kobj_attribute *a, char *buf)
+{
+	return sysfs_emit(buf, "0\n");
+}
+BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show);
+
 /*
  * Features which only depend on kernel version.
  *
@@ -433,6 +445,7 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
 	BTRFS_ATTR_PTR(static_feature, send_stream_version),
 	BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
 	BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
+	BTRFS_ATTR_PTR(static_feature, temp_fsid),
 	NULL
 };
 
@@ -1196,10 +1209,19 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-	return sysfs_emit(buf, "%llu\n", fs_info->generation);
+	return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info));
 }
 BTRFS_ATTR(, generation, btrfs_generation_show);
 
+static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
+				    struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid);
+}
+BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
+
 static const char * const btrfs_read_policy_name[] = { "pid" };
 
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
@@ -1302,6 +1324,7 @@ static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, read_policy),
 	BTRFS_ATTR_PTR(, bg_reclaim_threshold),
 	BTRFS_ATTR_PTR(, commit_stats),
+	BTRFS_ATTR_PTR(, temp_fsid),
 	NULL,
 };
 
@@ -2086,6 +2109,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
 }
 BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
 
+static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj,
+				struct kobj_attribute *a,
+				char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+	ssize_t ret = 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	ASSERT(btrfs_qgroup_enabled(fs_info));
+	switch (btrfs_qgroup_mode(fs_info)) {
+	case BTRFS_QGROUP_MODE_FULL:
+		ret = sysfs_emit(buf, "qgroup\n");
+		break;
+	case BTRFS_QGROUP_MODE_SIMPLE:
+		ret = sysfs_emit(buf, "squota\n");
+		break;
+	default:
+		btrfs_warn(fs_info, "unexpected qgroup mode %d\n",
+			   btrfs_qgroup_mode(fs_info));
+		break;
+	}
+	spin_unlock(&fs_info->qgroup_lock);
+
+	return ret;
+}
+BTRFS_ATTR(qgroups, mode, qgroup_mode_show);
+
 static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
 					struct kobj_attribute *a,
 					char *buf)
@@ -2148,6 +2198,7 @@ static struct attribute *qgroups_attrs[] = {
 	BTRFS_ATTR_PTR(qgroups, enabled),
 	BTRFS_ATTR_PTR(qgroups, inconsistent),
 	BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+	BTRFS_ATTR_PTR(qgroups, mode),
 	NULL
 };
 ATTRIBUTE_GROUPS(qgroups);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 5ef0b90e25c3..6a43a64ba55a 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -61,7 +61,11 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	key.type = BTRFS_EXTENT_CSUM_KEY;
 	key.offset = 0;
 
-	btrfs_setup_item_for_insert(root, path, &key, value_len);
+	/*
+	 * Passing a NULL trans handle is fine here, we have a dummy root eb
+	 * and the tree is a single node (level 0).
+	 */
+	btrfs_setup_item_for_insert(NULL, root, path, &key, value_len);
 	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
 			    value_len);
 
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 05b03f5eab83..492d69d2fa73 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -34,7 +34,11 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = start;
 
-	btrfs_setup_item_for_insert(root, &path, &key, value_len);
+	/*
+	 * Passing a NULL trans handle is fine here, we have a dummy root eb
+	 * and the tree is a single node (level 0).
+	 */
+	btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len);
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 	btrfs_set_file_extent_generation(leaf, fi, 1);
 	btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,7 +68,11 @@ static void insert_inode_item_key(struct btrfs_root *root)
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 
-	btrfs_setup_item_for_insert(root, &path, &key, value_len);
+	/*
+	 * Passing a NULL trans handle is fine here, we have a dummy root eb
+	 * and the tree is a single node (level 0).
+	 */
+	btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len);
 }
 
 /*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c780d3729463..6e63816dddcb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,7 +386,7 @@ loop:
 			IO_TREE_TRANS_DIRTY_PAGES);
 	extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
 			IO_TREE_FS_PINNED_EXTENTS);
-	fs_info->generation++;
+	btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
 	cur_trans->aborted = 0;
@@ -561,6 +561,69 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
 	return true;
 }
 
+static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
+					enum btrfs_reserve_flush_enum flush,
+					u64 num_bytes,
+					u64 *delayed_refs_bytes)
+{
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
+	u64 extra_delayed_refs_bytes = 0;
+	u64 bytes;
+	int ret;
+
+	/*
+	 * If there's a gap between the size of the delayed refs reserve and
+	 * its reserved space, than some tasks have added delayed refs or bumped
+	 * its size otherwise (due to block group creation or removal, or block
+	 * group item update). Also try to allocate that gap in order to prevent
+	 * using (and possibly abusing) the global reserve when committing the
+	 * transaction.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+	    !btrfs_block_rsv_full(delayed_refs_rsv)) {
+		spin_lock(&delayed_refs_rsv->lock);
+		if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
+			extra_delayed_refs_bytes = delayed_refs_rsv->size -
+				delayed_refs_rsv->reserved;
+		spin_unlock(&delayed_refs_rsv->lock);
+	}
+
+	bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
+
+	/*
+	 * We want to reserve all the bytes we may need all at once, so we only
+	 * do 1 enospc flushing cycle per transaction start.
+	 */
+	ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+	if (ret == 0) {
+		if (extra_delayed_refs_bytes > 0)
+			btrfs_migrate_to_delayed_refs_rsv(fs_info,
+							  extra_delayed_refs_bytes);
+		return 0;
+	}
+
+	if (extra_delayed_refs_bytes > 0) {
+		bytes -= extra_delayed_refs_bytes;
+		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+		if (ret == 0)
+			return 0;
+	}
+
+	/*
+	 * If we are an emergency flush, which can steal from the global block
+	 * reserve, then attempt to not reserve space for the delayed refs, as
+	 * we will consume space for them from the global block reserve.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+		bytes -= *delayed_refs_bytes;
+		*delayed_refs_bytes = 0;
+		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+	}
+
+	return ret;
+}
+
 static struct btrfs_trans_handle *
 start_transaction(struct btrfs_root *root, unsigned int num_items,
 		  unsigned int type, enum btrfs_reserve_flush_enum flush,
@@ -568,10 +631,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	u64 num_bytes = 0;
 	u64 qgroup_reserved = 0;
+	u64 delayed_refs_bytes = 0;
 	bool reloc_reserved = false;
 	bool do_chunk_alloc = false;
 	int ret;
@@ -594,9 +659,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items && root != fs_info->chunk_root) {
-		struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
-		u64 delayed_refs_bytes = 0;
-
 		qgroup_reserved = num_items * fs_info->nodesize;
 		/*
 		 * Use prealloc for now, as there might be a currently running
@@ -608,20 +670,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 		if (ret)
 			return ERR_PTR(ret);
 
+		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
 		/*
-		 * We want to reserve all the bytes we may need all at once, so
-		 * we only do 1 enospc flushing cycle per transaction start.  We
-		 * accomplish this by simply assuming we'll do num_items worth
-		 * of delayed refs updates in this trans handle, and refill that
-		 * amount for whatever is missing in the reserve.
+		 * If we plan to insert/update/delete "num_items" from a btree,
+		 * we will also generate delayed refs for extent buffers in the
+		 * respective btree paths, so reserve space for the delayed refs
+		 * that will be generated by the caller as it modifies btrees.
+		 * Try to reserve them to avoid excessive use of the global
+		 * block reserve.
 		 */
-		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
-		if (flush == BTRFS_RESERVE_FLUSH_ALL &&
-		    !btrfs_block_rsv_full(delayed_refs_rsv)) {
-			delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
-									  num_items);
-			num_bytes += delayed_refs_bytes;
-		}
+		delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items);
 
 		/*
 		 * Do the reservation for the relocation root creation
@@ -631,16 +689,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 			reloc_reserved = true;
 		}
 
-		ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush);
+		ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
+						   &delayed_refs_bytes);
 		if (ret)
 			goto reserve_fail;
-		if (delayed_refs_bytes) {
-			btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes);
-			num_bytes -= delayed_refs_bytes;
-		}
-		btrfs_block_rsv_add_bytes(rsv, num_bytes, true);
 
-		if (rsv->space_info->force_alloc)
+		btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true);
+
+		if (trans_rsv->space_info->force_alloc)
 			do_chunk_alloc = true;
 	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
 		   !btrfs_block_rsv_full(delayed_refs_rsv)) {
@@ -700,6 +756,7 @@ again:
 
 	h->type = type;
 	INIT_LIST_HEAD(&h->new_bgs);
+	btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
@@ -712,8 +769,17 @@ again:
 	if (num_bytes) {
 		trace_btrfs_space_reservation(fs_info, "transaction",
 					      h->transid, num_bytes, 1);
-		h->block_rsv = &fs_info->trans_block_rsv;
+		h->block_rsv = trans_rsv;
 		h->bytes_reserved = num_bytes;
+		if (delayed_refs_bytes > 0) {
+			trace_btrfs_space_reservation(fs_info,
+						      "local_delayed_refs_rsv",
+						      h->transid,
+						      delayed_refs_bytes, 1);
+			h->delayed_refs_bytes_reserved = delayed_refs_bytes;
+			btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true);
+			delayed_refs_bytes = 0;
+		}
 		h->reloc_reserved = reloc_reserved;
 	}
 
@@ -769,8 +835,10 @@ join_fail:
 	kmem_cache_free(btrfs_trans_handle_cachep, h);
 alloc_fail:
 	if (num_bytes)
-		btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
-					num_bytes, NULL);
+		btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
+	if (delayed_refs_bytes)
+		btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
+						    delayed_refs_bytes);
 reserve_fail:
 	btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 	return ERR_PTR(ret);
@@ -817,7 +885,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo
 }
 
 /*
- * btrfs_attach_transaction() - catch the running transaction
+ * Catch the running transaction.
  *
  * It is used when we want to commit the current the transaction, but
  * don't want to start a new one.
@@ -836,7 +904,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 }
 
 /*
- * btrfs_attach_transaction_barrier() - catch the running transaction
+ * Catch the running transaction.
  *
  * It is similar to the above function, the difference is this one
  * will wait for all the inactive transactions until they fully
@@ -912,7 +980,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
 	int ret = 0;
 
 	if (transid) {
-		if (transid <= fs_info->last_trans_committed)
+		if (transid <= btrfs_get_last_trans_committed(fs_info))
 			goto out;
 
 		/* find specified transaction */
@@ -936,7 +1004,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
 		 * raced with btrfs_commit_transaction
 		 */
 		if (!cur_trans) {
-			if (transid > fs_info->last_trans_committed)
+			if (transid > btrfs_get_last_trans_committed(fs_info))
 				ret = -EINVAL;
 			goto out;
 		}
@@ -991,11 +1059,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
 
 	if (!trans->block_rsv) {
 		ASSERT(!trans->bytes_reserved);
+		ASSERT(!trans->delayed_refs_bytes_reserved);
 		return;
 	}
 
-	if (!trans->bytes_reserved)
+	if (!trans->bytes_reserved) {
+		ASSERT(!trans->delayed_refs_bytes_reserved);
 		return;
+	}
 
 	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
 	trace_btrfs_space_reservation(fs_info, "transaction",
@@ -1003,6 +1074,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
 	btrfs_block_rsv_release(fs_info, trans->block_rsv,
 				trans->bytes_reserved, NULL);
 	trans->bytes_reserved = 0;
+
+	if (!trans->delayed_refs_bytes_reserved)
+		return;
+
+	trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv",
+				      trans->transid,
+				      trans->delayed_refs_bytes_reserved, 0);
+	btrfs_block_rsv_release(fs_info, &trans->delayed_rsv,
+				trans->delayed_refs_bytes_reserved, NULL);
+	trans->delayed_refs_bytes_reserved = 0;
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -1334,7 +1415,7 @@ again:
 	}
 
 	/* Now flush any delayed refs generated by updating all of the roots */
-	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, U64_MAX);
 	if (ret)
 		return ret;
 
@@ -1349,7 +1430,7 @@ again:
 		 * so we want to keep this flushing in this loop to make sure
 		 * everything gets run.
 		 */
-		ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+		ret = btrfs_run_delayed_refs(trans, U64_MAX);
 		if (ret)
 			return ret;
 	}
@@ -1484,45 +1565,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
 }
 
 /*
- * defrag a given btree.
- * Every leaf in the btree is read and defragged.
- */
-int btrfs_defrag_root(struct btrfs_root *root)
-{
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_trans_handle *trans;
-	int ret;
-
-	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
-		return 0;
-
-	while (1) {
-		trans = btrfs_start_transaction(root, 0);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			break;
-		}
-
-		ret = btrfs_defrag_leaves(trans, root);
-
-		btrfs_end_transaction(trans);
-		btrfs_btree_balance_dirty(info);
-		cond_resched();
-
-		if (btrfs_fs_closing(info) || ret != -EAGAIN)
-			break;
-
-		if (btrfs_defrag_cancelled(info)) {
-			btrfs_debug(info, "defrag_root cancelled");
-			ret = -EAGAIN;
-			break;
-		}
-	}
-	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
-	return ret;
-}
-
-/*
  * Do all special snapshot related qgroup dirty hack.
  *
  * Will do all needed qgroup inherit and dirty hack like switch commit
@@ -1539,11 +1581,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	int ret;
 
 	/*
-	 * Save some performance in the case that qgroups are not
-	 * enabled. If this check races with the ioctl, rescan will
-	 * kick in anyway.
+	 * Save some performance in the case that qgroups are not enabled. If
+	 * this check races with the ioctl, rescan will kick in anyway.
 	 */
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	/*
@@ -1567,7 +1608,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	 * for now flush the delayed refs to narrow the race window where the
 	 * qgroup counters could end up wrong.
 	 */
-	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, U64_MAX);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
@@ -1582,7 +1623,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 
 	/* Now qgroup are all updated, we can inherit it to new qgroups */
 	ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
-				   inherit);
+				   parent->root_key.objectid, inherit);
 	if (ret < 0)
 		goto out;
 
@@ -1732,6 +1773,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
+	ret = btrfs_create_qgroup(trans, objectid);
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		goto fail;
+	}
+
 	/*
 	 * pull in the delayed directory update
 	 * and the delayed inode item
@@ -1843,8 +1890,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * To co-operate with that hack, we do hack again.
 	 * Or snapshot will be greatly slowed down by a subtree qgroup rescan
 	 */
-	ret = qgroup_account_snapshot(trans, root, parent_root,
-				      pending->inherit, objectid);
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
+		ret = qgroup_account_snapshot(trans, root, parent_root,
+					      pending->inherit, objectid);
+	else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid,
+					   parent_root->root_key.objectid, pending->inherit);
 	if (ret < 0)
 		goto fail;
 
@@ -1860,8 +1911,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
 						  fname.disk_name.len * 2);
-	parent_inode->i_mtime = inode_set_ctime_current(parent_inode);
-	ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
+	inode_set_mtime_to_ts(parent_inode,
+			      inode_set_ctime_current(parent_inode));
+	ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
@@ -2084,7 +2136,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
        struct btrfs_block_group *block_group, *tmp;
 
        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
-               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
                list_del_init(&block_group->bg_list);
        }
 }
@@ -2403,7 +2455,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (ret)
 		goto unlock_reloc;
 
-	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, U64_MAX);
 	if (ret)
 		goto unlock_reloc;
 
@@ -2536,7 +2588,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
 		btrfs_clear_space_info_full(fs_info);
 
-	fs_info->last_trans_committed = cur_trans->transid;
+	btrfs_set_last_trans_committed(fs_info, cur_trans->transid);
 	/*
 	 * We needn't acquire the lock here because there is no other task
 	 * which can change it.
@@ -2654,18 +2706,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
  */
 void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 				      const char *function,
-				      unsigned int line, int errno, bool first_hit)
+				      unsigned int line, int error, bool first_hit)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 
-	WRITE_ONCE(trans->aborted, errno);
-	WRITE_ONCE(trans->transaction->aborted, errno);
-	if (first_hit && errno == -ENOSPC)
+	WRITE_ONCE(trans->aborted, error);
+	WRITE_ONCE(trans->transaction->aborted, error);
+	if (first_hit && error == -ENOSPC)
 		btrfs_dump_space_info_for_trans_abort(fs_info);
 	/* Wake up anybody who may be waiting on this transaction */
 	wake_up(&fs_info->transaction_wait);
 	wake_up(&fs_info->transaction_blocked_wait);
-	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+	__btrfs_handle_fs_error(fs_info, function, line, error, NULL);
 }
 
 int __init btrfs_transaction_init(void)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93869cda6af9..18c4f6e83b78 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -118,8 +118,10 @@ enum {
 struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
+	u64 delayed_refs_bytes_reserved;
 	u64 chunk_bytes_reserved;
 	unsigned long delayed_ref_updates;
+	unsigned long delayed_ref_csum_deletions;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *orig_rsv;
@@ -139,6 +141,7 @@ struct btrfs_trans_handle {
 	bool in_fsync;
 	struct btrfs_fs_info *fs_info;
 	struct list_head new_bgs;
+	struct btrfs_block_rsv delayed_rsv;
 };
 
 /*
@@ -172,7 +175,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 {
 	spin_lock(&inode->lock);
 	inode->last_trans = trans->transaction->transid;
-	inode->last_sub_trans = inode->root->log_transid;
+	inode->last_sub_trans = btrfs_get_root_log_transid(inode->root);
 	inode->last_log_commit = inode->last_sub_trans - 1;
 	spin_unlock(&inode->lock);
 }
@@ -200,32 +203,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
 	delayed_refs->qgroup_to_skip = 0;
 }
 
-bool __cold abort_should_print_stack(int errno);
+bool __cold abort_should_print_stack(int error);
 
 /*
  * Call btrfs_abort_transaction as early as possible when an error condition is
  * detected, that way the exact stack trace is reported for some errors.
  */
-#define btrfs_abort_transaction(trans, errno)		\
+#define btrfs_abort_transaction(trans, error)		\
 do {								\
 	bool first = false;					\
 	/* Report first abort since mount */			\
 	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
 			&((trans)->fs_info->fs_state))) {	\
 		first = true;					\
-		if (WARN(abort_should_print_stack(errno),	\
+		if (WARN(abort_should_print_stack(error),	\
 			KERN_ERR				\
 			"BTRFS: Transaction aborted (error %d)\n",	\
-			(errno))) {					\
+			(error))) {					\
 			/* Stack trace printed. */			\
 		} else {						\
 			btrfs_err((trans)->fs_info,			\
 				  "Transaction aborted (error %d)",	\
-				  (errno));			\
+				  (error));			\
 		}						\
 	}							\
 	__btrfs_abort_transaction((trans), __func__,		\
-				  __LINE__, (errno), first);	\
+				  __LINE__, (error), first);	\
 } while (0)
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -243,7 +246,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
 int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
 
 void btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root);
 void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
 int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
@@ -264,7 +266,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 				      const char *function,
-				      unsigned int line, int errno, bool first_hit);
+				      unsigned int line, int error, bool first_hit);
 
 int __init btrfs_transaction_init(void);
 void __cold btrfs_transaction_exit(void);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ab08a0b01311..a416cbea75d1 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -29,6 +29,8 @@
 #include "accessors.h"
 #include "file-item.h"
 #include "inode-item.h"
+#include "dir-item.h"
+#include "raid-stripe-tree.h"
 
 /*
  * Error message should follow the following format:
@@ -1465,6 +1467,9 @@ static int check_extent_item(struct extent_buffer *leaf,
 			}
 			inline_refs += btrfs_shared_data_ref_count(leaf, sref);
 			break;
+		case BTRFS_EXTENT_OWNER_REF_KEY:
+			WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+			break;
 		default:
 			extent_err(leaf, slot, "unknown inline ref type: %u",
 				   inline_type);
@@ -1631,6 +1636,44 @@ static int check_inode_ref(struct extent_buffer *leaf,
 	return 0;
 }
 
+static int check_raid_stripe_extent(const struct extent_buffer *leaf,
+				    const struct btrfs_key *key, int slot)
+{
+	struct btrfs_stripe_extent *stripe_extent =
+		btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+	if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
+		generic_err(leaf, slot,
+"invalid key objectid for raid stripe extent, have %llu expect aligned to %u",
+			    key->objectid, leaf->fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+
+	if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) {
+		generic_err(leaf, slot,
+	"RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset");
+		return -EUCLEAN;
+	}
+
+	switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) {
+	case BTRFS_STRIPE_RAID0:
+	case BTRFS_STRIPE_RAID1:
+	case BTRFS_STRIPE_DUP:
+	case BTRFS_STRIPE_RAID10:
+	case BTRFS_STRIPE_RAID5:
+	case BTRFS_STRIPE_RAID6:
+	case BTRFS_STRIPE_RAID1C3:
+	case BTRFS_STRIPE_RAID1C4:
+		break;
+	default:
+		generic_err(leaf, slot, "invalid raid stripe encoding %u",
+			    btrfs_stripe_extent_encoding(leaf, stripe_extent));
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
@@ -1685,6 +1728,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
 	case BTRFS_EXTENT_DATA_REF_KEY:
 		ret = check_extent_data_ref(leaf, key, slot);
 		break;
+	case BTRFS_RAID_STRIPE_KEY:
+		ret = check_raid_stripe_extent(leaf, key, slot);
+		break;
 	}
 
 	if (ret)
@@ -2005,7 +2051,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 	 * So we only checks tree blocks which is read from disk, whose
 	 * generation <= fs_info->last_trans_committed.
 	 */
-	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+	if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info))
 		return 0;
 
 	/* We have @first_key, so this @eb must have at least one item */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cbb17b542131..7d6729d9fd2f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -347,8 +347,7 @@ static int process_one_buffer(struct btrfs_root *log,
 	}
 
 	if (wc->pin) {
-		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
-						      eb->len);
+		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
 		if (ret)
 			return ret;
 
@@ -504,9 +503,9 @@ insert:
 		found_size = btrfs_item_size(path->nodes[0],
 						path->slots[0]);
 		if (found_size > item_size)
-			btrfs_truncate_item(path, item_size, 1);
+			btrfs_truncate_item(trans, path, item_size, 1);
 		else if (found_size < item_size)
-			btrfs_extend_item(path, item_size - found_size);
+			btrfs_extend_item(trans, path, item_size - found_size);
 	} else if (ret) {
 		return ret;
 	}
@@ -574,7 +573,7 @@ insert:
 		}
 	}
 no_copy:
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 	return 0;
 }
@@ -767,7 +766,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			} else if (ret == 0) {
 				btrfs_init_generic_ref(&ref,
 						BTRFS_ADD_DELAYED_REF,
-						ins.objectid, ins.offset, 0);
+						ins.objectid, ins.offset, 0,
+						root->root_key.objectid);
 				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						key->objectid, offset, 0, false);
@@ -890,7 +890,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 update_inode:
 	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 out:
 	iput(inode);
 	return ret;
@@ -1445,7 +1445,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 			if (ret)
 				goto out;
 
-			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(inode));
 			if (ret)
 				goto out;
 		}
@@ -1483,8 +1483,7 @@ out:
 	return ret;
 }
 
-static int count_inode_extrefs(struct btrfs_root *root,
-		struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
 {
 	int ret = 0;
 	int name_len;
@@ -1498,8 +1497,8 @@ static int count_inode_extrefs(struct btrfs_root *root,
 	struct extent_buffer *leaf;
 
 	while (1) {
-		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
-					    &extref, &offset);
+		ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
+					    path, &extref, &offset);
 		if (ret)
 			break;
 
@@ -1527,8 +1526,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
 	return nlink;
 }
 
-static int count_inode_refs(struct btrfs_root *root,
-			struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1543,7 +1541,7 @@ static int count_inode_refs(struct btrfs_root *root,
 	key.offset = (u64)-1;
 
 	while (1) {
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
 		if (ret < 0)
 			break;
 		if (ret > 0) {
@@ -1595,9 +1593,9 @@ process_slot:
  * will free the inode.
  */
 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
 					   struct inode *inode)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
 	int ret;
 	u64 nlink = 0;
@@ -1607,13 +1605,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = count_inode_refs(root, BTRFS_I(inode), path);
+	ret = count_inode_refs(BTRFS_I(inode), path);
 	if (ret < 0)
 		goto out;
 
 	nlink = ret;
 
-	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
+	ret = count_inode_extrefs(BTRFS_I(inode), path);
 	if (ret < 0)
 		goto out;
 
@@ -1623,7 +1621,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 
 	if (nlink != inode->i_nlink) {
 		set_nlink(inode, nlink);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 		if (ret)
 			goto out;
 	}
@@ -1685,7 +1683,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 			break;
 		}
 
-		ret = fixup_inode_link_count(trans, root, inode);
+		ret = fixup_inode_link_count(trans, inode);
 		iput(inode);
 		if (ret)
 			break;
@@ -1732,7 +1730,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 			set_nlink(inode, 1);
 		else
 			inc_nlink(inode);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	} else if (ret == -EEXIST) {
 		ret = 0;
 	}
@@ -1939,7 +1937,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 out:
 	if (!ret && update_size) {
 		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+		ret = btrfs_update_inode(trans, BTRFS_I(dir));
 	}
 	kfree(name.name);
 	iput(dir);
@@ -2483,7 +2481,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 							drop_args.bytes_found);
 					/* Update the inode's nbytes. */
 					ret = btrfs_update_inode(wc->trans,
-							root, BTRFS_I(inode));
+								 BTRFS_I(inode));
 				}
 				iput(inode);
 				if (ret)
@@ -2574,7 +2572,7 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
 	btrfs_tree_unlock(eb);
 
 	if (trans) {
-		ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
+		ret = btrfs_pin_reserved_extent(trans, eb);
 		if (ret)
 			return ret;
 		btrfs_redirty_list_add(trans->transaction, eb);
@@ -2848,10 +2846,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
 }
 
 /*
- * btrfs_sync_log does sends a given tree log down to the disk and
- * updates the super blocks to record it.  When this call is done,
- * you know that any inodes previously logged are safely on disk only
- * if it returns 0.
+ * Sends a given tree log down to the disk and updates the super blocks to
+ * record it.  When this call is done, you know that any inodes previously
+ * logged are safely on disk only if it returns 0.
  *
  * Any other return value means you need to call btrfs_commit_transaction.
  * Some of the edge cases for fsyncing directories that have had unlinks
@@ -2961,7 +2958,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_root_node(&log->root_item, log->node);
 	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
 
-	root->log_transid++;
+	btrfs_set_root_log_transid(root, root->log_transid + 1);
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
 	/*
@@ -2999,9 +2996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	ret = update_log_root(trans, log, &new_root_item);
 	if (ret) {
-		if (!list_empty(&root_log_ctx.list))
-			list_del_init(&root_log_ctx.list);
-
+		list_del_init(&root_log_ctx.list);
 		blk_finish_plug(&plug);
 		btrfs_set_log_full_commit(trans);
 		if (ret != -ENOSPC)
@@ -3021,7 +3016,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	index2 = root_log_ctx.log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		ret = btrfs_wait_tree_log_extents(log, mark);
@@ -3136,8 +3130,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * someone else already started it. We use <= and not < because the
 	 * first log transaction has an ID of 0.
 	 */
-	ASSERT(root->last_log_commit <= log_transid);
-	root->last_log_commit = log_transid;
+	ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
+	btrfs_set_root_last_log_commit(root, log_transid);
 
 out_wake_log_root:
 	mutex_lock(&log_root_tree->log_mutex);
@@ -3211,8 +3205,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
-			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+	extent_io_tree_release(&log->dirty_log_pages);
 	extent_io_tree_release(&log->log_csum_range);
 
 	btrfs_put_root(log);
@@ -3530,7 +3523,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 		last_offset = max(last_offset, curr_end);
 	}
 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 	return 0;
 }
@@ -4138,19 +4131,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
 
 	btrfs_set_token_timespec_sec(&token, &item->atime,
-				     inode->i_atime.tv_sec);
+				     inode_get_atime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->atime,
-				      inode->i_atime.tv_nsec);
+				      inode_get_atime_nsec(inode));
 
 	btrfs_set_token_timespec_sec(&token, &item->mtime,
-				     inode->i_mtime.tv_sec);
+				     inode_get_mtime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
-				      inode->i_mtime.tv_nsec);
+				      inode_get_mtime_nsec(inode));
 
 	btrfs_set_token_timespec_sec(&token, &item->ctime,
-				     inode_get_ctime(inode).tv_sec);
+				     inode_get_ctime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
-				      inode_get_ctime(inode).tv_nsec);
+				      inode_get_ctime_nsec(inode));
 
 	/*
 	 * We do not need to set the nbytes field, in fact during a fast fsync
@@ -4488,7 +4481,7 @@ copy_item:
 		dst_index++;
 	}
 
-	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
 	btrfs_release_path(dst_path);
 out:
 	kfree(ins_data);
@@ -4693,7 +4686,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	write_extent_buffer(leaf, &fi,
 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
 			    sizeof(fi));
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	btrfs_release_path(path);
 
@@ -4921,12 +4914,12 @@ process:
 		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
 
 		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
-			spin_lock_irq(&inode->ordered_tree.lock);
+			spin_lock_irq(&inode->ordered_tree_lock);
 			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
 				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
 				atomic_inc(&trans->transaction->pending_ordered);
 			}
-			spin_unlock_irq(&inode->ordered_tree.lock);
+			spin_unlock_irq(&inode->ordered_tree_lock);
 		}
 		btrfs_put_ordered_extent(ordered);
 	}
@@ -7204,9 +7197,7 @@ again:
 			 * each subsequent pass.
 			 */
 			if (ret == -ENOENT)
-				ret = btrfs_pin_extent_for_log_replay(trans,
-							log->node->start,
-							log->node->len);
+				ret = btrfs_pin_extent_for_log_replay(trans, log->node);
 			btrfs_put_root(log);
 
 			if (!ret)
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 33606025513d..b4ac2b0cd235 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -223,7 +223,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 }
 
 /*
- * ulist_del - delete one node from ulist
+ * Delete one node from ulist.
+ *
  * @ulist:	ulist to remove node from
  * @val:	value to delete
  * @aux:	aux to delete
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 7c7001f42b14..5be74f9e47eb 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -124,7 +124,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 		 * An item with that type already exists.
 		 * Extend the item and store the new subid at the end.
 		 */
-		btrfs_extend_item(path, sizeof(subid_le));
+		btrfs_extend_item(trans, path, sizeof(subid_le));
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		offset = btrfs_item_ptr_offset(eb, slot);
@@ -139,7 +139,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 	ret = 0;
 	subid_le = cpu_to_le64(subid_cpu);
 	write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
-	btrfs_mark_buffer_dirty(eb);
+	btrfs_mark_buffer_dirty(trans, eb);
 
 out:
 	btrfs_free_path(path);
@@ -221,7 +221,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 	move_src = offset + sizeof(subid);
 	move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
 	memmove_extent_buffer(eb, move_dst, move_src, move_len);
-	btrfs_truncate_item(path, item_size - sizeof(subid), 1);
+	btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1);
 
 out:
 	btrfs_free_path(path);
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 744f4f4d4c68..66e2270b0dae 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -487,7 +487,7 @@ static int rollback_verity(struct btrfs_inode *inode)
 	}
 	inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
 	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -554,7 +554,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
 	}
 	inode->ro_flags |= BTRFS_INODE_RO_VERITY;
 	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 	if (ret)
 		goto end_trans;
 	ret = del_orphan(trans, inode);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9ef6f54635c..c87e18827a0a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -35,6 +35,7 @@
 #include "relocation.h"
 #include "scrub.h"
 #include "super.h"
+#include "raid-stripe-tree.h"
 
 #define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
 					 BTRFS_BLOCK_GROUP_RAID10 | \
@@ -357,21 +358,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 }
 
 /*
- * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
- * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
+ * Allocate new btrfs_fs_devices structure identified by a fsid.
+ *
+ * @fsid:    if not NULL, copy the UUID to fs_devices::fsid and to
+ *           fs_devices::metadata_fsid
  *
  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
  * The returned struct is not linked onto any lists and can be destroyed with
  * kfree() right away.
  */
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
-						 const u8 *metadata_fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 {
 	struct btrfs_fs_devices *fs_devs;
 
-	ASSERT(fsid || !metadata_fsid);
-
 	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 	if (!fs_devs)
 		return ERR_PTR(-ENOMEM);
@@ -385,8 +384,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 
 	if (fsid) {
 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
-		memcpy(fs_devs->metadata_uuid,
-		       metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
+		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 	}
 
 	return fs_devs;
@@ -457,91 +455,41 @@ static noinline struct btrfs_fs_devices *find_fsid(
 	return NULL;
 }
 
-/*
- * First check if the metadata_uuid is different from the fsid in the given
- * fs_devices. Then check if the given fsid is the same as the metadata_uuid
- * in the fs_devices. If it is, return true; otherwise, return false.
- */
-static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
-				      const u8 *fsid)
-{
-	return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
-		      BTRFS_FSID_SIZE) != 0 &&
-	       memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
-}
-
-static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
-				struct btrfs_super_block *disk_super)
-{
-
-	struct btrfs_fs_devices *fs_devices;
-
-	/*
-	 * Handle scanned device having completed its fsid change but
-	 * belonging to a fs_devices that was created by first scanning
-	 * a device which didn't have its fsid/metadata_uuid changed
-	 * at all and the CHANGING_FSID_V2 flag set.
-	 */
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (!fs_devices->fsid_change)
-			continue;
-
-		if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
-					  fs_devices->fsid))
-			return fs_devices;
-	}
-
-	/*
-	 * Handle scanned device having completed its fsid change but
-	 * belonging to a fs_devices that was created by a device that
-	 * has an outdated pair of fsid/metadata_uuid and
-	 * CHANGING_FSID_V2 flag set.
-	 */
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (!fs_devices->fsid_change)
-			continue;
-
-		if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
-			return fs_devices;
-	}
-
-	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
-}
-
-
 static int
 btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
-		      int flush, struct block_device **bdev,
+		      int flush, struct bdev_handle **bdev_handle,
 		      struct btrfs_super_block **disk_super)
 {
+	struct block_device *bdev;
 	int ret;
 
-	*bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
+	*bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
 
-	if (IS_ERR(*bdev)) {
-		ret = PTR_ERR(*bdev);
+	if (IS_ERR(*bdev_handle)) {
+		ret = PTR_ERR(*bdev_handle);
 		goto error;
 	}
+	bdev = (*bdev_handle)->bdev;
 
 	if (flush)
-		sync_blockdev(*bdev);
-	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
+		sync_blockdev(bdev);
+	ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
 	if (ret) {
-		blkdev_put(*bdev, holder);
+		bdev_release(*bdev_handle);
 		goto error;
 	}
-	invalidate_bdev(*bdev);
-	*disk_super = btrfs_read_dev_super(*bdev);
+	invalidate_bdev(bdev);
+	*disk_super = btrfs_read_dev_super(bdev);
 	if (IS_ERR(*disk_super)) {
 		ret = PTR_ERR(*disk_super);
-		blkdev_put(*bdev, holder);
+		bdev_release(*bdev_handle);
 		goto error;
 	}
 
 	return 0;
 
 error:
-	*bdev = NULL;
+	*bdev_handle = NULL;
 	return ret;
 }
 
@@ -562,13 +510,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
 {
 	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 	struct btrfs_device *device, *tmp_device;
-	int ret = 0;
+	int ret;
+	bool freed = false;
 
 	lockdep_assert_held(&uuid_mutex);
 
-	if (devt)
-		ret = -ENOENT;
-
+	/* Return good status if there is no instance of devt. */
+	ret = 0;
 	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 
 		mutex_lock(&fs_devices->device_list_mutex);
@@ -579,8 +527,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
 			if (devt && devt != device->devt)
 				continue;
 			if (fs_devices->opened) {
-				/* for an already deleted device return 0 */
-				if (devt && ret != 0)
+				if (devt)
 					ret = -EBUSY;
 				break;
 			}
@@ -590,7 +537,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
 			list_del(&device->dev_list);
 			btrfs_free_device(device);
 
-			ret = 0;
+			freed = true;
 		}
 		mutex_unlock(&fs_devices->device_list_mutex);
 
@@ -601,9 +548,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
 		}
 	}
 
+	/* If there is at least one freed device return 0. */
+	if (freed)
+		return 0;
+
 	return ret;
 }
 
+static struct btrfs_fs_devices *find_fsid_by_device(
+					struct btrfs_super_block *disk_super,
+					dev_t devt, bool *same_fsid_diff_dev)
+{
+	struct btrfs_fs_devices *fsid_fs_devices;
+	struct btrfs_fs_devices *devt_fs_devices;
+	const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+					BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+	bool found_by_devt = false;
+
+	/* Find the fs_device by the usual method, if found use it. */
+	fsid_fs_devices = find_fsid(disk_super->fsid,
+		    has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+
+	/* The temp_fsid feature is supported only with single device filesystem. */
+	if (btrfs_super_num_devices(disk_super) != 1)
+		return fsid_fs_devices;
+
+	/*
+	 * A seed device is an integral component of the sprout device, which
+	 * functions as a multi-device filesystem. So, temp-fsid feature is
+	 * not supported.
+	 */
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
+		return fsid_fs_devices;
+
+	/* Try to find a fs_devices by matching devt. */
+	list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
+		struct btrfs_device *device;
+
+		list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
+			if (device->devt == devt) {
+				found_by_devt = true;
+				break;
+			}
+		}
+		if (found_by_devt)
+			break;
+	}
+
+	if (found_by_devt) {
+		/* Existing device. */
+		if (fsid_fs_devices == NULL) {
+			if (devt_fs_devices->opened == 0) {
+				/* Stale device. */
+				return NULL;
+			} else {
+				/* temp_fsid is mounting a subvol. */
+				return devt_fs_devices;
+			}
+		} else {
+			/* Regular or temp_fsid device mounting a subvol. */
+			return devt_fs_devices;
+		}
+	} else {
+		/* New device. */
+		if (fsid_fs_devices == NULL) {
+			return NULL;
+		} else {
+			/* sb::fsid is already used create a new temp_fsid. */
+			*same_fsid_diff_dev = true;
+			return NULL;
+		}
+	}
+
+	/* Not reached. */
+}
+
 /*
  * This is only used on mount, and we are protected from competing things
  * messing with our fs_devices by the uuid_mutex, thus we do not need the
@@ -613,7 +632,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 			struct btrfs_device *device, blk_mode_t flags,
 			void *holder)
 {
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct btrfs_super_block *disk_super;
 	u64 devid;
 	int ret;
@@ -624,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		return -EINVAL;
 
 	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-				    &bdev, &disk_super);
+				    &bdev_handle, &disk_super);
 	if (ret)
 		return ret;
 
@@ -648,21 +667,21 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		fs_devices->seeding = true;
 	} else {
-		if (bdev_read_only(bdev))
+		if (bdev_read_only(bdev_handle->bdev))
 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		else
 			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	}
 
-	if (!bdev_nonrot(bdev))
+	if (!bdev_nonrot(bdev_handle->bdev))
 		fs_devices->rotating = true;
 
-	if (bdev_max_discard_sectors(bdev))
+	if (bdev_max_discard_sectors(bdev_handle->bdev))
 		fs_devices->discardable = true;
 
-	device->bdev = bdev;
+	device->bdev_handle = bdev_handle;
+	device->bdev = bdev_handle->bdev;
 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
-	device->holder = holder;
 
 	fs_devices->open_devices++;
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
@@ -676,7 +695,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 
 error_free_page:
 	btrfs_release_disk_super(disk_super);
-	blkdev_put(bdev, holder);
+	bdev_release(bdev_handle);
 
 	return -EINVAL;
 }
@@ -690,84 +709,6 @@ u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
 }
 
 /*
- * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
- * being created with a disk that has already completed its fsid change. Such
- * disk can belong to an fs which has its FSID changed or to one which doesn't.
- * Handle both cases here.
- */
-static struct btrfs_fs_devices *find_fsid_inprogress(
-					struct btrfs_super_block *disk_super)
-{
-	struct btrfs_fs_devices *fs_devices;
-
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (fs_devices->fsid_change)
-			continue;
-
-		if (check_fsid_changed(fs_devices,  disk_super->fsid))
-			return fs_devices;
-	}
-
-	return find_fsid(disk_super->fsid, NULL);
-}
-
-static struct btrfs_fs_devices *find_fsid_changed(
-					struct btrfs_super_block *disk_super)
-{
-	struct btrfs_fs_devices *fs_devices;
-
-	/*
-	 * Handles the case where scanned device is part of an fs that had
-	 * multiple successful changes of FSID but currently device didn't
-	 * observe it. Meaning our fsid will be different than theirs. We need
-	 * to handle two subcases :
-	 *  1 - The fs still continues to have different METADATA/FSID uuids.
-	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
-	 *  are equal).
-	 */
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		/* Changed UUIDs */
-		if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
-		    memcmp(fs_devices->fsid, disk_super->fsid,
-			   BTRFS_FSID_SIZE) != 0)
-			return fs_devices;
-
-		/* Unchanged UUIDs */
-		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
-			   BTRFS_FSID_SIZE) == 0 &&
-		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
-			   BTRFS_FSID_SIZE) == 0)
-			return fs_devices;
-	}
-
-	return NULL;
-}
-
-static struct btrfs_fs_devices *find_fsid_reverted_metadata(
-				struct btrfs_super_block *disk_super)
-{
-	struct btrfs_fs_devices *fs_devices;
-
-	/*
-	 * Handle the case where the scanned device is part of an fs whose last
-	 * metadata UUID change reverted it to the original FSID. At the same
-	 * time fs_devices was first created by another constituent device
-	 * which didn't fully observe the operation. This results in an
-	 * btrfs_fs_devices created with metadata/fsid different AND
-	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
-	 * fs_devices equal to the FSID of the disk.
-	 */
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (!fs_devices->fsid_change)
-			continue;
-
-		if (check_fsid_changed(fs_devices, disk_super->fsid))
-			return fs_devices;
-	}
-
-	return NULL;
-}
-/*
  * Add new device to list of registered devices
  *
  * Returns:
@@ -785,10 +726,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 	dev_t path_devt;
 	int error;
+	bool same_fsid_diff_dev = false;
 	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
-	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
-					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+		btrfs_err(NULL,
+"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
+			  path);
+		return ERR_PTR(-EAGAIN);
+	}
 
 	error = lookup_bdev(path, &path_devt);
 	if (error) {
@@ -797,27 +744,23 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 		return ERR_PTR(error);
 	}
 
-	if (fsid_change_in_progress) {
-		if (!has_metadata_uuid)
-			fs_devices = find_fsid_inprogress(disk_super);
-		else
-			fs_devices = find_fsid_changed(disk_super);
-	} else if (has_metadata_uuid) {
-		fs_devices = find_fsid_with_metadata_uuid(disk_super);
-	} else {
-		fs_devices = find_fsid_reverted_metadata(disk_super);
-		if (!fs_devices)
-			fs_devices = find_fsid(disk_super->fsid, NULL);
-	}
-
+	fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
 
 	if (!fs_devices) {
-		fs_devices = alloc_fs_devices(disk_super->fsid,
-				has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+		fs_devices = alloc_fs_devices(disk_super->fsid);
+		if (has_metadata_uuid)
+			memcpy(fs_devices->metadata_uuid,
+			       disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
 		if (IS_ERR(fs_devices))
 			return ERR_CAST(fs_devices);
 
-		fs_devices->fsid_change = fsid_change_in_progress;
+		if (same_fsid_diff_dev) {
+			generate_random_uuid(fs_devices->fsid);
+			fs_devices->temp_fsid = true;
+			pr_info("BTRFS: device %s using temp-fsid %pU\n",
+				path, fs_devices->fsid);
+		}
 
 		mutex_lock(&fs_devices->device_list_mutex);
 		list_add(&fs_devices->fs_list, &fs_uuids);
@@ -832,18 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 		mutex_lock(&fs_devices->device_list_mutex);
 		device = btrfs_find_device(fs_devices, &args);
 
-		/*
-		 * If this disk has been pulled into an fs devices created by
-		 * a device which had the CHANGING_FSID_V2 flag then replace the
-		 * metadata_uuid/fsid values of the fs_devices.
-		 */
-		if (fs_devices->fsid_change &&
-		    found_transid > fs_devices->latest_generation) {
+		if (found_transid > fs_devices->latest_generation) {
 			memcpy(fs_devices->fsid, disk_super->fsid,
 					BTRFS_FSID_SIZE);
 			memcpy(fs_devices->metadata_uuid,
 			       btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
-			fs_devices->fsid_change = false;
 		}
 	}
 
@@ -997,7 +933,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 
 	lockdep_assert_held(&uuid_mutex);
 
-	fs_devices = alloc_fs_devices(orig->fsid, NULL);
+	fs_devices = alloc_fs_devices(orig->fsid);
 	if (IS_ERR(fs_devices))
 		return fs_devices;
 
@@ -1068,9 +1004,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
 		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
 			continue;
 
-		if (device->bdev) {
-			blkdev_put(device->bdev, device->holder);
+		if (device->bdev_handle) {
+			bdev_release(device->bdev_handle);
 			device->bdev = NULL;
+			device->bdev_handle = NULL;
 			fs_devices->open_devices--;
 		}
 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1115,7 +1052,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
 		invalidate_bdev(device->bdev);
 	}
 
-	blkdev_put(device->bdev, device->holder);
+	bdev_release(device->bdev_handle);
 }
 
 static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1356,14 +1293,19 @@ int btrfs_forget_devices(dev_t devt)
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount path
  * and we are not allowed to call set_blocksize during the scan. The superblock
- * is read via pagecache
+ * is read via pagecache.
+ *
+ * With @mount_arg_dev it's a scan during mount time that will always register
+ * the device or return an error. Multi-device and seeding devices are registered
+ * in both cases.
  */
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+					   bool mount_arg_dev)
 {
 	struct btrfs_super_block *disk_super;
 	bool new_device_added = false;
 	struct btrfs_device *device = NULL;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	u64 bytenr, bytenr_orig;
 	int ret;
 
@@ -1386,31 +1328,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
 	 * values temporarily, as the device paths of the fsid are the only
 	 * required information for assembling the volume.
 	 */
-	bdev = blkdev_get_by_path(path, flags, NULL, NULL);
-	if (IS_ERR(bdev))
-		return ERR_CAST(bdev);
+	bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
+	if (IS_ERR(bdev_handle))
+		return ERR_CAST(bdev_handle);
 
 	bytenr_orig = btrfs_sb_offset(0);
-	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+	ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
 	if (ret) {
 		device = ERR_PTR(ret);
 		goto error_bdev_put;
 	}
 
-	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
+	disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+					   bytenr_orig);
 	if (IS_ERR(disk_super)) {
 		device = ERR_CAST(disk_super);
 		goto error_bdev_put;
 	}
 
+	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
+		dev_t devt;
+
+		ret = lookup_bdev(path, &devt);
+		if (ret)
+			btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
+				   path, ret);
+		else
+			btrfs_free_stale_devices(devt, NULL);
+
+		pr_debug("BTRFS: skip registering single non-seed device %s\n", path);
+		device = NULL;
+		goto free_disk_super;
+	}
+
 	device = device_list_add(path, disk_super, &new_device_added);
 	if (!IS_ERR(device) && new_device_added)
 		btrfs_free_stale_devices(device->devt, device);
 
+free_disk_super:
 	btrfs_release_disk_super(disk_super);
 
 error_bdev_put:
-	blkdev_put(bdev, NULL);
+	bdev_release(bdev_handle);
 
 	return device;
 }
@@ -1894,7 +1854,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
 	ptr = btrfs_device_fsid(dev_item);
 	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
 			    ptr, BTRFS_FSID_SIZE);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	ret = 0;
 out:
@@ -2087,7 +2047,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
 
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
-		    struct block_device **bdev, void **holder)
+		    struct bdev_handle **bdev_handle)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
@@ -2196,7 +2156,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 
 	btrfs_assign_next_active_device(device, NULL);
 
-	if (device->bdev) {
+	if (device->bdev_handle) {
 		cur_devices->open_devices--;
 		/* remove sysfs entry */
 		btrfs_sysfs_remove_device(device);
@@ -2212,9 +2172,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	 * free the device.
 	 *
 	 * We cannot call btrfs_close_bdev() here because we're holding the sb
-	 * write lock, and blkdev_put() will pull in the ->open_mutex on the
-	 * block device and it's dependencies.  Instead just flush the device
-	 * and let the caller do the final blkdev_put.
+	 * write lock, and bdev_release() will pull in the ->open_mutex on
+	 * the block device and it's dependencies.  Instead just flush the
+	 * device and let the caller do the final bdev_release.
 	 */
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		btrfs_scratch_superblocks(fs_info, device->bdev,
@@ -2225,8 +2185,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		}
 	}
 
-	*bdev = device->bdev;
-	*holder = device->holder;
+	*bdev_handle = device->bdev_handle;
 	synchronize_rcu();
 	btrfs_free_device(device);
 
@@ -2363,7 +2322,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 				 const char *path)
 {
 	struct btrfs_super_block *disk_super;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	int ret;
 
 	if (!path || !path[0])
@@ -2381,7 +2340,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
-				    &bdev, &disk_super);
+				    &bdev_handle, &disk_super);
 	if (ret) {
 		btrfs_put_dev_args_from_path(args);
 		return ret;
@@ -2394,7 +2353,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 	else
 		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 	btrfs_release_disk_super(disk_super);
-	blkdev_put(bdev, NULL);
+	bdev_release(bdev_handle);
 	return 0;
 }
 
@@ -2451,7 +2410,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
 	 * Private copy of the seed devices, anchored at
 	 * fs_info->fs_devices->seed_list
 	 */
-	seed_devices = alloc_fs_devices(NULL, NULL);
+	seed_devices = alloc_fs_devices(NULL);
 	if (IS_ERR(seed_devices))
 		return seed_devices;
 
@@ -2597,7 +2556,7 @@ next_slot:
 		if (device->fs_devices->seeding) {
 			btrfs_set_device_generation(leaf, dev_item,
 						    device->generation);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 		}
 
 		path->slots[0]++;
@@ -2614,7 +2573,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	struct btrfs_root *root = fs_info->dev_root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct super_block *sb = fs_info->sb;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_fs_devices *seed_devices = NULL;
@@ -2627,12 +2586,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	if (sb_rdonly(sb) && !fs_devices->seeding)
 		return -EROFS;
 
-	bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
-				  fs_info->bdev_holder, NULL);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+					fs_info->bdev_holder, NULL);
+	if (IS_ERR(bdev_handle))
+		return PTR_ERR(bdev_handle);
 
-	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+	if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
 		ret = -EINVAL;
 		goto error;
 	}
@@ -2644,11 +2603,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		locked = true;
 	}
 
-	sync_blockdev(bdev);
+	sync_blockdev(bdev_handle->bdev);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
-		if (device->bdev == bdev) {
+		if (device->bdev == bdev_handle->bdev) {
 			ret = -EEXIST;
 			rcu_read_unlock();
 			goto error;
@@ -2664,7 +2623,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 
 	device->fs_info = fs_info;
-	device->bdev = bdev;
+	device->bdev_handle = bdev_handle;
+	device->bdev = bdev_handle->bdev;
 	ret = lookup_bdev(device_path, &device->devt);
 	if (ret)
 		goto error_free_device;
@@ -2685,12 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	device->io_align = fs_info->sectorsize;
 	device->sector_size = fs_info->sectorsize;
 	device->total_bytes =
-		round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
+		round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
 	device->disk_total_bytes = device->total_bytes;
 	device->commit_total_bytes = device->total_bytes;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
-	device->holder = fs_info->bdev_holder;
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 
@@ -2726,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
 
-	if (!bdev_nonrot(bdev))
+	if (!bdev_nonrot(device->bdev))
 		fs_devices->rotating = true;
 
 	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2848,7 +2807,7 @@ error_free_zone:
 error_free_device:
 	btrfs_free_device(device);
 error:
-	blkdev_put(bdev, fs_info->bdev_holder);
+	bdev_release(bdev_handle);
 	if (locked) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
@@ -2895,7 +2854,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
 				     btrfs_device_get_disk_total_bytes(device));
 	btrfs_set_device_bytes_used(leaf, dev_item,
 				    btrfs_device_get_bytes_used(device));
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 out:
 	btrfs_free_path(path);
@@ -2929,6 +2888,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 	btrfs_set_super_total_bytes(super_copy,
 			round_down(old_total + diff, fs_info->sectorsize));
 	device->fs_devices->total_rw_bytes += diff;
+	atomic64_add(diff, &fs_info->free_chunk_space);
 
 	btrfs_device_set_total_bytes(device, new_size);
 	btrfs_device_set_disk_total_bytes(device, new_size);
@@ -3027,7 +2987,8 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 }
 
 /*
- * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * Find the mapping containing the given logical extent.
+ *
  * @logical: Logical block offset in bytes.
  * @length: Length of extent in bytes.
  *
@@ -3483,7 +3444,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
 
 	btrfs_set_balance_flags(leaf, item, bctl->flags);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	err = btrfs_commit_transaction(trans);
@@ -4838,6 +4799,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	u64 old_size = btrfs_device_get_total_bytes(device);
 	u64 diff;
 	u64 start;
+	u64 free_diff = 0;
 
 	new_size = round_down(new_size, fs_info->sectorsize);
 	start = new_size;
@@ -4863,7 +4825,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	btrfs_device_set_total_bytes(device, new_size);
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		device->fs_devices->total_rw_bytes -= diff;
-		atomic64_sub(diff, &fs_info->free_chunk_space);
+
+		/*
+		 * The new free_chunk_space is new_size - used, so we have to
+		 * subtract the delta of the old free_chunk_space which included
+		 * old_size - used.  If used > new_size then just subtract this
+		 * entire device's free space.
+		 */
+		if (device->bytes_used < new_size)
+			free_diff = (old_size - device->bytes_used) -
+				    (new_size - device->bytes_used);
+		else
+			free_diff = old_size - device->bytes_used;
+		atomic64_sub(free_diff, &fs_info->free_chunk_space);
 	}
 
 	/*
@@ -4998,9 +4972,10 @@ done:
 	if (ret) {
 		mutex_lock(&fs_info->chunk_mutex);
 		btrfs_device_set_total_bytes(device, old_size);
-		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 			device->fs_devices->total_rw_bytes += diff;
-		atomic64_add(diff, &fs_info->free_chunk_space);
+			atomic64_add(free_diff, &fs_info->free_chunk_space);
+		}
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
 	return ret;
@@ -5880,6 +5855,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 }
 
 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+						       u64 logical,
 						       u16 total_stripes)
 {
 	struct btrfs_io_context *bioc;
@@ -5899,6 +5875,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
 	bioc->fs_info = fs_info;
 	bioc->replace_stripe_src = -1;
 	bioc->full_stripe_logical = (u64)-1;
+	bioc->logical = logical;
 
 	return bioc;
 }
@@ -6203,12 +6180,20 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
 	return U64_MAX;
 }
 
-static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
-			  u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+			 u64 logical, u64 *length, struct btrfs_io_stripe *dst,
+			 struct map_lookup *map, u32 stripe_index,
+			 u64 stripe_offset, u64 stripe_nr)
 {
 	dst->dev = map->stripes[stripe_index].dev;
+
+	if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+		return btrfs_get_raid_extent_offset(fs_info, logical, length,
+						    map->type, stripe_index, dst);
+
 	dst->physical = map->stripes[stripe_index].physical +
 			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+	return 0;
 }
 
 /*
@@ -6245,16 +6230,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
  *			For RAID6 profile, mirror > 2 means mark another
  *			data/P stripe error and rebuild from the remaining
  *			stripes..
- *
- * @need_raid_map:	(Used only for integrity checker) whether the map wants
- *                      a full stripe map (including all data and P/Q stripes)
- *                      for RAID56. Should always be 1 except integrity checker.
  */
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		    u64 logical, u64 *length,
 		    struct btrfs_io_context **bioc_ret,
-		    struct btrfs_io_stripe *smap, int *mirror_num_ret,
-		    int need_raid_map)
+		    struct btrfs_io_stripe *smap, int *mirror_num_ret)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -6349,8 +6329,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
+		if (op != BTRFS_MAP_READ || mirror_num > 1) {
 			/*
+			 * Needs full stripe mapping.
+			 *
 			 * Push stripe_nr back to the start of the full stripe
 			 * For those cases needing a full stripe, @stripe_nr
 			 * is the full stripe number.
@@ -6373,19 +6355,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 			stripe_index = 0;
 			stripe_offset = 0;
 		} else {
-			/*
-			 * Mirror #0 or #1 means the original data block.
-			 * Mirror #2 is RAID5 parity block.
-			 * Mirror #3 is RAID6 Q block.
-			 */
+			ASSERT(mirror_num <= 1);
+			/* Just grab the data stripe directly. */
 			stripe_index = stripe_nr % data_stripes;
 			stripe_nr /= data_stripes;
-			if (mirror_num > 1)
-				stripe_index = data_stripes + mirror_num - 2;
 
 			/* We distribute the parity blocks across stripes */
 			stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
-			if (op == BTRFS_MAP_READ && mirror_num <= 1)
+			if (op == BTRFS_MAP_READ && mirror_num < 1)
 				mirror_num = 1;
 		}
 	} else {
@@ -6424,16 +6401,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	 * I/O context structure.
 	 */
 	if (smap && num_alloc_stripes == 1 &&
+	    !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
+	      op != BTRFS_MAP_READ) &&
 	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
-		set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
+		ret = set_io_stripe(fs_info, op, logical, length, smap, map,
+				    stripe_index, stripe_offset, stripe_nr);
 		if (mirror_num_ret)
 			*mirror_num_ret = mirror_num;
 		*bioc_ret = NULL;
-		ret = 0;
 		goto out;
 	}
 
-	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+	bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
 	if (!bioc) {
 		ret = -ENOMEM;
 		goto out;
@@ -6447,7 +6426,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	 *
 	 * It's still mostly the same as other profiles, just with extra rotation.
 	 */
-	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
 	    (op != BTRFS_MAP_READ || mirror_num > 1)) {
 		/*
 		 * For RAID56 @stripe_nr is already the number of full stripes
@@ -6459,22 +6438,35 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		 */
 		bioc->full_stripe_logical = em->start +
 			btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
-		for (i = 0; i < num_stripes; i++)
-			set_io_stripe(&bioc->stripes[i], map,
-				      (i + stripe_nr) % num_stripes,
-				      stripe_offset, stripe_nr);
+		for (int i = 0; i < num_stripes; i++) {
+			ret = set_io_stripe(fs_info, op, logical, length,
+					    &bioc->stripes[i], map,
+					    (i + stripe_nr) % num_stripes,
+					    stripe_offset, stripe_nr);
+			if (ret < 0)
+				break;
+		}
 	} else {
 		/*
 		 * For all other non-RAID56 profiles, just copy the target
 		 * stripe into the bioc.
 		 */
 		for (i = 0; i < num_stripes; i++) {
-			set_io_stripe(&bioc->stripes[i], map, stripe_index,
-				      stripe_offset, stripe_nr);
+			ret = set_io_stripe(fs_info, op, logical, length,
+					    &bioc->stripes[i], map, stripe_index,
+					    stripe_offset, stripe_nr);
+			if (ret < 0)
+				break;
 			stripe_index++;
 		}
 	}
 
+	if (ret) {
+		*bioc_ret = NULL;
+		btrfs_put_bioc(bioc);
+		goto out;
+	}
+
 	if (op != BTRFS_MAP_READ)
 		max_errors = btrfs_chunk_max_errors(map);
 
@@ -6901,7 +6893,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 		if (!btrfs_test_opt(fs_info, DEGRADED))
 			return ERR_PTR(-ENOENT);
 
-		fs_devices = alloc_fs_devices(fsid, NULL);
+		fs_devices = alloc_fs_devices(fsid);
 		if (IS_ERR(fs_devices))
 			return fs_devices;
 
@@ -7534,7 +7526,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
 		btrfs_set_dev_stats_value(eb, ptr, i,
 					  btrfs_dev_stat_read(device, i));
-	btrfs_mark_buffer_dirty(eb);
+	btrfs_mark_buffer_dirty(trans, eb);
 
 out:
 	btrfs_free_path(path);
@@ -8076,7 +8068,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
 	ASSERT(mirror_num > 0);
 
 	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
-			      &bioc, smap, &mirror_ret, true);
+			      &bioc, smap, &mirror_ret);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2128a032c3b7..9cc374864a79 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,13 +90,11 @@ struct btrfs_device {
 
 	u64 generation;
 
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 
 	struct btrfs_zoned_device_info *zone_info;
 
-	/* block device holder for blkdev_get/put */
-	void *holder;
-
 	/*
 	 * Device's major-minor number. Must be set even if the device is not
 	 * opened (bdev == NULL), unless the device is missing.
@@ -290,6 +288,19 @@ struct btrfs_fs_devices {
 	 * - Following shall be true at all times:
 	 *   - metadata_uuid == btrfs_header::fsid
 	 *   - metadata_uuid == btrfs_dev_item::fsid
+	 *
+	 * - Relations between fsid and metadata_uuid in sb and fs_devices:
+	 *   - Normal:
+	 *       fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid
+	 *       sb->metadata_uuid == 0
+	 *
+	 *   - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set:
+	 *       fs_devices->fsid == sb->fsid
+	 *       fs_devices->metadata_uuid == sb->metadata_uuid
+	 *
+	 *   - When in-memory fs_devices->temp_fsid is true
+	 *	 fs_devices->fsid = random
+	 *	 fs_devices->metadata_uuid == sb->fsid
 	 */
 	u8 metadata_uuid[BTRFS_FSID_SIZE];
 
@@ -353,9 +364,10 @@ struct btrfs_fs_devices {
 	bool rotating;
 	/* Devices support TRIM/discard commands. */
 	bool discardable;
-	bool fsid_change;
 	/* The filesystem is a seed filesystem. */
 	bool seeding;
+	/* The mount needs to use a randomly generated fsid. */
+	bool temp_fsid;
 
 	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
@@ -381,12 +393,12 @@ struct btrfs_fs_devices {
 
 struct btrfs_io_stripe {
 	struct btrfs_device *dev;
-	union {
-		/* Block mapping */
-		u64 physical;
-		/* For the endio handler */
-		struct btrfs_io_context *bioc;
-	};
+	/* Block mapping. */
+	u64 physical;
+	u64 length;
+	bool is_scrub;
+	/* For the endio handler. */
+	struct btrfs_io_context *bioc;
 };
 
 struct btrfs_discard_stripe {
@@ -419,6 +431,11 @@ struct btrfs_io_context {
 	atomic_t error;
 	u16 max_errors;
 
+	u64 logical;
+	u64 size;
+	/* Raid stripe tree ordered entry. */
+	struct list_head rst_ordered_entry;
+
 	/*
 	 * The total number of stripes, including the extra duplicated
 	 * stripe for replace.
@@ -596,8 +613,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc);
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		    u64 logical, u64 *length,
 		    struct btrfs_io_context **bioc_ret,
-		    struct btrfs_io_stripe *smap, int *mirror_num_ret,
-		    int need_raid_map);
+		    struct btrfs_io_stripe *smap, int *mirror_num_ret);
 int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
 			   struct btrfs_io_stripe *smap, u64 logical,
 			   u32 length, int mirror_num);
@@ -611,7 +627,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
 void btrfs_mapping_tree_free(struct extent_map_tree *tree);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags);
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+					   bool mount_arg_dev);
 int btrfs_forget_devices(dev_t devt);
 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -629,7 +646,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
-		    struct block_device **bdev, void **holder);
+		    struct bdev_handle **bdev_handle);
 void __exit btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 96828a13dd43..3cf236fb40a4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -188,15 +188,15 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		if (old_data_len + name_len + sizeof(*di) == item_size) {
 			/* No other xattrs packed in the same leaf item. */
 			if (size > old_data_len)
-				btrfs_extend_item(path, size - old_data_len);
+				btrfs_extend_item(trans, path, size - old_data_len);
 			else if (size < old_data_len)
-				btrfs_truncate_item(path, data_size, 1);
+				btrfs_truncate_item(trans, path, data_size, 1);
 		} else {
 			/* There are other xattrs packed in the same item. */
 			ret = btrfs_delete_one_dir_name(trans, root, path, di);
 			if (ret)
 				goto out;
-			btrfs_extend_item(path, data_size);
+			btrfs_extend_item(trans, path, data_size);
 		}
 
 		ptr = btrfs_item_ptr(leaf, slot, char);
@@ -205,7 +205,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		btrfs_set_dir_data_len(leaf, di, size);
 		data_ptr = ((unsigned long)(di + 1)) + name_len;
 		write_extent_buffer(leaf, value, data_ptr, size);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	} else {
 		/*
 		 * Insert, and we had space for the xattr, so path->slots[0] is
@@ -265,7 +265,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
 
 	inode_inc_iversion(inode);
 	inode_set_ctime_current(inode);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
 out:
@@ -408,7 +408,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
 	if (!ret) {
 		inode_inc_iversion(inode);
 		inode_set_ctime_current(inode);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 		if (ret)
 			btrfs_abort_transaction(trans, ret);
 	}
@@ -442,7 +442,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = {
 	.set = btrfs_xattr_handler_set_prop,
 };
 
-const struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler * const btrfs_xattr_handlers[] = {
 	&btrfs_security_xattr_handler,
 	&btrfs_trusted_xattr_handler,
 	&btrfs_user_xattr_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 1cd3fc0a8f17..118118ca3e1d 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -8,7 +8,7 @@
 
 #include <linux/xattr.h>
 
-extern const struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler * const btrfs_xattr_handlers[];
 
 int btrfs_getxattr(struct inode *inode, const char *name,
 		void *buffer, size_t size);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 09bc325d075d..3504ade30cb0 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1282,21 +1282,284 @@ out:
 	return ret;
 }
 
+struct zone_info {
+	u64 physical;
+	u64 capacity;
+	u64 alloc_offset;
+};
+
+static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
+				struct zone_info *info, unsigned long *active,
+				struct map_lookup *map)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *device = map->stripes[zone_idx].dev;
+	int dev_replace_is_ongoing = 0;
+	unsigned int nofs_flag;
+	struct blk_zone zone;
+	int ret;
+
+	info->physical = map->stripes[zone_idx].physical;
+
+	if (!device->bdev) {
+		info->alloc_offset = WP_MISSING_DEV;
+		return 0;
+	}
+
+	/* Consider a zone as active if we can allow any number of active zones. */
+	if (!device->zone_info->max_active_zones)
+		__set_bit(zone_idx, active);
+
+	if (!btrfs_dev_is_sequential(device, info->physical)) {
+		info->alloc_offset = WP_CONVENTIONAL;
+		return 0;
+	}
+
+	/* This zone will be used for allocation, so mark this zone non-empty. */
+	btrfs_dev_clear_zone_empty(device, info->physical);
+
+	down_read(&dev_replace->rwsem);
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+		btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
+	up_read(&dev_replace->rwsem);
+
+	/*
+	 * The group is mapped to a sequential zone. Get the zone write pointer
+	 * to determine the allocation offset within the zone.
+	 */
+	WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+	nofs_flag = memalloc_nofs_save();
+	ret = btrfs_get_dev_zone(device, info->physical, &zone);
+	memalloc_nofs_restore(nofs_flag);
+	if (ret) {
+		if (ret != -EIO && ret != -EOPNOTSUPP)
+			return ret;
+		info->alloc_offset = WP_MISSING_DEV;
+		return 0;
+	}
+
+	if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+		btrfs_err_in_rcu(fs_info,
+		"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+			zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+			device->devid);
+		return -EIO;
+	}
+
+	info->capacity = (zone.capacity << SECTOR_SHIFT);
+
+	switch (zone.cond) {
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+		btrfs_err(fs_info,
+		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
+			  (info->physical >> device->zone_info->zone_size_shift),
+			  rcu_str_deref(device->name), device->devid);
+		info->alloc_offset = WP_MISSING_DEV;
+		break;
+	case BLK_ZONE_COND_EMPTY:
+		info->alloc_offset = 0;
+		break;
+	case BLK_ZONE_COND_FULL:
+		info->alloc_offset = info->capacity;
+		break;
+	default:
+		/* Partially used zone. */
+		info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
+		__set_bit(zone_idx, active);
+		break;
+	}
+
+	return 0;
+}
+
+static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
+					 struct zone_info *info,
+					 unsigned long *active)
+{
+	if (info->alloc_offset == WP_MISSING_DEV) {
+		btrfs_err(bg->fs_info,
+			"zoned: cannot recover write pointer for zone %llu",
+			info->physical);
+		return -EIO;
+	}
+
+	bg->alloc_offset = info->alloc_offset;
+	bg->zone_capacity = info->capacity;
+	if (test_bit(0, active))
+		set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+	return 0;
+}
+
+static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
+				      struct map_lookup *map,
+				      struct zone_info *zone_info,
+				      unsigned long *active)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
+		return -EINVAL;
+	}
+
+	if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+		btrfs_err(bg->fs_info,
+			  "zoned: cannot recover write pointer for zone %llu",
+			  zone_info[0].physical);
+		return -EIO;
+	}
+	if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+		btrfs_err(bg->fs_info,
+			  "zoned: cannot recover write pointer for zone %llu",
+			  zone_info[1].physical);
+		return -EIO;
+	}
+	if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+		btrfs_err(bg->fs_info,
+			  "zoned: write pointer offset mismatch of zones in DUP profile");
+		return -EIO;
+	}
+
+	if (test_bit(0, active) != test_bit(1, active)) {
+		if (!btrfs_zone_activate(bg))
+			return -EIO;
+	} else if (test_bit(0, active)) {
+		set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+	}
+
+	bg->alloc_offset = zone_info[0].alloc_offset;
+	bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
+	return 0;
+}
+
+static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
+					struct map_lookup *map,
+					struct zone_info *zone_info,
+					unsigned long *active)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+	int i;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+			  btrfs_bg_type_to_raid_name(map->type));
+		return -EINVAL;
+	}
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+		    zone_info[i].alloc_offset == WP_CONVENTIONAL)
+			continue;
+
+		if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+		    !btrfs_test_opt(fs_info, DEGRADED)) {
+			btrfs_err(fs_info,
+			"zoned: write pointer offset mismatch of zones in %s profile",
+				  btrfs_bg_type_to_raid_name(map->type));
+			return -EIO;
+		}
+		if (test_bit(0, active) != test_bit(i, active)) {
+			if (!btrfs_test_opt(fs_info, DEGRADED) &&
+			    !btrfs_zone_activate(bg)) {
+				return -EIO;
+			}
+		} else {
+			if (test_bit(0, active))
+				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+		}
+		/* In case a device is missing we have a cap of 0, so don't use it. */
+		bg->zone_capacity = min_not_zero(zone_info[0].capacity,
+						 zone_info[1].capacity);
+	}
+
+	if (zone_info[0].alloc_offset != WP_MISSING_DEV)
+		bg->alloc_offset = zone_info[0].alloc_offset;
+	else
+		bg->alloc_offset = zone_info[i - 1].alloc_offset;
+
+	return 0;
+}
+
+static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
+					struct map_lookup *map,
+					struct zone_info *zone_info,
+					unsigned long *active)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+			  btrfs_bg_type_to_raid_name(map->type));
+		return -EINVAL;
+	}
+
+	for (int i = 0; i < map->num_stripes; i++) {
+		if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+		    zone_info[i].alloc_offset == WP_CONVENTIONAL)
+			continue;
+
+		if (test_bit(0, active) != test_bit(i, active)) {
+			if (!btrfs_zone_activate(bg))
+				return -EIO;
+		} else {
+			if (test_bit(0, active))
+				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+		}
+		bg->zone_capacity += zone_info[i].capacity;
+		bg->alloc_offset += zone_info[i].alloc_offset;
+	}
+
+	return 0;
+}
+
+static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
+					 struct map_lookup *map,
+					 struct zone_info *zone_info,
+					 unsigned long *active)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+			  btrfs_bg_type_to_raid_name(map->type));
+		return -EINVAL;
+	}
+
+	for (int i = 0; i < map->num_stripes; i++) {
+		if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+		    zone_info[i].alloc_offset == WP_CONVENTIONAL)
+			continue;
+
+		if (test_bit(0, active) != test_bit(i, active)) {
+			if (!btrfs_zone_activate(bg))
+				return -EIO;
+		} else {
+			if (test_bit(0, active))
+				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+		}
+
+		if ((i % map->sub_stripes) == 0) {
+			bg->zone_capacity += zone_info[i].capacity;
+			bg->alloc_offset += zone_info[i].alloc_offset;
+		}
+	}
+
+	return 0;
+}
+
 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
-	struct btrfs_device *device;
 	u64 logical = cache->start;
 	u64 length = cache->length;
+	struct zone_info *zone_info = NULL;
 	int ret;
 	int i;
-	unsigned int nofs_flag;
-	u64 *alloc_offsets = NULL;
-	u64 *caps = NULL;
-	u64 *physical = NULL;
 	unsigned long *active = NULL;
 	u64 last_alloc = 0;
 	u32 num_sequential = 0, num_conventional = 0;
@@ -1328,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 		goto out;
 	}
 
-	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
-	if (!alloc_offsets) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
-	if (!caps) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
-	if (!physical) {
+	zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
+	if (!zone_info) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1353,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 	}
 
 	for (i = 0; i < map->num_stripes; i++) {
-		bool is_sequential;
-		struct blk_zone zone;
-		struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-		int dev_replace_is_ongoing = 0;
-
-		device = map->stripes[i].dev;
-		physical[i] = map->stripes[i].physical;
-
-		if (device->bdev == NULL) {
-			alloc_offsets[i] = WP_MISSING_DEV;
-			continue;
-		}
-
-		is_sequential = btrfs_dev_is_sequential(device, physical[i]);
-		if (is_sequential)
-			num_sequential++;
-		else
-			num_conventional++;
-
-		/*
-		 * Consider a zone as active if we can allow any number of
-		 * active zones.
-		 */
-		if (!device->zone_info->max_active_zones)
-			__set_bit(i, active);
-
-		if (!is_sequential) {
-			alloc_offsets[i] = WP_CONVENTIONAL;
-			continue;
-		}
-
-		/*
-		 * This zone will be used for allocation, so mark this zone
-		 * non-empty.
-		 */
-		btrfs_dev_clear_zone_empty(device, physical[i]);
-
-		down_read(&dev_replace->rwsem);
-		dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
-		if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
-			btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
-		up_read(&dev_replace->rwsem);
-
-		/*
-		 * The group is mapped to a sequential zone. Get the zone write
-		 * pointer to determine the allocation offset within the zone.
-		 */
-		WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
-		nofs_flag = memalloc_nofs_save();
-		ret = btrfs_get_dev_zone(device, physical[i], &zone);
-		memalloc_nofs_restore(nofs_flag);
-		if (ret == -EIO || ret == -EOPNOTSUPP) {
-			ret = 0;
-			alloc_offsets[i] = WP_MISSING_DEV;
-			continue;
-		} else if (ret) {
-			goto out;
-		}
-
-		if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
-			btrfs_err_in_rcu(fs_info,
-	"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
-				zone.start << SECTOR_SHIFT,
-				rcu_str_deref(device->name), device->devid);
-			ret = -EIO;
+		ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+		if (ret)
 			goto out;
-		}
 
-		caps[i] = (zone.capacity << SECTOR_SHIFT);
-
-		switch (zone.cond) {
-		case BLK_ZONE_COND_OFFLINE:
-		case BLK_ZONE_COND_READONLY:
-			btrfs_err(fs_info,
-		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
-				  physical[i] >> device->zone_info->zone_size_shift,
-				  rcu_str_deref(device->name), device->devid);
-			alloc_offsets[i] = WP_MISSING_DEV;
-			break;
-		case BLK_ZONE_COND_EMPTY:
-			alloc_offsets[i] = 0;
-			break;
-		case BLK_ZONE_COND_FULL:
-			alloc_offsets[i] = caps[i];
-			break;
-		default:
-			/* Partially used zone */
-			alloc_offsets[i] =
-					((zone.wp - zone.start) << SECTOR_SHIFT);
-			__set_bit(i, active);
-			break;
-		}
+		if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+			num_conventional++;
+		else
+			num_sequential++;
 	}
 
 	if (num_sequential > 0)
@@ -1468,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 
 	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
 	case 0: /* single */
-		if (alloc_offsets[0] == WP_MISSING_DEV) {
-			btrfs_err(fs_info,
-			"zoned: cannot recover write pointer for zone %llu",
-				physical[0]);
-			ret = -EIO;
-			goto out;
-		}
-		cache->alloc_offset = alloc_offsets[0];
-		cache->zone_capacity = caps[0];
-		if (test_bit(0, active))
-			set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+		ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
 		break;
 	case BTRFS_BLOCK_GROUP_DUP:
-		if (map->type & BTRFS_BLOCK_GROUP_DATA) {
-			btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
-			ret = -EINVAL;
-			goto out;
-		}
-		if (alloc_offsets[0] == WP_MISSING_DEV) {
-			btrfs_err(fs_info,
-			"zoned: cannot recover write pointer for zone %llu",
-				physical[0]);
-			ret = -EIO;
-			goto out;
-		}
-		if (alloc_offsets[1] == WP_MISSING_DEV) {
-			btrfs_err(fs_info,
-			"zoned: cannot recover write pointer for zone %llu",
-				physical[1]);
-			ret = -EIO;
-			goto out;
-		}
-		if (alloc_offsets[0] != alloc_offsets[1]) {
-			btrfs_err(fs_info,
-			"zoned: write pointer offset mismatch of zones in DUP profile");
-			ret = -EIO;
-			goto out;
-		}
-		if (test_bit(0, active) != test_bit(1, active)) {
-			if (!btrfs_zone_activate(cache)) {
-				ret = -EIO;
-				goto out;
-			}
-		} else {
-			if (test_bit(0, active))
-				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
-					&cache->runtime_flags);
-		}
-		cache->alloc_offset = alloc_offsets[0];
-		cache->zone_capacity = min(caps[0], caps[1]);
+		ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
 		break;
 	case BTRFS_BLOCK_GROUP_RAID1:
+	case BTRFS_BLOCK_GROUP_RAID1C3:
+	case BTRFS_BLOCK_GROUP_RAID1C4:
+		ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+		break;
 	case BTRFS_BLOCK_GROUP_RAID0:
+		ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+		break;
 	case BTRFS_BLOCK_GROUP_RAID10:
+		ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+		break;
 	case BTRFS_BLOCK_GROUP_RAID5:
 	case BTRFS_BLOCK_GROUP_RAID6:
-		/* non-single profiles are not supported yet */
 	default:
 		btrfs_err(fs_info, "zoned: profile %s not yet supported",
 			  btrfs_bg_type_to_raid_name(map->type));
@@ -1570,9 +1698,7 @@ out:
 		cache->physical_map = NULL;
 	}
 	bitmap_free(active);
-	kfree(physical);
-	kfree(caps);
-	kfree(alloc_offsets);
+	kfree(zone_info);
 	free_extent_map(em);
 
 	return ret;
@@ -1609,7 +1735,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans,
 	set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
 	set_extent_buffer_dirty(eb);
 	set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
-			EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
+			EXTENT_DIRTY, NULL);
 }
 
 bool btrfs_use_zone_append(struct btrfs_bio *bbio)
@@ -1887,7 +2013,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
 	int i, ret;
 
 	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-			      &mapped_length, &bioc, NULL, NULL, 1);
+			      &mapped_length, &bioc, NULL, NULL);
 	if (ret || !bioc || mapped_length < PAGE_SIZE) {
 		ret = -EIO;
 		goto out_put_bioc;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index e7ac4ec809a4..5511766485cd 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -145,7 +145,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
 }
 
 /*
- * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ * Calculate monotonic memory bounds.
  *
  * It is possible based on the level configurations that a higher level
  * workspace uses less memory than a lower level workspace.  In order to reuse
@@ -218,7 +218,8 @@ void zstd_cleanup_workspace_manager(void)
 }
 
 /*
- * zstd_find_workspace - find workspace
+ * Find workspace for given level.
+ *
  * @level: compression level
  *
  * This iterates over the set bits in the active_map beginning at the requested
@@ -256,7 +257,8 @@ static struct list_head *zstd_find_workspace(unsigned int level)
 }
 
 /*
- * zstd_get_workspace - zstd's get_workspace
+ * Zstd get_workspace for level.
+ *
  * @level: compression level
  *
  * If @level is 0, then any compression level can be used.  Therefore, we begin
@@ -296,7 +298,8 @@ again:
 }
 
 /*
- * zstd_put_workspace - zstd put_workspace
+ * Zstd put_workspace.
+ *
  * @ws: list_head for the workspace
  *
  * When putting back a workspace, we only need to update the LRU if we are of
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f4863078f7fe..936b9e0b351d 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -750,7 +750,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n",
 	     page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not ");
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = inode_get_mtime(inode);
 	ceph_osdc_start_request(osdc, req);
 	err = ceph_osdc_wait_request(osdc, req);
 
@@ -1327,7 +1327,7 @@ new_request:
 			pages = NULL;
 		}
 
-		req->r_mtime = inode->i_mtime;
+		req->r_mtime = inode_get_mtime(inode);
 		ceph_osdc_start_request(&fsc->client->osdc, req);
 		req = NULL;
 
@@ -1875,7 +1875,7 @@ int ceph_uninline_data(struct file *file)
 		goto out_unlock;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = inode_get_mtime(inode);
 	ceph_osdc_start_request(&fsc->client->osdc, req);
 	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	ceph_osdc_put_request(req);
@@ -1917,7 +1917,7 @@ int ceph_uninline_data(struct file *file)
 			goto out_put_req;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = inode_get_mtime(inode);
 	ceph_osdc_start_request(&fsc->client->osdc, req);
 	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
@@ -2092,7 +2092,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 				     0, false, true);
 	ceph_osdc_start_request(&fsc->client->osdc, rd_req);
 
-	wr_req->r_mtime = ci->netfs.inode.i_mtime;
+	wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode);
 	ceph_osdc_start_request(&fsc->client->osdc, wr_req);
 
 	err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 14215ec646f7..a104669fcf4c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1421,8 +1421,8 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
 		arg->old_xattr_buf = NULL;
 	}
 
-	arg->mtime = inode->i_mtime;
-	arg->atime = inode->i_atime;
+	arg->mtime = inode_get_mtime(inode);
+	arg->atime = inode_get_atime(inode);
 	arg->ctime = inode_get_ctime(inode);
 	arg->btime = ci->i_btime;
 	arg->change_attr = inode_peek_iversion_raw(inode);
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 5b5112c78462..e3b1c3fab412 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -133,6 +133,7 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
 }
 
 static struct fscrypt_operations ceph_fscrypt_ops = {
+	.needs_bounce_pages	= 1,
 	.get_context		= ceph_crypt_get_context,
 	.set_context		= ceph_crypt_set_context,
 	.get_dummy_policy	= ceph_get_dummy_policy,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b5f8038065d7..649600d0a7b6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2489,7 +2489,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 		goto out;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = inode_get_mtime(inode);
 	ceph_osdc_start_request(&fsc->client->osdc, req);
 	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	if (ret == -ENOENT)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b79100f720b3..2e2a303b9e64 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -185,9 +185,9 @@ struct inode *ceph_get_snapdir(struct inode *parent)
 	inode->i_mode = parent->i_mode;
 	inode->i_uid = parent->i_uid;
 	inode->i_gid = parent->i_gid;
-	inode->i_mtime = parent->i_mtime;
+	inode_set_mtime_to_ts(inode, inode_get_mtime(parent));
 	inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
-	inode->i_atime = parent->i_atime;
+	inode_set_atime_to_ts(inode, inode_get_atime(parent));
 	ci->i_rbytes = 0;
 	ci->i_btime = ceph_inode(parent)->i_btime;
 
@@ -835,28 +835,31 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 			/* the MDS did a utimes() */
 			dout("mtime %lld.%09ld -> %lld.%09ld "
 			     "tw %d -> %d\n",
-			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+			     inode_get_mtime_sec(inode),
+			     inode_get_mtime_nsec(inode),
 			     mtime->tv_sec, mtime->tv_nsec,
 			     ci->i_time_warp_seq, (int)time_warp_seq);
 
-			inode->i_mtime = *mtime;
-			inode->i_atime = *atime;
+			inode_set_mtime_to_ts(inode, *mtime);
+			inode_set_atime_to_ts(inode, *atime);
 			ci->i_time_warp_seq = time_warp_seq;
 		} else if (time_warp_seq == ci->i_time_warp_seq) {
+			struct timespec64	ts;
+
 			/* nobody did utimes(); take the max */
-			if (timespec64_compare(mtime, &inode->i_mtime) > 0) {
+			ts = inode_get_mtime(inode);
+			if (timespec64_compare(mtime, &ts) > 0) {
 				dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
-				     inode->i_mtime.tv_sec,
-				     inode->i_mtime.tv_nsec,
+				     ts.tv_sec, ts.tv_nsec,
 				     mtime->tv_sec, mtime->tv_nsec);
-				inode->i_mtime = *mtime;
+				inode_set_mtime_to_ts(inode, *mtime);
 			}
-			if (timespec64_compare(atime, &inode->i_atime) > 0) {
+			ts = inode_get_atime(inode);
+			if (timespec64_compare(atime, &ts) > 0) {
 				dout("atime %lld.%09ld -> %lld.%09ld inc\n",
-				     inode->i_atime.tv_sec,
-				     inode->i_atime.tv_nsec,
+				     ts.tv_sec, ts.tv_nsec,
 				     atime->tv_sec, atime->tv_nsec);
-				inode->i_atime = *atime;
+				inode_set_atime_to_ts(inode, *atime);
 			}
 		} else if (issued & CEPH_CAP_FILE_EXCL) {
 			/* we did a utimes(); ignore mds values */
@@ -867,8 +870,8 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 		/* we have no write|excl caps; whatever the MDS says is true */
 		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
 			inode_set_ctime_to_ts(inode, *ctime);
-			inode->i_mtime = *mtime;
-			inode->i_atime = *atime;
+			inode_set_mtime_to_ts(inode, *mtime);
+			inode_set_atime_to_ts(inode, *atime);
 			ci->i_time_warp_seq = time_warp_seq;
 		} else {
 			warn = 1;
@@ -2551,20 +2554,22 @@ retry:
 	}
 
 	if (ia_valid & ATTR_ATIME) {
+		struct timespec64 atime = inode_get_atime(inode);
+
 		dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
-		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+		     atime.tv_sec, atime.tv_nsec,
 		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
 		if (issued & CEPH_CAP_FILE_EXCL) {
 			ci->i_time_warp_seq++;
-			inode->i_atime = attr->ia_atime;
+			inode_set_atime_to_ts(inode, attr->ia_atime);
 			dirtied |= CEPH_CAP_FILE_EXCL;
 		} else if ((issued & CEPH_CAP_FILE_WR) &&
-			   timespec64_compare(&inode->i_atime,
-					    &attr->ia_atime) < 0) {
-			inode->i_atime = attr->ia_atime;
+			   timespec64_compare(&atime,
+					      &attr->ia_atime) < 0) {
+			inode_set_atime_to_ts(inode, attr->ia_atime);
 			dirtied |= CEPH_CAP_FILE_WR;
 		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-			   !timespec64_equal(&inode->i_atime, &attr->ia_atime)) {
+			   !timespec64_equal(&atime, &attr->ia_atime)) {
 			ceph_encode_timespec64(&req->r_args.setattr.atime,
 					       &attr->ia_atime);
 			mask |= CEPH_SETATTR_ATIME;
@@ -2624,20 +2629,21 @@ retry:
 		}
 	}
 	if (ia_valid & ATTR_MTIME) {
+		struct timespec64 mtime = inode_get_mtime(inode);
+
 		dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
-		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+		     mtime.tv_sec, mtime.tv_nsec,
 		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
 		if (issued & CEPH_CAP_FILE_EXCL) {
 			ci->i_time_warp_seq++;
-			inode->i_mtime = attr->ia_mtime;
+			inode_set_mtime_to_ts(inode, attr->ia_mtime);
 			dirtied |= CEPH_CAP_FILE_EXCL;
 		} else if ((issued & CEPH_CAP_FILE_WR) &&
-			   timespec64_compare(&inode->i_mtime,
-					    &attr->ia_mtime) < 0) {
-			inode->i_mtime = attr->ia_mtime;
+			   timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
+			inode_set_mtime_to_ts(inode, attr->ia_mtime);
 			dirtied |= CEPH_CAP_FILE_WR;
 		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-			   !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) {
+			   !timespec64_equal(&mtime, &attr->ia_mtime)) {
 			ceph_encode_timespec64(&req->r_args.setattr.mtime,
 					       &attr->ia_mtime);
 			mask |= CEPH_SETATTR_MTIME;
@@ -2651,8 +2657,8 @@ retry:
 		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
 					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
 		dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
-		     inode_get_ctime(inode).tv_sec,
-		     inode_get_ctime(inode).tv_nsec,
+		     inode_get_ctime_sec(inode),
+		     inode_get_ctime_nsec(inode),
 		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
 		     only ? "ctime only" : "ignored");
 		if (only) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 615db141b6c4..de798444bb97 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -861,8 +861,8 @@ int ceph_wait_on_conflict_unlink(struct dentry *dentry)
 		if (!d_same_name(udentry, pdentry, &dname))
 			goto next;
 
+		found = dget_dlock(udentry);
 		spin_unlock(&udentry->d_lock);
-		found = dget(udentry);
 		break;
 next:
 		spin_unlock(&udentry->d_lock);
@@ -4353,12 +4353,16 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 		rec.v2.flock_len = (__force __le32)
 			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
 	} else {
+		struct timespec64 ts;
+
 		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
 		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v1.issued = cpu_to_le32(cap->issued);
 		rec.v1.size = cpu_to_le64(i_size_read(inode));
-		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
-		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
+		ts = inode_get_mtime(inode);
+		ceph_encode_timespec64(&rec.v1.mtime, &ts);
+		ts = inode_get_atime(inode);
+		ceph_encode_timespec64(&rec.v1.atime, &ts);
 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
 		rec.v1.pathbase = cpu_to_le64(pathbase);
 	}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 813f21add992..6732e1ea97d9 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -658,8 +658,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 
 	BUG_ON(capsnap->writing);
 	capsnap->size = i_size_read(inode);
-	capsnap->mtime = inode->i_mtime;
-	capsnap->atime = inode->i_atime;
+	capsnap->mtime = inode_get_mtime(inode);
+	capsnap->atime = inode_get_atime(inode);
 	capsnap->ctime = inode_get_ctime(inode);
 	capsnap->btime = ci->i_btime;
 	capsnap->change_attr = inode_peek_iversion_raw(inode);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 51c7f2b14f6f..98844fc8a2f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1119,7 +1119,7 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-extern const struct xattr_handler *ceph_xattr_handlers[];
+extern const struct xattr_handler * const ceph_xattr_handlers[];
 
 struct ceph_acl_sec_ctx {
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0deae4a0f5f1..097ce7f74073 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1446,7 +1446,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
  * List of handlers for synthetic system.* attributes. Other
  * attributes are handled directly.
  */
-const struct xattr_handler *ceph_xattr_handlers[] = {
+const struct xattr_handler * const ceph_xattr_handlers[] = {
 	&ceph_other_xattr_handler,
 	NULL,
 };
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 950b6919fb87..6ba032442b39 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p)
 	struct module *owner = p->owner;
 	struct kobject *kobj;
 
-	if (owner && !try_module_get(owner))
+	if (!try_module_get(owner))
 		return NULL;
 	kobj = kobject_get_unless_zero(&p->kobj);
 	if (!kobj)
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index ae023853a98f..1d2dac95f86a 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -123,9 +123,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
 	if (attr->va_size != -1)
 		inode->i_blocks = (attr->va_size + 511) >> 9;
 	if (attr->va_atime.tv_sec != -1) 
-		inode->i_atime = coda_to_timespec64(attr->va_atime);
+		inode_set_atime_to_ts(inode,
+				      coda_to_timespec64(attr->va_atime));
 	if (attr->va_mtime.tv_sec != -1)
-		inode->i_mtime = coda_to_timespec64(attr->va_mtime);
+		inode_set_mtime_to_ts(inode,
+				      coda_to_timespec64(attr->va_mtime));
         if (attr->va_ctime.tv_sec != -1)
 		inode_set_ctime_to_ts(inode,
 				      coda_to_timespec64(attr->va_ctime));
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index cb512b10473b..4e552ba7bd43 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir)
 	/* optimistically we can also act as if our nose bleeds. The
 	 * granularity of the mtime is coarse anyways so we might actually be
 	 * right most of the time. Note: we only do this for directories. */
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 #endif
 }
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 42346618b4ed..16acc58311ea 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
 	ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
 	coda_inode->i_size = file_inode(host_file)->i_size;
 	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
-	coda_inode->i_mtime = inode_set_ctime_current(coda_inode);
+	inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode));
 	inode_unlock(coda_inode);
 	file_end_write(host_file);
 
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index fbdcb3582926..dcc22f593e43 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -88,7 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
 {
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 }
 
 static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -96,8 +96,8 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 	inode->i_mode = iattr->ia_mode;
 	inode->i_uid = iattr->ia_uid;
 	inode->i_gid = iattr->ia_gid;
-	inode->i_atime = iattr->ia_atime;
-	inode->i_mtime = iattr->ia_mtime;
+	inode_set_atime_to_ts(inode, iattr->ia_atime);
+	inode_set_mtime_to_ts(inode, iattr->ia_mtime);
 	inode_set_ctime_to_ts(inode, iattr->ia_ctime);
 }
 
@@ -171,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode)
 		return ERR_PTR(-ENOMEM);
 
 	p_inode = d_inode(dentry->d_parent);
-	p_inode->i_mtime = inode_set_ctime_current(p_inode);
+	inode_set_mtime_to_ts(p_inode, inode_set_ctime_current(p_inode));
 	configfs_set_inode_lock_class(sd, inode);
 	return inode;
 }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 5ee7d7bbb361..60dbfa0f8805 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -133,8 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 	}
 
 	/* Struct copy intentional */
-	inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
-								zerotime);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime)));
 	/* inode->i_nlink is left 1 - arguably wrong for directories,
 	   but it's the best we can do without reading the directory
 	   contents.  1 yields the right result in GNU find, even
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 		sb->s_mtd = NULL;
 	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		blkdev_put(sb->s_bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 	kfree(sbi);
 }
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 62e1a3dd8357..0ad8c30b8fa5 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -111,10 +111,14 @@ out:
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			  sector_t pblk, unsigned int len)
 {
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
-	const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits;
-	const unsigned int blocks_per_page = 1 << blocks_per_page_bits;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
+	const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
+	const unsigned int du_per_page = 1U << du_per_page_bits;
+	u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits);
+	u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits);
+	sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT);
 	struct page *pages[16]; /* write up to 16 pages at a time */
 	unsigned int nr_pages;
 	unsigned int i;
@@ -130,8 +134,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 							  len);
 
 	BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
-	nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
-			 (len + blocks_per_page - 1) >> blocks_per_page_bits);
+	nr_pages = min_t(u64, ARRAY_SIZE(pages),
+			 (du_remaining + du_per_page - 1) >> du_per_page_bits);
 
 	/*
 	 * We need at least one page for ciphertext.  Allocate the first one
@@ -154,21 +158,22 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 	bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
 
 	do {
-		bio->bi_iter.bi_sector = pblk << (blockbits - 9);
+		bio->bi_iter.bi_sector = sector;
 
 		i = 0;
 		offset = 0;
 		do {
-			err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk,
-						  ZERO_PAGE(0), pages[i],
-						  blocksize, offset, GFP_NOFS);
+			err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
+						      ZERO_PAGE(0), pages[i],
+						      du_size, offset,
+						      GFP_NOFS);
 			if (err)
 				goto out;
-			lblk++;
-			pblk++;
-			len--;
-			offset += blocksize;
-			if (offset == PAGE_SIZE || len == 0) {
+			du_index++;
+			sector += 1U << (du_bits - SECTOR_SHIFT);
+			du_remaining--;
+			offset += du_size;
+			if (offset == PAGE_SIZE || du_remaining == 0) {
 				ret = bio_add_page(bio, pages[i++], offset, 0);
 				if (WARN_ON_ONCE(ret != offset)) {
 					err = -EIO;
@@ -176,13 +181,13 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 				}
 				offset = 0;
 			}
-		} while (i != nr_pages && len != 0);
+		} while (i != nr_pages && du_remaining != 0);
 
 		err = submit_bio_wait(bio);
 		if (err)
 			goto out;
 		bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
-	} while (len != 0);
+	} while (du_remaining != 0);
 	err = 0;
 out:
 	bio_put(bio);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6a837e4b80dc..328470d40dec 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
-struct kmem_cache *fscrypt_info_cachep;
+struct kmem_cache *fscrypt_inode_info_cachep;
 
 void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
@@ -49,6 +49,13 @@ EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
 
 struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
 {
+	if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) {
+		/*
+		 * Oops, the filesystem called a function that uses the bounce
+		 * page pool, but it didn't set needs_bounce_pages.
+		 */
+		return NULL;
+	}
 	return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
 }
 
@@ -70,44 +77,44 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
 EXPORT_SYMBOL(fscrypt_free_bounce_page);
 
 /*
- * Generate the IV for the given logical block number within the given file.
- * For filenames encryption, lblk_num == 0.
+ * Generate the IV for the given data unit index within the given file.
+ * For filenames encryption, index == 0.
  *
  * Keep this in sync with fscrypt_limit_io_blocks().  fscrypt_limit_io_blocks()
  * needs to know about any IV generation methods where the low bits of IV don't
- * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ * simply contain the data unit index (e.g., IV_INO_LBLK_32).
  */
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
-			 const struct fscrypt_info *ci)
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
+			 const struct fscrypt_inode_info *ci)
 {
 	u8 flags = fscrypt_policy_flags(&ci->ci_policy);
 
 	memset(iv, 0, ci->ci_mode->ivsize);
 
 	if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
-		WARN_ON_ONCE(lblk_num > U32_MAX);
+		WARN_ON_ONCE(index > U32_MAX);
 		WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX);
-		lblk_num |= (u64)ci->ci_inode->i_ino << 32;
+		index |= (u64)ci->ci_inode->i_ino << 32;
 	} else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
-		WARN_ON_ONCE(lblk_num > U32_MAX);
-		lblk_num = (u32)(ci->ci_hashed_ino + lblk_num);
+		WARN_ON_ONCE(index > U32_MAX);
+		index = (u32)(ci->ci_hashed_ino + index);
 	} else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
 		memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE);
 	}
-	iv->lblk_num = cpu_to_le64(lblk_num);
+	iv->index = cpu_to_le64(index);
 }
 
-/* Encrypt or decrypt a single filesystem block of file contents */
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
-			u64 lblk_num, struct page *src_page,
-			struct page *dest_page, unsigned int len,
-			unsigned int offs, gfp_t gfp_flags)
+/* Encrypt or decrypt a single "data unit" of file contents. */
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+			    fscrypt_direction_t rw, u64 index,
+			    struct page *src_page, struct page *dest_page,
+			    unsigned int len, unsigned int offs,
+			    gfp_t gfp_flags)
 {
 	union fscrypt_iv iv;
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist dst, src;
-	struct fscrypt_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	int res = 0;
 
@@ -116,7 +123,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
 	if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0))
 		return -EINVAL;
 
-	fscrypt_generate_iv(&iv, lblk_num, ci);
+	fscrypt_generate_iv(&iv, index, ci);
 
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req)
@@ -137,28 +144,29 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
 		res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res) {
-		fscrypt_err(inode, "%scryption failed for block %llu: %d",
-			    (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res);
+		fscrypt_err(ci->ci_inode,
+			    "%scryption failed for data unit %llu: %d",
+			    (rw == FS_DECRYPT ? "De" : "En"), index, res);
 		return res;
 	}
 	return 0;
 }
 
 /**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a
- *					pagecache page
- * @page:      The locked pagecache page containing the block(s) to encrypt
- * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
- *		multiple of the filesystem's block size.
- * @offs:      Byte offset within @page of the first block to encrypt.  Must be
- *		a multiple of the filesystem's block size.
- * @gfp_flags: Memory allocation flags.  See details below.
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page
+ * @page: the locked pagecache page containing the data to encrypt
+ * @len: size of the data to encrypt, in bytes
+ * @offs: offset within @page of the data to encrypt, in bytes
+ * @gfp_flags: memory allocation flags; see details below
+ *
+ * This allocates a new bounce page and encrypts the given data into it.  The
+ * length and offset of the data must be aligned to the file's crypto data unit
+ * size.  Alignment to the filesystem block size fulfills this requirement, as
+ * the filesystem block size is always a multiple of the data unit size.
  *
- * A new bounce page is allocated, and the specified block(s) are encrypted into
- * it.  In the bounce page, the ciphertext block(s) will be located at the same
- * offsets at which the plaintext block(s) were located in the source page; any
- * other parts of the bounce page will be left uninitialized.  However, normally
- * blocksize == PAGE_SIZE and the whole page is encrypted at once.
+ * In the bounce page, the ciphertext data will be located at the same offset at
+ * which the plaintext data was located in the source page.  Any other parts of
+ * the bounce page will be left uninitialized.
  *
  * This is for use by the filesystem's ->writepages() method.
  *
@@ -176,28 +184,29 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
 
 {
 	const struct inode *inode = page->mapping->host;
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
 	struct page *ciphertext_page;
-	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
-		       (offs >> blockbits);
+	u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) +
+		    (offs >> du_bits);
 	unsigned int i;
 	int err;
 
 	if (WARN_ON_ONCE(!PageLocked(page)))
 		return ERR_PTR(-EINVAL);
 
-	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
 		return ERR_PTR(-EINVAL);
 
 	ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
 	if (!ciphertext_page)
 		return ERR_PTR(-ENOMEM);
 
-	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
-		err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num,
-					  page, ciphertext_page,
-					  blocksize, i, gfp_flags);
+	for (i = offs; i < offs + len; i += du_size, index++) {
+		err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
+					      page, ciphertext_page,
+					      du_size, i, gfp_flags);
 		if (err) {
 			fscrypt_free_bounce_page(ciphertext_page);
 			return ERR_PTR(err);
@@ -224,30 +233,33 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
  * and @lblk_num must be specified, as they can't be determined from @page.
  *
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
  * Return: 0 on success; -errno on failure
  */
 int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
 				  u64 lblk_num, gfp_t gfp_flags)
 {
-	return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page,
-				   len, offs, gfp_flags);
+	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+		return -EOPNOTSUPP;
+	return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
+				       lblk_num, page, page, len, offs,
+				       gfp_flags);
 }
 EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 
 /**
- * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a
- *					pagecache folio
- * @folio:     The locked pagecache folio containing the block(s) to decrypt
- * @len:       Total size of the block(s) to decrypt.  Must be a nonzero
- *		multiple of the filesystem's block size.
- * @offs:      Byte offset within @folio of the first block to decrypt.  Must be
- *		a multiple of the filesystem's block size.
+ * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio
+ * @folio: the pagecache folio containing the data to decrypt
+ * @len: size of the data to decrypt, in bytes
+ * @offs: offset within @folio of the data to decrypt, in bytes
  *
- * The specified block(s) are decrypted in-place within the pagecache folio,
- * which must still be locked and not uptodate.
- *
- * This is for use by the filesystem's ->readahead() method.
+ * Decrypt data that has just been read from an encrypted file.  The data must
+ * be located in a pagecache folio that is still locked and not yet uptodate.
+ * The length and offset of the data must be aligned to the file's crypto data
+ * unit size.  Alignment to the filesystem block size fulfills this requirement,
+ * as the filesystem block size is always a multiple of the data unit size.
  *
  * Return: 0 on success; -errno on failure
  */
@@ -255,25 +267,26 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
 				     size_t offs)
 {
 	const struct inode *inode = folio->mapping->host;
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
-	u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) +
-		       (offs >> blockbits);
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
+	u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
+		    (offs >> du_bits);
 	size_t i;
 	int err;
 
 	if (WARN_ON_ONCE(!folio_test_locked(folio)))
 		return -EINVAL;
 
-	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
 		return -EINVAL;
 
-	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+	for (i = offs; i < offs + len; i += du_size, index++) {
 		struct page *page = folio_page(folio, i >> PAGE_SHIFT);
 
-		err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
-					  page, blocksize, i & ~PAGE_MASK,
-					  GFP_NOFS);
+		err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
+					      page, du_size, i & ~PAGE_MASK,
+					      GFP_NOFS);
 		if (err)
 			return err;
 	}
@@ -295,14 +308,19 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
  * and @lblk_num must be specified, as they can't be determined from @page.
  *
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
  * Return: 0 on success; -errno on failure
  */
 int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
 				  u64 lblk_num)
 {
-	return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
-				   len, offs, GFP_NOFS);
+	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+		return -EOPNOTSUPP;
+	return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
+				       lblk_num, page, page, len, offs,
+				       GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
 
@@ -325,7 +343,7 @@ int fscrypt_initialize(struct super_block *sb)
 		return 0;
 
 	/* No need to allocate a bounce page pool if this FS won't use it. */
-	if (sb->s_cop->flags & FS_CFLG_OWN_PAGES)
+	if (!sb->s_cop->needs_bounce_pages)
 		return 0;
 
 	mutex_lock(&fscrypt_init_mutex);
@@ -391,18 +409,19 @@ static int __init fscrypt_init(void)
 	if (!fscrypt_read_workqueue)
 		goto fail;
 
-	fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
-	if (!fscrypt_info_cachep)
+	fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info,
+					       SLAB_RECLAIM_ACCOUNT);
+	if (!fscrypt_inode_info_cachep)
 		goto fail_free_queue;
 
 	err = fscrypt_init_keyring();
 	if (err)
-		goto fail_free_info;
+		goto fail_free_inode_info;
 
 	return 0;
 
-fail_free_info:
-	kmem_cache_destroy(fscrypt_info_cachep);
+fail_free_inode_info:
+	kmem_cache_destroy(fscrypt_inode_info_cachep);
 fail_free_queue:
 	destroy_workqueue(fscrypt_read_workqueue);
 fail:
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6eae3f12ad50..7b3fc189593a 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
 {
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	union fscrypt_iv iv;
 	struct scatterlist sg;
@@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode,
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist src_sg, dst_sg;
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	union fscrypt_iv iv;
 	int res;
@@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
  */
 u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
 {
-	const struct fscrypt_info *ci = dir->i_crypt_info;
+	const struct fscrypt_inode_info *ci = dir->i_crypt_info;
 
 	WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 2d63da48635a..1892356cf924 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -47,7 +47,8 @@ struct fscrypt_context_v2 {
 	u8 contents_encryption_mode;
 	u8 filenames_encryption_mode;
 	u8 flags;
-	u8 __reserved[4];
+	u8 log2_data_unit_size;
+	u8 __reserved[3];
 	u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
 	u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
 };
@@ -165,6 +166,26 @@ fscrypt_policy_flags(const union fscrypt_policy *policy)
 	BUG();
 }
 
+static inline int
+fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy,
+			  const struct inode *inode)
+{
+	return policy->log2_data_unit_size ?: inode->i_blkbits;
+}
+
+static inline int
+fscrypt_policy_du_bits(const union fscrypt_policy *policy,
+		       const struct inode *inode)
+{
+	switch (policy->version) {
+	case FSCRYPT_POLICY_V1:
+		return inode->i_blkbits;
+	case FSCRYPT_POLICY_V2:
+		return fscrypt_policy_v2_du_bits(&policy->v2, inode);
+	}
+	BUG();
+}
+
 /*
  * For encrypted symlinks, the ciphertext length is stored at the beginning
  * of the string in little-endian format.
@@ -189,18 +210,18 @@ struct fscrypt_prepared_key {
 };
 
 /*
- * fscrypt_info - the "encryption key" for an inode
+ * fscrypt_inode_info - the "encryption key" for an inode
  *
  * When an encrypted file's key is made available, an instance of this struct is
  * allocated and stored in ->i_crypt_info.  Once created, it remains until the
  * inode is evicted.
  */
-struct fscrypt_info {
+struct fscrypt_inode_info {
 
 	/* The key in a form prepared for actual encryption/decryption */
 	struct fscrypt_prepared_key ci_enc_key;
 
-	/* True if ci_enc_key should be freed when this fscrypt_info is freed */
+	/* True if ci_enc_key should be freed when this struct is freed */
 	bool ci_owns_key;
 
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
@@ -212,6 +233,16 @@ struct fscrypt_info {
 #endif
 
 	/*
+	 * log2 of the data unit size (granularity of contents encryption) of
+	 * this file.  This is computable from ci_policy and ci_inode but is
+	 * cached here for efficiency.  Only used for regular files.
+	 */
+	u8 ci_data_unit_bits;
+
+	/* Cached value: log2 of number of data units per FS block */
+	u8 ci_data_units_per_block_bits;
+
+	/*
 	 * Encryption mode used for this inode.  It corresponds to either the
 	 * contents or filenames encryption mode, depending on the inode type.
 	 */
@@ -263,12 +294,13 @@ typedef enum {
 } fscrypt_direction_t;
 
 /* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
+extern struct kmem_cache *fscrypt_inode_info_cachep;
 int fscrypt_initialize(struct super_block *sb);
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
-			u64 lblk_num, struct page *src_page,
-			struct page *dest_page, unsigned int len,
-			unsigned int offs, gfp_t gfp_flags);
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+			    fscrypt_direction_t rw, u64 index,
+			    struct page *src_page, struct page *dest_page,
+			    unsigned int len, unsigned int offs,
+			    gfp_t gfp_flags);
 struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
 
 void __printf(3, 4) __cold
@@ -283,8 +315,8 @@ fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
 
 union fscrypt_iv {
 	struct {
-		/* logical block number within the file */
-		__le64 lblk_num;
+		/* zero-based index of data unit within the file */
+		__le64 index;
 
 		/* per-file nonce; only set in DIRECT_KEY mode */
 		u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
@@ -293,8 +325,18 @@ union fscrypt_iv {
 	__le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
 };
 
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
-			 const struct fscrypt_info *ci);
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
+			 const struct fscrypt_inode_info *ci);
+
+/*
+ * Return the number of bits used by the maximum file data unit index that is
+ * possible on the given filesystem, using the given log2 data unit size.
+ */
+static inline int
+fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits)
+{
+	return fls64(sb->s_maxbytes - 1) - du_bits;
+}
 
 /* fname.c */
 bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
@@ -332,17 +374,17 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
 
 /* inline_crypt.c */
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci);
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);
 
 static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
 {
 	return ci->ci_inlinecrypt;
 }
 
 int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				     const u8 *raw_key,
-				     const struct fscrypt_info *ci);
+				     const struct fscrypt_inode_info *ci);
 
 void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 				      struct fscrypt_prepared_key *prep_key);
@@ -353,7 +395,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
  */
 static inline bool
 fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
-			const struct fscrypt_info *ci)
+			const struct fscrypt_inode_info *ci)
 {
 	/*
 	 * The two smp_load_acquire()'s here pair with the smp_store_release()'s
@@ -370,13 +412,13 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
 
 #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
 
-static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
 {
 	return 0;
 }
 
 static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
 {
 	return false;
 }
@@ -384,7 +426,7 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
 static inline int
 fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				 const u8 *raw_key,
-				 const struct fscrypt_info *ci)
+				 const struct fscrypt_inode_info *ci)
 {
 	WARN_ON_ONCE(1);
 	return -EOPNOTSUPP;
@@ -398,7 +440,7 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 
 static inline bool
 fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
-			const struct fscrypt_info *ci)
+			const struct fscrypt_inode_info *ci)
 {
 	return smp_load_acquire(&prep_key->tfm) != NULL;
 }
@@ -433,8 +475,28 @@ struct fscrypt_master_key_secret {
  * fscrypt_master_key - an in-use master key
  *
  * This represents a master encryption key which has been added to the
- * filesystem and can be used to "unlock" the encrypted files which were
- * encrypted with it.
+ * filesystem.  There are three high-level states that a key can be in:
+ *
+ * FSCRYPT_KEY_STATUS_PRESENT
+ *	Key is fully usable; it can be used to unlock inodes that are encrypted
+ *	with it (this includes being able to create new inodes).  ->mk_present
+ *	indicates whether the key is in this state.  ->mk_secret exists, the key
+ *	is in the keyring, and ->mk_active_refs > 0 due to ->mk_present.
+ *
+ * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED
+ *	Removal of this key has been initiated, but some inodes that were
+ *	unlocked with it are still in-use.  Like ABSENT, ->mk_secret is wiped,
+ *	and the key can no longer be used to unlock inodes.  Unlike ABSENT, the
+ *	key is still in the keyring; ->mk_decrypted_inodes is nonempty; and
+ *	->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes.
+ *
+ *	This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty,
+ *	or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key.
+ *
+ * FSCRYPT_KEY_STATUS_ABSENT
+ *	Key is fully removed.  The key is no longer in the keyring,
+ *	->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is
+ *	wiped, and the key can no longer be used to unlock inodes.
  */
 struct fscrypt_master_key {
 
@@ -444,7 +506,7 @@ struct fscrypt_master_key {
 	 */
 	struct hlist_node			mk_node;
 
-	/* Semaphore that protects ->mk_secret and ->mk_users */
+	/* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */
 	struct rw_semaphore			mk_sem;
 
 	/*
@@ -454,8 +516,8 @@ struct fscrypt_master_key {
 	 * ->mk_direct_keys) that have been prepared continue to exist.
 	 * A structural ref only guarantees that the struct continues to exist.
 	 *
-	 * There is one active ref associated with ->mk_secret being present,
-	 * and one active ref for each inode in ->mk_decrypted_inodes.
+	 * There is one active ref associated with ->mk_present being true, and
+	 * one active ref for each inode in ->mk_decrypted_inodes.
 	 *
 	 * There is one structural ref associated with the active refcount being
 	 * nonzero.  Finding a key in the keyring also takes a structural ref,
@@ -467,17 +529,10 @@ struct fscrypt_master_key {
 	struct rcu_head				mk_rcu_head;
 
 	/*
-	 * The secret key material.  After FS_IOC_REMOVE_ENCRYPTION_KEY is
-	 * executed, this is wiped and no new inodes can be unlocked with this
-	 * key; however, there may still be inodes in ->mk_decrypted_inodes
-	 * which could not be evicted.  As long as some inodes still remain,
-	 * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
-	 * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
+	 * The secret key material.  Wiped as soon as it is no longer needed;
+	 * for details, see the fscrypt_master_key struct comment.
 	 *
-	 * While ->mk_secret is present, one ref in ->mk_active_refs is held.
-	 *
-	 * Locking: protected by ->mk_sem.  The manipulation of ->mk_active_refs
-	 *	    associated with this field is protected by ->mk_sem as well.
+	 * Locking: protected by ->mk_sem.
 	 */
 	struct fscrypt_master_key_secret	mk_secret;
 
@@ -500,7 +555,7 @@ struct fscrypt_master_key {
 	 *
 	 * Locking: protected by ->mk_sem.  (We don't just rely on the keyrings
 	 * subsystem semaphore ->mk_users->sem, as we need support for atomic
-	 * search+insert along with proper synchronization with ->mk_secret.)
+	 * search+insert along with proper synchronization with other fields.)
 	 */
 	struct key		*mk_users;
 
@@ -523,20 +578,17 @@ struct fscrypt_master_key {
 	siphash_key_t		mk_ino_hash_key;
 	bool			mk_ino_hash_key_initialized;
 
-} __randomize_layout;
-
-static inline bool
-is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
-{
 	/*
-	 * The READ_ONCE() is only necessary for fscrypt_drop_inode().
-	 * fscrypt_drop_inode() runs in atomic context, so it can't take the key
-	 * semaphore and thus 'secret' can change concurrently which would be a
-	 * data race.  But fscrypt_drop_inode() only need to know whether the
-	 * secret *was* present at the time of check, so READ_ONCE() suffices.
+	 * Whether this key is in the "present" state, i.e. fully usable.  For
+	 * details, see the fscrypt_master_key struct comment.
+	 *
+	 * Locking: protected by ->mk_sem, but can be read locklessly using
+	 * READ_ONCE().  Writers must use WRITE_ONCE() when concurrent readers
+	 * are possible.
 	 */
-	return READ_ONCE(secret->size) != 0;
-}
+	bool			mk_present;
+
+} __randomize_layout;
 
 static inline const char *master_key_spec_type(
 				const struct fscrypt_key_specifier *spec)
@@ -598,17 +650,18 @@ struct fscrypt_mode {
 extern struct fscrypt_mode fscrypt_modes[];
 
 int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
-			const u8 *raw_key, const struct fscrypt_info *ci);
+			const u8 *raw_key, const struct fscrypt_inode_info *ci);
 
 void fscrypt_destroy_prepared_key(struct super_block *sb,
 				  struct fscrypt_prepared_key *prep_key);
 
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+				 const u8 *raw_key);
 
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk);
 
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk);
 
 int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);
@@ -643,10 +696,11 @@ static inline int fscrypt_require_key(struct inode *inode)
 
 void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
 
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
 			      const u8 *raw_master_key);
 
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci);
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+				struct fscrypt_inode_info *ci);
 
 /* policy.c */
 
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 6238dbcadcad..52504dd478d3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);
 int fscrypt_prepare_setflags(struct inode *inode,
 			     unsigned int oldflags, unsigned int flags)
 {
-	struct fscrypt_info *ci;
+	struct fscrypt_inode_info *ci;
 	struct fscrypt_master_key *mk;
 	int err;
 
@@ -187,7 +187,7 @@ int fscrypt_prepare_setflags(struct inode *inode,
 			return -EINVAL;
 		mk = ci->ci_master_key;
 		down_read(&mk->mk_sem);
-		if (is_master_key_secret_present(&mk->mk_secret))
+		if (mk->mk_present)
 			err = fscrypt_derive_dirhash_key(ci, mk);
 		else
 			err = -ENOKEY;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 8bfb3ce86476..b4002aea7cdb 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -39,11 +39,11 @@ static struct block_device **fscrypt_get_devices(struct super_block *sb,
 	return devs;
 }
 
-static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
+static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_inode_info *ci)
 {
-	struct super_block *sb = ci->ci_inode->i_sb;
+	const struct super_block *sb = ci->ci_inode->i_sb;
 	unsigned int flags = fscrypt_policy_flags(&ci->ci_policy);
-	int ino_bits = 64, lblk_bits = 64;
+	int dun_bits;
 
 	if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
 		return offsetofend(union fscrypt_iv, nonce);
@@ -54,10 +54,9 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
 	if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
 		return sizeof(__le32);
 
-	/* Default case: IVs are just the file logical block number */
-	if (sb->s_cop->get_ino_and_lblk_bits)
-		sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
-	return DIV_ROUND_UP(lblk_bits, 8);
+	/* Default case: IVs are just the file data unit index */
+	dun_bits = fscrypt_max_file_dun_bits(sb, ci->ci_data_unit_bits);
+	return DIV_ROUND_UP(dun_bits, 8);
 }
 
 /*
@@ -90,7 +89,7 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
 }
 
 /* Enable inline encryption for this file if supported. */
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
 {
 	const struct inode *inode = ci->ci_inode;
 	struct super_block *sb = inode->i_sb;
@@ -129,7 +128,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
 	 * crypto configuration that the file would use.
 	 */
 	crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
-	crypto_cfg.data_unit_size = sb->s_blocksize;
+	crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
 	crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
 
 	devs = fscrypt_get_devices(sb, &num_devs);
@@ -152,7 +151,7 @@ out_free_devs:
 
 int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				     const u8 *raw_key,
-				     const struct fscrypt_info *ci)
+				     const struct fscrypt_inode_info *ci)
 {
 	const struct inode *inode = ci->ci_inode;
 	struct super_block *sb = inode->i_sb;
@@ -168,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 		return -ENOMEM;
 
 	err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
-				  fscrypt_get_dun_bytes(ci), sb->s_blocksize);
+				  fscrypt_get_dun_bytes(ci),
+				  1U << ci->ci_data_unit_bits);
 	if (err) {
 		fscrypt_err(inode, "error %d initializing blk-crypto key", err);
 		goto fail;
@@ -232,13 +232,15 @@ bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
 
-static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
+static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci,
+				 u64 lblk_num,
 				 u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
 {
+	u64 index = lblk_num << ci->ci_data_units_per_block_bits;
 	union fscrypt_iv iv;
 	int i;
 
-	fscrypt_generate_iv(&iv, lblk_num, ci);
+	fscrypt_generate_iv(&iv, index, ci);
 
 	BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE);
 	memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE);
@@ -265,7 +267,7 @@ static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
 void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
 			       u64 first_lblk, gfp_t gfp_mask)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
 
 	if (!fscrypt_inode_uses_inline_crypto(inode))
@@ -456,7 +458,7 @@ EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
  */
 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	u32 dun;
 
 	if (!fscrypt_inode_uses_inline_crypto(inode))
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 7cbb1fd872ac..f34a9b0b9e92 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -99,10 +99,10 @@ void fscrypt_put_master_key_activeref(struct super_block *sb,
 	spin_unlock(&sb->s_master_keys->lock);
 
 	/*
-	 * ->mk_active_refs == 0 implies that ->mk_secret is not present and
-	 * that ->mk_decrypted_inodes is empty.
+	 * ->mk_active_refs == 0 implies that ->mk_present is false and
+	 * ->mk_decrypted_inodes is empty.
 	 */
-	WARN_ON_ONCE(is_master_key_secret_present(&mk->mk_secret));
+	WARN_ON_ONCE(mk->mk_present);
 	WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes));
 
 	for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
@@ -121,6 +121,18 @@ void fscrypt_put_master_key_activeref(struct super_block *sb,
 	fscrypt_put_master_key(mk);
 }
 
+/*
+ * This transitions the key state from present to incompletely removed, and then
+ * potentially to absent (depending on whether inodes remain).
+ */
+static void fscrypt_initiate_key_removal(struct super_block *sb,
+					 struct fscrypt_master_key *mk)
+{
+	WRITE_ONCE(mk->mk_present, false);
+	wipe_master_key_secret(&mk->mk_secret);
+	fscrypt_put_master_key_activeref(sb, mk);
+}
+
 static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
 {
 	if (spec->__reserved)
@@ -234,14 +246,13 @@ void fscrypt_destroy_keyring(struct super_block *sb)
 			 * evicted, every key remaining in the keyring should
 			 * have an empty inode list, and should only still be in
 			 * the keyring due to the single active ref associated
-			 * with ->mk_secret.  There should be no structural refs
-			 * beyond the one associated with the active ref.
+			 * with ->mk_present.  There should be no structural
+			 * refs beyond the one associated with the active ref.
 			 */
 			WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1);
 			WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1);
-			WARN_ON_ONCE(!is_master_key_secret_present(&mk->mk_secret));
-			wipe_master_key_secret(&mk->mk_secret);
-			fscrypt_put_master_key_activeref(sb, mk);
+			WARN_ON_ONCE(!mk->mk_present);
+			fscrypt_initiate_key_removal(sb, mk);
 		}
 	}
 	kfree_sensitive(keyring);
@@ -439,7 +450,8 @@ static int add_new_master_key(struct super_block *sb,
 	}
 
 	move_master_key_secret(&mk->mk_secret, secret);
-	refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */
+	mk->mk_present = true;
+	refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */
 
 	spin_lock(&keyring->lock);
 	hlist_add_head_rcu(&mk->mk_node,
@@ -478,11 +490,18 @@ static int add_existing_master_key(struct fscrypt_master_key *mk,
 			return err;
 	}
 
-	/* Re-add the secret if needed. */
-	if (!is_master_key_secret_present(&mk->mk_secret)) {
-		if (!refcount_inc_not_zero(&mk->mk_active_refs))
+	/* If the key is incompletely removed, make it present again. */
+	if (!mk->mk_present) {
+		if (!refcount_inc_not_zero(&mk->mk_active_refs)) {
+			/*
+			 * Raced with the last active ref being dropped, so the
+			 * key has become, or is about to become, "absent".
+			 * Therefore, we need to allocate a new key struct.
+			 */
 			return KEY_DEAD;
+		}
 		move_master_key_secret(&mk->mk_secret, secret);
+		WRITE_ONCE(mk->mk_present, true);
 	}
 
 	return 0;
@@ -506,8 +525,8 @@ static int do_add_master_key(struct super_block *sb,
 			err = add_new_master_key(sb, secret, mk_spec);
 	} else {
 		/*
-		 * Found the key in ->s_master_keys.  Re-add the secret if
-		 * needed, and add the user to ->mk_users if needed.
+		 * Found the key in ->s_master_keys.  Add the user to ->mk_users
+		 * if needed, and make the key "present" again if possible.
 		 */
 		down_write(&mk->mk_sem);
 		err = add_existing_master_key(mk, secret);
@@ -867,7 +886,7 @@ static void shrink_dcache_inode(struct inode *inode)
 
 static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
 {
-	struct fscrypt_info *ci;
+	struct fscrypt_inode_info *ci;
 	struct inode *inode;
 	struct inode *toput_inode = NULL;
 
@@ -917,7 +936,7 @@ static int check_for_busy_inodes(struct super_block *sb,
 		/* select an example file to show for debugging purposes */
 		struct inode *inode =
 			list_first_entry(&mk->mk_decrypted_inodes,
-					 struct fscrypt_info,
+					 struct fscrypt_inode_info,
 					 ci_master_key_link)->ci_inode;
 		ino = inode->i_ino;
 	}
@@ -989,9 +1008,8 @@ static int try_to_lock_encrypted_files(struct super_block *sb,
  *
  * If all inodes were evicted, then we unlink the fscrypt_master_key from the
  * keyring.  Otherwise it remains in the keyring in the "incompletely removed"
- * state (without the actual secret key) where it tracks the list of remaining
- * inodes.  Userspace can execute the ioctl again later to retry eviction, or
- * alternatively can re-add the secret key again.
+ * state where it tracks the list of remaining inodes.  Userspace can execute
+ * the ioctl again later to retry eviction, or alternatively can re-add the key.
  *
  * For more details, see the "Removing keys" section of
  * Documentation/filesystems/fscrypt.rst.
@@ -1053,11 +1071,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 		}
 	}
 
-	/* No user claims remaining.  Go ahead and wipe the secret. */
+	/* No user claims remaining.  Initiate removal of the key. */
 	err = -ENOKEY;
-	if (is_master_key_secret_present(&mk->mk_secret)) {
-		wipe_master_key_secret(&mk->mk_secret);
-		fscrypt_put_master_key_activeref(sb, mk);
+	if (mk->mk_present) {
+		fscrypt_initiate_key_removal(sb, mk);
 		err = 0;
 	}
 	inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
@@ -1074,9 +1091,9 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 	}
 	/*
 	 * We return 0 if we successfully did something: removed a claim to the
-	 * key, wiped the secret, or tried locking the files again.  Users need
-	 * to check the informational status flags if they care whether the key
-	 * has been fully removed including all files locked.
+	 * key, initiated removal of the key, or tried locking the files again.
+	 * Users need to check the informational status flags if they care
+	 * whether the key has been fully removed including all files locked.
 	 */
 out_put_key:
 	fscrypt_put_master_key(mk);
@@ -1103,12 +1120,11 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users);
  * Retrieve the status of an fscrypt master encryption key.
  *
  * We set ->status to indicate whether the key is absent, present, or
- * incompletely removed.  "Incompletely removed" means that the master key
- * secret has been removed, but some files which had been unlocked with it are
- * still in use.  This field allows applications to easily determine the state
- * of an encrypted directory without using a hack such as trying to open a
- * regular file in it (which can confuse the "incompletely removed" state with
- * absent or present).
+ * incompletely removed.  (For an explanation of what these statuses mean and
+ * how they are represented internally, see struct fscrypt_master_key.)  This
+ * field allows applications to easily determine the status of an encrypted
+ * directory without using a hack such as trying to open a regular file in it
+ * (which can confuse the "incompletely removed" status with absent or present).
  *
  * In addition, for v2 policy keys we allow applications to determine, via
  * ->status_flags and ->user_count, whether the key has been added by the
@@ -1150,7 +1166,7 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
 	}
 	down_read(&mk->mk_sem);
 
-	if (!is_master_key_secret_present(&mk->mk_secret)) {
+	if (!mk->mk_present) {
 		arg.status = refcount_read(&mk->mk_active_refs) > 0 ?
 			FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED :
 			FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 361f41ef46c7..d71f7c799e79 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -148,7 +148,7 @@ err_free_tfm:
  * and IV generation method (@ci->ci_policy.flags).
  */
 int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
-			const u8 *raw_key, const struct fscrypt_info *ci)
+			const u8 *raw_key, const struct fscrypt_inode_info *ci)
 {
 	struct crypto_skcipher *tfm;
 
@@ -178,13 +178,14 @@ void fscrypt_destroy_prepared_key(struct super_block *sb,
 }
 
 /* Given a per-file encryption key, set up the file's crypto transform object */
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+				 const u8 *raw_key)
 {
 	ci->ci_owns_key = true;
 	return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
 }
 
-static int setup_per_mode_enc_key(struct fscrypt_info *ci,
+static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
 				  struct fscrypt_master_key *mk,
 				  struct fscrypt_prepared_key *keys,
 				  u8 hkdf_context, bool include_fs_uuid)
@@ -265,7 +266,7 @@ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
 	return 0;
 }
 
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk)
 {
 	int err;
@@ -279,7 +280,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
 	return 0;
 }
 
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk)
 {
 	WARN_ON_ONCE(ci->ci_inode->i_ino == 0);
@@ -289,7 +290,7 @@ void fscrypt_hash_inode_number(struct fscrypt_info *ci,
 					      &mk->mk_ino_hash_key);
 }
 
-static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
+static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
 					    struct fscrypt_master_key *mk)
 {
 	int err;
@@ -329,7 +330,7 @@ unlock:
 	return 0;
 }
 
-static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
+static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
 				     struct fscrypt_master_key *mk,
 				     bool need_dirhash_key)
 {
@@ -404,7 +405,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
  * still allow 512-bit master keys if the user chooses to use them, though.)
  */
 static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
-					  const struct fscrypt_info *ci)
+					  const struct fscrypt_inode_info *ci)
 {
 	unsigned int min_keysize;
 
@@ -430,11 +431,12 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
  *
  * If the master key is found in the filesystem-level keyring, then it is
  * returned in *mk_ret with its semaphore read-locked.  This is needed to ensure
- * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
- * multiple tasks may race to create an fscrypt_info for the same inode), and to
- * synchronize the master key being removed with a new inode starting to use it.
+ * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes
+ * (as multiple tasks may race to create an fscrypt_inode_info for the same
+ * inode), and to synchronize the master key being removed with a new inode
+ * starting to use it.
  */
-static int setup_file_encryption_key(struct fscrypt_info *ci,
+static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
 				     bool need_dirhash_key,
 				     struct fscrypt_master_key **mk_ret)
 {
@@ -484,8 +486,8 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
 	}
 	down_read(&mk->mk_sem);
 
-	/* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
-	if (!is_master_key_secret_present(&mk->mk_secret)) {
+	if (!mk->mk_present) {
+		/* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */
 		err = -ENOKEY;
 		goto out_release_key;
 	}
@@ -519,7 +521,7 @@ out_release_key:
 	return err;
 }
 
-static void put_crypt_info(struct fscrypt_info *ci)
+static void put_crypt_info(struct fscrypt_inode_info *ci)
 {
 	struct fscrypt_master_key *mk;
 
@@ -537,8 +539,8 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		/*
 		 * Remove this inode from the list of inodes that were unlocked
 		 * with the master key.  In addition, if we're removing the last
-		 * inode from a master key struct that already had its secret
-		 * removed, then complete the full removal of the struct.
+		 * inode from an incompletely removed key, then complete the
+		 * full removal of the key.
 		 */
 		spin_lock(&mk->mk_decrypted_inodes_lock);
 		list_del(&ci->ci_master_key_link);
@@ -546,7 +548,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
 	}
 	memzero_explicit(ci, sizeof(*ci));
-	kmem_cache_free(fscrypt_info_cachep, ci);
+	kmem_cache_free(fscrypt_inode_info_cachep, ci);
 }
 
 static int
@@ -555,7 +557,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
 			      const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
 			      bool need_dirhash_key)
 {
-	struct fscrypt_info *crypt_info;
+	struct fscrypt_inode_info *crypt_info;
 	struct fscrypt_mode *mode;
 	struct fscrypt_master_key *mk = NULL;
 	int res;
@@ -564,7 +566,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	if (res)
 		return res;
 
-	crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
+	crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL);
 	if (!crypt_info)
 		return -ENOMEM;
 
@@ -580,6 +582,11 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
 	crypt_info->ci_mode = mode;
 
+	crypt_info->ci_data_unit_bits =
+		fscrypt_policy_du_bits(&crypt_info->ci_policy, inode);
+	crypt_info->ci_data_units_per_block_bits =
+		inode->i_blkbits - crypt_info->ci_data_unit_bits;
+
 	res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
 	if (res)
 		goto out;
@@ -587,8 +594,8 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	/*
 	 * For existing inodes, multiple tasks may race to set ->i_crypt_info.
 	 * So use cmpxchg_release().  This pairs with the smp_load_acquire() in
-	 * fscrypt_get_info().  I.e., here we publish ->i_crypt_info with a
-	 * RELEASE barrier so that other tasks can ACQUIRE it.
+	 * fscrypt_get_inode_info().  I.e., here we publish ->i_crypt_info with
+	 * a RELEASE barrier so that other tasks can ACQUIRE it.
 	 */
 	if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
 		/*
@@ -735,8 +742,8 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
  * fscrypt_put_encryption_info() - free most of an inode's fscrypt data
  * @inode: an inode being evicted
  *
- * Free the inode's fscrypt_info.  Filesystems must call this when the inode is
- * being evicted.  An RCU grace period need not have elapsed yet.
+ * Free the inode's fscrypt_inode_info.  Filesystems must call this when the
+ * inode is being evicted.  An RCU grace period need not have elapsed yet.
  */
 void fscrypt_put_encryption_info(struct inode *inode)
 {
@@ -773,7 +780,7 @@ EXPORT_SYMBOL(fscrypt_free_inode);
  */
 int fscrypt_drop_inode(struct inode *inode)
 {
-	const struct fscrypt_info *ci = fscrypt_get_info(inode);
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode);
 
 	/*
 	 * If ci is NULL, then the inode doesn't have an encryption key set up
@@ -794,13 +801,14 @@ int fscrypt_drop_inode(struct inode *inode)
 		return 0;
 
 	/*
-	 * Note: since we aren't holding the key semaphore, the result here can
+	 * We can't take ->mk_sem here, since this runs in atomic context.
+	 * Therefore, ->mk_present can change concurrently, and our result may
 	 * immediately become outdated.  But there's no correctness problem with
 	 * unnecessarily evicting.  Nor is there a correctness problem with not
 	 * evicting while iput() is racing with the key being removed, since
 	 * then the thread removing the key will either evict the inode itself
 	 * or will correctly detect that it wasn't evicted due to the race.
 	 */
-	return !is_master_key_secret_present(&ci->ci_master_key->mk_secret);
+	return !READ_ONCE(ci->ci_master_key->mk_present);
 }
 EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 75dabd9b27f9..a10710bc8123 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -178,7 +178,8 @@ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk)
  */
 static struct fscrypt_direct_key *
 find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
-			  const u8 *raw_key, const struct fscrypt_info *ci)
+			  const u8 *raw_key,
+			  const struct fscrypt_inode_info *ci)
 {
 	unsigned long hash_key;
 	struct fscrypt_direct_key *dk;
@@ -218,7 +219,7 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
 
 /* Prepare to encrypt directly using the master key in the given mode */
 static struct fscrypt_direct_key *
-fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
+fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key)
 {
 	struct fscrypt_direct_key *dk;
 	int err;
@@ -250,7 +251,7 @@ err_free_dk:
 }
 
 /* v1 policy, DIRECT_KEY: use the master key directly */
-static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci,
 				    const u8 *raw_master_key)
 {
 	struct fscrypt_direct_key *dk;
@@ -264,7 +265,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci,
 }
 
 /* v1 policy, !DIRECT_KEY: derive the file's encryption key */
-static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci,
 				     const u8 *raw_master_key)
 {
 	u8 *derived_key;
@@ -289,7 +290,8 @@ out:
 	return err;
 }
 
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
+			      const u8 *raw_master_key)
 {
 	if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
 		return setup_v1_file_key_direct(ci, raw_master_key);
@@ -297,8 +299,10 @@ int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
 		return setup_v1_file_key_derived(ci, raw_master_key);
 }
 
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
+int
+fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci)
 {
+	const struct super_block *sb = ci->ci_inode->i_sb;
 	struct key *key;
 	const struct fscrypt_key *payload;
 	int err;
@@ -306,8 +310,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
 	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
 					ci->ci_policy.v1.master_key_descriptor,
 					ci->ci_mode->keysize, &payload);
-	if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
-		key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+	if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) {
+		key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix,
 						ci->ci_policy.v1.master_key_descriptor,
 						ci->ci_mode->keysize, &payload);
 	}
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f4456ecb3f87..701259991277 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -118,12 +118,11 @@ static bool supported_direct_key_modes(const struct inode *inode,
 }
 
 static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
-					 const struct inode *inode,
-					 const char *type,
-					 int max_ino_bits, int max_lblk_bits)
+					 const struct inode *inode)
 {
+	const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64)
+				? "IV_INO_LBLK_64" : "IV_INO_LBLK_32";
 	struct super_block *sb = inode->i_sb;
-	int ino_bits = 64, lblk_bits = 64;
 
 	/*
 	 * IV_INO_LBLK_* exist only because of hardware limitations, and
@@ -150,17 +149,29 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
 			     type, sb->s_id);
 		return false;
 	}
-	if (sb->s_cop->get_ino_and_lblk_bits)
-		sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
-	if (ino_bits > max_ino_bits) {
+
+	/*
+	 * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit
+	 * in 32 bits.  In principle, IV_INO_LBLK_32 could support longer inode
+	 * numbers because it hashes the inode number; however, currently the
+	 * inode number is gotten from inode::i_ino which is 'unsigned long'.
+	 * So for now the implementation limit is 32 bits.
+	 */
+	if (!sb->s_cop->has_32bit_inodes) {
 		fscrypt_warn(inode,
 			     "Can't use %s policy on filesystem '%s' because its inode numbers are too long",
 			     type, sb->s_id);
 		return false;
 	}
-	if (lblk_bits > max_lblk_bits) {
+
+	/*
+	 * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit
+	 * indices fit in 32 bits.
+	 */
+	if (fscrypt_max_file_dun_bits(sb,
+			fscrypt_policy_v2_du_bits(policy, inode)) > 32) {
 		fscrypt_warn(inode,
-			     "Can't use %s policy on filesystem '%s' because its block numbers are too long",
+			     "Can't use %s policy on filesystem '%s' because its maximum file size is too large",
 			     type, sb->s_id);
 		return false;
 	}
@@ -233,25 +244,39 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
 		return false;
 	}
 
+	if (policy->log2_data_unit_size) {
+		if (!inode->i_sb->s_cop->supports_subblock_data_units) {
+			fscrypt_warn(inode,
+				     "Filesystem does not support configuring crypto data unit size");
+			return false;
+		}
+		if (policy->log2_data_unit_size > inode->i_blkbits ||
+		    policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) {
+			fscrypt_warn(inode,
+				     "Unsupported log2_data_unit_size in encryption policy: %d",
+				     policy->log2_data_unit_size);
+			return false;
+		}
+		if (policy->log2_data_unit_size != inode->i_blkbits &&
+		    (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+			/*
+			 * Not safe to enable yet, as we need to ensure that DUN
+			 * wraparound can only occur on a FS block boundary.
+			 */
+			fscrypt_warn(inode,
+				     "Sub-block data units not yet supported with IV_INO_LBLK_32");
+			return false;
+		}
+	}
+
 	if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
 	    !supported_direct_key_modes(inode, policy->contents_encryption_mode,
 					policy->filenames_encryption_mode))
 		return false;
 
-	if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
-	    !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64",
-					  32, 32))
-		return false;
-
-	/*
-	 * IV_INO_LBLK_32 hashes the inode number, so in principle it can
-	 * support any ino_bits.  However, currently the inode number is gotten
-	 * from inode::i_ino which is 'unsigned long'.  So for now the
-	 * implementation limit is 32 bits.
-	 */
-	if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
-	    !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
-					  32, 32))
+	if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
+			      FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) &&
+	    !supported_iv_ino_lblk_policy(policy, inode))
 		return false;
 
 	if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -330,6 +355,7 @@ static int fscrypt_new_context(union fscrypt_context *ctx_u,
 		ctx->filenames_encryption_mode =
 			policy->filenames_encryption_mode;
 		ctx->flags = policy->flags;
+		ctx->log2_data_unit_size = policy->log2_data_unit_size;
 		memcpy(ctx->master_key_identifier,
 		       policy->master_key_identifier,
 		       sizeof(ctx->master_key_identifier));
@@ -390,6 +416,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
 		policy->filenames_encryption_mode =
 			ctx->filenames_encryption_mode;
 		policy->flags = ctx->flags;
+		policy->log2_data_unit_size = ctx->log2_data_unit_size;
 		memcpy(policy->__reserved, ctx->__reserved,
 		       sizeof(policy->__reserved));
 		memcpy(policy->master_key_identifier,
@@ -405,11 +432,11 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
 /* Retrieve an inode's encryption policy */
 static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	union fscrypt_context ctx;
 	int ret;
 
-	ci = fscrypt_get_info(inode);
+	ci = fscrypt_get_inode_info(inode);
 	if (ci) {
 		/* key available, use the cached policy */
 		*policy = ci->ci_policy;
@@ -647,7 +674,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 
 	/*
 	 * Both parent and child are encrypted, so verify they use the same
-	 * encryption policy.  Compare the fscrypt_info structs if the keys are
+	 * encryption policy.  Compare the cached policies if the keys are
 	 * available, otherwise retrieve and compare the fscrypt_contexts.
 	 *
 	 * Note that the fscrypt_context retrieval will be required frequently
@@ -717,7 +744,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
  */
 int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
 {
-	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct fscrypt_inode_info *ci = inode->i_crypt_info;
 
 	BUILD_BUG_ON(sizeof(union fscrypt_context) !=
 			FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -742,7 +769,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
  */
 int fscrypt_set_context(struct inode *inode, void *fs_data)
 {
-	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	union fscrypt_context ctx;
 	int ctxsize;
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 25ac74d30bff..796e23761ba0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3246,11 +3246,10 @@ void d_genocide(struct dentry *parent)
 	d_walk(parent, parent, d_genocide_kill);
 }
 
-void d_tmpfile(struct file *file, struct inode *inode)
+void d_mark_tmpfile(struct file *file, struct inode *inode)
 {
 	struct dentry *dentry = file->f_path.dentry;
 
-	inode_dec_link_count(inode);
 	BUG_ON(dentry->d_name.name != dentry->d_iname ||
 		!hlist_unhashed(&dentry->d_u.d_alias) ||
 		!d_unlinked(dentry));
@@ -3260,6 +3259,15 @@ void d_tmpfile(struct file *file, struct inode *inode)
 				(unsigned long long)inode->i_ino);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dentry->d_parent->d_lock);
+}
+EXPORT_SYMBOL(d_mark_tmpfile);
+
+void d_tmpfile(struct file *file, struct inode *inode)
+{
+	struct dentry *dentry = file->f_path.dentry;
+
+	inode_dec_link_count(inode);
+	d_mark_tmpfile(file, inode);
 	d_instantiate(dentry, inode);
 }
 EXPORT_SYMBOL(d_tmpfile);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 83e57e9f9fa0..5d41765e0c77 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -72,7 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
 	struct inode *inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 	return inode;
 }
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 299c295a27a0..c830261aa883 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb)
 	}
 
 	inode->i_ino = 2;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	mode = S_IFCHR|opts->ptmxmode;
 	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
@@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	if (!inode)
 		goto fail;
 	inode->i_ino = 1;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
@@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
 	inode->i_ino = index + 3;
 	inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
 	inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index));
 
 	sprintf(s, "%d", index);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index f2ed0c0266cb..c586c5db18b5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -702,6 +702,6 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
 int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
 		       loff_t offset);
 
-extern const struct xattr_handler *ecryptfs_xattr_handlers[];
+extern const struct xattr_handler * const ecryptfs_xattr_handlers[];
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 992d9c7e64ae..a25dd3d20008 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1210,7 +1210,7 @@ static const struct xattr_handler ecryptfs_xattr_handler = {
 	.set = ecryptfs_xattr_set,
 };
 
-const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+const struct xattr_handler * const ecryptfs_xattr_handlers[] = {
 	&ecryptfs_xattr_handler,
 	NULL
 };
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 59b52718a3a2..7e9961639802 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
 	} else {
 		inode_lock(inode);
 		i_size_write(inode, datasize + sizeof(attributes));
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		inode_unlock(inode);
 	}
 
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index db9231f0e77b..76dd3c7295d9 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -25,7 +25,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
 		switch (mode & S_IFMT) {
 		case S_IFREG:
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 3789d22ba501..7844ab24b813 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -103,10 +103,9 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
 	i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid));
 	i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid));
 	inode->i_size  = be32_to_cpu(efs_inode->di_size);
-	inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
-	inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
+	inode_set_atime(inode, be32_to_cpu(efs_inode->di_atime), 0);
+	inode_set_mtime(inode, be32_to_cpu(efs_inode->di_mtime), 0);
 	inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0);
-	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
 	/* this is the number of blocks in the file */
 	if (inode->i_size == 0) {
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0c2c99c58b5e..f6a0a1748521 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -222,7 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			up_read(&devs->rwsem);
 			return 0;
 		}
-		map->m_bdev = dif->bdev;
+		map->m_bdev = dif->bdev_handle->bdev;
 		map->m_daxdev = dif->dax_dev;
 		map->m_dax_part_off = dif->dax_part_off;
 		map->m_fscache = dif->fscache;
@@ -240,7 +240,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			if (map->m_pa >= startoff &&
 			    map->m_pa < startoff + length) {
 				map->m_pa -= startoff;
-				map->m_bdev = dif->bdev;
+				map->m_bdev = dif->bdev_handle->bdev;
 				map->m_daxdev = dif->dax_dev;
 				map->m_dax_part_off = dif->dax_part_off;
 				map->m_fscache = dif->fscache;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index edc8ec7581b8..b8ad05b4509d 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -175,7 +175,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 		vi->chunkbits = sb->s_blocksize_bits +
 			(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
 	}
-	inode->i_mtime = inode->i_atime = inode_get_ctime(inode);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_get_ctime(inode)));
 
 	inode->i_flags &= ~S_DAX;
 	if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ff88d0dd980..cf04e21bfda2 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -47,7 +47,7 @@ typedef u32 erofs_blk_t;
 struct erofs_device_info {
 	char *path;
 	struct erofs_fscache *fscache;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct dax_device *dax_dev;
 	u64 dax_part_off;
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3700af9ee173..6fd04781fec5 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -227,7 +227,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct erofs_fscache *fscache;
 	struct erofs_deviceslot *dis;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	void *ptr;
 
 	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
@@ -251,13 +251,13 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 			return PTR_ERR(fscache);
 		dif->fscache = fscache;
 	} else if (!sbi->devs->flatdev) {
-		bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type,
-					  NULL);
-		if (IS_ERR(bdev))
-			return PTR_ERR(bdev);
-		dif->bdev = bdev;
-		dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
-						  NULL, NULL);
+		bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
+						sb->s_type, NULL);
+		if (IS_ERR(bdev_handle))
+			return PTR_ERR(bdev_handle);
+		dif->bdev_handle = bdev_handle;
+		dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
+				&dif->dax_part_off, NULL, NULL);
 	}
 
 	dif->blocks = le32_to_cpu(dis->blocks);
@@ -806,8 +806,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 	struct erofs_device_info *dif = ptr;
 
 	fs_put_dax(dif->dax_dev, NULL);
-	if (dif->bdev)
-		blkdev_put(dif->bdev, &erofs_fs_type);
+	if (dif->bdev_handle)
+		bdev_release(dif->bdev_handle);
 	erofs_fscache_unregister_cookie(dif->fscache);
 	dif->fscache = NULL;
 	kfree(dif->path);
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 09d341675e89..b58316b49a43 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -168,7 +168,7 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
 };
 #endif
 
-const struct xattr_handler *erofs_xattr_handlers[] = {
+const struct xattr_handler * const erofs_xattr_handlers[] = {
 	&erofs_xattr_user_handler,
 	&erofs_xattr_trusted_handler,
 #ifdef CONFIG_EROFS_FS_SECURITY
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index f16283cb8c93..b246cd0e135e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -23,7 +23,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
 {
 	const struct xattr_handler *handler = NULL;
 
-	static const struct xattr_handler *xattr_handler_map[] = {
+	static const struct xattr_handler * const xattr_handler_map[] = {
 		[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
 		[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -44,7 +44,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
 	return xattr_prefix(handler);
 }
 
-extern const struct xattr_handler *erofs_xattr_handlers[];
+extern const struct xattr_handler * const erofs_xattr_handlers[];
 
 int erofs_xattr_prefixes_init(struct super_block *sb);
 void erofs_xattr_prefixes_cleanup(struct super_block *sb);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index f55498e5c23d..f78b614f44dc 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -549,6 +549,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
 		u8 tz, __le16 time, __le16 date, u8 time_cs);
 void exfat_truncate_atime(struct timespec64 *ts);
+void exfat_truncate_inode_atime(struct inode *inode);
 void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
 		u8 *tz, __le16 *time, __le16 *date, u8 *time_cs);
 u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type);
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 32395ef686a2..30ee2c8d36a5 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -22,7 +22,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
 	if (err)
 		return err;
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 
 	if (!IS_SYNC(inode))
@@ -290,10 +290,9 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	}
 
 	if (attr->ia_valid & ATTR_SIZE)
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
-	setattr_copy(&nop_mnt_idmap, inode, attr);
-	exfat_truncate_atime(&inode->i_atime);
+	exfat_truncate_inode_atime(inode);
 
 	if (attr->ia_valid & ATTR_SIZE) {
 		error = exfat_block_truncate_page(inode, attr->ia_size);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 13329baeafbc..a2185e6f0548 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -26,6 +26,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
 	bool is_dir = (ei->type == TYPE_DIR) ? true : false;
+	struct timespec64 ts;
 
 	if (inode->i_ino == EXFAT_ROOT_INO)
 		return 0;
@@ -55,16 +56,18 @@ int __exfat_write_inode(struct inode *inode, int sync)
 			&ep->dentry.file.create_time,
 			&ep->dentry.file.create_date,
 			&ep->dentry.file.create_time_cs);
-	exfat_set_entry_time(sbi, &inode->i_mtime,
-			&ep->dentry.file.modify_tz,
-			&ep->dentry.file.modify_time,
-			&ep->dentry.file.modify_date,
-			&ep->dentry.file.modify_time_cs);
-	exfat_set_entry_time(sbi, &inode->i_atime,
-			&ep->dentry.file.access_tz,
-			&ep->dentry.file.access_time,
-			&ep->dentry.file.access_date,
-			NULL);
+	exfat_set_entry_time(sbi, &ts,
+			     &ep->dentry.file.modify_tz,
+			     &ep->dentry.file.modify_time,
+			     &ep->dentry.file.modify_date,
+			     &ep->dentry.file.modify_time_cs);
+	inode_set_mtime_to_ts(inode, ts);
+	exfat_set_entry_time(sbi, &ts,
+			     &ep->dentry.file.access_tz,
+			     &ep->dentry.file.access_time,
+			     &ep->dentry.file.access_date,
+			     NULL);
+	inode_set_atime_to_ts(inode, ts);
 
 	/* File size should be zero if there is no cluster allocated */
 	on_disk_size = i_size_read(inode);
@@ -355,7 +358,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
 
 	if (to > i_size_read(inode)) {
 		truncate_pagecache(inode, i_size_read(inode));
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		exfat_truncate(inode);
 	}
 }
@@ -398,7 +401,7 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
 		exfat_write_failed(mapping, pos+len);
 
 	if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		ei->attr |= ATTR_ARCHIVE;
 		mark_inode_dirty(inode);
 	}
@@ -576,10 +579,10 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
 	exfat_save_attr(inode, info->attr);
 
 	inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
-	inode->i_mtime = info->mtime;
+	inode_set_mtime_to_ts(inode, info->mtime);
 	inode_set_ctime_to_ts(inode, info->mtime);
 	ei->i_crtime = info->crtime;
-	inode->i_atime = info->atime;
+	inode_set_atime_to_ts(inode, info->atime);
 
 	return 0;
 }
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index 2e1a1a6b1021..fa8459828046 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -126,6 +126,14 @@ void exfat_truncate_atime(struct timespec64 *ts)
 	ts->tv_nsec = 0;
 }
 
+void exfat_truncate_inode_atime(struct inode *inode)
+{
+	struct timespec64 atime = inode_get_atime(inode);
+
+	exfat_truncate_atime(&atime);
+	inode_set_atime_to_ts(inode, atime);
+}
+
 u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type)
 {
 	int i;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 1b9f587f6cca..b92e46916dea 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -569,7 +569,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
 		goto unlock;
 
 	inode_inc_iversion(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	if (IS_DIRSYNC(dir))
 		exfat_sync_inode(dir);
 	else
@@ -582,8 +582,9 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
 		goto unlock;
 
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
+
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
 	d_instantiate(dentry, inode);
@@ -816,16 +817,16 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
 	ei->dir.dir = DIR_DELETED;
 
 	inode_inc_iversion(dir);
-	dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
-	exfat_truncate_atime(&dir->i_atime);
+	simple_inode_init_ts(dir);
+	exfat_truncate_inode_atime(dir);
 	if (IS_DIRSYNC(dir))
 		exfat_sync_inode(dir);
 	else
 		mark_inode_dirty(dir);
 
 	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
 	exfat_unhash_inode(inode);
 	exfat_d_version_set(dentry, inode_query_iversion(dir));
 unlock:
@@ -851,7 +852,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		goto unlock;
 
 	inode_inc_iversion(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	if (IS_DIRSYNC(dir))
 		exfat_sync_inode(dir);
 	else
@@ -865,8 +866,8 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		goto unlock;
 
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
 	d_instantiate(dentry, inode);
@@ -977,8 +978,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
 	ei->dir.dir = DIR_DELETED;
 
 	inode_inc_iversion(dir);
-	dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
-	exfat_truncate_atime(&dir->i_atime);
+	simple_inode_init_ts(dir);
+	exfat_truncate_inode_atime(dir);
 	if (IS_DIRSYNC(dir))
 		exfat_sync_inode(dir);
 	else
@@ -986,8 +987,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
 	drop_nlink(dir);
 
 	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
 	exfat_unhash_inode(inode);
 	exfat_d_version_set(dentry, inode_query_iversion(dir));
 unlock:
@@ -1312,7 +1313,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
 	inode_inc_iversion(new_dir);
 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 	EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
-	exfat_truncate_atime(&new_dir->i_atime);
+	exfat_truncate_inode_atime(new_dir);
 	if (IS_DIRSYNC(new_dir))
 		exfat_sync_inode(new_dir);
 	else
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 2778bd9b631e..e919a68bf4a1 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -370,8 +370,8 @@ static int exfat_read_root(struct inode *inode)
 	ei->i_size_ondisk = i_size_read(inode);
 
 	exfat_save_attr(inode, ATTR_SUBDIR);
-	inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	ei->i_crtime = simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
 	return 0;
 }
 
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index b335f17f682f..c7900868171b 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -468,7 +468,7 @@ int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 	ext2_set_de_type(de, inode);
 	ext2_commit_chunk(page, pos, len);
 	if (update_times)
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(dir);
 	return ext2_handle_dirsync(dir);
@@ -555,7 +555,7 @@ got_it:
 	de->inode = cpu_to_le32(inode->i_ino);
 	ext2_set_de_type (de, inode);
 	ext2_commit_chunk(page, pos, rec_len);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(dir);
 	err = ext2_handle_dirsync(dir);
@@ -606,7 +606,7 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
 		pde->rec_len = ext2_rec_len_to_disk(to - from);
 	dir->inode = 0;
 	ext2_commit_chunk(page, pos, to - from);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(inode);
 	return ext2_handle_dirsync(inode);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c24d0de95a83..fdf63e9c6e7c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -546,7 +546,7 @@ got:
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_flags =
 		ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 314b415ee518..464faf6c217e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1291,7 +1291,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 	__ext2_truncate_blocks(inode, newsize);
 	filemap_invalidate_unlock(inode->i_mapping);
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (inode_needs_sync(inode)) {
 		sync_mapping_buffers(inode->i_mapping);
 		sync_inode_metadata(inode, 1);
@@ -1412,10 +1412,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 	i_gid_write(inode, i_gid);
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode_set_atime(inode, (signed)le32_to_cpu(raw_inode->i_atime), 0);
 	inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0);
-	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+	inode_set_mtime(inode, (signed)le32_to_cpu(raw_inode->i_mtime), 0);
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
@@ -1544,9 +1543,9 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le32(inode->i_size);
-	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-	raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
-	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	raw_inode->i_atime = cpu_to_le32(inode_get_atime_sec(inode));
+	raw_inode->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode));
+	raw_inode->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode));
 
 	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index aaf3e3e88cb2..645ee6142f69 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1572,7 +1572,7 @@ out:
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	return len - towrite;
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 20f741184673..e849241ebb8f 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -98,7 +98,7 @@ static struct buffer_head *ext2_xattr_cache_find(struct inode *,
 static void ext2_xattr_rehash(struct ext2_xattr_header *,
 			      struct ext2_xattr_entry *);
 
-static const struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler * const ext2_xattr_handler_map[] = {
 	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &nop_posix_acl_access,
@@ -110,7 +110,7 @@ static const struct xattr_handler *ext2_xattr_handler_map[] = {
 #endif
 };
 
-const struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler * const ext2_xattr_handlers[] = {
 	&ext2_xattr_user_handler,
 	&ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_SECURITY
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 7925f596e8e2..6a4966949047 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -72,7 +72,7 @@ extern void ext2_xattr_delete_inode(struct inode *);
 extern struct mb_cache *ext2_xattr_create_cache(void);
 extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
 
-extern const struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler * const ext2_xattr_handlers[];
 
 # else  /* CONFIG_EXT2_FS_XATTR */
 
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 453d4da5de52..7ae0b61258a7 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -232,19 +232,14 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
 	return ext4_has_feature_stable_inodes(sb);
 }
 
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
-				       int *ino_bits_ret, int *lblk_bits_ret)
-{
-	*ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
-	*lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
 const struct fscrypt_operations ext4_cryptops = {
-	.key_prefix		= "ext4:",
+	.needs_bounce_pages	= 1,
+	.has_32bit_inodes	= 1,
+	.supports_subblock_data_units = 1,
+	.legacy_key_prefix	= "ext4:",
 	.get_context		= ext4_get_context,
 	.set_context		= ext4_set_context,
 	.get_dummy_policy	= ext4_get_dummy_policy,
 	.empty_dir		= ext4_empty_dir,
 	.has_stable_inodes	= ext4_has_stable_inodes,
-	.get_ino_and_lblk_bits	= ext4_get_ino_and_lblk_bits,
 };
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9418359b1d9d..8da5fb680210 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -891,10 +891,13 @@ do {										\
 		(raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX));	\
 } while (0)
 
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)				\
-	EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime)
+#define EXT4_INODE_SET_ATIME(inode, raw_inode)						\
+	EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))
 
-#define EXT4_INODE_SET_CTIME(inode, raw_inode)					\
+#define EXT4_INODE_SET_MTIME(inode, raw_inode)						\
+	EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode)						\
 	EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
 
 #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)				\
@@ -910,9 +913,16 @@ do {										\
 			.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime)	\
 		})
 
-#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)				\
+#define EXT4_INODE_GET_ATIME(inode, raw_inode)					\
+do {										\
+	inode_set_atime_to_ts(inode,						\
+		EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode));		\
+} while (0)
+
+#define EXT4_INODE_GET_MTIME(inode, raw_inode)					\
 do {										\
-	(inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode);	\
+	inode_set_mtime_to_ts(inode,						\
+		EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode));		\
 } while (0)
 
 #define EXT4_INODE_GET_CTIME(inode, raw_inode)					\
@@ -1537,7 +1547,7 @@ struct ext4_sb_info {
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
-	struct block_device *s_journal_bdev;
+	struct bdev_handle *s_journal_bdev_handle;
 #ifdef CONFIG_QUOTA
 	/* Names of quota files with journalled quota */
 	char __rcu *s_qf_names[EXT4_MAXQUOTAS];
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 202c76996b62..4c4176ee1749 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4481,7 +4481,8 @@ retry:
 			if (epos > new_size)
 				epos = new_size;
 			if (ext4_update_inode_size(inode, epos) & 0x1)
-				inode->i_mtime = inode_get_ctime(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_get_ctime(inode));
 		}
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4617,7 +4618,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 
 		/* Now release the pages and zero block aligned part of pages */
 		truncate_pagecache_range(inode, start, end - 1);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 					     flags);
@@ -4642,7 +4643,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		goto out_mutex;
 	}
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (new_size)
 		ext4_update_inode_size(inode, new_size);
 	ret = ext4_mark_inode_dirty(handle, inode);
@@ -5378,7 +5379,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	ret = ext4_mark_inode_dirty(handle, inode);
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 
@@ -5488,7 +5489,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 	/* Expand file to avoid data loss if there is error while shifting */
 	inode->i_size += len;
 	EXT4_I(inode)->i_disksize += len;
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	ret = ext4_mark_inode_dirty(handle, inode);
 	if (ret)
 		goto out_stop;
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index cdf9bfe10137..11e6f33677a2 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -576,8 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
 	if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
 	    fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
 		return true;
-	if (EXT4_SB(sb)->s_journal_bdev &&
-	    fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
+	if (EXT4_SB(sb)->s_journal_bdev_handle &&
+	    fm->fmr_device ==
+	    new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev))
 		return true;
 	return false;
 }
@@ -647,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
 	memset(handlers, 0, sizeof(handlers));
 	handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
 	handlers[0].gfd_fn = ext4_getfsmap_datadev;
-	if (EXT4_SB(sb)->s_journal_bdev) {
+	if (EXT4_SB(sb)->s_journal_bdev_handle) {
 		handlers[1].gfd_dev = new_encode_dev(
-				EXT4_SB(sb)->s_journal_bdev->bd_dev);
+			EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev);
 		handlers[1].gfd_fn = ext4_getfsmap_logdev;
 	}
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b65058d972f9..e9bbb1da2d0a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1250,8 +1250,8 @@ got:
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	ei->i_crtime = inode->i_mtime;
+	simple_inode_init_ts(inode);
+	ei->i_crtime = inode_get_mtime(inode);
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 012d9259ff53..9a84a5f9fef4 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	ext4_update_dx_flag(dir);
 	inode_inc_iversion(dir);
 	return 1;
@@ -1991,7 +1991,7 @@ out:
 		ext4_orphan_del(handle, inode);
 
 	if (err == 0) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		err = ext4_mark_inode_dirty(handle, inode);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4ce35f1c8b0a..08cb5c0e0d51 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4020,7 +4020,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	ret2 = ext4_mark_inode_dirty(handle, inode);
 	if (unlikely(ret2))
 		ret = ret2;
@@ -4180,7 +4180,7 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	err2 = ext4_mark_inode_dirty(handle, inode);
 	if (unlikely(err2 && !err))
 		err = err2;
@@ -4284,8 +4284,8 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 
 	EXT4_INODE_SET_CTIME(inode, raw_inode);
-	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
-	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+	EXT4_INODE_SET_MTIME(inode, raw_inode);
+	EXT4_INODE_SET_ATIME(inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
@@ -4893,8 +4893,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	}
 
 	EXT4_INODE_GET_CTIME(inode, raw_inode);
-	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
-	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+	EXT4_INODE_GET_ATIME(inode, raw_inode);
+	EXT4_INODE_GET_MTIME(inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 
 	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
@@ -5019,8 +5019,8 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
 
 		spin_lock(&ei->i_raw_lock);
 		EXT4_INODE_SET_CTIME(inode, raw_inode);
-		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
-		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+		EXT4_INODE_SET_MTIME(inode, raw_inode);
+		EXT4_INODE_SET_ATIME(inode, raw_inode);
 		ext4_inode_csum_set(inode, raw_inode, ei);
 		spin_unlock(&ei->i_raw_lock);
 		trace_ext4_other_inode_update_time(inode, orig_ino);
@@ -5413,7 +5413,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 			 * update c/mtime in shrink case below
 			 */
 			if (!shrink)
-				inode->i_mtime = inode_set_ctime_current(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_set_ctime_current(inode));
 
 			if (shrink)
 				ext4_fc_track_range(handle, inode,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0bfe2ce589e2..4f931f80cb34 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -312,13 +312,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
 	struct ext4_inode_info *ei1;
 	struct ext4_inode_info *ei2;
 	unsigned long tmp;
+	struct timespec64 ts1, ts2;
 
 	ei1 = EXT4_I(inode1);
 	ei2 = EXT4_I(inode2);
 
 	swap(inode1->i_version, inode2->i_version);
-	swap(inode1->i_atime, inode2->i_atime);
-	swap(inode1->i_mtime, inode2->i_mtime);
+
+	ts1 = inode_get_atime(inode1);
+	ts2 = inode_get_atime(inode2);
+	inode_set_atime_to_ts(inode1, ts2);
+	inode_set_atime_to_ts(inode2, ts1);
+
+	ts1 = inode_get_mtime(inode1);
+	ts2 = inode_get_mtime(inode2);
+	inode_set_mtime_to_ts(inode1, ts2);
+	inode_set_mtime_to_ts(inode2, ts1);
 
 	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
 	tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bbda587f76b8..057d74467293 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2207,7 +2207,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	ext4_update_dx_flag(dir);
 	inode_inc_iversion(dir);
 	err2 = ext4_mark_inode_dirty(handle, dir);
@@ -3202,7 +3202,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	 * recovery. */
 	inode->i_size = 0;
 	ext4_orphan_add(handle, inode);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	inode_set_ctime_current(inode);
 	retval = ext4_mark_inode_dirty(handle, inode);
 	if (retval)
@@ -3277,7 +3277,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
 		retval = ext4_delete_entry(handle, dir, de, bh);
 		if (retval)
 			goto out_handle;
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 		ext4_update_dx_flag(dir);
 		retval = ext4_mark_inode_dirty(handle, dir);
 		if (retval)
@@ -3648,7 +3648,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
 	if (ext4_has_feature_filetype(ent->dir->i_sb))
 		ent->de->file_type = file_type;
 	inode_inc_iversion(ent->dir);
-	ent->dir->i_mtime = inode_set_ctime_current(ent->dir);
+	inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir));
 	retval = ext4_mark_inode_dirty(handle, ent->dir);
 	BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
 	if (!ent->inlined) {
@@ -3963,7 +3963,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		ext4_dec_count(new.inode);
 		inode_set_ctime_current(new.inode);
 	}
-	old.dir->i_mtime = inode_set_ctime_current(old.dir);
+	inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
 	ext4_update_dx_flag(old.dir);
 	if (old.dir_bh) {
 		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dbebd8b3127e..42a44990d99c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1351,14 +1351,14 @@ static void ext4_put_super(struct super_block *sb)
 
 	sync_blockdev(sb->s_bdev);
 	invalidate_bdev(sb->s_bdev);
-	if (sbi->s_journal_bdev) {
+	if (sbi->s_journal_bdev_handle) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
 		 * floating about in memory - the physical journal device may
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
-		sync_blockdev(sbi->s_journal_bdev);
-		invalidate_bdev(sbi->s_journal_bdev);
+		sync_blockdev(sbi->s_journal_bdev_handle->bdev);
+		invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
 	}
 
 	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -4233,7 +4233,7 @@ int ext4_calculate_overhead(struct super_block *sb)
 	 * Add the internal journal blocks whether the journal has been
 	 * loaded or not
 	 */
-	if (sbi->s_journal && !sbi->s_journal_bdev)
+	if (sbi->s_journal && !sbi->s_journal_bdev_handle)
 		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
 	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
 		/* j_inum for internal journal is non-zero */
@@ -5670,9 +5670,9 @@ failed_mount:
 #endif
 	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
 	brelse(sbi->s_sbh);
-	if (sbi->s_journal_bdev) {
-		invalidate_bdev(sbi->s_journal_bdev);
-		blkdev_put(sbi->s_journal_bdev, sb);
+	if (sbi->s_journal_bdev_handle) {
+		invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
+		bdev_release(sbi->s_journal_bdev_handle);
 	}
 out_fail:
 	invalidate_bdev(sb->s_bdev);
@@ -5842,12 +5842,13 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
 	return journal;
 }
 
-static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
+static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
 					dev_t j_dev, ext4_fsblk_t *j_start,
 					ext4_fsblk_t *j_len)
 {
 	struct buffer_head *bh;
 	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	int hblock, blocksize;
 	ext4_fsblk_t sb_block;
 	unsigned long offset;
@@ -5856,16 +5857,17 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
 
 	/* see get_tree_bdev why this is needed and safe */
 	up_write(&sb->s_umount);
-	bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
-				 &fs_holder_ops);
+	bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				       sb, &fs_holder_ops);
 	down_write(&sb->s_umount);
-	if (IS_ERR(bdev)) {
+	if (IS_ERR(bdev_handle)) {
 		ext4_msg(sb, KERN_ERR,
 			 "failed to open journal device unknown-block(%u,%u) %ld",
-			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev));
-		return ERR_CAST(bdev);
+			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle));
+		return bdev_handle;
 	}
 
+	bdev = bdev_handle->bdev;
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
@@ -5912,12 +5914,12 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
 	*j_start = sb_block + 1;
 	*j_len = ext4_blocks_count(es);
 	brelse(bh);
-	return bdev;
+	return bdev_handle;
 
 out_bh:
 	brelse(bh);
 out_bdev:
-	blkdev_put(bdev, sb);
+	bdev_release(bdev_handle);
 	return ERR_PTR(errno);
 }
 
@@ -5927,14 +5929,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 	journal_t *journal;
 	ext4_fsblk_t j_start;
 	ext4_fsblk_t j_len;
-	struct block_device *journal_bdev;
+	struct bdev_handle *bdev_handle;
 	int errno = 0;
 
-	journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
-	if (IS_ERR(journal_bdev))
-		return ERR_CAST(journal_bdev);
+	bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+	if (IS_ERR(bdev_handle))
+		return ERR_CAST(bdev_handle);
 
-	journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start,
+	journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start,
 					j_len, sb->s_blocksize);
 	if (IS_ERR(journal)) {
 		ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5949,14 +5951,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 		goto out_journal;
 	}
 	journal->j_private = sb;
-	EXT4_SB(sb)->s_journal_bdev = journal_bdev;
+	EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle;
 	ext4_init_journal_params(sb, journal);
 	return journal;
 
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
-	blkdev_put(journal_bdev, sb);
+	bdev_release(bdev_handle);
 	return ERR_PTR(errno);
 }
 
@@ -7127,7 +7129,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
 	}
 	EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
 	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	err = ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 out_unlock:
@@ -7300,12 +7302,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
 static void ext4_kill_sb(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
+	struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL;
 
 	kill_block_super(sb);
 
-	if (journal_bdev)
-		blkdev_put(journal_bdev, sb);
+	if (handle)
+		bdev_release(handle);
 }
 
 static struct file_system_type ext4_fs_type = {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 92ba28cebac6..82dc5e673d5c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_HURD]		     = &ext4_xattr_hurd_handler,
 };
 
-const struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler * const ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
 	&ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_SECURITY
@@ -356,7 +356,7 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
 
 static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
 {
-	return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) |
+	return ((u64) inode_get_ctime_sec(ea_inode) << 32) |
 		(u32) inode_peek_iversion_raw(ea_inode);
 }
 
@@ -368,12 +368,12 @@ static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
 
 static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
 {
-	return (u32)ea_inode->i_atime.tv_sec;
+	return (u32) inode_get_atime_sec(ea_inode);
 }
 
 static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
 {
-	ea_inode->i_atime.tv_sec = hash;
+	inode_set_atime(ea_inode, hash, 0);
 }
 
 /*
@@ -418,7 +418,7 @@ free_bhs:
 	return ret;
 }
 
-#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode)))
 
 static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 				 u32 ea_inode_hash, struct inode **ea_inode)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 824faf0b15a8..bd97c4aa8177 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -193,7 +193,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
 extern void ext4_evict_ea_inode(struct inode *inode);
 
-extern const struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler * const ext4_xattr_handlers[];
 
 extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 				 struct ext4_xattr_ibody_find *is);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8aa29fe2e87b..042593aed1ec 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	de->file_type = fs_umode_to_ftype(inode->i_mode);
 	set_page_dirty(page);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	f2fs_mark_inode_dirty_sync(dir, false);
 	f2fs_put_page(page, 1);
 }
@@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
 			f2fs_i_links_write(dir, true);
 		clear_inode_flag(inode, FI_NEW_INODE);
 	}
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (F2FS_I(dir)->i_current_depth != current_depth)
@@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	}
 	f2fs_put_page(page, 1);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (inode)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6d688e42d89c..9043cedfa12b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1234,6 +1234,7 @@ struct f2fs_bio_info {
 #define FDEV(i)				(sbi->devs[i])
 #define RDEV(i)				(raw_super->devs[i])
 struct f2fs_dev_info {
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 	char path[MAX_PATH_LEN];
 	unsigned int total_segments;
@@ -3317,13 +3318,15 @@ static inline void clear_file(struct inode *inode, int type)
 
 static inline bool f2fs_is_time_consistent(struct inode *inode)
 {
-	struct timespec64 ctime = inode_get_ctime(inode);
+	struct timespec64 ts = inode_get_atime(inode);
 
-	if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+	if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts))
 		return false;
-	if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime))
+	ts = inode_get_ctime(inode);
+	if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts))
 		return false;
-	if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+	ts = inode_get_mtime(inode);
+	if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts))
 		return false;
 	return true;
 }
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ca5904129b16..dd99abbb7186 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -798,7 +798,7 @@ int f2fs_truncate(struct inode *inode)
 	if (err)
 		return err;
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	f2fs_mark_inode_dirty_sync(inode, false);
 	return 0;
 }
@@ -905,9 +905,9 @@ static void __setattr_copy(struct mnt_idmap *idmap,
 	i_uid_update(idmap, attr, inode);
 	i_gid_update(idmap, attr, inode);
 	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
-		inode->i_mtime = attr->ia_mtime;
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
 	if (ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (ia_valid & ATTR_MODE) {
@@ -1012,7 +1012,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 			return err;
 
 		spin_lock(&F2FS_I(inode)->i_size_lock);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		F2FS_I(inode)->last_disk_size = i_size_read(inode);
 		spin_unlock(&F2FS_I(inode)->i_size_lock);
 	}
@@ -1840,7 +1840,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 	}
 
 	if (!ret) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		f2fs_mark_inode_dirty_sync(inode, false);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
@@ -2888,10 +2888,10 @@ out_src:
 	if (ret)
 		goto out_unlock;
 
-	src->i_mtime = inode_set_ctime_current(src);
+	inode_set_mtime_to_ts(src, inode_set_ctime_current(src));
 	f2fs_mark_inode_dirty_sync(src, false);
 	if (src != dst) {
-		dst->i_mtime = inode_set_ctime_current(dst);
+		inode_set_mtime_to_ts(dst, inode_set_ctime_current(dst));
 		f2fs_mark_inode_dirty_sync(dst, false);
 	}
 	f2fs_update_time(sbi, REQ_TIME);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 2fe25619ccb5..ac00423f117b 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -699,7 +699,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cde243840abd..5779c7edd49b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -386,9 +386,9 @@ static void init_idisk_time(struct inode *inode)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	fi->i_disk_time[0] = inode->i_atime;
+	fi->i_disk_time[0] = inode_get_atime(inode);
 	fi->i_disk_time[1] = inode_get_ctime(inode);
-	fi->i_disk_time[2] = inode->i_mtime;
+	fi->i_disk_time[2] = inode_get_mtime(inode);
 }
 
 static int do_read_inode(struct inode *inode)
@@ -417,12 +417,12 @@ static int do_read_inode(struct inode *inode)
 	inode->i_size = le64_to_cpu(ri->i_size);
 	inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
 
-	inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
+	inode_set_atime(inode, le64_to_cpu(ri->i_atime),
+			le32_to_cpu(ri->i_atime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(ri->i_ctime),
 			le32_to_cpu(ri->i_ctime_nsec));
-	inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+	inode_set_mtime(inode, le64_to_cpu(ri->i_mtime),
+			le32_to_cpu(ri->i_mtime_nsec));
 	inode->i_generation = le32_to_cpu(ri->i_generation);
 	if (S_ISDIR(inode->i_mode))
 		fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
@@ -698,12 +698,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 	}
 	set_raw_inline(inode, ri);
 
-	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
-	ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
-	ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-	ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
-	ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ri->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+	ri->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	ri->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+	ri->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+	ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+	ri->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 	if (S_ISDIR(inode->i_mode))
 		ri->i_current_depth =
 			cpu_to_le32(F2FS_I(inode)->i_current_depth);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 193b22a2d6bf..d0053b0284d8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -243,8 +243,8 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap,
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	F2FS_I(inode)->i_crtime = inode->i_mtime;
+	simple_inode_init_ts(inode);
+	F2FS_I(inode)->i_crtime = inode_get_mtime(inode);
 	inode->i_generation = get_random_u32();
 
 	if (S_ISDIR(inode->i_mode))
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 7be60df277a5..b56d0f1078a7 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -320,12 +320,12 @@ static int recover_inode(struct inode *inode, struct page *page)
 	}
 
 	f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
-	inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
+	inode_set_atime(inode, le64_to_cpu(raw->i_atime),
+			le32_to_cpu(raw->i_atime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(raw->i_ctime),
 			le32_to_cpu(raw->i_ctime_nsec));
-	inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+	inode_set_mtime(inode, le64_to_cpu(raw->i_mtime),
+			le32_to_cpu(raw->i_mtime_nsec));
 
 	F2FS_I(inode)->i_advise = raw->i_advise;
 	F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a8c8232852bb..be17d77513d5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1562,7 +1562,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < sbi->s_ndevs; i++) {
 		if (i > 0)
-			blkdev_put(FDEV(i).bdev, sbi->sb);
+			bdev_release(FDEV(i).bdev_handle);
 #ifdef CONFIG_BLK_DEV_ZONED
 		kvfree(FDEV(i).blkz_seq);
 #endif
@@ -2710,7 +2710,7 @@ retry:
 
 	if (len == towrite)
 		return err;
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	f2fs_mark_inode_dirty_sync(inode, false);
 	return len - towrite;
 }
@@ -3203,13 +3203,6 @@ static bool f2fs_has_stable_inodes(struct super_block *sb)
 	return true;
 }
 
-static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
-				       int *ino_bits_ret, int *lblk_bits_ret)
-{
-	*ino_bits_ret = 8 * sizeof(nid_t);
-	*lblk_bits_ret = 8 * sizeof(block_t);
-}
-
 static struct block_device **f2fs_get_devices(struct super_block *sb,
 					      unsigned int *num_devs)
 {
@@ -3231,13 +3224,15 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
 }
 
 static const struct fscrypt_operations f2fs_cryptops = {
-	.key_prefix		= "f2fs:",
+	.needs_bounce_pages	= 1,
+	.has_32bit_inodes	= 1,
+	.supports_subblock_data_units = 1,
+	.legacy_key_prefix	= "f2fs:",
 	.get_context		= f2fs_get_context,
 	.set_context		= f2fs_set_context,
 	.get_dummy_policy	= f2fs_get_dummy_policy,
 	.empty_dir		= f2fs_empty_dir,
 	.has_stable_inodes	= f2fs_has_stable_inodes,
-	.get_ino_and_lblk_bits	= f2fs_get_ino_and_lblk_bits,
 	.get_devices		= f2fs_get_devices,
 };
 #endif
@@ -4198,7 +4193,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < max_devices; i++) {
 		if (i == 0)
-			FDEV(0).bdev = sbi->sb->s_bdev;
+			FDEV(0).bdev_handle = sbi->sb->s_bdev_handle;
 		else if (!RDEV(i).path[0])
 			break;
 
@@ -4218,13 +4213,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 				FDEV(i).end_blk = FDEV(i).start_blk +
 					(FDEV(i).total_segments <<
 					sbi->log_blocks_per_seg) - 1;
-				FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
-					mode, sbi->sb, NULL);
+				FDEV(i).bdev_handle = bdev_open_by_path(
+					FDEV(i).path, mode, sbi->sb, NULL);
 			}
 		}
-		if (IS_ERR(FDEV(i).bdev))
-			return PTR_ERR(FDEV(i).bdev);
+		if (IS_ERR(FDEV(i).bdev_handle))
+			return PTR_ERR(FDEV(i).bdev_handle);
 
+		FDEV(i).bdev = FDEV(i).bdev_handle->bdev;
 		/* to release errored devices */
 		sbi->s_ndevs = i + 1;
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index a657284faee3..4314456854f6 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -189,7 +189,7 @@ const struct xattr_handler f2fs_xattr_security_handler = {
 	.set	= f2fs_xattr_generic_set,
 };
 
-static const struct xattr_handler *f2fs_xattr_handler_map[] = {
+static const struct xattr_handler * const f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -202,7 +202,7 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
 };
 
-const struct xattr_handler *f2fs_xattr_handlers[] = {
+const struct xattr_handler * const f2fs_xattr_handlers[] = {
 	&f2fs_xattr_user_handler,
 	&f2fs_xattr_trusted_handler,
 #ifdef CONFIG_F2FS_FS_SECURITY
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index b1811c392e6f..a005ffdcf717 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -125,7 +125,7 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
 extern const struct xattr_handler f2fs_xattr_advise_handler;
 extern const struct xattr_handler f2fs_xattr_security_handler;
 
-extern const struct xattr_handler *f2fs_xattr_handlers[];
+extern const struct xattr_handler * const f2fs_xattr_handlers[];
 
 extern int f2fs_setxattr(struct inode *, int, const char *,
 				const void *, size_t, struct page *, int);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index cdd39b6020f3..1fac3dabf130 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -512,6 +512,7 @@ static int fat_validate_dir(struct inode *dir)
 int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct timespec64 mtime;
 	int error;
 
 	MSDOS_I(inode)->i_pos = 0;
@@ -561,14 +562,18 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 
-	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
-	inode_set_ctime_to_ts(inode, inode->i_mtime);
+	fat_time_fat2unix(sbi, &mtime, de->time, de->date, 0);
+	inode_set_mtime_to_ts(inode, mtime);
+	inode_set_ctime_to_ts(inode, mtime);
 	if (sbi->options.isvfat) {
-		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+		struct timespec64 atime;
+
+		fat_time_fat2unix(sbi, &atime, 0, de->adate, 0);
+		inode_set_atime_to_ts(inode, atime);
 		fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
 				  de->cdate, de->ctime_cs);
 	} else
-		inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime);
+		inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, &mtime));
 
 	return 0;
 }
@@ -849,6 +854,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh;
 	struct msdos_dir_entry *raw_entry;
+	struct timespec64 mtime;
 	loff_t i_pos;
 	sector_t blocknr;
 	int err, offset;
@@ -882,12 +888,14 @@ retry:
 		raw_entry->size = cpu_to_le32(inode->i_size);
 	raw_entry->attr = fat_make_attrs(inode);
 	fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
-	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+	mtime = inode_get_mtime(inode);
+	fat_time_unix2fat(sbi, &mtime, &raw_entry->time,
 			  &raw_entry->date, NULL);
 	if (sbi->options.isvfat) {
+		struct timespec64 ts = inode_get_atime(inode);
 		__le16 atime;
-		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
-				  &raw_entry->adate, NULL);
+
+		fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL);
 		fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime,
 				  &raw_entry->cdate, &raw_entry->ctime_cs);
 	}
@@ -1407,7 +1415,8 @@ static int fat_read_root(struct inode *inode)
 	MSDOS_I(inode)->mmu_private = inode->i_size;
 
 	fat_save_attrs(inode, ATTR_DIR);
-	inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime(inode, 0, 0)));
 	set_nlink(inode, fat_subdirs(inode)+2);
 
 	return 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index f2304a1054aa..c7a2d27120ba 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -325,15 +325,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
 	}
 
 	if (flags & S_ATIME)
-		inode->i_atime = fat_truncate_atime(sbi, now);
+		inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, now));
 	/*
 	 * ctime and mtime share the same on-disk field, and should be
 	 * identical in memory. all mtime updates will be applied to ctime,
 	 * but ctime updates are ignored.
 	 */
 	if (flags & S_MTIME)
-		inode->i_mtime = inode_set_ctime_to_ts(inode,
-						       fat_truncate_mtime(sbi, now));
+		inode_set_mtime_to_ts(inode,
+				      inode_set_ctime_to_ts(inode, fat_truncate_mtime(sbi, now)));
 
 	return 0;
 }
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..5fb0b146e79e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file)
 	struct files_struct *files = current->files;
 	struct fdtable *fdt;
 
+	if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
+		return;
+
 	rcu_read_lock_sched();
 
 	if (unlikely(files->resize_in_progress)) {
@@ -853,8 +856,104 @@ void do_close_on_exec(struct files_struct *files)
 	spin_unlock(&files->file_lock);
 }
 
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+	struct file __rcu *file;
+	struct file __rcu *file_reloaded;
+	struct file __rcu *file_reloaded_cmp;
+
+	file = rcu_dereference_raw(*f);
+	if (!file)
+		return NULL;
+
+	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+		return ERR_PTR(-EAGAIN);
+
+	file_reloaded = rcu_dereference_raw(*f);
+
+	/*
+	 * Ensure that all accesses have a dependency on the load from
+	 * rcu_dereference_raw() above so we get correct ordering
+	 * between reuse/allocation and the pointer check below.
+	 */
+	file_reloaded_cmp = file_reloaded;
+	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+	/*
+	 * atomic_long_inc_not_zero() above provided a full memory
+	 * barrier when we acquired a reference.
+	 *
+	 * This is paired with the write barrier from assigning to the
+	 * __rcu protected file pointer so that if that pointer still
+	 * matches the current file, we know we have successfully
+	 * acquired a reference to the right file.
+	 *
+	 * If the pointers don't match the file has been reallocated by
+	 * SLAB_TYPESAFE_BY_RCU.
+	 */
+	if (file == file_reloaded_cmp)
+		return file_reloaded;
+
+	fput(file);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+	for (;;) {
+		struct file __rcu *file;
+
+		file = __get_file_rcu(f);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
+			continue;
+
+		return file;
+	}
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
+/**
+ * get_file_active - try go get a reference to a file
+ * @f: the file to get a reference on
+ *
+ * In contast to get_file_rcu() the pointer itself isn't part of the
+ * reference counting.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_active(struct file **f)
+{
+	struct file __rcu *file;
+
+	rcu_read_lock();
+	file = __get_file_rcu(f);
+	rcu_read_unlock();
+	if (IS_ERR(file))
+		file = NULL;
+	return file;
+}
+EXPORT_SYMBOL_GPL(get_file_active);
+
 static inline struct file *__fget_files_rcu(struct files_struct *files,
-	unsigned int fd, fmode_t mask)
+       unsigned int fd, fmode_t mask)
 {
 	for (;;) {
 		struct file *file;
@@ -865,12 +964,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 			return NULL;
 
 		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
-		file = rcu_dereference_raw(*fdentry);
-		if (unlikely(!file))
-			return NULL;
-
-		if (unlikely(file->f_mode & mask))
-			return NULL;
 
 		/*
 		 * Ok, we have a file pointer. However, because we do
@@ -879,10 +972,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * Such a race can take two forms:
 		 *
-		 *  (a) the file ref already went down to zero,
-		 *      and get_file_rcu() fails. Just try again:
+		 *  (a) the file ref already went down to zero and the
+		 *      file hasn't been reused yet or the file count
+		 *      isn't zero but the file has already been reused.
 		 */
-		if (unlikely(!get_file_rcu(file)))
+		file = __get_file_rcu(fdentry);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
 			continue;
 
 		/*
@@ -893,13 +991,21 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * If so, we need to put our ref and try again.
 		 */
-		if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
-		    unlikely(rcu_dereference_raw(*fdentry) != file)) {
+		if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
 			fput(file);
 			continue;
 		}
 
 		/*
+		 * This isn't the file we're looking for or we're not
+		 * allowed to get a reference to it.
+		 */
+		if (unlikely(file->f_mode & mask)) {
+			fput(file);
+			return NULL;
+		}
+
+		/*
 		 * Ok, we have a ref to the file, and checked that it
 		 * still exists.
 		 */
@@ -948,7 +1054,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
 	return file;
 }
 
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+	return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -957,13 +1070,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
 	task_lock(task);
 	files = task->files;
 	if (files)
-		file = files_lookup_fd_rcu(files, fd);
+		file = __fget_files_rcu(files, fd, 0);
 	task_unlock(task);
 
 	return file;
 }
 
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -974,7 +1087,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	files = task->files;
 	if (files) {
 		for (; fd < files_fdtable(files)->max_fds; fd++) {
-			file = files_lookup_fd_rcu(files, fd);
+			file = __fget_files_rcu(files, fd, 0);
 			if (file)
 				break;
 		}
@@ -983,7 +1096,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	*ret_fd = fd;
 	return file;
 }
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
 
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1385,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
 	if (unlikely(newfd == oldfd)) { /* corner case */
 		struct files_struct *files = current->files;
+		struct file *f;
 		int retval = oldfd;
 
 		rcu_read_lock();
-		if (!files_lookup_fd_rcu(files, oldfd))
+		f = __fget_files_rcu(files, oldfd, 0);
+		if (!f)
 			retval = -EBADF;
 		rcu_read_unlock();
+		if (f)
+			fput(f);
 		return retval;
 	}
 	return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..fa92743ba6a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly;
 
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
 struct backing_file {
 	struct file file;
-	struct path real_path;
+	struct path user_path;
 };
 
 static inline struct backing_file *backing_file(struct file *f)
@@ -55,31 +55,36 @@ static inline struct backing_file *backing_file(struct file *f)
 	return container_of(f, struct backing_file, file);
 }
 
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
 {
-	return &backing_file(f)->real_path;
+	return &backing_file(f)->user_path;
 }
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
 
-static void file_free_rcu(struct rcu_head *head)
+static inline void file_free(struct file *f)
 {
-	struct file *f = container_of(head, struct file, f_rcuhead);
-
+	security_file_free(f);
+	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+		percpu_counter_dec(&nr_files);
 	put_cred(f->f_cred);
-	if (unlikely(f->f_mode & FMODE_BACKING))
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		path_put(backing_file_user_path(f));
 		kfree(backing_file(f));
-	else
+	} else {
 		kmem_cache_free(filp_cachep, f);
+	}
 }
 
-static inline void file_free(struct file *f)
+void release_empty_file(struct file *f)
 {
-	security_file_free(f);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		path_put(backing_file_real_path(f));
-	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
-		percpu_counter_dec(&nr_files);
-	call_rcu(&f->f_rcuhead, file_free_rcu);
+	WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
+	if (atomic_long_dec_and_test(&f->f_count)) {
+		security_file_free(f);
+		put_cred(f->f_cred);
+		if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+			percpu_counter_dec(&nr_files);
+		kmem_cache_free(filp_cachep, f);
+	}
 }
 
 /*
@@ -164,7 +169,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 		return error;
 	}
 
-	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
@@ -172,6 +176,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	f->f_mode = OPEN_FMODE(flags);
 	/* f->f_version: 0 */
 
+	/*
+	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+	 * fget-rcu pattern users need to be able to handle spurious
+	 * refcount bumps we should reinitialize the reused file first.
+	 */
+	atomic_long_set(&f->f_count, 1);
 	return 0;
 }
 
@@ -471,7 +481,8 @@ EXPORT_SYMBOL(__fput_sync);
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+				SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+				SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }
 
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ac5d43b164b5..20600e9ea202 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -109,11 +109,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi,
 	set_nlink(inode, vip->vii_nlink);
 	inode->i_size = vip->vii_size;
 
-	inode->i_atime.tv_sec = vip->vii_atime;
+	inode_set_atime(inode, vip->vii_atime, 0);
 	inode_set_ctime(inode, vip->vii_ctime, 0);
-	inode->i_mtime.tv_sec = vip->vii_mtime;
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
+	inode_set_mtime(inode, vip->vii_mtime, 0);
 
 	inode->i_blocks = vip->vii_blocks;
 	inode->i_generation = vip->vii_gen;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c1af01b2c42d..1767493dffda 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -613,6 +613,24 @@ out_free:
 	kfree(isw);
 }
 
+static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+				   struct list_head *list, int *nr)
+{
+	struct inode *inode;
+
+	list_for_each_entry(inode, list, i_io_list) {
+		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+			continue;
+
+		isw->inodes[*nr] = inode;
+		(*nr)++;
+
+		if (*nr >= WB_MAX_INODES_PER_ISW - 1)
+			return true;
+	}
+	return false;
+}
+
 /**
  * cleanup_offline_cgwb - detach associated inodes
  * @wb: target wb
@@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 {
 	struct cgroup_subsys_state *memcg_css;
 	struct inode_switch_wbs_context *isw;
-	struct inode *inode;
 	int nr;
 	bool restart = false;
 
@@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 
 	nr = 0;
 	spin_lock(&wb->list_lock);
-	list_for_each_entry(inode, &wb->b_attached, i_io_list) {
-		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
-			continue;
-
-		isw->inodes[nr++] = inode;
-
-		if (nr >= WB_MAX_INODES_PER_ISW - 1) {
-			restart = true;
-			break;
-		}
-	}
+	/*
+	 * In addition to the inodes that have completed writeback, also switch
+	 * cgwbs for those inodes only with dirty timestamps. Otherwise, those
+	 * inodes won't be written back for a long time when lazytime is
+	 * enabled, and thus pinning the dying cgwbs. It won't break the
+	 * bandwidth restrictions, as writeback of inode metadata is not
+	 * accounted for.
+	 */
+	restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+	if (!restart)
+		restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
 	spin_unlock(&wb->list_lock);
 
 	/* no attached inodes? bail out */
diff --git a/fs/fsopen.c b/fs/fsopen.c
index ce03f6521c88..6593ae518115 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -465,6 +465,7 @@ SYSCALL_DEFINE5(fsconfig,
 		param.file = fget(aux);
 		if (!param.file)
 			goto out_key;
+		param.dirfd = aux;
 		break;
 	default:
 		break;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index ab62e4624256..284a35006462 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 	inode->i_mode = mode;
 	inode->i_uid = fc->user_id;
 	inode->i_gid = fc->group_id;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	/* setting ->i_op to NULL is not allowed */
 	if (iop)
 		inode->i_op = iop;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d707e6987da9..d19cbf34c634 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1812,12 +1812,12 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
 	memset(&outarg, 0, sizeof(outarg));
 
 	inarg.valid = FATTR_MTIME;
-	inarg.mtime = inode->i_mtime.tv_sec;
-	inarg.mtimensec = inode->i_mtime.tv_nsec;
+	inarg.mtime = inode_get_mtime_sec(inode);
+	inarg.mtimensec = inode_get_mtime_nsec(inode);
 	if (fm->fc->minor >= 23) {
 		inarg.valid |= FATTR_CTIME;
-		inarg.ctime = inode_get_ctime(inode).tv_sec;
-		inarg.ctimensec = inode_get_ctime(inode).tv_nsec;
+		inarg.ctime = inode_get_ctime_sec(inode);
+		inarg.ctimensec = inode_get_ctime_nsec(inode);
 	}
 	if (ff) {
 		inarg.valid |= FATTR_FH;
@@ -1956,7 +1956,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
 	/* the kernel maintains i_mtime locally */
 	if (trust_local_cmtime) {
 		if (attr->ia_valid & ATTR_MTIME)
-			inode->i_mtime = attr->ia_mtime;
+			inode_set_mtime_to_ts(inode, attr->ia_mtime);
 		if (attr->ia_valid & ATTR_CTIME)
 			inode_set_ctime_to_ts(inode, attr->ia_ctime);
 		/* FIXME: clear I_DIRTY_SYNC? */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bf0b85d0b95c..6e6e721f421b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1284,7 +1284,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
 		      size_t size);
 ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
 int fuse_removexattr(struct inode *inode, const char *name);
-extern const struct xattr_handler *fuse_xattr_handlers[];
+extern const struct xattr_handler * const fuse_xattr_handlers[];
 
 struct posix_acl;
 struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e4eb7cf26fb..caa8121ad99c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -188,12 +188,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
 	attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
 
-	inode->i_atime.tv_sec   = attr->atime;
-	inode->i_atime.tv_nsec  = attr->atimensec;
+	inode_set_atime(inode, attr->atime, attr->atimensec);
 	/* mtime from server may be stale due to local buffered write */
 	if (!(cache_mask & STATX_MTIME)) {
-		inode->i_mtime.tv_sec   = attr->mtime;
-		inode->i_mtime.tv_nsec  = attr->mtimensec;
+		inode_set_mtime(inode, attr->mtime, attr->mtimensec);
 	}
 	if (!(cache_mask & STATX_CTIME)) {
 		inode_set_ctime(inode, attr->ctime, attr->ctimensec);
@@ -276,12 +274,12 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		attr->size = i_size_read(inode);
 
 	if (cache_mask & STATX_MTIME) {
-		attr->mtime = inode->i_mtime.tv_sec;
-		attr->mtimensec = inode->i_mtime.tv_nsec;
+		attr->mtime = inode_get_mtime_sec(inode);
+		attr->mtimensec = inode_get_mtime_nsec(inode);
 	}
 	if (cache_mask & STATX_CTIME) {
-		attr->ctime = inode_get_ctime(inode).tv_sec;
-		attr->ctimensec = inode_get_ctime(inode).tv_nsec;
+		attr->ctime = inode_get_ctime_sec(inode);
+		attr->ctimensec = inode_get_ctime_nsec(inode);
 	}
 
 	if ((attr_version != 0 && fi->attr_version > attr_version) ||
@@ -290,7 +288,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		return;
 	}
 
-	old_mtime = inode->i_mtime;
+	old_mtime = inode_get_mtime(inode);
 	fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask);
 
 	oldsize = inode->i_size;
@@ -337,8 +335,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
 {
 	inode->i_mode = attr->mode & S_IFMT;
 	inode->i_size = attr->size;
-	inode->i_mtime.tv_sec  = attr->mtime;
-	inode->i_mtime.tv_nsec = attr->mtimensec;
+	inode_set_mtime(inode, attr->mtime, attr->mtimensec);
 	inode_set_ctime(inode, attr->ctime, attr->ctimensec);
 	if (S_ISREG(inode->i_mode)) {
 		fuse_init_common(inode);
@@ -1423,17 +1420,19 @@ EXPORT_SYMBOL_GPL(fuse_dev_free);
 static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
 				      const struct fuse_inode *fi)
 {
+	struct timespec64 atime = inode_get_atime(&fi->inode);
+	struct timespec64 mtime = inode_get_mtime(&fi->inode);
 	struct timespec64 ctime = inode_get_ctime(&fi->inode);
 
 	*attr = (struct fuse_attr){
 		.ino		= fi->inode.i_ino,
 		.size		= fi->inode.i_size,
 		.blocks		= fi->inode.i_blocks,
-		.atime		= fi->inode.i_atime.tv_sec,
-		.mtime		= fi->inode.i_mtime.tv_sec,
+		.atime		= atime.tv_sec,
+		.mtime		= mtime.tv_sec,
 		.ctime		= ctime.tv_sec,
-		.atimensec	= fi->inode.i_atime.tv_nsec,
-		.mtimensec	= fi->inode.i_mtime.tv_nsec,
+		.atimensec	= atime.tv_nsec,
+		.mtimensec	= mtime.tv_nsec,
 		.ctimensec	= ctime.tv_nsec,
 		.mode		= fi->inode.i_mode,
 		.nlink		= fi->inode.i_nlink,
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 9e6d587b3e67..c66a54d6c7d3 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -476,7 +476,7 @@ retry_locked:
 	if (!fi->rdc.cached) {
 		/* Starting cache? Set cache mtime. */
 		if (!ctx->pos && !fi->rdc.size) {
-			fi->rdc.mtime = inode->i_mtime;
+			fi->rdc.mtime = inode_get_mtime(inode);
 			fi->rdc.iversion = inode_query_iversion(inode);
 		}
 		spin_unlock(&fi->rdc.lock);
@@ -488,8 +488,10 @@ retry_locked:
 	 * changed, and reset the cache if so.
 	 */
 	if (!ctx->pos) {
+		struct timespec64 mtime = inode_get_mtime(inode);
+
 		if (inode_peek_iversion(inode) != fi->rdc.iversion ||
-		    !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) {
+		    !timespec64_equal(&fi->rdc.mtime, &mtime)) {
 			fuse_rdc_reset(inode);
 			goto retry_locked;
 		}
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 49c01559580f..5b423fdbb13f 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -209,7 +209,7 @@ static const struct xattr_handler fuse_xattr_handler = {
 	.set    = fuse_xattr_set,
 };
 
-const struct xattr_handler *fuse_xattr_handlers[] = {
+const struct xattr_handler * const fuse_xattr_handlers[] = {
 	&fuse_xattr_handler,
 	NULL
 };
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ef7017fb6951..011cd992e0e6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1386,7 +1386,7 @@ static int trunc_start(struct inode *inode, u64 newsize)
 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 
 	i_size_write(inode, newsize);
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 	gfs2_dinode_out(ip, dibh->b_data);
 
 	if (journaled)
@@ -1583,7 +1583,7 @@ out_unlock:
 
 			/* Every transaction boundary, we rewrite the dinode
 			   to keep its di_blocks current in case of failure. */
-			ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+			inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 			gfs2_trans_add_meta(ip->i_gl, dibh);
 			gfs2_dinode_out(ip, dibh->b_data);
 			brelse(dibh);
@@ -1949,7 +1949,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
 		gfs2_statfs_change(sdp, 0, +btotal, 0);
 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
 				  ip->i_inode.i_gid);
-		ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+		inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
 		up_write(&ip->i_rw_mutex);
@@ -1992,7 +1992,7 @@ static int trunc_end(struct gfs2_inode *ip)
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 		gfs2_ordered_del_inode(ip);
 	}
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
 
 	gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -2093,7 +2093,7 @@ static int do_grow(struct inode *inode, u64 size)
 		goto do_end_trans;
 
 	truncate_setsize(inode, size);
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a2afa88f8be..61ddd03ea111 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
 	if (ip->i_inode.i_size < offset + size)
 		i_size_write(&ip->i_inode, offset + size);
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 	gfs2_dinode_out(ip, dibh->b_data);
 
 	brelse(dibh);
@@ -227,7 +227,7 @@ out:
 
 	if (ip->i_inode.i_size < offset + copied)
 		i_size_write(&ip->i_inode, offset + copied);
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 
 	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			da->bh = NULL;
 			brelse(bh);
 			ip->i_entries++;
-			ip->i_inode.i_mtime = tv;
+			inode_set_mtime_to_ts(&ip->i_inode, tv);
 			if (S_ISDIR(nip->i_inode.i_mode))
 				inc_nlink(&ip->i_inode);
 			mark_inode_dirty(inode);
@@ -1911,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 	if (!dip->i_entries)
 		gfs2_consist_inode(dip);
 	dip->i_entries--;
-	dip->i_inode.i_mtime =  tv;
+	inode_set_mtime_to_ts(&dip->i_inode, tv);
 	if (d_is_dir(dentry))
 		drop_nlink(&dip->i_inode);
 	mark_inode_dirty(&dip->i_inode);
@@ -1952,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 	dent->de_type = cpu_to_be16(new_type);
 	brelse(bh);
 
-	dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode);
+	inode_set_mtime_to_ts(&dip->i_inode, inode_set_ctime_current(&dip->i_inode));
 	mark_inode_dirty_sync(&dip->i_inode);
 	return 0;
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4a280be229a6..3772a5d9e85c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2719,16 +2719,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
 	for(;; i->fd++) {
 		struct inode *inode;
 
-		i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+		i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
 		if (!i->file) {
 			i->fd = 0;
 			break;
 		}
+
 		inode = file_inode(i->file);
-		if (inode->i_sb != i->sb)
-			continue;
-		if (get_file_rcu(i->file))
+		if (inode->i_sb == i->sb)
 			break;
+
+		rcu_read_unlock();
+		fput(i->file);
+		rcu_read_lock();
 	}
 	rcu_read_unlock();
 	return i->file;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f41ca89d216b..e7d334c277a1 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -403,7 +403,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	const struct gfs2_dinode *str = buf;
-	struct timespec64 atime;
+	struct timespec64 atime, iatime;
 	u16 height, depth;
 	umode_t mode = be32_to_cpu(str->di_mode);
 	struct inode *inode = &ip->i_inode;
@@ -433,10 +433,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
-	if (timespec64_compare(&inode->i_atime, &atime) < 0)
-		inode->i_atime = atime;
-	inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
-	inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+	iatime = inode_get_atime(inode);
+	if (timespec64_compare(&iatime, &atime) < 0)
+		inode_set_atime_to_ts(inode, atime);
+	inode_set_mtime(inode, be64_to_cpu(str->di_mtime),
+			be32_to_cpu(str->di_mtime_nsec));
 	inode_set_ctime(inode, be64_to_cpu(str->di_ctime),
 			be32_to_cpu(str->di_ctime_nsec));
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 0eac04507904..7fe77bc771e5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -185,8 +185,9 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 		set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
 
 		/* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */
-		inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1);
-		inode->i_atime.tv_nsec = 0;
+		inode_set_atime(inode,
+				1LL << (8 * sizeof(inode_get_atime_sec(inode)) - 1),
+				0);
 
 		glock_set_object(ip->i_gl, ip);
 
@@ -696,7 +697,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
 	inode->i_rdev = dev;
 	inode->i_size = size;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	munge_mode_uid_gid(dip, inode);
 	check_and_update_goal(dip);
 	ip->i_goal = dip->i_goal;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 171b2713d2e5..d9854aece15b 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -886,7 +886,7 @@ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc,
 		size = loc + sizeof(struct gfs2_quota);
 		if (size > inode->i_size)
 			i_size_write(inode, size);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		mark_inode_dirty(inode);
 		set_bit(QDF_REFRESH, &qd->qd_flags);
 	}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 02d93da21b2b..52a878fa7139 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -410,9 +410,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_nlink = cpu_to_be32(inode->i_nlink);
 	str->di_size = cpu_to_be64(i_size_read(inode));
 	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
-	str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
-	str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
-	str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec);
+	str->di_atime = cpu_to_be64(inode_get_atime_sec(inode));
+	str->di_mtime = cpu_to_be64(inode_get_mtime_sec(inode));
+	str->di_ctime = cpu_to_be64(inode_get_ctime_sec(inode));
 
 	str->di_goal_meta = cpu_to_be64(ip->i_goal);
 	str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -427,9 +427,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_entries = cpu_to_be32(ip->i_entries);
 
 	str->di_eattr = cpu_to_be64(ip->i_eattr);
-	str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
-	str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
-	str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec);
+	str->di_atime_nsec = cpu_to_be32(inode_get_atime_nsec(inode));
+	str->di_mtime_nsec = cpu_to_be32(inode_get_mtime_nsec(inode));
+	str->di_ctime_nsec = cpu_to_be32(inode_get_ctime_nsec(inode));
 }
 
 /**
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index ab9c83106932..b4ddf6244586 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -60,8 +60,8 @@ extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
 
-extern const struct xattr_handler *gfs2_xattr_handlers_max[];
-extern const struct xattr_handler **gfs2_xattr_handlers_min;
+extern const struct xattr_handler * const gfs2_xattr_handlers_max[];
+extern const struct xattr_handler * const *gfs2_xattr_handlers_min;
 
 #endif /* __SUPER_DOT_H__ */
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4fea70c0fe3d..79d5c5559512 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1494,7 +1494,7 @@ static const struct xattr_handler gfs2_xattr_trusted_handler = {
 	.set    = gfs2_xattr_set,
 };
 
-const struct xattr_handler *gfs2_xattr_handlers_max[] = {
+const struct xattr_handler * const gfs2_xattr_handlers_max[] = {
 	/* GFS2_FS_FORMAT_MAX */
 	&gfs2_xattr_trusted_handler,
 
@@ -1504,4 +1504,4 @@ const struct xattr_handler *gfs2_xattr_handlers_max[] = {
 	NULL,
 };
 
-const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
+const struct xattr_handler * const *gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 6341bb248247..f8395cdd1adf 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -146,7 +146,7 @@ static const struct xattr_handler hfs_type_handler = {
 	.set = hfs_xattr_set,
 };
 
-const struct xattr_handler *hfs_xattr_handlers[] = {
+const struct xattr_handler * const hfs_xattr_handlers[] = {
 	&hfs_creator_handler,
 	&hfs_type_handler,
 	NULL
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 632c226a3972..d63880e7d9d6 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
 		goto err1;
 
 	dir->i_size++;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	hfs_find_exit(&fd);
 	return 0;
@@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
 	}
 
 	dir->i_size--;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	res = 0;
 out:
@@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
 	if (err)
 		goto out;
 	dst_dir->i_size++;
-	dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
+	inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
 	mark_inode_dirty(dst_dir);
 
 	/* finally remove the old entry */
@@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
 	if (err)
 		goto out;
 	src_dir->i_size--;
-	src_dir->i_mtime = inode_set_ctime_current(src_dir);
+	inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
 	mark_inode_dirty(src_dir);
 
 	type = entry.type;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 49d02524e667..b5a6ad5df357 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -215,7 +215,7 @@ extern void hfs_evict_inode(struct inode *);
 extern void hfs_delete_inode(struct inode *);
 
 /* attr.c */
-extern const struct xattr_handler *hfs_xattr_handlers[];
+extern const struct xattr_handler * const hfs_xattr_handlers[];
 
 /* mdb.c */
 extern int hfs_mdb_get(struct super_block *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index ee349b72cfb3..a7bc4690a780 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	set_nlink(inode, 1);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
 	HFS_I(inode)->fs_blocks = 0;
@@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
 			inode->i_mode |= S_IWUGO;
 		inode->i_mode &= ~hsb->s_file_umask;
 		inode->i_mode |= S_IFREG;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
-									hfs_m_to_utime(rec->file.MdDat));
+		inode_set_mtime_to_ts(inode,
+				      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat))));
 		inode->i_op = &hfs_file_inode_operations;
 		inode->i_fop = &hfs_file_operations;
 		inode->i_mapping->a_ops = &hfs_aops;
@@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
 		inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
 		HFS_I(inode)->fs_blocks = 0;
 		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
-									hfs_m_to_utime(rec->dir.MdDat));
+		inode_set_mtime_to_ts(inode,
+				      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat))));
 		inode->i_op = &hfs_dir_inode_operations;
 		inode->i_fop = &hfs_dir_operations;
 		break;
@@ -474,7 +474,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		    be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
 		}
 
-		rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime);
+		rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
 		rec.dir.Val = cpu_to_be16(inode->i_size - 2);
 
 		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
@@ -502,7 +502,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		else
 			rec.file.Flags |= HFS_FIL_LOCK;
 		hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
-		rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime);
+		rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
 
 		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
 			    sizeof(struct hfs_cat_file));
@@ -654,7 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 
 		truncate_setsize(inode, attr->ia_size);
 		hfs_file_truncate(inode);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 
 	setattr_copy(&nop_mnt_idmap, inode, attr);
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index dc27d418fbcd..76fa02e3835b 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -28,11 +28,13 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
 	/* fix up inode on a timezone change */
 	diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest;
 	if (diff) {
-		struct timespec64 ctime = inode_get_ctime(inode);
+		struct timespec64 ts = inode_get_ctime(inode);
 
-		inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec);
-		inode->i_atime.tv_sec += diff;
-		inode->i_mtime.tv_sec += diff;
+		inode_set_ctime(inode, ts.tv_sec + diff, ts.tv_nsec);
+		ts = inode_get_atime(inode);
+		inode_set_atime(inode, ts.tv_sec + diff, ts.tv_nsec);
+		ts = inode_get_mtime(inode);
+		inode_set_mtime(inode, ts.tv_sec + diff, ts.tv_nsec);
 		HFS_I(inode)->tz_secondswest += diff;
 	}
 	return 1;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index e71ae2537eaa..1995bafee839 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
 	dir->i_size++;
 	if (S_ISDIR(inode->i_mode))
 		hfsplus_subfolders_inc(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
 
 	hfs_find_exit(&fd);
@@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
 	dir->i_size--;
 	if (type == HFSPLUS_FOLDER)
 		hfsplus_subfolders_dec(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
 
 	if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
@@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid,
 	dst_dir->i_size++;
 	if (type == HFSPLUS_FOLDER)
 		hfsplus_subfolders_inc(dst_dir);
-	dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
+	inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
 
 	/* finally remove the old entry */
 	err = hfsplus_cat_build_key(sb, src_fd.search_key,
@@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid,
 	src_dir->i_size--;
 	if (type == HFSPLUS_FOLDER)
 		hfsplus_subfolders_dec(src_dir);
-	src_dir->i_mtime = inode_set_ctime_current(src_dir);
+	inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
 
 	/* remove old thread entry */
 	hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c65c8c4b03dd..702a0663b1d8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap,
 		}
 		truncate_setsize(inode, attr->ia_size);
 		hfsplus_file_truncate(inode);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	}
 
 	setattr_copy(&nop_mnt_idmap, inode, attr);
@@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
 	inode->i_ino = sbi->next_cnid++;
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	set_nlink(inode, 1);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	hip = HFSPLUS_I(inode);
 	INIT_LIST_HEAD(&hip->open_dir_list);
@@ -521,8 +521,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		hfsplus_get_perms(inode, &folder->permissions, 1);
 		set_nlink(inode, 1);
 		inode->i_size = 2 + be32_to_cpu(folder->valence);
-		inode->i_atime = hfsp_mt2ut(folder->access_date);
-		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
+		inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date));
+		inode_set_mtime_to_ts(inode,
+				      hfsp_mt2ut(folder->content_mod_date));
 		inode_set_ctime_to_ts(inode,
 				      hfsp_mt2ut(folder->attribute_mod_date));
 		HFSPLUS_I(inode)->create_date = folder->create_date;
@@ -563,8 +564,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 			init_special_inode(inode, inode->i_mode,
 					   be32_to_cpu(file->permissions.dev));
 		}
-		inode->i_atime = hfsp_mt2ut(file->access_date);
-		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
+		inode_set_atime_to_ts(inode, hfsp_mt2ut(file->access_date));
+		inode_set_mtime_to_ts(inode,
+				      hfsp_mt2ut(file->content_mod_date));
 		inode_set_ctime_to_ts(inode,
 				      hfsp_mt2ut(file->attribute_mod_date));
 		HFSPLUS_I(inode)->create_date = file->create_date;
@@ -609,8 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
 					sizeof(struct hfsplus_cat_folder));
 		/* simple node checks? */
 		hfsplus_cat_set_perms(inode, &folder->permissions);
-		folder->access_date = hfsp_ut2mt(inode->i_atime);
-		folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+		folder->access_date = hfsp_ut2mt(inode_get_atime(inode));
+		folder->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
 		folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
 		folder->valence = cpu_to_be32(inode->i_size - 2);
 		if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
@@ -644,8 +646,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
 		else
 			file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
-		file->access_date = hfsp_ut2mt(inode->i_atime);
-		file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+		file->access_date = hfsp_ut2mt(inode_get_atime(inode));
+		file->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
 		file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
 		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
 					 sizeof(struct hfsplus_cat_file));
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 58021e73c00b..9c9ff6b8c6f7 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -13,7 +13,7 @@
 
 static int hfsplus_removexattr(struct inode *inode, const char *name);
 
-const struct xattr_handler *hfsplus_xattr_handlers[] = {
+const struct xattr_handler * const hfsplus_xattr_handlers[] = {
 	&hfsplus_xattr_osx_handler,
 	&hfsplus_xattr_user_handler,
 	&hfsplus_xattr_trusted_handler,
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index d14e362b3eba..15cc55e41410 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -17,7 +17,7 @@ extern const struct xattr_handler hfsplus_xattr_user_handler;
 extern const struct xattr_handler hfsplus_xattr_trusted_handler;
 extern const struct xattr_handler hfsplus_xattr_security_handler;
 
-extern const struct xattr_handler *hfsplus_xattr_handlers[];
+extern const struct xattr_handler * const hfsplus_xattr_handlers[];
 
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dc5a5cea5fae..ea87f24c6c3f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -513,10 +513,14 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
 	set_nlink(ino, st->nlink);
 	i_uid_write(ino, st->uid);
 	i_gid_write(ino, st->gid);
-	ino->i_atime =
-		(struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec };
-	ino->i_mtime =
-		(struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec };
+	inode_set_atime_to_ts(ino, (struct timespec64){
+			st->atime.tv_sec,
+			st->atime.tv_nsec,
+		});
+	inode_set_mtime_to_ts(ino, (struct timespec64){
+			st->mtime.tv_sec,
+			st->mtime.tv_nsec,
+		});
 	inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec);
 	ino->i_size = st->size;
 	ino->i_blocks = st->blocks;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index f36566d61215..49dd585c2b17 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -277,14 +277,16 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in
 	 * inode.
 	 */
 
-	if (!inode_get_ctime(result).tv_sec) {
+	if (!inode_get_ctime_sec(result)) {
 		time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date));
 
 		inode_set_ctime(result, csec ? csec : 1, 0);
-		result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date));
-		result->i_mtime.tv_nsec = 0;
-		result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date));
-		result->i_atime.tv_nsec = 0;
+		inode_set_mtime(result,
+				local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)),
+				0);
+		inode_set_atime(result,
+				local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)),
+				0);
 		hpfs_result->i_ea_size = le32_to_cpu(de->ea_size);
 		if (!hpfs_result->i_ea_mode && de->read_only)
 			result->i_mode &= ~0222;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 479166378bae..a59e8fa630db 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -37,8 +37,8 @@ void hpfs_init_inode(struct inode *i)
 	hpfs_inode->i_dirty = 0;
 
 	inode_set_ctime(i, 0, 0);
-	i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0;
-	i->i_atime.tv_sec = i->i_atime.tv_nsec = 0;
+	inode_set_mtime(i, 0, 0);
+	inode_set_atime(i, 0, 0);
 }
 
 void hpfs_read_inode(struct inode *i)
@@ -230,9 +230,9 @@ void hpfs_write_inode_nolock(struct inode *i)
 	}
 	hpfs_write_inode_ea(i, fnode);
 	if (de) {
-		de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
-		de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
-		de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
+		de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i)));
+		de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i)));
+		de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i)));
 		de->read_only = !(i->i_mode & 0222);
 		de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size);
 		hpfs_mark_4buffers_dirty(&qbh);
@@ -240,9 +240,9 @@ void hpfs_write_inode_nolock(struct inode *i)
 	}
 	if (S_ISDIR(i->i_mode)) {
 		if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) {
-			de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
-			de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
-			de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
+			de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i)));
+			de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i)));
+			de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i)));
 			de->read_only = !(i->i_mode & 0222);
 			de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0);
 			de->file_size = cpu_to_le32(0);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4eb8d6f5989..9184b4584b01 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -12,10 +12,10 @@
 static void hpfs_update_directory_times(struct inode *dir)
 {
 	time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
-	if (t == dir->i_mtime.tv_sec &&
-	    t == inode_get_ctime(dir).tv_sec)
+	if (t == inode_get_mtime_sec(dir) &&
+	    t == inode_get_ctime_sec(dir))
 		return;
-	dir->i_mtime = inode_set_ctime(dir, t, 0);
+	inode_set_mtime_to_ts(dir, inode_set_ctime(dir, t, 0));
 	hpfs_write_inode_nolock(dir);
 }
 
@@ -58,8 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	result->i_ino = fno;
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
 	hpfs_i(result)->i_dno = dno;
-	result->i_mtime = result->i_atime =
-		inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+	inode_set_mtime_to_ts(result,
+			      inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
 	hpfs_i(result)->i_ea_size = 0;
 	result->i_mode |= S_IFDIR;
 	result->i_op = &hpfs_dir_iops;
@@ -164,8 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	result->i_fop = &hpfs_file_ops;
 	set_nlink(result, 1);
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
-	result->i_mtime = result->i_atime =
-		inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+	inode_set_mtime_to_ts(result,
+			      inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
 	hpfs_i(result)->i_ea_size = 0;
 	if (dee.read_only)
 		result->i_mode &= ~0222;
@@ -245,8 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	hpfs_init_inode(result);
 	result->i_ino = fno;
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
-	result->i_mtime = result->i_atime =
-		inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+	inode_set_mtime_to_ts(result,
+			      inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
 	hpfs_i(result)->i_ea_size = 0;
 	result->i_uid = current_fsuid();
 	result->i_gid = current_fsgid();
@@ -319,8 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	result->i_ino = fno;
 	hpfs_init_inode(result);
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
-	result->i_mtime = result->i_atime =
-		inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+	inode_set_mtime_to_ts(result,
+			      inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
 	hpfs_i(result)->i_ea_size = 0;
 	result->i_mode = S_IFLNK | 0777;
 	result->i_uid = current_fsuid();
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 758a51564124..6b0ba3c1efba 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -725,10 +725,12 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	if (!de)
 		hpfs_error(s, "unable to find root dir");
 	else {
-		root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date));
-		root->i_atime.tv_nsec = 0;
-		root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date));
-		root->i_mtime.tv_nsec = 0;
+		inode_set_atime(root,
+				local_to_gmt(s, le32_to_cpu(de->read_date)),
+				0);
+		inode_set_mtime(root,
+				local_to_gmt(s, le32_to_cpu(de->write_date)),
+				0);
 		inode_set_ctime(root,
 				local_to_gmt(s, le32_to_cpu(de->creation_date)),
 				0);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 316c4cebd3f3..da217eaba102 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -980,7 +980,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 		inode->i_mode = S_IFDIR | ctx->mode;
 		inode->i_uid = ctx->uid;
 		inode->i_gid = ctx->gid;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_op = &hugetlbfs_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -1024,7 +1024,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_mapping->private_data = resv_map;
 		info->seals = F_SEAL_SEAL;
 		switch (mode & S_IFMT) {
@@ -1067,7 +1067,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
 	if (!inode)
 		return -ENOSPC;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	d_instantiate(dentry, inode);
 	dget(dentry);/* Extra count - pin the dentry in core */
 	return 0;
@@ -1099,7 +1099,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
 	if (!inode)
 		return -ENOSPC;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	d_tmpfile(file, inode);
 	return finish_open_simple(file, 0);
 }
@@ -1121,7 +1121,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
 		} else
 			iput(inode);
 	}
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	return error;
 }
diff --git a/fs/init.c b/fs/init.c
index 9684406a8416..e9387b6c4f30 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	if (!IS_POSIXACL(path.dentry->d_inode))
-		mode &= ~current_umask();
+	mode = mode_strip_umask(d_inode(path.dentry), mode);
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (!error)
 		error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
@@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 	dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
-	if (!IS_POSIXACL(path.dentry->d_inode))
-		mode &= ~current_umask();
+	mode = mode_strip_umask(d_inode(path.dentry), mode);
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
 		error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
diff --git a/fs/inode.c b/fs/inode.c
index 84bc3c76e5cc..4f8984b97df0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1837,27 +1837,29 @@ EXPORT_SYMBOL(bmap);
 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
 			     struct timespec64 now)
 {
-	struct timespec64 ctime;
+	struct timespec64 atime, mtime, ctime;
 
 	if (!(mnt->mnt_flags & MNT_RELATIME))
 		return 1;
 	/*
 	 * Is mtime younger than or equal to atime? If yes, update atime:
 	 */
-	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+	atime = inode_get_atime(inode);
+	mtime = inode_get_mtime(inode);
+	if (timespec64_compare(&mtime, &atime) >= 0)
 		return 1;
 	/*
 	 * Is ctime younger than or equal to atime? If yes, update atime:
 	 */
 	ctime = inode_get_ctime(inode);
-	if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
+	if (timespec64_compare(&ctime, &atime) >= 0)
 		return 1;
 
 	/*
 	 * Is the previous atime value older than a day? If yes,
 	 * update atime:
 	 */
-	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+	if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
 		return 1;
 	/*
 	 * Good, we can skip the atime update:
@@ -1888,12 +1890,13 @@ int inode_update_timestamps(struct inode *inode, int flags)
 
 	if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
 		struct timespec64 ctime = inode_get_ctime(inode);
+		struct timespec64 mtime = inode_get_mtime(inode);
 
 		now = inode_set_ctime_current(inode);
 		if (!timespec64_equal(&now, &ctime))
 			updated |= S_CTIME;
-		if (!timespec64_equal(&now, &inode->i_mtime)) {
-			inode->i_mtime = now;
+		if (!timespec64_equal(&now, &mtime)) {
+			inode_set_mtime_to_ts(inode, now);
 			updated |= S_MTIME;
 		}
 		if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
@@ -1903,8 +1906,10 @@ int inode_update_timestamps(struct inode *inode, int flags)
 	}
 
 	if (flags & S_ATIME) {
-		if (!timespec64_equal(&now, &inode->i_atime)) {
-			inode->i_atime = now;
+		struct timespec64 atime = inode_get_atime(inode);
+
+		if (!timespec64_equal(&now, &atime)) {
+			inode_set_atime_to_ts(inode, now);
 			updated |= S_ATIME;
 		}
 	}
@@ -1963,7 +1968,7 @@ EXPORT_SYMBOL(inode_update_time);
 bool atime_needs_update(const struct path *path, struct inode *inode)
 {
 	struct vfsmount *mnt = path->mnt;
-	struct timespec64 now;
+	struct timespec64 now, atime;
 
 	if (inode->i_flags & S_NOATIME)
 		return false;
@@ -1989,7 +1994,8 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
 	if (!relatime_need_update(mnt, inode, now))
 		return false;
 
-	if (timespec64_equal(&inode->i_atime, &now))
+	atime = inode_get_atime(inode);
+	if (timespec64_equal(&atime, &now))
 		return false;
 
 	return true;
@@ -2006,7 +2012,7 @@ void touch_atime(const struct path *path)
 	if (!sb_start_write_trylock(inode->i_sb))
 		return;
 
-	if (__mnt_want_write(mnt) != 0)
+	if (mnt_get_write_access(mnt) != 0)
 		goto skip_update;
 	/*
 	 * File systems can error out when updating inodes if they need to
@@ -2018,7 +2024,7 @@ void touch_atime(const struct path *path)
 	 * of the fs read only, e.g. subvolumes in Btrfs.
 	 */
 	inode_update_time(inode, S_ATIME);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 skip_update:
 	sb_end_write(inode->i_sb);
 }
@@ -2106,17 +2112,18 @@ static int inode_needs_update_time(struct inode *inode)
 {
 	int sync_it = 0;
 	struct timespec64 now = current_time(inode);
-	struct timespec64 ctime;
+	struct timespec64 ts;
 
 	/* First try to exhaust all avenues to not sync */
 	if (IS_NOCMTIME(inode))
 		return 0;
 
-	if (!timespec64_equal(&inode->i_mtime, &now))
+	ts = inode_get_mtime(inode);
+	if (!timespec64_equal(&ts, &now))
 		sync_it = S_MTIME;
 
-	ctime = inode_get_ctime(inode);
-	if (!timespec64_equal(&ctime, &now))
+	ts = inode_get_ctime(inode);
+	if (!timespec64_equal(&ts, &now))
 		sync_it |= S_CTIME;
 
 	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2131,9 +2138,9 @@ static int __file_update_time(struct file *file, int sync_mode)
 	struct inode *inode = file_inode(file);
 
 	/* try to update time settings */
-	if (!__mnt_want_write_file(file)) {
+	if (!mnt_get_write_access_file(file)) {
 		ret = inode_update_time(inode, sync_mode);
-		__mnt_drop_write_file(file);
+		mnt_put_write_access_file(file);
 	}
 
 	return ret;
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..58e43341aebf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
 
 extern void dissolve_on_fput(struct vfsmount *);
 extern bool may_mount(void);
@@ -94,14 +94,22 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
 struct file *alloc_empty_file(int flags, const struct cred *cred);
 struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
 struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void release_empty_file(struct file *f);
+
+static inline void file_put_write_access(struct file *file)
+{
+	put_write_access(file->f_inode);
+	mnt_put_write_access(file->f_path.mnt);
+	if (unlikely(file->f_mode & FMODE_BACKING))
+		mnt_put_write_access(backing_file_user_path(file)->mnt);
+}
 
 static inline void put_file_access(struct file *file)
 {
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
 		i_readcount_dec(file->f_inode);
 	} else if (file->f_mode & FMODE_WRITER) {
-		put_write_access(file->f_inode);
-		__mnt_drop_write(file->f_path.mnt);
+		file_put_write_access(file);
 	}
 }
 
@@ -130,9 +138,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb)
 	 * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
 	 * cleared, it will see s_readonly_remount set.
 	 * For RW->RO transition, the barrier pairs with the barrier in
-	 * __mnt_want_write() before the mnt_is_readonly() check. The barrier
-	 * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
-	 * cleared, it will see s_readonly_remount set.
+	 * mnt_get_write_access() before the mnt_is_readonly() check.
+	 * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
+	 * already cleared, it will see s_readonly_remount set.
 	 */
 	smp_wmb();
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 2ee21286ac8f..3e4d53e26f94 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1422,8 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
 			inode->i_ino, de->flags[-high_sierra]);
 	}
 #endif
-	inode->i_mtime = inode->i_atime =
-		inode_set_ctime(inode, iso_date(de->date, high_sierra), 0);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0)));
 
 	ei->i_first_extent = (isonum_733(de->extent) +
 			isonum_711(de->ext_attr_length));
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 348783a70f57..d6c17ad69dee 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -426,16 +426,14 @@ repeat:
 						0);
 			}
 			if (rr->u.TF.flags & TF_MODIFY) {
-				inode->i_mtime.tv_sec =
-				    iso_date(rr->u.TF.times[cnt++].time,
-					     0);
-				inode->i_mtime.tv_nsec = 0;
+				inode_set_mtime(inode,
+						iso_date(rr->u.TF.times[cnt++].time, 0),
+						0);
 			}
 			if (rr->u.TF.flags & TF_ACCESS) {
-				inode->i_atime.tv_sec =
-				    iso_date(rr->u.TF.times[cnt++].time,
-					     0);
-				inode->i_atime.tv_nsec = 0;
+				inode_set_atime(inode,
+						iso_date(rr->u.TF.times[cnt++].time, 0),
+						0);
 			}
 			if (rr->u.TF.flags & TF_ATTRIBUTES) {
 				inode_set_ctime(inode,
@@ -531,9 +529,9 @@ repeat:
 			inode->i_rdev = reloc->i_rdev;
 			inode->i_size = reloc->i_size;
 			inode->i_blocks = reloc->i_blocks;
-			inode->i_atime = reloc->i_atime;
+			inode_set_atime_to_ts(inode, inode_get_atime(reloc));
 			inode_set_ctime_to_ts(inode, inode_get_ctime(reloc));
-			inode->i_mtime = reloc->i_mtime;
+			inode_set_mtime_to_ts(inode, inode_get_mtime(reloc));
 			iput(reloc);
 			break;
 #ifdef CONFIG_ZISOFS
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 091ab0eaabbe..2b2938970da3 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -204,8 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i,
 	if (ret)
 		goto fail;
 
-	dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
-					       ITIME(je32_to_cpu(ri->ctime)));
+	inode_set_mtime_to_ts(dir_i,
+			      inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(ri->ctime))));
 
 	jffs2_free_raw_inode(ri);
 
@@ -238,7 +238,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
 	if (dead_f->inocache)
 		set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
 	if (!ret)
-		dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+		inode_set_mtime_to_ts(dir_i,
+				      inode_set_ctime_to_ts(dir_i, ITIME(now)));
 	return ret;
 }
 /***********************************************************************/
@@ -272,7 +273,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 		set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
 		mutex_unlock(&f->sem);
 		d_instantiate(dentry, d_inode(old_dentry));
-		dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+		inode_set_mtime_to_ts(dir_i,
+				      inode_set_ctime_to_ts(dir_i, ITIME(now)));
 		ihold(d_inode(old_dentry));
 	}
 	return ret;
@@ -423,8 +425,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
 		goto fail;
 	}
 
-	dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
-					       ITIME(je32_to_cpu(rd->mctime)));
+	inode_set_mtime_to_ts(dir_i,
+			      inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
 
 	jffs2_free_raw_dirent(rd);
 
@@ -568,8 +570,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
 		goto fail;
 	}
 
-	dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
-					       ITIME(je32_to_cpu(rd->mctime)));
+	inode_set_mtime_to_ts(dir_i,
+			      inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
 	inc_nlink(dir_i);
 
 	jffs2_free_raw_dirent(rd);
@@ -610,7 +612,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
 			      dentry->d_name.len, f, now);
 	if (!ret) {
-		dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+		inode_set_mtime_to_ts(dir_i,
+				      inode_set_ctime_to_ts(dir_i, ITIME(now)));
 		clear_nlink(d_inode(dentry));
 		drop_nlink(dir_i);
 	}
@@ -746,8 +749,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i,
 		goto fail;
 	}
 
-	dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
-					       ITIME(je32_to_cpu(rd->mctime)));
+	inode_set_mtime_to_ts(dir_i,
+			      inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
 
 	jffs2_free_raw_dirent(rd);
 
@@ -868,16 +871,18 @@ static int jffs2_rename (struct mnt_idmap *idmap,
 		 * caller won't do it on its own since we are returning an error.
 		 */
 		d_invalidate(new_dentry);
-		new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i,
-							   ITIME(now));
+		inode_set_mtime_to_ts(new_dir_i,
+				      inode_set_ctime_to_ts(new_dir_i, ITIME(now)));
 		return ret;
 	}
 
 	if (d_is_dir(old_dentry))
 		drop_nlink(old_dir_i);
 
-	old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now));
-	new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now));
+	inode_set_mtime_to_ts(old_dir_i,
+			      inode_set_ctime_to_ts(old_dir_i, ITIME(now)));
+	inode_set_mtime_to_ts(new_dir_i,
+			      inode_set_ctime_to_ts(new_dir_i, ITIME(now)));
 
 	return 0;
 }
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 11c66793960e..62ea76da7fdf 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -317,8 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
 			inode->i_size = pos + writtenlen;
 			inode->i_blocks = (inode->i_size + 511) >> 9;
 
-			inode->i_mtime = inode_set_ctime_to_ts(inode,
-							       ITIME(je32_to_cpu(ri->ctime)));
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime))));
 		}
 	}
 
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 0403efab4089..d175cccb7c55 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -113,8 +113,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 
 
 	ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size);
-	ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime));
-	ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime));
+	ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode_get_atime(inode)));
+	ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode_get_mtime(inode)));
 	ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode)));
 
 	ri->offset = cpu_to_je32(0);
@@ -147,9 +147,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 		return PTR_ERR(new_metadata);
 	}
 	/* It worked. Update the inode */
-	inode->i_atime = ITIME(je32_to_cpu(ri->atime));
+	inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(ri->atime)));
 	inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime)));
-	inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
+	inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(ri->mtime)));
 	inode->i_mode = jemode_to_cpu(ri->mode);
 	i_uid_write(inode, je16_to_cpu(ri->uid));
 	i_gid_write(inode, je16_to_cpu(ri->gid));
@@ -282,8 +282,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 	i_uid_write(inode, je16_to_cpu(latest_node.uid));
 	i_gid_write(inode, je16_to_cpu(latest_node.gid));
 	inode->i_size = je32_to_cpu(latest_node.isize);
-	inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
-	inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
+	inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(latest_node.atime)));
+	inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(latest_node.mtime)));
 	inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime)));
 
 	set_nlink(inode, f->inocache->pino_nlink);
@@ -386,8 +386,8 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
 	iattr.ia_mode = inode->i_mode;
 	iattr.ia_uid = inode->i_uid;
 	iattr.ia_gid = inode->i_gid;
-	iattr.ia_atime = inode->i_atime;
-	iattr.ia_mtime = inode->i_mtime;
+	iattr.ia_atime = inode_get_atime(inode);
+	iattr.ia_mtime = inode_get_mtime(inode);
 	iattr.ia_ctime = inode_get_ctime(inode);
 
 	jffs2_do_setattr(inode, &iattr);
@@ -475,8 +475,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
 	inode->i_mode = jemode_to_cpu(ri->mode);
 	i_gid_write(inode, je16_to_cpu(ri->gid));
 	i_uid_write(inode, je16_to_cpu(ri->uid));
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
-	ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
+	simple_inode_init_ts(inode);
+	ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode_get_mtime(inode)));
 
 	inode->i_blocks = 0;
 	inode->i_size = 0;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 50727a1ff931..86ab014a349c 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -36,8 +36,8 @@ struct kvec;
 #define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds())
 #define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec)
 #define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f)))
-#define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime)
-#define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime)
+#define JFFS2_F_I_MTIME(f) I_SEC(inode_get_mtime(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_ATIME(f) I_SEC(inode_get_atime(OFNI_EDONI_2SFFJ(f)))
 #define sleep_on_spinunlock(wq, s)				\
 	do {							\
 		DECLARE_WAITQUEUE(__wait, current);		\
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3b6bdc9a49e1..00224f3a8d6e 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -920,7 +920,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
  * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
  *   is an implementation of setxattr handler on jffs2.
  * -------------------------------------------------- */
-const struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler * const jffs2_xattr_handlers[] = {
 	&jffs2_user_xattr_handler,
 #ifdef CONFIG_JFFS2_FS_SECURITY
 	&jffs2_security_xattr_handler,
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 1b5030a3349d..7e7de093ec0a 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -94,7 +94,7 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
 extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			     const char *buffer, size_t size, int flags);
 
-extern const struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler * const jffs2_xattr_handlers[];
 extern const struct xattr_handler jffs2_user_xattr_handler;
 extern const struct xattr_handler jffs2_trusted_xattr_handler;
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 920d58a1566b..1a6b5921d17a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
 			break;
 		}
 
-		ip->i_mtime = inode_set_ctime_current(ip);
+		inode_set_mtime_to_ts(ip, inode_set_ctime_current(ip));
 		mark_inode_dirty(ip);
 
 		txCommit(tid, 1, &ip, 0);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 923a58422c46..8e87264e56ce 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3061,10 +3061,10 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 	}
 
 	ip->i_size = le64_to_cpu(dip->di_size);
-	ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
-	ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
-	ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
-	ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
+	inode_set_atime(ip, le32_to_cpu(dip->di_atime.tv_sec),
+			le32_to_cpu(dip->di_atime.tv_nsec));
+	inode_set_mtime(ip, le32_to_cpu(dip->di_mtime.tv_sec),
+			le32_to_cpu(dip->di_mtime.tv_nsec));
 	inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec),
 			le32_to_cpu(dip->di_ctime.tv_nsec));
 	ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
@@ -3138,12 +3138,12 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
 	else /* Leave the original permissions alone */
 		dip->di_mode = cpu_to_le32(jfs_ip->mode2);
 
-	dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
-	dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
-	dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec);
-	dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec);
-	dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
-	dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
+	dip->di_atime.tv_sec = cpu_to_le32(inode_get_atime_sec(ip));
+	dip->di_atime.tv_nsec = cpu_to_le32(inode_get_atime_nsec(ip));
+	dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime_sec(ip));
+	dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime_nsec(ip));
+	dip->di_mtime.tv_sec = cpu_to_le32(inode_get_mtime_sec(ip));
+	dip->di_mtime.tv_nsec = cpu_to_le32(inode_get_mtime_nsec(ip));
 	dip->di_ixpxd = jfs_ip->ixpxd;	/* in-memory pxd's are little-endian */
 	dip->di_acl = jfs_ip->acl;	/* as are dxd's */
 	dip->di_ea = jfs_ip->ea;
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 87594efa7f7c..f10f295d1502 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	jfs_inode->mode2 |= inode->i_mode;
 
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	jfs_inode->otime = inode_get_ctime(inode).tv_sec;
+	simple_inode_init_ts(inode);
+	jfs_inode->otime = inode_get_ctime_sec(inode);
 	inode->i_generation = JFS_SBI(sb)->gengen++;
 
 	jfs_inode->cflag = 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e855b8fde76c..cb6d1fda66a7 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
 int lmLogOpen(struct super_block *sb)
 {
 	int rc;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct jfs_log *log;
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 
@@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb)
 
 	mutex_lock(&jfs_log_mutex);
 	list_for_each_entry(log, &jfs_external_logs, journal_list) {
-		if (log->bdev->bd_dev == sbi->logdev) {
+		if (log->bdev_handle->bdev->bd_dev == sbi->logdev) {
 			if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
 				jfs_warn("wrong uuid on JFS journal");
 				mutex_unlock(&jfs_log_mutex);
@@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				 log, NULL);
-	if (IS_ERR(bdev)) {
-		rc = PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_dev(sbi->logdev,
+			BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
+	if (IS_ERR(bdev_handle)) {
+		rc = PTR_ERR(bdev_handle);
 		goto free;
 	}
 
-	log->bdev = bdev;
+	log->bdev_handle = bdev_handle;
 	uuid_copy(&log->uuid, &sbi->loguuid);
 
 	/*
@@ -1141,7 +1141,7 @@ journal_found:
 	lbmLogShutdown(log);
 
       close:		/* close external log device */
-	blkdev_put(bdev, log);
+	bdev_release(bdev_handle);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
 	init_waitqueue_head(&log->syncwait);
 
 	set_bit(log_INLINELOG, &log->flag);
-	log->bdev = sb->s_bdev;
+	log->bdev_handle = sb->s_bdev_handle;
 	log->base = addressPXD(&JFS_SBI(sb)->logpxd);
 	log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
 	    (L2LOGPSIZE - sb->s_blocksize_bits);
@@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	int rc = 0;
 
 	jfs_info("lmLogClose: log:0x%p", log);
@@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb)
 	 *	external log as separate logical volume
 	 */
 	list_del(&log->journal_list);
-	bdev = log->bdev;
+	bdev_handle = log->bdev_handle;
 	rc = lmLogShutdown(log);
 
-	blkdev_put(bdev, log);
+	bdev_release(bdev_handle);
 
 	kfree(log);
 
@@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bp->l_flag |= lbmREAD;
 
-	bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
+	bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
 	__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2110,10 +2110,15 @@ static void lbmStartIO(struct lbuf * bp)
 {
 	struct bio *bio;
 	struct jfs_log *log = bp->l_log;
+	struct block_device *bdev = NULL;
 
 	jfs_info("lbmStartIO");
 
-	bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
+	if (!log->no_integrity)
+		bdev = log->bdev_handle->bdev;
+
+	bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
+			GFP_NOFS);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
 	__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 805877ce5020..84aa2d253907 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -356,7 +356,7 @@ struct jfs_log {
 				 *    before writing syncpt.
 				 */
 	struct list_head journal_list; /* Global list */
-	struct block_device *bdev; /* 4: log lv pointer */
+	struct bdev_handle *bdev_handle; /* 4: log lv pointer */
 	int serial;		/* 4: log mount serial number */
 
 	s64 base;		/* @8: log extent address (inline log ) */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index b83aae56a1f2..415eb65a36ff 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -430,7 +430,8 @@ int updateSuper(struct super_block *sb, uint state)
 
 	if (state == FM_MOUNT) {
 		/* record log's dev_t and mount serial number */
-		j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev));
+		j_sb->s_logdev = cpu_to_le32(
+			new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev));
 		j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
 	} else if (state == FM_CLEAN) {
 		/*
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 0d33816d251d..ec67d8554d2c 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -46,7 +46,7 @@ extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
 extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
 extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 
-extern const struct xattr_handler *jfs_xattr_handlers[];
+extern const struct xattr_handler * const jfs_xattr_handlers[];
 
 #ifdef CONFIG_JFS_SECURITY
 extern int jfs_init_security(tid_t, struct inode *, struct inode *,
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 57d7a4300210..d68a4e6ac345 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
 
 	mark_inode_dirty(ip);
 
-	dip->i_mtime = inode_set_ctime_current(dip);
+	inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
 
 	mark_inode_dirty(dip);
 
@@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
 
 	/* update parent directory inode */
 	inc_nlink(dip);		/* for '..' from child directory */
-	dip->i_mtime = inode_set_ctime_current(dip);
+	inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
 	mark_inode_dirty(dip);
 
 	rc = txCommit(tid, 2, &iplist[0], 0);
@@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	/* update parent directory's link count corresponding
 	 * to ".." entry of the target directory deleted
 	 */
-	dip->i_mtime = inode_set_ctime_current(dip);
+	inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
 	inode_dec_link_count(dip);
 
 	/*
@@ -512,7 +512,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 
 	ASSERT(ip->i_nlink);
 
-	dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip));
+	inode_set_mtime_to_ts(dip,
+			      inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip)));
 	mark_inode_dirty(dip);
 
 	/* update target's inode */
@@ -828,7 +829,7 @@ static int jfs_link(struct dentry *old_dentry,
 	/* update object inode */
 	inc_nlink(ip);		/* for new link */
 	inode_set_ctime_current(ip);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	ihold(ip);
 
@@ -1028,7 +1029,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
 
 	mark_inode_dirty(ip);
 
-	dip->i_mtime = inode_set_ctime_current(dip);
+	inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
 	mark_inode_dirty(dip);
 	/*
 	 * commit update of parent directory and link object
@@ -1271,7 +1272,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	inode_set_ctime_current(old_ip);
 	mark_inode_dirty(old_ip);
 
-	new_dir->i_mtime = inode_set_ctime_current(new_dir);
+	inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
 	mark_inode_dirty(new_dir);
 
 	/* Build list of inodes modified by this transaction */
@@ -1283,7 +1284,8 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 
 	if (old_dir != new_dir) {
 		iplist[ipcount++] = new_dir;
-		old_dir->i_mtime = inode_set_ctime_current(old_dir);
+		inode_set_mtime_to_ts(old_dir,
+				      inode_set_ctime_current(old_dir));
 		mark_inode_dirty(old_dir);
 	}
 
@@ -1416,7 +1418,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 
 	mark_inode_dirty(ip);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	mark_inode_dirty(dir);
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2e2f7f6d36a0..966826c394ee 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -818,7 +818,7 @@ out:
 	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	inode_unlock(inode);
 	return len - towrite;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 8577ad494e05..0fb7afac298e 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -985,7 +985,7 @@ static const struct xattr_handler jfs_trusted_xattr_handler = {
 	.set = jfs_xattr_set,
 };
 
-const struct xattr_handler *jfs_xattr_handlers[] = {
+const struct xattr_handler * const jfs_xattr_handlers[] = {
 	&jfs_os2_xattr_handler,
 	&jfs_user_xattr_handler,
 	&jfs_security_xattr_handler,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 922719a343a7..b83054da68b3 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -151,7 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
 static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
 {
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 }
 
 static inline void set_inode_attr(struct inode *inode,
@@ -159,8 +159,8 @@ static inline void set_inode_attr(struct inode *inode,
 {
 	inode->i_uid = attrs->ia_uid;
 	inode->i_gid = attrs->ia_gid;
-	inode->i_atime = attrs->ia_atime;
-	inode->i_mtime = attrs->ia_mtime;
+	inode_set_atime_to_ts(inode, attrs->ia_atime);
+	inode_set_mtime_to_ts(inode, attrs->ia_mtime);
 	inode_set_ctime_to_ts(inode, attrs->ia_ctime);
 }
 
@@ -445,7 +445,7 @@ static const struct xattr_handler kernfs_user_xattr_handler = {
 	.set = kernfs_vfs_user_xattr_set,
 };
 
-const struct xattr_handler *kernfs_xattr_handlers[] = {
+const struct xattr_handler * const kernfs_xattr_handlers[] = {
 	&kernfs_trusted_xattr_handler,
 	&kernfs_security_xattr_handler,
 	&kernfs_user_xattr_handler,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index a9b854cdfdb5..237f2764b941 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -127,7 +127,7 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
 /*
  * inode.c
  */
-extern const struct xattr_handler *kernfs_xattr_handlers[];
+extern const struct xattr_handler * const kernfs_xattr_handlers[];
 void kernfs_evict_inode(struct inode *inode);
 int kernfs_iop_permission(struct mnt_idmap *idmap,
 			  struct inode *inode, int mask);
diff --git a/fs/libfs.c b/fs/libfs.c
index 37f2d34ee090..abe2b5a40ba1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -541,7 +541,8 @@ void simple_recursive_removal(struct dentry *dentry,
 				dput(victim);		// unpin it
 			}
 			if (victim == dentry) {
-				inode->i_mtime = inode_set_ctime_current(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_set_ctime_current(inode));
 				if (d_is_dir(dentry))
 					drop_nlink(inode);
 				inode_unlock(inode);
@@ -582,7 +583,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 	 */
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-	root->i_atime = root->i_mtime = inode_set_ctime_current(root);
+	simple_inode_init_ts(root);
 	s->s_root = d_make_root(root);
 	if (!s->s_root)
 		return -ENOMEM;
@@ -638,8 +639,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 {
 	struct inode *inode = d_inode(old_dentry);
 
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	inc_nlink(inode);
 	ihold(inode);
 	dget(dentry);
@@ -673,8 +674,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	drop_nlink(inode);
 	dput(dentry);
 	return 0;
@@ -709,9 +710,10 @@ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
 {
 	struct inode *newino = d_inode(new_dentry);
 
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
 	if (new_dir != old_dir)
-		new_dir->i_mtime = inode_set_ctime_current(new_dir);
+		inode_set_mtime_to_ts(new_dir,
+				      inode_set_ctime_current(new_dir));
 	inode_set_ctime_current(d_inode(old_dentry));
 	if (newino)
 		inode_set_ctime_current(newino);
@@ -926,7 +928,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 	 */
 	inode->i_ino = 1;
 	inode->i_mode = S_IFDIR | 0755;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	set_nlink(inode, 2);
@@ -952,7 +954,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 			goto out;
 		}
 		inode->i_mode = S_IFREG | files->mode;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_fop = files->ops;
 		inode->i_ino = i;
 		d_add(dentry, inode);
@@ -1520,7 +1522,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_flags |= S_PRIVATE;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	return inode;
 }
 EXPORT_SYMBOL(alloc_anon_inode);
@@ -1912,3 +1914,20 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
 	return direct_written + buffered_written;
 }
 EXPORT_SYMBOL_GPL(direct_write_fallback);
+
+/**
+ * simple_inode_init_ts - initialize the timestamps for a new inode
+ * @inode: inode to be initialized
+ *
+ * When a new inode is created, most filesystems set the timestamps to the
+ * current time. Add a helper to do this.
+ */
+struct timespec64 simple_inode_init_ts(struct inode *inode)
+{
+	struct timespec64 ts = inode_set_ctime_current(inode);
+
+	inode_set_atime_to_ts(inode, ts);
+	inode_set_mtime_to_ts(inode, ts);
+	return ts;
+}
+EXPORT_SYMBOL(simple_inode_init_ts);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 6579948070a4..81be07c1d3d1 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -24,7 +24,6 @@
 #include <linux/uio.h>
 #include <linux/smp.h>
 #include <linux/mutex.h>
-#include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/inetdevice.h>
 
@@ -135,11 +134,11 @@ lockd(void *vrqstp)
 	 * The main request loop. We don't terminate until the last
 	 * NFS mount or NFS daemon has gone away.
 	 */
-	while (!kthread_should_stop()) {
+	while (!svc_thread_should_stop(rqstp)) {
 		/* update sv_maxconn if it has changed */
 		rqstp->rq_server->sv_maxconn = nlm_max_connections;
 
-		nlmsvc_retry_blocked();
+		nlmsvc_retry_blocked(rqstp);
 		svc_recv(rqstp);
 	}
 	if (nlmsvc_ops)
@@ -373,7 +372,9 @@ static void lockd_put(void)
 	unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
 #endif
 
+	svc_get(nlmsvc_serv);
 	svc_set_num_threads(nlmsvc_serv, NULL, 0);
+	svc_put(nlmsvc_serv);
 	timer_delete_sync(&nlmsvc_retry);
 	nlmsvc_serv = NULL;
 	dprintk("lockd_down: service destroyed\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 43aeba9de55c..2dc10900ad1c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/lockd/nlm.h>
 #include <linux/lockd/lockd.h>
-#include <linux/kthread.h>
 #include <linux/exportfs.h>
 
 #define NLMDBG_FACILITY		NLMDBG_SVCLOCK
@@ -481,9 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	    struct nlm_host *host, struct nlm_lock *lock, int wait,
 	    struct nlm_cookie *cookie, int reclaim)
 {
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	struct inode		*inode = nlmsvc_file_inode(file);
-#endif
 	struct nlm_block	*block = NULL;
 	int			error;
 	int			mode;
@@ -497,7 +494,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_end,
 				wait);
 
-	if (nlmsvc_file_file(file)->f_op->lock) {
+	if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
 		async_block = wait;
 		wait = 0;
 	}
@@ -543,6 +540,25 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
+	spin_lock(&nlm_blocked_lock);
+	/*
+	 * If this is a lock request for an already pending
+	 * lock request we return nlm_lck_blocked without calling
+	 * vfs_lock_file() again. Otherwise we have two pending
+	 * requests on the underlaying ->lock() implementation but
+	 * only one nlm_block to being granted by lm_grant().
+	 */
+	if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) &&
+	    !list_empty(&block->b_list)) {
+		spin_unlock(&nlm_blocked_lock);
+		ret = nlm_lck_blocked;
+		goto out;
+	}
+
+	/* Append to list of blocked */
+	nlmsvc_insert_block_locked(block, NLM_NEVER);
+	spin_unlock(&nlm_blocked_lock);
+
 	if (!wait)
 		lock->fl.fl_flags &= ~FL_SLEEP;
 	mode = lock_to_openmode(&lock->fl);
@@ -552,16 +568,12 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	dprintk("lockd: vfs_lock_file returned %d\n", error);
 	switch (error) {
 		case 0:
+			nlmsvc_remove_block(block);
 			ret = nlm_granted;
 			goto out;
 		case -EAGAIN:
-			/*
-			 * If this is a blocking request for an
-			 * already pending lock request then we need
-			 * to put it back on lockd's block list
-			 */
-			if (wait)
-				break;
+			if (!wait)
+				nlmsvc_remove_block(block);
 			ret = async_block ? nlm_lck_blocked : nlm_lck_denied;
 			goto out;
 		case FILE_LOCK_DEFERRED:
@@ -572,17 +584,16 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			ret = nlmsvc_defer_lock_rqst(rqstp, block);
 			goto out;
 		case -EDEADLK:
+			nlmsvc_remove_block(block);
 			ret = nlm_deadlock;
 			goto out;
 		default:			/* includes ENOLCK */
+			nlmsvc_remove_block(block);
 			ret = nlm_lck_denied_nolocks;
 			goto out;
 	}
 
 	ret = nlm_lck_blocked;
-
-	/* Append to list of blocked */
-	nlmsvc_insert_block(block, NLM_NEVER);
 out:
 	mutex_unlock(&file->f_mutex);
 	nlmsvc_release_block(block);
@@ -1020,13 +1031,13 @@ retry_deferred_block(struct nlm_block *block)
  * be retransmitted.
  */
 void
-nlmsvc_retry_blocked(void)
+nlmsvc_retry_blocked(struct svc_rqst *rqstp)
 {
 	unsigned long	timeout = MAX_SCHEDULE_TIMEOUT;
 	struct nlm_block *block;
 
 	spin_lock(&nlm_blocked_lock);
-	while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
+	while (!list_empty(&nlm_blocked) && !svc_thread_should_stop(rqstp)) {
 		block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
 
 		if (block->b_when == NLM_NEVER)
diff --git a/fs/locks.c b/fs/locks.c
index 76ad05f8070a..d4e49a990a8d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2264,11 +2264,13 @@ out:
  * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
  * locks, the ->lock() interface may return asynchronously, before the lock has
  * been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Callers expecting ->lock() to return asynchronously
- * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
- * the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
- * request completes.
+ * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
+ * flags need to be set.
+ *
+ * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
+ * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
+ * blocking lock. When ->lock() does return asynchronously, it must return
+ * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
  * If the request is for non-blocking lock the file system should return
  * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
  * with the result. If the request timed out the callback routine will return a
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 25c08fbfcb9d..7da66ca184f4 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
 	}
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_ino = j;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_blocks = 0;
 	memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
 	insert_inode_hash(inode);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 20f23e6e58ad..62c313fc9a49 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -281,7 +281,7 @@ got_it:
 		de->inode = inode->i_ino;
 	}
 	dir_commit_chunk(page, pos, sbi->s_dirsize);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	err = minix_handle_dirsync(dir);
 out_put:
@@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
 	else
 		de->inode = 0;
 	dir_commit_chunk(page, pos, len);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	return minix_handle_dirsync(inode);
 }
@@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
 	else
 		de->inode = inode->i_ino;
 	dir_commit_chunk(page, pos, sbi->s_dirsize);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	return minix_handle_dirsync(dir);
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df575473c1cc..f8af6c3ae336 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -501,7 +501,8 @@ static struct inode *V1_minix_iget(struct inode *inode)
 	i_gid_write(inode, raw_inode->i_gid);
 	set_nlink(inode, raw_inode->i_nlinks);
 	inode->i_size = raw_inode->i_size;
-	inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime(inode, raw_inode->i_time, 0)));
 	inode->i_blocks = 0;
 	for (i = 0; i < 9; i++)
 		minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
@@ -538,11 +539,9 @@ static struct inode *V2_minix_iget(struct inode *inode)
 	i_gid_write(inode, raw_inode->i_gid);
 	set_nlink(inode, raw_inode->i_nlinks);
 	inode->i_size = raw_inode->i_size;
-	inode->i_mtime.tv_sec = raw_inode->i_mtime;
-	inode->i_atime.tv_sec = raw_inode->i_atime;
+	inode_set_mtime(inode, raw_inode->i_mtime, 0);
+	inode_set_atime(inode, raw_inode->i_atime, 0);
 	inode_set_ctime(inode, raw_inode->i_ctime, 0);
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_nsec = 0;
 	inode->i_blocks = 0;
 	for (i = 0; i < 10; i++)
 		minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
@@ -589,7 +588,7 @@ static struct buffer_head * V1_minix_update_inode(struct inode * inode)
 	raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
 	raw_inode->i_nlinks = inode->i_nlink;
 	raw_inode->i_size = inode->i_size;
-	raw_inode->i_time = inode->i_mtime.tv_sec;
+	raw_inode->i_time = inode_get_mtime_sec(inode);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
 	else for (i = 0; i < 9; i++)
@@ -616,9 +615,9 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
 	raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
 	raw_inode->i_nlinks = inode->i_nlink;
 	raw_inode->i_size = inode->i_size;
-	raw_inode->i_mtime = inode->i_mtime.tv_sec;
-	raw_inode->i_atime = inode->i_atime.tv_sec;
-	raw_inode->i_ctime = inode_get_ctime(inode).tv_sec;
+	raw_inode->i_mtime = inode_get_mtime_sec(inode);
+	raw_inode->i_atime = inode_get_atime_sec(inode);
+	raw_inode->i_ctime = inode_get_ctime_sec(inode);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
 	else for (i = 0; i < 10; i++)
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index ce18ae37c29d..dad131e30c05 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -350,7 +350,7 @@ do_indirects:
 		}
 		first_whole++;
 	}
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index 94565bd7e73f..71c13b2990b4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3105,25 +3105,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 EXPORT_SYMBOL(unlock_rename);
 
 /**
- * mode_strip_umask - handle vfs umask stripping
- * @dir:	parent directory of the new inode
- * @mode:	mode of the new inode to be created in @dir
- *
- * Umask stripping depends on whether or not the filesystem supports POSIX
- * ACLs. If the filesystem doesn't support it umask stripping is done directly
- * in here. If the filesystem does support POSIX ACLs umask stripping is
- * deferred until the filesystem calls posix_acl_create().
- *
- * Returns: mode
- */
-static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
-{
-	if (!IS_POSIXACL(dir))
-		mode &= ~current_umask();
-	return mode;
-}
-
-/**
  * vfs_prepare_mode - prepare the mode to be used for a new inode
  * @idmap:	idmap of the mount the inode was found from
  * @dir:	parent directory of the new inode
@@ -3536,7 +3517,8 @@ static const char *open_last_lookups(struct nameidata *nd,
 		if (likely(dentry))
 			goto finish_lookup;
 
-		BUG_ON(nd->flags & LOOKUP_RCU);
+		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
+			return ERR_PTR(-ECHILD);
 	} else {
 		/* create side of things */
 		if (nd->flags & LOOKUP_RCU) {
@@ -3803,7 +3785,10 @@ static struct file *path_openat(struct nameidata *nd,
 		WARN_ON(1);
 		error = -EINVAL;
 	}
-	fput(file);
+	if (unlikely(file->f_mode & FMODE_OPENED))
+		fput(file);
+	else
+		release_empty_file(file);
 	if (error == -EOPENSTALE) {
 		if (flags & LOOKUP_RCU)
 			error = -ECHILD;
@@ -4387,11 +4372,9 @@ retry_deleg:
 	if (!IS_ERR(dentry)) {
 
 		/* Why not before? Because we want correct error value */
-		if (last.name[last.len])
+		if (last.name[last.len] || d_is_negative(dentry))
 			goto slashes;
 		inode = dentry->d_inode;
-		if (d_is_negative(dentry))
-			goto slashes;
 		ihold(inode);
 		error = security_path_unlink(&path, dentry);
 		if (error)
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..6bde71735efa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
  * can determine when writes are able to occur to a filesystem.
  */
 /**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mnt it read-write) before
  * returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
  * called. This is effectively a refcount.
  */
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int ret = 0;
@@ -386,6 +386,7 @@ int __mnt_want_write(struct vfsmount *m)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mnt_get_write_access);
 
 /**
  * mnt_want_write - get write access to a mount
@@ -401,7 +402,7 @@ int mnt_want_write(struct vfsmount *m)
 	int ret;
 
 	sb_start_write(m->mnt_sb);
-	ret = __mnt_want_write(m);
+	ret = mnt_get_write_access(m);
 	if (ret)
 		sb_end_write(m->mnt_sb);
 	return ret;
@@ -409,15 +410,15 @@ int mnt_want_write(struct vfsmount *m)
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
 /**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
  * skips incrementing mnt_writers (since the open file already has a reference)
  * and instead only does the check for emergency r/o remounts.  This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
  */
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
 {
 	if (file->f_mode & FMODE_WRITER) {
 		/*
@@ -428,7 +429,7 @@ int __mnt_want_write_file(struct file *file)
 			return -EROFS;
 		return 0;
 	}
-	return __mnt_want_write(file->f_path.mnt);
+	return mnt_get_write_access(file->f_path.mnt);
 }
 
 /**
@@ -445,7 +446,7 @@ int mnt_want_write_file(struct file *file)
 	int ret;
 
 	sb_start_write(file_inode(file)->i_sb);
-	ret = __mnt_want_write_file(file);
+	ret = mnt_get_write_access_file(file);
 	if (ret)
 		sb_end_write(file_inode(file)->i_sb);
 	return ret;
@@ -453,19 +454,20 @@ int mnt_want_write_file(struct file *file)
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 
 /**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
  */
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(mnt_put_write_access);
 
 /**
  * mnt_drop_write - give up write access to a mount
@@ -477,20 +479,20 @@ void __mnt_drop_write(struct vfsmount *mnt)
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
 {
 	if (!(file->f_mode & FMODE_WRITER))
-		__mnt_drop_write(file->f_path.mnt);
+		mnt_put_write_access(file->f_path.mnt);
 }
 
 void mnt_drop_write_file(struct file *file)
 {
-	__mnt_drop_write_file(file);
+	mnt_put_write_access_file(file);
 	sb_end_write(file_inode(file)->i_sb);
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
@@ -1344,9 +1346,9 @@ void mntput(struct vfsmount *mnt)
 {
 	if (mnt) {
 		struct mount *m = real_mount(mnt);
-		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+		/* avoid cacheline pingpong */
 		if (unlikely(m->mnt_expiry_mark))
-			m->mnt_expiry_mark = 0;
+			WRITE_ONCE(m->mnt_expiry_mark, 0);
 		mntput_no_expire(m);
 	}
 }
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 716bc75e9ed2..b4294a8aa2d4 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
 	struct pnfs_block_dev		*children;
 	u64				chunk_size;
 
-	struct block_device		*bdev;
+	struct bdev_handle		*bdev_handle;
 	u64				disk_offset;
 
 	u64				pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 65cbb5607a5f..f318a05a80e1 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
 	} else {
 		if (dev->pr_registered) {
 			const struct pr_ops *ops =
-				dev->bdev->bd_disk->fops->pr_ops;
+				dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
 			int error;
 
-			error = ops->pr_register(dev->bdev, dev->pr_key, 0,
-				false);
+			error = ops->pr_register(dev->bdev_handle->bdev,
+				dev->pr_key, 0, false);
 			if (error)
 				pr_err("failed to unregister PR key.\n");
 		}
 
-		if (dev->bdev)
-			blkdev_put(dev->bdev, NULL);
+		if (dev->bdev_handle)
+			bdev_release(dev->bdev_handle);
 	}
 }
 
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
 	map->start = dev->start;
 	map->len = dev->len;
 	map->disk_offset = dev->disk_offset;
-	map->bdev = dev->bdev;
+	map->bdev = dev->bdev_handle->bdev;
 	return true;
 }
 
@@ -236,28 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	dev_t dev;
 
 	dev = bl_resolve_deviceid(server, v, gfp_mask);
 	if (!dev)
 		return -EIO;
 
-	bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
-				 NULL);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				       NULL, NULL);
+	if (IS_ERR(bdev_handle)) {
 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
-			MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
-		return PTR_ERR(bdev);
+			MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
+		return PTR_ERR(bdev_handle);
 	}
-	d->bdev = bdev;
-
-
-	d->len = bdev_nr_bytes(d->bdev);
+	d->bdev_handle = bdev_handle;
+	d->len = bdev_nr_bytes(bdev_handle->bdev);
 	d->map = bl_map_simple;
 
 	printk(KERN_INFO "pNFS: using block device %s\n",
-		d->bdev->bd_disk->disk_name);
+		bdev_handle->bdev->bd_disk->disk_name);
 	return 0;
 }
 
@@ -302,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
 	}
 }
 
-static struct block_device *
+static struct bdev_handle *
 bl_open_path(struct pnfs_block_volume *v, const char *prefix)
 {
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	const char *devname;
 
 	devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -313,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
 	if (!devname)
 		return ERR_PTR(-ENOMEM);
 
-	bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
-				  NULL);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+					NULL, NULL);
+	if (IS_ERR(bdev_handle)) {
 		pr_warn("pNFS: failed to open device %s (%ld)\n",
-			devname, PTR_ERR(bdev));
+			devname, PTR_ERR(bdev_handle));
 	}
 
 	kfree(devname);
-	return bdev;
+	return bdev_handle;
 }
 
 static int
@@ -329,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	const struct pr_ops *ops;
 	int error;
 
@@ -342,32 +340,32 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	 * On other distributions like Debian, the default SCSI by-id path will
 	 * point to the dm-multipath device if one exists.
 	 */
-	bdev = bl_open_path(v, "dm-uuid-mpath-0x");
-	if (IS_ERR(bdev))
-		bdev = bl_open_path(v, "wwn-0x");
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
-	d->bdev = bdev;
-
-	d->len = bdev_nr_bytes(d->bdev);
+	bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
+	if (IS_ERR(bdev_handle))
+		bdev_handle = bl_open_path(v, "wwn-0x");
+	if (IS_ERR(bdev_handle))
+		return PTR_ERR(bdev_handle);
+	d->bdev_handle = bdev_handle;
+
+	d->len = bdev_nr_bytes(d->bdev_handle->bdev);
 	d->map = bl_map_simple;
 	d->pr_key = v->scsi.pr_key;
 
 	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
-		d->bdev->bd_disk->disk_name, d->pr_key);
+		d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
 
-	ops = d->bdev->bd_disk->fops->pr_ops;
+	ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
 	if (!ops) {
 		pr_err("pNFS: block device %s does not support reservations.",
-				d->bdev->bd_disk->disk_name);
+				d->bdev_handle->bdev->bd_disk->disk_name);
 		error = -EINVAL;
 		goto out_blkdev_put;
 	}
 
-	error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+	error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
 	if (error) {
 		pr_err("pNFS: failed to register key for block device %s.",
-				d->bdev->bd_disk->disk_name);
+				d->bdev_handle->bdev->bd_disk->disk_name);
 		goto out_blkdev_put;
 	}
 
@@ -375,7 +373,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	return 0;
 
 out_blkdev_put:
-	blkdev_put(d->bdev, NULL);
+	bdev_release(d->bdev_handle);
 	return error;
 }
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 466ebf1d41b2..4ffa1f469e90 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	while (!kthread_freezable_should_stop(NULL))
+	while (!svc_thread_should_stop(rqstp))
 		svc_recv(rqstp);
 
 	svc_exit_thread(rqstp);
@@ -86,45 +86,6 @@ nfs4_callback_svc(void *vrqstp)
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/*
- * The callback service for NFSv4.1 callbacks
- */
-static int
-nfs41_callback_svc(void *vrqstp)
-{
-	struct svc_rqst *rqstp = vrqstp;
-	struct svc_serv *serv = rqstp->rq_server;
-	struct rpc_rqst *req;
-	int error;
-	DEFINE_WAIT(wq);
-
-	set_freezable();
-
-	while (!kthread_freezable_should_stop(NULL)) {
-		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
-		spin_lock_bh(&serv->sv_cb_lock);
-		if (!list_empty(&serv->sv_cb_list)) {
-			req = list_first_entry(&serv->sv_cb_list,
-					struct rpc_rqst, rq_bc_list);
-			list_del(&req->rq_bc_list);
-			spin_unlock_bh(&serv->sv_cb_lock);
-			finish_wait(&serv->sv_cb_waitq, &wq);
-			dprintk("Invoking bc_svc_process()\n");
-			error = bc_svc_process(serv, req, rqstp);
-			dprintk("bc_svc_process() returned w/ error code= %d\n",
-				error);
-		} else {
-			spin_unlock_bh(&serv->sv_cb_lock);
-			if (!kthread_should_stop())
-				schedule();
-			finish_wait(&serv->sv_cb_waitq, &wq);
-		}
-	}
-
-	svc_exit_thread(rqstp);
-	return 0;
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct svc_serv *serv)
 {
@@ -237,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 			cb_info->users);
 
 	threadfn = nfs4_callback_svc;
-#if defined(CONFIG_NFS_V4_1)
-	if (minorversion)
-		threadfn = nfs41_callback_svc;
-#else
+#if !defined(CONFIG_NFS_V4_1)
 	if (minorversion)
 		return ERR_PTR(-ENOTSUPP);
 #endif
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 6bed1394d748..96a4923080ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -60,7 +60,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
 	if (nfs_have_writebacks(inode))
 		res->change_attr++;
 	res->ctime = inode_get_ctime(inode);
-	res->mtime = inode->i_mtime;
+	res->mtime = inode_get_mtime(inode);
 	res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
 		args->bitmap[0];
 	res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index aed0748fd6ec..c7bb5da93307 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr {
 	u32				stripe_count;
 	u8				*stripe_indices;
 	u32				ds_num;
-	struct nfs4_pnfs_ds		*ds_list[];
+	struct nfs4_pnfs_ds		*ds_list[] __counted_by(ds_num);
 };
 
 struct nfs4_filelayout_segment {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 354a031c69b1..f84b3fb0dddd 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment {
 	u64				stripe_unit;
 	u32				flags;
 	u32				mirror_array_cnt;
-	struct nfs4_ff_layout_mirror	*mirror_array[];
+	struct nfs4_ff_layout_mirror	*mirror_array[] __counted_by(mirror_array_cnt);
 };
 
 struct nfs4_flexfile_layout {
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 2dc64454492b..5407ab8c8783 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -114,8 +114,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
 					      struct inode *inode)
 {
 	memset(auxdata, 0, sizeof(*auxdata));
-	auxdata->mtime_sec  = inode->i_mtime.tv_sec;
-	auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
+	auxdata->mtime_sec  = inode_get_mtime(inode).tv_sec;
+	auxdata->mtime_nsec = inode_get_mtime(inode).tv_nsec;
 	auxdata->ctime_sec  = inode_get_ctime(inode).tv_sec;
 	auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e21c073158e5..ebb8d60e1152 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -512,8 +512,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		} else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
-		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
-		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+		inode_set_atime(inode, 0, 0);
+		inode_set_mtime(inode, 0, 0);
 		inode_set_ctime(inode, 0, 0);
 		inode_set_iversion_raw(inode, 0);
 		inode->i_size = 0;
@@ -527,11 +527,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
 		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-			inode->i_atime = fattr->atime;
+			inode_set_atime_to_ts(inode, fattr->atime);
 		else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-			inode->i_mtime = fattr->mtime;
+			inode_set_mtime_to_ts(inode, fattr->mtime);
 		else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
@@ -742,9 +742,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
 		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME
 				| NFS_INO_INVALID_CTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-			inode->i_atime = fattr->atime;
+			inode_set_atime_to_ts(inode, fattr->atime);
 		else if (attr->ia_valid & ATTR_ATIME_SET)
-			inode->i_atime = attr->ia_atime;
+			inode_set_atime_to_ts(inode, attr->ia_atime);
 		else
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
 
@@ -758,9 +758,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
 		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME
 				| NFS_INO_INVALID_CTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-			inode->i_mtime = fattr->mtime;
+			inode_set_mtime_to_ts(inode, fattr->mtime);
 		else if (attr->ia_valid & ATTR_MTIME_SET)
-			inode->i_mtime = attr->ia_mtime;
+			inode_set_mtime_to_ts(inode, attr->ia_mtime);
 		else
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
 
@@ -1451,11 +1451,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		inode_set_ctime_to_ts(inode, fattr->ctime);
 	}
 
-	ts = inode->i_mtime;
+	ts = inode_get_mtime(inode);
 	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
 			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			&& timespec64_equal(&ts, &fattr->pre_mtime)) {
-		inode->i_mtime = fattr->mtime;
+		inode_set_mtime_to_ts(inode, fattr->mtime);
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
 			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
@@ -1506,7 +1506,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 		if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
 			invalid |= NFS_INO_INVALID_CHANGE;
 
-		ts = inode->i_mtime;
+		ts = inode_get_mtime(inode);
 		if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
 			invalid |= NFS_INO_INVALID_MTIME;
 
@@ -1534,7 +1534,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
 		invalid |= NFS_INO_INVALID_NLINK;
 
-	ts = inode->i_atime;
+	ts = inode_get_atime(inode);
 	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
 		invalid |= NFS_INO_INVALID_ATIME;
 
@@ -2002,7 +2002,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
 			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
-		fattr->pre_mtime = inode->i_mtime;
+		fattr->pre_mtime = inode_get_mtime(inode);
 		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
@@ -2184,7 +2184,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-		inode->i_mtime = fattr->mtime;
+		inode_set_mtime_to_ts(inode, fattr->mtime);
 	else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_MTIME;
@@ -2220,7 +2220,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			save_cache_validity & NFS_INO_INVALID_SIZE;
 
 	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-		inode->i_atime = fattr->atime;
+		inode_set_atime_to_ts(inode, fattr->atime);
 	else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_ATIME;
diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h
index 5ba00610aede..0d3ce0460e35 100644
--- a/fs/nfs/nfs.h
+++ b/fs/nfs/nfs.h
@@ -18,7 +18,7 @@ struct nfs_subversion {
 	const struct rpc_version *rpc_vers;	/* NFS version information */
 	const struct nfs_rpc_ops *rpc_ops;	/* NFS operations */
 	const struct super_operations *sops;	/* NFS Super operations */
-	const struct xattr_handler **xattr;	/* NFS xattr handlers */
+	const struct xattr_handler * const *xattr;	/* NFS xattr handlers */
 	struct list_head list;		/* List of NFS versions */
 };
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 47c5c1f86d66..827d00e2f094 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -315,7 +315,7 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *,
 						    struct nfs_fh *,
 						    struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
-extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern const struct xattr_handler * const nfs4_xattr_handlers[];
 extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
 		const struct nfs_open_context *ctx,
 		const struct nfs_lock_context *l_ctx,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5ee283eb9660..a654d7234f51 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -10737,7 +10737,7 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
 };
 #endif
 
-const struct xattr_handler *nfs4_xattr_handlers[] = {
+const struct xattr_handler * const nfs4_xattr_handlers[] = {
 	&nfs4_xattr_nfs4_acl_handler,
 #if defined(CONFIG_NFS_V4_1)
 	&nfs4_xattr_nfs4_dacl_handler,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d6473cb00cb..9b1cfca8112a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1071,7 +1071,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
 		sb->s_export_op = &nfs_export_ops;
 		break;
 	case 4:
-		sb->s_flags |= SB_POSIXACL;
+		sb->s_iflags |= SB_I_NOUMASK;
 		sb->s_time_gran = 1;
 		sb->s_time_min = S64_MIN;
 		sb->s_time_max = S64_MAX;
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 6fffc8f03f74..b8736a82e57c 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,7 +12,8 @@ nfsd-y			+= trace.o
 
 nfsd-y 			+= nfssvc.o nfsctl.o nfsfh.o vfs.o \
 			   export.o auth.o lockd.o nfscache.o \
-			   stats.o filecache.o nfs3proc.o nfs3xdr.o
+			   stats.o filecache.o nfs3proc.o nfs3xdr.o \
+			   netlink.o
 nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 01d7fd108cf3..46fd74d91ea9 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -117,12 +117,13 @@ static __be32
 nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
 		struct iomap *iomaps, int nr_iomaps)
 {
+	struct timespec64 mtime = inode_get_mtime(inode);
 	loff_t new_size = lcp->lc_last_wr + 1;
 	struct iattr iattr = { .ia_valid = 0 };
 	int error;
 
 	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
-	    timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+	    timespec64_compare(&lcp->lc_mtime, &mtime) < 0)
 		lcp->lc_mtime = current_time(inode);
 	iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
 	iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 1ed2f691ebb9..ce78f74715ee 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -16,9 +16,9 @@
 
 __be32
 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
-		struct nfsd4_layoutget *lgp)
+		const struct nfsd4_layoutget *lgp)
 {
-	struct pnfs_block_extent *b = lgp->lg_content;
+	const struct pnfs_block_extent *b = lgp->lg_content;
 	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
 	__be32 *p;
 
@@ -77,7 +77,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 
 __be32
 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
-		struct nfsd4_getdeviceinfo *gdp)
+		const struct nfsd4_getdeviceinfo *gdp)
 {
 	struct pnfs_block_deviceaddr *dev = gdp->gd_device;
 	int len = sizeof(__be32), ret, i;
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index bc5166bfe46b..b0361e8aa9a7 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -51,9 +51,9 @@ struct pnfs_block_deviceaddr {
 };
 
 __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
-		struct nfsd4_getdeviceinfo *gdp);
+		const struct nfsd4_getdeviceinfo *gdp);
 __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
-		struct nfsd4_layoutget *lgp);
+		const struct nfsd4_layoutget *lgp);
 int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 		u32 block_size);
 int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 11a0eaa2f914..b7da17e53007 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -339,12 +339,16 @@ static int export_stats_init(struct export_stats *stats)
 
 static void export_stats_reset(struct export_stats *stats)
 {
-	nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
+	if (stats)
+		nfsd_percpu_counters_reset(stats->counter,
+					   EXP_STATS_COUNTERS_NUM);
 }
 
 static void export_stats_destroy(struct export_stats *stats)
 {
-	nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
+	if (stats)
+		nfsd_percpu_counters_destroy(stats->counter,
+					     EXP_STATS_COUNTERS_NUM);
 }
 
 static void svc_export_put(struct kref *ref)
@@ -353,7 +357,8 @@ static void svc_export_put(struct kref *ref)
 	path_put(&exp->ex_path);
 	auth_domain_put(exp->ex_client);
 	nfsd4_fslocs_free(&exp->ex_fslocs);
-	export_stats_destroy(&exp->ex_stats);
+	export_stats_destroy(exp->ex_stats);
+	kfree(exp->ex_stats);
 	kfree(exp->ex_uuid);
 	kfree_rcu(exp, ex_rcu);
 }
@@ -767,13 +772,15 @@ static int svc_export_show(struct seq_file *m,
 	seq_putc(m, '\t');
 	seq_escape(m, exp->ex_client->name, " \t\n\\");
 	if (export_stats) {
-		seq_printf(m, "\t%lld\n", exp->ex_stats.start_time);
+		struct percpu_counter *counter = exp->ex_stats->counter;
+
+		seq_printf(m, "\t%lld\n", exp->ex_stats->start_time);
 		seq_printf(m, "\tfh_stale: %lld\n",
-			   percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE]));
+			   percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE]));
 		seq_printf(m, "\tio_read: %lld\n",
-			   percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ]));
+			   percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ]));
 		seq_printf(m, "\tio_write: %lld\n",
-			   percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE]));
+			   percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE]));
 		seq_putc(m, '\n');
 		return 0;
 	}
@@ -819,7 +826,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_layout_types = 0;
 	new->ex_uuid = NULL;
 	new->cd = item->cd;
-	export_stats_reset(&new->ex_stats);
+	export_stats_reset(new->ex_stats);
 }
 
 static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -856,7 +863,14 @@ static struct cache_head *svc_export_alloc(void)
 	if (!i)
 		return NULL;
 
-	if (export_stats_init(&i->ex_stats)) {
+	i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL);
+	if (!i->ex_stats) {
+		kfree(i);
+		return NULL;
+	}
+
+	if (export_stats_init(i->ex_stats)) {
+		kfree(i->ex_stats);
 		kfree(i);
 		return NULL;
 	}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 2df8ae25aad3..ca9dc230ae3d 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -64,10 +64,10 @@ struct svc_export {
 	struct cache_head	h;
 	struct auth_domain *	ex_client;
 	int			ex_flags;
+	int			ex_fsid;
 	struct path		ex_path;
 	kuid_t			ex_anon_uid;
 	kgid_t			ex_anon_gid;
-	int			ex_fsid;
 	unsigned char *		ex_uuid; /* 16 byte fsid */
 	struct nfsd4_fs_locations ex_fslocs;
 	uint32_t		ex_nflavors;
@@ -76,8 +76,8 @@ struct svc_export {
 	struct nfsd4_deviceid_map *ex_devid_map;
 	struct cache_detail	*cd;
 	struct rcu_head		ex_rcu;
-	struct export_stats	ex_stats;
 	unsigned long		ex_xprtsec_modes;
+	struct export_stats	*ex_stats;
 };
 
 /* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ee9c923192e0..07bf219f9ae4 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -989,22 +989,21 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_file *new, *nf;
-	const struct cred *cred;
+	bool stale_retry = true;
 	bool open_retry = true;
 	struct inode *inode;
 	__be32 status;
 	int ret;
 
+retry:
 	status = fh_verify(rqstp, fhp, S_IFREG,
 				may_flags|NFSD_MAY_OWNER_OVERRIDE);
 	if (status != nfs_ok)
 		return status;
 	inode = d_inode(fhp->fh_dentry);
-	cred = get_current_cred();
 
-retry:
 	rcu_read_lock();
-	nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+	nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
 	rcu_read_unlock();
 
 	if (nf) {
@@ -1026,7 +1025,7 @@ retry:
 
 	rcu_read_lock();
 	spin_lock(&inode->i_lock);
-	nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+	nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
 	if (unlikely(nf)) {
 		spin_unlock(&inode->i_lock);
 		rcu_read_unlock();
@@ -1058,6 +1057,7 @@ wait_for_construction:
 			goto construction_err;
 		}
 		open_retry = false;
+		fh_put(fhp);
 		goto retry;
 	}
 	this_cpu_inc(nfsd_file_cache_hits);
@@ -1074,7 +1074,6 @@ out:
 		nfsd_file_check_write_error(nf);
 		*pnf = nf;
 	}
-	put_cred(cred);
 	trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status);
 	return status;
 
@@ -1088,8 +1087,20 @@ open_file:
 			status = nfs_ok;
 			trace_nfsd_file_opened(nf, status);
 		} else {
-			status = nfsd_open_verified(rqstp, fhp, may_flags,
-						    &nf->nf_file);
+			ret = nfsd_open_verified(rqstp, fhp, may_flags,
+						 &nf->nf_file);
+			if (ret == -EOPENSTALE && stale_retry) {
+				stale_retry = false;
+				nfsd_file_unhash(nf);
+				clear_and_wake_up_bit(NFSD_FILE_PENDING,
+						      &nf->nf_flags);
+				if (refcount_dec_and_test(&nf->nf_ref))
+					nfsd_file_free(nf);
+				nf = NULL;
+				fh_put(fhp);
+				goto retry;
+			}
+			status = nfserrno(ret);
 			trace_nfsd_file_open(nf, status);
 		}
 	} else
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
index bb205328e043..aeb71c10ff1b 100644
--- a/fs/nfsd/flexfilelayoutxdr.c
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -17,9 +17,9 @@ struct ff_idmap {
 
 __be32
 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
-		struct nfsd4_layoutget *lgp)
+		const struct nfsd4_layoutget *lgp)
 {
-	struct pnfs_ff_layout *fl = lgp->lg_content;
+	const struct pnfs_ff_layout *fl = lgp->lg_content;
 	int len, mirror_len, ds_len, fh_len;
 	__be32 *p;
 
@@ -77,7 +77,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
 
 __be32
 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
-		struct nfsd4_getdeviceinfo *gdp)
+		const struct nfsd4_getdeviceinfo *gdp)
 {
 	struct pnfs_ff_device_addr *da = gdp->gd_device;
 	int len;
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h
index 8e195aeca023..6d5a1066a903 100644
--- a/fs/nfsd/flexfilelayoutxdr.h
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -43,8 +43,8 @@ struct pnfs_ff_layout {
 };
 
 __be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
-		struct nfsd4_getdeviceinfo *gdp);
+		const struct nfsd4_getdeviceinfo *gdp);
 __be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
-		struct nfsd4_layoutget *lgp);
+		const struct nfsd4_layoutget *lgp);
 
 #endif /* _NFSD_FLEXFILELAYOUTXDR_H */
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
new file mode 100644
index 000000000000..0e1d635ec5f9
--- /dev/null
+++ b/fs/nfsd/netlink.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netlink.h"
+
+#include <uapi/linux/nfsd_netlink.h>
+
+/* Ops table for nfsd */
+static const struct genl_split_ops nfsd_nl_ops[] = {
+	{
+		.cmd	= NFSD_CMD_RPC_STATUS_GET,
+		.start	= nfsd_nl_rpc_status_get_start,
+		.dumpit	= nfsd_nl_rpc_status_get_dumpit,
+		.done	= nfsd_nl_rpc_status_get_done,
+		.flags	= GENL_CMD_CAP_DUMP,
+	},
+};
+
+struct genl_family nfsd_nl_family __ro_after_init = {
+	.name		= NFSD_FAMILY_NAME,
+	.version	= NFSD_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= nfsd_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(nfsd_nl_ops),
+};
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
new file mode 100644
index 000000000000..d83dd6bdee92
--- /dev/null
+++ b/fs/nfsd/netlink.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_NFSD_GEN_H
+#define _LINUX_NFSD_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/nfsd_netlink.h>
+
+int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
+int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
+
+int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
+				  struct netlink_callback *cb);
+
+extern struct genl_family nfsd_nl_family;
+
+#endif /* _LINUX_NFSD_GEN_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 268ef57751c4..b78eceebd945 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -171,7 +171,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
 	 * + 1 (xdr opaque byte count) = 26
 	 */
 	resp->count = argp->count;
-	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3) << 2) +
+			 resp->count + 4);
 
 	fh_copy(&resp->fh, &argp->fh);
 	resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
@@ -194,7 +195,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
 				SVCFH_fmt(&argp->fh),
 				argp->len,
 				(unsigned long long) argp->offset,
-				argp->stable? " stable" : "");
+				argp->stable ? " stable" : "");
 
 	resp->status = nfserr_fbig;
 	if (argp->offset > (u64)OFFSET_MAX ||
@@ -294,8 +295,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			status = nfserr_exist;
 			break;
 		case NFS3_CREATE_EXCLUSIVE:
-			if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
-			    d_inode(child)->i_atime.tv_sec == v_atime &&
+			if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+			    inode_get_atime_sec(d_inode(child)) == v_atime &&
 			    d_inode(child)->i_size == 0) {
 				break;
 			}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4039ffcf90ba..92bc109dabe6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -84,7 +84,21 @@ static void encode_uint32(struct xdr_stream *xdr, u32 n)
 static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
 			   size_t len)
 {
-	WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
+	xdr_stream_encode_uint32_array(xdr, bitmap, len);
+}
+
+static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap,
+				struct nfs4_cb_fattr *fattr)
+{
+	fattr->ncf_cb_change = 0;
+	fattr->ncf_cb_fsize = 0;
+	if (bitmap[0] & FATTR4_WORD0_CHANGE)
+		if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0)
+			return -NFSERR_BAD_XDR;
+	if (bitmap[0] & FATTR4_WORD0_SIZE)
+		if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0)
+			return -NFSERR_BAD_XDR;
+	return 0;
 }
 
 /*
@@ -358,6 +372,30 @@ encode_cb_recallany4args(struct xdr_stream *xdr,
 }
 
 /*
+ * CB_GETATTR4args
+ *	struct CB_GETATTR4args {
+ *	   nfs_fh4 fh;
+ *	   bitmap4 attr_request;
+ *	};
+ *
+ * The size and change attributes are the only one
+ * guaranteed to be serviced by the client.
+ */
+static void
+encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
+			struct nfs4_cb_fattr *fattr)
+{
+	struct nfs4_delegation *dp =
+		container_of(fattr, struct nfs4_delegation, dl_cb_fattr);
+	struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR);
+	encode_nfs_fh4(xdr, fh);
+	encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap));
+	hdr->nops++;
+}
+
+/*
  * CB_SEQUENCE4args
  *
  *	struct CB_SEQUENCE4args {
@@ -493,6 +531,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
 }
 
 /*
+ * 20.1.  Operation 3: CB_GETATTR - Get Attributes
+ */
+static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req,
+		struct xdr_stream *xdr, const void *data)
+{
+	const struct nfsd4_callback *cb = data;
+	struct nfs4_cb_fattr *ncf =
+		container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_clp->cl_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_getattr4args(xdr, &hdr, ncf);
+	encode_cb_nops(&hdr);
+}
+
+/*
  * 20.2. Operation 4: CB_RECALL - Recall a Delegation
  */
 static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -548,6 +606,42 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
 }
 
 /*
+ * 20.1.  Operation 3: CB_GETATTR - Get Attributes
+ */
+static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  void *data)
+{
+	struct nfsd4_callback *cb = data;
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+	u32 bitmap[3] = {0};
+	u32 attrlen;
+	struct nfs4_cb_fattr *ncf =
+		container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
+	status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status);
+	if (status)
+		return status;
+	if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0)
+		return -NFSERR_BAD_XDR;
+	if (xdr_stream_decode_u32(xdr, &attrlen) < 0)
+		return -NFSERR_BAD_XDR;
+	if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize)))
+		return -NFSERR_BAD_XDR;
+	status = decode_cb_fattr4(xdr, bitmap, ncf);
+	return status;
+}
+
+/*
  * 20.2. Operation 4: CB_RECALL - Recall a Delegation
  */
 static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
@@ -855,6 +949,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
 	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
 	PROC(CB_OFFLOAD,	COMPOUND,	cb_offload,	cb_offload),
 	PROC(CB_RECALL_ANY,	COMPOUND,	cb_recall_any,	cb_recall_any),
+	PROC(CB_GETATTR,	COMPOUND,	cb_getattr,	cb_getattr),
 };
 
 static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index e8a80052cb1b..5e8096bc5eaa 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -515,11 +515,11 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 	if (!list_empty(&ls->ls_layouts)) {
 		if (found)
 			nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
-		lrp->lrs_present = 1;
+		lrp->lrs_present = true;
 	} else {
 		trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
 		nfs4_unhash_stid(&ls->ls_stid);
-		lrp->lrs_present = 0;
+		lrp->lrs_present = false;
 	}
 	spin_unlock(&ls->ls_lock);
 
@@ -539,7 +539,7 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp,
 	struct nfs4_layout *lp, *t;
 	LIST_HEAD(reaplist);
 
-	lrp->lrs_present = 0;
+	lrp->lrs_present = false;
 
 	spin_lock(&clp->cl_lock);
 	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4199ede0583c..6f2d4aa4970d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -322,8 +322,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			status = nfserr_exist;
 			break;
 		case NFS4_CREATE_EXCLUSIVE:
-			if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
-			    d_inode(child)->i_atime.tv_sec == v_atime &&
+			if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+			    inode_get_atime_sec(d_inode(child)) == v_atime &&
 			    d_inode(child)->i_size == 0) {
 				open->op_created = true;
 				break;		/* subtle */
@@ -331,8 +331,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			status = nfserr_exist;
 			break;
 		case NFS4_CREATE_EXCLUSIVE4_1:
-			if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
-			    d_inode(child)->i_atime.tv_sec == v_atime &&
+			if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+			    inode_get_atime_sec(d_inode(child)) == v_atime &&
 			    d_inode(child)->i_size == 0) {
 				open->op_created = true;
 				goto set_attr;	/* subtle */
@@ -1329,7 +1329,8 @@ extern void nfs_sb_deactive(struct super_block *sb);
  * setup a work entry in the ssc delayed unmount list.
  */
 static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
-				  struct nfsd4_ssc_umount_item **nsui)
+				  struct nfsd4_ssc_umount_item **nsui,
+				  struct svc_rqst *rqstp)
 {
 	struct nfsd4_ssc_umount_item *ni = NULL;
 	struct nfsd4_ssc_umount_item *work = NULL;
@@ -1351,7 +1352,7 @@ try_again:
 			spin_unlock(&nn->nfsd_ssc_lock);
 
 			/* allow 20secs for mount/unmount for now - revisit */
-			if (kthread_should_stop() ||
+			if (svc_thread_should_stop(rqstp) ||
 					(schedule_timeout(20*HZ) == 0)) {
 				finish_wait(&nn->nfsd_ssc_waitq, &wait);
 				kfree(work);
@@ -1467,7 +1468,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
 		goto out_free_rawdata;
 	snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
 
-	status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui);
+	status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp);
 	if (status)
 		goto out_free_devname;
 	if ((*nsui)->nsui_vfsmount)
@@ -1642,6 +1643,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
 	if (bytes_total == 0)
 		bytes_total = ULLONG_MAX;
 	do {
+		/* Only async copies can be stopped here */
 		if (kthread_should_stop())
 			break;
 		bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
@@ -1760,6 +1762,7 @@ static int nfsd4_do_async_copy(void *data)
 	struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
 	__be32 nfserr;
 
+	trace_nfsd_copy_do_async(copy);
 	if (nfsd4_ssc_is_inter(copy)) {
 		struct file *filp;
 
@@ -1798,21 +1801,27 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 status;
 	struct nfsd4_copy *async_copy = NULL;
 
+	copy->cp_clp = cstate->clp;
 	if (nfsd4_ssc_is_inter(copy)) {
+		trace_nfsd_copy_inter(copy);
 		if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) {
 			status = nfserr_notsupp;
 			goto out;
 		}
 		status = nfsd4_setup_inter_ssc(rqstp, cstate, copy);
-		if (status)
+		if (status) {
+			trace_nfsd_copy_done(copy, status);
 			return nfserr_offload_denied;
+		}
 	} else {
+		trace_nfsd_copy_intra(copy);
 		status = nfsd4_setup_intra_ssc(rqstp, cstate, copy);
-		if (status)
+		if (status) {
+			trace_nfsd_copy_done(copy, status);
 			return status;
+		}
 	}
 
-	copy->cp_clp = cstate->clp;
 	memcpy(&copy->fh, &cstate->current_fh.fh_handle,
 		sizeof(struct knfsd_fh));
 	if (nfsd4_copy_is_async(copy)) {
@@ -1847,6 +1856,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				       copy->nf_dst->nf_file, true);
 	}
 out:
+	trace_nfsd_copy_done(copy, status);
 	release_copy_files(copy);
 	return status;
 out_err:
@@ -1929,8 +1939,8 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		return status;
 
-	cn->cpn_sec = nn->nfsd4_lease;
-	cn->cpn_nsec = 0;
+	cn->cpn_lease_time.tv_sec = nn->nfsd4_lease;
+	cn->cpn_lease_time.tv_nsec = 0;
 
 	status = nfserrno(-ENOMEM);
 	cps = nfs4_alloc_init_cpntf_state(nn, stid);
@@ -2347,10 +2357,10 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
 	mutex_unlock(&ls->ls_mutex);
 
 	if (new_size > i_size_read(inode)) {
-		lcp->lc_size_chg = 1;
+		lcp->lc_size_chg = true;
 		lcp->lc_newsize = new_size;
 	} else {
-		lcp->lc_size_chg = 0;
+		lcp->lc_size_chg = false;
 	}
 
 	nfserr = ops->proc_layoutcommit(inode, lcp);
@@ -3200,6 +3210,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LOCK] = {
 		.op_func = nfsd4_lock,
+		.op_release = nfsd4_lock_release,
 		.op_flags = OP_MODIFIES_SOMETHING |
 				OP_NONTRIVIAL_ERROR_ENCODE,
 		.op_name = "OP_LOCK",
@@ -3208,6 +3219,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LOCKT] = {
 		.op_func = nfsd4_lockt,
+		.op_release = nfsd4_lockt_release,
 		.op_flags = OP_NONTRIVIAL_ERROR_ENCODE,
 		.op_name = "OP_LOCKT",
 		.op_rsize_bop = nfsd4_lock_rsize,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8534693eb6a4..65fd5510323a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -59,7 +59,7 @@
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
-#define all_ones {{~0,~0},~0}
+#define all_ones {{ ~0, ~0}, ~0}
 static const stateid_t one_stateid = {
 	.si_generation = ~0,
 	.si_opaque = all_ones,
@@ -127,6 +127,7 @@ static void free_session(struct nfsd4_session *);
 
 static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
 static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops;
 
 static struct workqueue_struct *laundry_wq;
 
@@ -297,7 +298,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
 
 	nbl = find_blocked_lock(lo, fh, nn);
 	if (!nbl) {
-		nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
+		nbl = kmalloc(sizeof(*nbl), GFP_KERNEL);
 		if (nbl) {
 			INIT_LIST_HEAD(&nbl->nbl_list);
 			INIT_LIST_HEAD(&nbl->nbl_lru);
@@ -1159,6 +1160,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
 		 struct nfs4_clnt_odstate *odstate, u32 dl_type)
 {
 	struct nfs4_delegation *dp;
+	struct nfs4_stid *stid;
 	long n;
 
 	dprintk("NFSD alloc_init_deleg\n");
@@ -1167,9 +1169,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
 		goto out_dec;
 	if (delegation_blocked(&fp->fi_fhandle))
 		goto out_dec;
-	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
-	if (dp == NULL)
+	stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg);
+	if (stid == NULL)
 		goto out_dec;
+	dp = delegstateid(stid);
 
 	/*
 	 * delegation seqid's are never incremented.  The 4.1 special
@@ -1187,6 +1190,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
 	dp->dl_recalled = false;
 	nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
 		      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+	nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client,
+			&nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR);
+	dp->dl_cb_fattr.ncf_file_modified = false;
+	dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE;
 	get_nfs4_file(fp);
 	dp->dl_stid.sc_file = fp;
 	return dp;
@@ -2894,11 +2901,56 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
 	spin_unlock(&nn->client_lock);
 }
 
+static int
+nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+	struct nfs4_cb_fattr *ncf =
+			container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+	ncf->ncf_cb_status = task->tk_status;
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, 2 * HZ);
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void
+nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_fattr *ncf =
+			container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+	struct nfs4_delegation *dp =
+			container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+	nfs4_put_stid(&dp->dl_stid);
+	clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
+	wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
+}
+
 static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
 	.done		= nfsd4_cb_recall_any_done,
 	.release	= nfsd4_cb_recall_any_release,
 };
 
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = {
+	.done		= nfsd4_cb_getattr_done,
+	.release	= nfsd4_cb_getattr_release,
+};
+
+void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
+{
+	struct nfs4_delegation *dp =
+			container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+	if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags))
+		return;
+	refcount_inc(&dp->dl_stid.sc_count);
+	nfsd4_run_cb(&ncf->ncf_getattr);
+}
+
 static struct nfs4_client *create_client(struct xdr_netobj name,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -5634,13 +5686,15 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	struct svc_fh *parent = NULL;
 	int cb_up;
 	int status = 0;
+	struct kstat stat;
+	struct path path;
 
 	cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
-	open->op_recall = 0;
+	open->op_recall = false;
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_PREVIOUS:
 			if (!cb_up)
-				open->op_recall = 1;
+				open->op_recall = true;
 			break;
 		case NFS4_OPEN_CLAIM_NULL:
 			parent = currentfh;
@@ -5671,6 +5725,18 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
 		open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
 		trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
+		path.mnt = currentfh->fh_export->ex_path.mnt;
+		path.dentry = currentfh->fh_dentry;
+		if (vfs_getattr(&path, &stat,
+				(STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE),
+				AT_STATX_SYNC_AS_STAT)) {
+			nfs4_put_stid(&dp->dl_stid);
+			destroy_delegation(dp);
+			goto out_no_deleg;
+		}
+		dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
+		dp->dl_cb_fattr.ncf_initial_cinfo =
+			nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry));
 	} else {
 		open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
 		trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
@@ -5682,7 +5748,7 @@ out_no_deleg:
 	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
 	    open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
 		dprintk("NFSD: WARNING: refusing delegation reclaim\n");
-		open->op_recall = 1;
+		open->op_recall = true;
 	}
 
 	/* 4.1 client asking for a delegation? */
@@ -7487,6 +7553,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfsd4_blocked_lock *nbl = NULL;
 	struct file_lock *file_lock = NULL;
 	struct file_lock *conflock = NULL;
+	struct super_block *sb;
 	__be32 status = 0;
 	int lkflg;
 	int err;
@@ -7508,6 +7575,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		dprintk("NFSD: nfsd4_lock: permission denied!\n");
 		return status;
 	}
+	sb = cstate->current_fh.fh_dentry->d_sb;
 
 	if (lock->lk_is_new) {
 		if (nfsd4_has_session(cstate))
@@ -7559,7 +7627,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	fp = lock_stp->st_stid.sc_file;
 	switch (lock->lk_type) {
 		case NFS4_READW_LT:
-			if (nfsd4_has_session(cstate))
+			if (nfsd4_has_session(cstate) ||
+			    exportfs_lock_op_is_async(sb->s_export_op))
 				fl_flags |= FL_SLEEP;
 			fallthrough;
 		case NFS4_READ_LT:
@@ -7571,7 +7640,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			fl_type = F_RDLCK;
 			break;
 		case NFS4_WRITEW_LT:
-			if (nfsd4_has_session(cstate))
+			if (nfsd4_has_session(cstate) ||
+			    exportfs_lock_op_is_async(sb->s_export_op))
 				fl_flags |= FL_SLEEP;
 			fallthrough;
 		case NFS4_WRITE_LT:
@@ -7599,7 +7669,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * for file locks), so don't attempt blocking lock notifications
 	 * on those filesystems:
 	 */
-	if (nf->nf_file->f_op->lock)
+	if (!exportfs_lock_op_is_async(sb->s_export_op))
 		fl_flags &= ~FL_SLEEP;
 
 	nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
@@ -7705,6 +7775,14 @@ out:
 	return status;
 }
 
+void nfsd4_lock_release(union nfsd4_op_u *u)
+{
+	struct nfsd4_lock *lock = &u->lock;
+	struct nfsd4_lock_denied *deny = &lock->lk_denied;
+
+	kfree(deny->ld_owner.data);
+}
+
 /*
  * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
  * so we do a temporary open here just to get an open file to pass to
@@ -7810,6 +7888,14 @@ out:
 	return status;
 }
 
+void nfsd4_lockt_release(union nfsd4_op_u *u)
+{
+	struct nfsd4_lockt *lockt = &u->lockt;
+	struct nfsd4_lock_denied *deny = &lockt->lt_denied;
+
+	kfree(deny->ld_owner.data);
+}
+
 __be32
 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    union nfsd4_op_u *u)
@@ -8403,6 +8489,8 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
  * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
  * @rqstp: RPC transaction context
  * @inode: file to be checked for a conflict
+ * @modified: return true if file was modified
+ * @size: new size of file if modified is true
  *
  * This function is called when there is a conflict between a write
  * delegation and a change/size GETATTR from another client. The server
@@ -8411,21 +8499,23 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
  * delegation before replying to the GETATTR. See RFC 8881 section
  * 18.7.4.
  *
- * The current implementation does not support CB_GETATTR yet. However
- * this can avoid recalling the delegation could be added in follow up
- * work.
- *
  * Returns 0 if there is no conflict; otherwise an nfs_stat
  * code is returned.
  */
 __be32
-nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
+nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
+			     bool *modified, u64 *size)
 {
-	__be32 status;
 	struct file_lock_context *ctx;
-	struct file_lock *fl;
 	struct nfs4_delegation *dp;
+	struct nfs4_cb_fattr *ncf;
+	struct file_lock *fl;
+	struct iattr attrs;
+	__be32 status;
 
+	might_sleep();
+
+	*modified = false;
 	ctx = locks_inode_context(inode);
 	if (!ctx)
 		return 0;
@@ -8452,10 +8542,34 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
 break_lease:
 			spin_unlock(&ctx->flc_lock);
 			nfsd_stats_wdeleg_getattr_inc();
-			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
-			if (status != nfserr_jukebox ||
-					!nfsd_wait_for_delegreturn(rqstp, inode))
-				return status;
+
+			dp = fl->fl_owner;
+			ncf = &dp->dl_cb_fattr;
+			nfs4_cb_getattr(&dp->dl_cb_fattr);
+			wait_on_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, TASK_INTERRUPTIBLE);
+			if (ncf->ncf_cb_status) {
+				status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+				if (status != nfserr_jukebox ||
+						!nfsd_wait_for_delegreturn(rqstp, inode))
+					return status;
+			}
+			if (!ncf->ncf_file_modified &&
+					(ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
+					ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
+				ncf->ncf_file_modified = true;
+			if (ncf->ncf_file_modified) {
+				/*
+				 * The server would not update the file's metadata
+				 * with the client's modified size.
+				 */
+				attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
+				attrs.ia_valid = ATTR_MTIME | ATTR_CTIME;
+				setattr_copy(&nop_mnt_idmap, inode, &attrs);
+				mark_inode_dirty(inode);
+				ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
+				*size = ncf->ncf_cur_fsize;
+				*modified = true;
+			}
 			return 0;
 		}
 		break;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 92c7dde148a4..ec4ed6206df1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2530,66 +2530,62 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	return true;
 }
 
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
-			     struct svc_export *exp)
+static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr,
+				   struct knfsd_fh *fh_handle)
 {
-	if (exp->ex_flags & NFSEXP_V4ROOT) {
-		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
-		*p++ = 0;
-	} else
-		p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
-	return p;
+	return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size);
 }
 
+/* This is a frequently-encoded type; open-coded for speed */
 static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
-				    struct timespec64 *tv)
+				    const struct timespec64 *tv)
 {
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, XDR_UNIT * 3);
 	if (!p)
 		return nfserr_resource;
-
-	p = xdr_encode_hyper(p, (s64)tv->tv_sec);
+	p = xdr_encode_hyper(p, tv->tv_sec);
 	*p = cpu_to_be32(tv->tv_nsec);
 	return nfs_ok;
 }
 
-/*
- * ctime (in NFSv4, time_metadata) is not writeable, and the client
- * doesn't really care what resolution could theoretically be stored by
- * the filesystem.
- *
- * The client cares how close together changes can be while still
- * guaranteeing ctime changes.  For most filesystems (which have
- * timestamps with nanosecond fields) that is limited by the resolution
- * of the time returned from current_time() (which I'm assuming to be
- * 1/HZ).
- */
-static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
+static __be32 nfsd4_encode_specdata4(struct xdr_stream *xdr,
+				     unsigned int major, unsigned int minor)
 {
-	struct timespec64 ts;
-	u32 ns;
+	__be32 status;
 
-	ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
-	ts = ns_to_timespec64(ns);
+	status = nfsd4_encode_uint32_t(xdr, major);
+	if (status != nfs_ok)
+		return status;
+	return nfsd4_encode_uint32_t(xdr, minor);
+}
 
-	p = xdr_encode_hyper(p, ts.tv_sec);
-	*p++ = cpu_to_be32(ts.tv_nsec);
+static __be32
+nfsd4_encode_change_info4(struct xdr_stream *xdr, const struct nfsd4_change_info *c)
+{
+	__be32 status;
 
-	return p;
+	status = nfsd4_encode_bool(xdr, c->atomic);
+	if (status != nfs_ok)
+		return status;
+	status = nfsd4_encode_changeid4(xdr, c->before_change);
+	if (status != nfs_ok)
+		return status;
+	return nfsd4_encode_changeid4(xdr, c->after_change);
 }
 
-static __be32
-nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c)
+static __be32 nfsd4_encode_netaddr4(struct xdr_stream *xdr,
+				    const struct nfs42_netaddr *addr)
 {
-	if (xdr_stream_encode_bool(xdr, c->atomic) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u64(xdr, c->before_change) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u64(xdr, c->after_change) < 0)
-		return nfserr_resource;
-	return nfs_ok;
+	__be32 status;
+
+	/* na_r_netid */
+	status = nfsd4_encode_opaque(xdr, addr->netid, addr->netid_len);
+	if (status != nfs_ok)
+		return status;
+	/* na_r_addr */
+	return nfsd4_encode_opaque(xdr, addr->addr, addr->addr_len);
 }
 
 /* Encode as an array of strings the string given with components
@@ -2661,9 +2657,6 @@ static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep,
 	return nfsd4_encode_components_esc(xdr, sep, components, 0, 0);
 }
 
-/*
- * encode a location element of a fs_locations structure
- */
 static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
 					struct nfsd4_fs_location *location)
 {
@@ -2676,15 +2669,12 @@ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
 	status = nfsd4_encode_components(xdr, '/', location->path);
 	if (status)
 		return status;
-	return 0;
+	return nfs_ok;
 }
 
-/*
- * Encode a path in RFC3530 'pathname4' format
- */
-static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
-				const struct path *root,
-				const struct path *path)
+static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr,
+				     const struct path *root,
+				     const struct path *path)
 {
 	struct path cur = *path;
 	__be32 *p;
@@ -2752,89 +2742,59 @@ out_free:
 	return err;
 }
 
-static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr,
-			struct svc_rqst *rqstp, const struct path *path)
+static __be32 nfsd4_encode_fs_locations4(struct xdr_stream *xdr,
+					 struct svc_rqst *rqstp,
+					 struct svc_export *exp)
 {
+	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
 	struct svc_export *exp_ps;
-	__be32 res;
+	unsigned int i;
+	__be32 status;
 
+	/* fs_root */
 	exp_ps = rqst_find_fsidzero_export(rqstp);
 	if (IS_ERR(exp_ps))
 		return nfserrno(PTR_ERR(exp_ps));
-	res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path);
+	status = nfsd4_encode_pathname4(xdr, &exp_ps->ex_path, &exp->ex_path);
 	exp_put(exp_ps);
-	return res;
-}
-
-/*
- *  encode a fs_locations structure
- */
-static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr,
-			struct svc_rqst *rqstp, struct svc_export *exp)
-{
-	__be32 status;
-	int i;
-	__be32 *p;
-	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
-
-	status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path);
-	if (status)
+	if (status != nfs_ok)
 		return status;
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+
+	/* locations<> */
+	if (xdr_stream_encode_u32(xdr, fslocs->locations_count) != XDR_UNIT)
 		return nfserr_resource;
-	*p++ = cpu_to_be32(fslocs->locations_count);
-	for (i=0; i<fslocs->locations_count; i++) {
+	for (i = 0; i < fslocs->locations_count; i++) {
 		status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]);
-		if (status)
+		if (status != nfs_ok)
 			return status;
 	}
-	return 0;
-}
 
-static u32 nfs4_file_type(umode_t mode)
-{
-	switch (mode & S_IFMT) {
-	case S_IFIFO:	return NF4FIFO;
-	case S_IFCHR:	return NF4CHR;
-	case S_IFDIR:	return NF4DIR;
-	case S_IFBLK:	return NF4BLK;
-	case S_IFLNK:	return NF4LNK;
-	case S_IFREG:	return NF4REG;
-	case S_IFSOCK:	return NF4SOCK;
-	default:	return NF4BAD;
-	}
+	return nfs_ok;
 }
 
-static inline __be32
-nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
-		     struct nfs4_ace *ace)
+static __be32 nfsd4_encode_nfsace4(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+				   struct nfs4_ace *ace)
 {
+	__be32 status;
+
+	/* type */
+	status = nfsd4_encode_acetype4(xdr, ace->type);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* flag */
+	status = nfsd4_encode_aceflag4(xdr, ace->flag);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* access mask */
+	status = nfsd4_encode_acemask4(xdr, ace->access_mask & NFS4_ACE_MASK_ALL);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* who */
 	if (ace->whotype != NFS4_ACL_WHO_NAMED)
 		return nfs4_acl_write_who(xdr, ace->whotype);
-	else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+	if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
 		return nfsd4_encode_group(xdr, rqstp, ace->who_gid);
-	else
-		return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
-}
-
-static inline __be32
-nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types)
-{
-	__be32		*p;
-	unsigned long	i = hweight_long(layout_types);
-
-	p = xdr_reserve_space(xdr, 4 + 4 * i);
-	if (!p)
-		return nfserr_resource;
-
-	*p++ = cpu_to_be32(i);
-
-	for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
-		if (layout_types & (1 << i))
-			*p++ = cpu_to_be32(i);
-
-	return 0;
+	return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
 }
 
 #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -2906,12 +2866,12 @@ static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino)
 }
 
 static __be32
-nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
+nfsd4_encode_bitmap4(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
 {
 	__be32 *p;
 
 	if (bmval2) {
-		p = xdr_reserve_space(xdr, 16);
+		p = xdr_reserve_space(xdr, XDR_UNIT * 4);
 		if (!p)
 			goto out_resource;
 		*p++ = cpu_to_be32(3);
@@ -2919,94 +2879,687 @@ nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
 		*p++ = cpu_to_be32(bmval1);
 		*p++ = cpu_to_be32(bmval2);
 	} else if (bmval1) {
-		p = xdr_reserve_space(xdr, 12);
+		p = xdr_reserve_space(xdr, XDR_UNIT * 3);
 		if (!p)
 			goto out_resource;
 		*p++ = cpu_to_be32(2);
 		*p++ = cpu_to_be32(bmval0);
 		*p++ = cpu_to_be32(bmval1);
 	} else {
-		p = xdr_reserve_space(xdr, 8);
+		p = xdr_reserve_space(xdr, XDR_UNIT * 2);
 		if (!p)
 			goto out_resource;
 		*p++ = cpu_to_be32(1);
 		*p++ = cpu_to_be32(bmval0);
 	}
 
-	return 0;
+	return nfs_ok;
 out_resource:
 	return nfserr_resource;
 }
 
+struct nfsd4_fattr_args {
+	struct svc_rqst		*rqstp;
+	struct svc_fh		*fhp;
+	struct svc_export	*exp;
+	struct dentry		*dentry;
+	struct kstat		stat;
+	struct kstatfs		statfs;
+	struct nfs4_acl		*acl;
+	u64			size;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+	void			*context;
+	int			contextlen;
+#endif
+	u32			rdattr_err;
+	bool			contextsupport;
+	bool			ignore_crossmnt;
+};
+
+typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr,
+				const struct nfsd4_fattr_args *args);
+
+static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4__true(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_bool(xdr, true);
+}
+
+static __be32 nfsd4_encode_fattr4__false(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_bool(xdr, false);
+}
+
+static __be32 nfsd4_encode_fattr4_supported_attrs(struct xdr_stream *xdr,
+						  const struct nfsd4_fattr_args *args)
+{
+	struct nfsd4_compoundres *resp = args->rqstp->rq_resp;
+	u32 minorversion = resp->cstate.minorversion;
+	u32 supp[3];
+
+	memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+	if (!IS_POSIXACL(d_inode(args->dentry)))
+		supp[0] &= ~FATTR4_WORD0_ACL;
+	if (!args->contextsupport)
+		supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+	return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
+}
+
+static __be32 nfsd4_encode_fattr4_type(struct xdr_stream *xdr,
+				       const struct nfsd4_fattr_args *args)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, XDR_UNIT);
+	if (!p)
+		return nfserr_resource;
+
+	switch (args->stat.mode & S_IFMT) {
+	case S_IFIFO:
+		*p = cpu_to_be32(NF4FIFO);
+		break;
+	case S_IFCHR:
+		*p = cpu_to_be32(NF4CHR);
+		break;
+	case S_IFDIR:
+		*p = cpu_to_be32(NF4DIR);
+		break;
+	case S_IFBLK:
+		*p = cpu_to_be32(NF4BLK);
+		break;
+	case S_IFLNK:
+		*p = cpu_to_be32(NF4LNK);
+		break;
+	case S_IFREG:
+		*p = cpu_to_be32(NF4REG);
+		break;
+	case S_IFSOCK:
+		*p = cpu_to_be32(NF4SOCK);
+		break;
+	default:
+		return nfserr_serverfault;
+	}
+
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_fh_expire_type(struct xdr_stream *xdr,
+						 const struct nfsd4_fattr_args *args)
+{
+	u32 mask;
+
+	mask = NFS4_FH_PERSISTENT;
+	if (!(args->exp->ex_flags & NFSEXP_NOSUBTREECHECK))
+		mask |= NFS4_FH_VOL_RENAME;
+	return nfsd4_encode_uint32_t(xdr, mask);
+}
+
+static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	const struct svc_export *exp = args->exp;
+	u64 c;
+
+	if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
+		u32 flush_time = convert_to_wallclock(exp->cd->flush_time);
+
+		if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT)
+			return nfserr_resource;
+		if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+			return nfserr_resource;
+		return nfs_ok;
+	}
+
+	c = nfsd4_change_attribute(&args->stat, d_inode(args->dentry));
+	return nfsd4_encode_changeid4(xdr, c);
+}
+
+static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr,
+				       const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->size);
+}
+
+static __be32 nfsd4_encode_fattr4_fsid(struct xdr_stream *xdr,
+				       const struct nfsd4_fattr_args *args)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, XDR_UNIT * 2 + XDR_UNIT * 2);
+	if (!p)
+		return nfserr_resource;
+
+	if (unlikely(args->exp->ex_fslocs.migrated)) {
+		p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
+		xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
+		return nfs_ok;
+	}
+	switch (fsid_source(args->fhp)) {
+	case FSIDSOURCE_FSID:
+		p = xdr_encode_hyper(p, (u64)args->exp->ex_fsid);
+		xdr_encode_hyper(p, (u64)0);
+		break;
+	case FSIDSOURCE_DEV:
+		*p++ = xdr_zero;
+		*p++ = cpu_to_be32(MAJOR(args->stat.dev));
+		*p++ = xdr_zero;
+		*p   = cpu_to_be32(MINOR(args->stat.dev));
+		break;
+	case FSIDSOURCE_UUID:
+		xdr_encode_opaque_fixed(p, args->exp->ex_uuid, EX_UUID_LEN);
+		break;
+	}
+
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_lease_time(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(args->rqstp), nfsd_net_id);
+
+	return nfsd4_encode_nfs_lease4(xdr, nn->nfsd4_lease);
+}
+
+static __be32 nfsd4_encode_fattr4_rdattr_error(struct xdr_stream *xdr,
+					       const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, args->rdattr_err);
+}
+
+static __be32 nfsd4_encode_fattr4_aclsupport(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	u32 mask;
+
+	mask = 0;
+	if (IS_POSIXACL(d_inode(args->dentry)))
+		mask = ACL4_SUPPORT_ALLOW_ACL | ACL4_SUPPORT_DENY_ACL;
+	return nfsd4_encode_uint32_t(xdr, mask);
+}
+
+static __be32 nfsd4_encode_fattr4_acl(struct xdr_stream *xdr,
+				      const struct nfsd4_fattr_args *args)
+{
+	struct nfs4_acl *acl = args->acl;
+	struct nfs4_ace *ace;
+	__be32 status;
+
+	/* nfsace4<> */
+	if (!acl) {
+		if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+			return nfserr_resource;
+	} else {
+		if (xdr_stream_encode_u32(xdr, acl->naces) != XDR_UNIT)
+			return nfserr_resource;
+		for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+			status = nfsd4_encode_nfsace4(xdr, args->rqstp, ace);
+			if (status != nfs_ok)
+				return status;
+		}
+	}
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle);
+}
+
+static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->stat.ino);
+}
+
+static __be32 nfsd4_encode_fattr4_files_avail(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree);
+}
+
+static __be32 nfsd4_encode_fattr4_files_free(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree);
+}
+
+static __be32 nfsd4_encode_fattr4_files_total(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->statfs.f_files);
+}
+
+static __be32 nfsd4_encode_fattr4_fs_locations(struct xdr_stream *xdr,
+					       const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_fs_locations4(xdr, args->rqstp, args->exp);
+}
+
+static __be32 nfsd4_encode_fattr4_maxfilesize(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	struct super_block *sb = args->exp->ex_path.mnt->mnt_sb;
+
+	return nfsd4_encode_uint64_t(xdr, sb->s_maxbytes);
+}
+
+static __be32 nfsd4_encode_fattr4_maxlink(struct xdr_stream *xdr,
+					  const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, 255);
+}
+
+static __be32 nfsd4_encode_fattr4_maxname(struct xdr_stream *xdr,
+					  const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, args->statfs.f_namelen);
+}
+
+static __be32 nfsd4_encode_fattr4_maxread(struct xdr_stream *xdr,
+					  const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp));
+}
+
+static __be32 nfsd4_encode_fattr4_maxwrite(struct xdr_stream *xdr,
+					   const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp));
+}
+
+static __be32 nfsd4_encode_fattr4_mode(struct xdr_stream *xdr,
+				       const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_mode4(xdr, args->stat.mode & S_IALLUGO);
+}
+
+static __be32 nfsd4_encode_fattr4_numlinks(struct xdr_stream *xdr,
+					   const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, args->stat.nlink);
+}
+
+static __be32 nfsd4_encode_fattr4_owner(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_user(xdr, args->rqstp, args->stat.uid);
+}
+
+static __be32 nfsd4_encode_fattr4_owner_group(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_group(xdr, args->rqstp, args->stat.gid);
+}
+
+static __be32 nfsd4_encode_fattr4_rawdev(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_specdata4(xdr, MAJOR(args->stat.rdev),
+				      MINOR(args->stat.rdev));
+}
+
+static __be32 nfsd4_encode_fattr4_space_avail(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	u64 avail = (u64)args->statfs.f_bavail * (u64)args->statfs.f_bsize;
+
+	return nfsd4_encode_uint64_t(xdr, avail);
+}
+
+static __be32 nfsd4_encode_fattr4_space_free(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	u64 free = (u64)args->statfs.f_bfree * (u64)args->statfs.f_bsize;
+
+	return nfsd4_encode_uint64_t(xdr, free);
+}
+
+static __be32 nfsd4_encode_fattr4_space_total(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	u64 total = (u64)args->statfs.f_blocks * (u64)args->statfs.f_bsize;
+
+	return nfsd4_encode_uint64_t(xdr, total);
+}
+
+static __be32 nfsd4_encode_fattr4_space_used(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, (u64)args->stat.blocks << 9);
+}
+
+static __be32 nfsd4_encode_fattr4_time_access(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfstime4(xdr, &args->stat.atime);
+}
+
+static __be32 nfsd4_encode_fattr4_time_create(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfstime4(xdr, &args->stat.btime);
+}
+
+/*
+ * ctime (in NFSv4, time_metadata) is not writeable, and the client
+ * doesn't really care what resolution could theoretically be stored by
+ * the filesystem.
+ *
+ * The client cares how close together changes can be while still
+ * guaranteeing ctime changes.  For most filesystems (which have
+ * timestamps with nanosecond fields) that is limited by the resolution
+ * of the time returned from current_time() (which I'm assuming to be
+ * 1/HZ).
+ */
+static __be32 nfsd4_encode_fattr4_time_delta(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	const struct inode *inode = d_inode(args->dentry);
+	u32 ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
+	struct timespec64 ts = ns_to_timespec64(ns);
+
+	return nfsd4_encode_nfstime4(xdr, &ts);
+}
+
+static __be32 nfsd4_encode_fattr4_time_metadata(struct xdr_stream *xdr,
+						const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfstime4(xdr, &args->stat.ctime);
+}
+
+static __be32 nfsd4_encode_fattr4_time_modify(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfstime4(xdr, &args->stat.mtime);
+}
+
+static __be32 nfsd4_encode_fattr4_mounted_on_fileid(struct xdr_stream *xdr,
+						    const struct nfsd4_fattr_args *args)
+{
+	u64 ino;
+	int err;
+
+	if (!args->ignore_crossmnt &&
+	    args->dentry == args->exp->ex_path.mnt->mnt_root) {
+		err = nfsd4_get_mounted_on_ino(args->exp, &ino);
+		if (err)
+			return nfserrno(err);
+	} else
+		ino = args->stat.ino;
+
+	return nfsd4_encode_uint64_t(xdr, ino);
+}
+
+#ifdef CONFIG_NFSD_PNFS
+
+static __be32 nfsd4_encode_fattr4_fs_layout_types(struct xdr_stream *xdr,
+						  const struct nfsd4_fattr_args *args)
+{
+	unsigned long mask = args->exp->ex_layout_types;
+	int i;
+
+	/* Hamming weight of @mask is the number of layout types to return */
+	if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT)
+		return nfserr_resource;
+	for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+		if (mask & BIT(i)) {
+			/* layouttype4 */
+			if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT)
+				return nfserr_resource;
+		}
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_layout_types(struct xdr_stream *xdr,
+					       const struct nfsd4_fattr_args *args)
+{
+	unsigned long mask = args->exp->ex_layout_types;
+	int i;
+
+	/* Hamming weight of @mask is the number of layout types to return */
+	if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT)
+		return nfserr_resource;
+	for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+		if (mask & BIT(i)) {
+			/* layouttype4 */
+			if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT)
+				return nfserr_resource;
+		}
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_layout_blksize(struct xdr_stream *xdr,
+						 const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, args->stat.blksize);
+}
+
+#endif
+
+static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr,
+						     const struct nfsd4_fattr_args *args)
+{
+	struct nfsd4_compoundres *resp = args->rqstp->rq_resp;
+	u32 supp[3];
+
+	memcpy(supp, nfsd_suppattrs[resp->cstate.minorversion], sizeof(supp));
+	supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
+	supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
+	supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
+
+	return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
+}
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
+					    const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_security_label(xdr, args->rqstp,
+					   args->context, args->contextlen);
+}
+#endif
+
+static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr,
+						const struct nfsd4_fattr_args *args)
+{
+	int err = xattr_supports_user_prefix(d_inode(args->dentry));
+
+	return nfsd4_encode_bool(xdr, err == 0);
+}
+
+static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
+	[FATTR4_SUPPORTED_ATTRS]	= nfsd4_encode_fattr4_supported_attrs,
+	[FATTR4_TYPE]			= nfsd4_encode_fattr4_type,
+	[FATTR4_FH_EXPIRE_TYPE]		= nfsd4_encode_fattr4_fh_expire_type,
+	[FATTR4_CHANGE]			= nfsd4_encode_fattr4_change,
+	[FATTR4_SIZE]			= nfsd4_encode_fattr4_size,
+	[FATTR4_LINK_SUPPORT]		= nfsd4_encode_fattr4__true,
+	[FATTR4_SYMLINK_SUPPORT]	= nfsd4_encode_fattr4__true,
+	[FATTR4_NAMED_ATTR]		= nfsd4_encode_fattr4__false,
+	[FATTR4_FSID]			= nfsd4_encode_fattr4_fsid,
+	[FATTR4_UNIQUE_HANDLES]		= nfsd4_encode_fattr4__true,
+	[FATTR4_LEASE_TIME]		= nfsd4_encode_fattr4_lease_time,
+	[FATTR4_RDATTR_ERROR]		= nfsd4_encode_fattr4_rdattr_error,
+	[FATTR4_ACL]			= nfsd4_encode_fattr4_acl,
+	[FATTR4_ACLSUPPORT]		= nfsd4_encode_fattr4_aclsupport,
+	[FATTR4_ARCHIVE]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_CANSETTIME]		= nfsd4_encode_fattr4__true,
+	[FATTR4_CASE_INSENSITIVE]	= nfsd4_encode_fattr4__false,
+	[FATTR4_CASE_PRESERVING]	= nfsd4_encode_fattr4__true,
+	[FATTR4_CHOWN_RESTRICTED]	= nfsd4_encode_fattr4__true,
+	[FATTR4_FILEHANDLE]		= nfsd4_encode_fattr4_filehandle,
+	[FATTR4_FILEID]			= nfsd4_encode_fattr4_fileid,
+	[FATTR4_FILES_AVAIL]		= nfsd4_encode_fattr4_files_avail,
+	[FATTR4_FILES_FREE]		= nfsd4_encode_fattr4_files_free,
+	[FATTR4_FILES_TOTAL]		= nfsd4_encode_fattr4_files_total,
+	[FATTR4_FS_LOCATIONS]		= nfsd4_encode_fattr4_fs_locations,
+	[FATTR4_HIDDEN]			= nfsd4_encode_fattr4__noop,
+	[FATTR4_HOMOGENEOUS]		= nfsd4_encode_fattr4__true,
+	[FATTR4_MAXFILESIZE]		= nfsd4_encode_fattr4_maxfilesize,
+	[FATTR4_MAXLINK]		= nfsd4_encode_fattr4_maxlink,
+	[FATTR4_MAXNAME]		= nfsd4_encode_fattr4_maxname,
+	[FATTR4_MAXREAD]		= nfsd4_encode_fattr4_maxread,
+	[FATTR4_MAXWRITE]		= nfsd4_encode_fattr4_maxwrite,
+	[FATTR4_MIMETYPE]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_MODE]			= nfsd4_encode_fattr4_mode,
+	[FATTR4_NO_TRUNC]		= nfsd4_encode_fattr4__true,
+	[FATTR4_NUMLINKS]		= nfsd4_encode_fattr4_numlinks,
+	[FATTR4_OWNER]			= nfsd4_encode_fattr4_owner,
+	[FATTR4_OWNER_GROUP]		= nfsd4_encode_fattr4_owner_group,
+	[FATTR4_QUOTA_AVAIL_HARD]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_QUOTA_AVAIL_SOFT]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_QUOTA_USED]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RAWDEV]			= nfsd4_encode_fattr4_rawdev,
+	[FATTR4_SPACE_AVAIL]		= nfsd4_encode_fattr4_space_avail,
+	[FATTR4_SPACE_FREE]		= nfsd4_encode_fattr4_space_free,
+	[FATTR4_SPACE_TOTAL]		= nfsd4_encode_fattr4_space_total,
+	[FATTR4_SPACE_USED]		= nfsd4_encode_fattr4_space_used,
+	[FATTR4_SYSTEM]			= nfsd4_encode_fattr4__noop,
+	[FATTR4_TIME_ACCESS]		= nfsd4_encode_fattr4_time_access,
+	[FATTR4_TIME_ACCESS_SET]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_TIME_BACKUP]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_TIME_CREATE]		= nfsd4_encode_fattr4_time_create,
+	[FATTR4_TIME_DELTA]		= nfsd4_encode_fattr4_time_delta,
+	[FATTR4_TIME_METADATA]		= nfsd4_encode_fattr4_time_metadata,
+	[FATTR4_TIME_MODIFY]		= nfsd4_encode_fattr4_time_modify,
+	[FATTR4_TIME_MODIFY_SET]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_MOUNTED_ON_FILEID]	= nfsd4_encode_fattr4_mounted_on_fileid,
+	[FATTR4_DIR_NOTIF_DELAY]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_DIRENT_NOTIF_DELAY]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_DACL]			= nfsd4_encode_fattr4__noop,
+	[FATTR4_SACL]			= nfsd4_encode_fattr4__noop,
+	[FATTR4_CHANGE_POLICY]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_FS_STATUS]		= nfsd4_encode_fattr4__noop,
+
+#ifdef CONFIG_NFSD_PNFS
+	[FATTR4_FS_LAYOUT_TYPES]	= nfsd4_encode_fattr4_fs_layout_types,
+	[FATTR4_LAYOUT_HINT]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_TYPES]		= nfsd4_encode_fattr4_layout_types,
+	[FATTR4_LAYOUT_BLKSIZE]		= nfsd4_encode_fattr4_layout_blksize,
+	[FATTR4_LAYOUT_ALIGNMENT]	= nfsd4_encode_fattr4__noop,
+#else
+	[FATTR4_FS_LAYOUT_TYPES]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_HINT]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_TYPES]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_BLKSIZE]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_ALIGNMENT]	= nfsd4_encode_fattr4__noop,
+#endif
+
+	[FATTR4_FS_LOCATIONS_INFO]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_MDSTHRESHOLD]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTION_GET]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTION_SET]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTEVT_GET]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTEVT_SET]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTION_HOLD]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_MODE_SET_MASKED]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_SUPPATTR_EXCLCREAT]	= nfsd4_encode_fattr4_suppattr_exclcreat,
+	[FATTR4_FS_CHARSET_CAP]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_CLONE_BLKSIZE]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_SPACE_FREED]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_CHANGE_ATTR_TYPE]	= nfsd4_encode_fattr4__noop,
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+	[FATTR4_SEC_LABEL]		= nfsd4_encode_fattr4_sec_label,
+#else
+	[FATTR4_SEC_LABEL]		= nfsd4_encode_fattr4__noop,
+#endif
+
+	[FATTR4_MODE_UMASK]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_XATTR_SUPPORT]		= nfsd4_encode_fattr4_xattr_support,
+};
+
 /*
  * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
  * ourselves.
  */
 static __be32
-nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
-		struct svc_export *exp,
-		struct dentry *dentry, u32 *bmval,
-		struct svc_rqst *rqstp, int ignore_crossmnt)
+nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+		    struct svc_fh *fhp, struct svc_export *exp,
+		    struct dentry *dentry, const u32 *bmval,
+		    int ignore_crossmnt)
 {
-	u32 bmval0 = bmval[0];
-	u32 bmval1 = bmval[1];
-	u32 bmval2 = bmval[2];
-	struct kstat stat;
+	struct nfsd4_fattr_args args;
 	struct svc_fh *tempfh = NULL;
-	struct kstatfs statfs;
-	__be32 *p, *attrlen_p;
 	int starting_len = xdr->buf->len;
+	__be32 *attrlen_p, status;
 	int attrlen_offset;
-	u32 dummy;
-	u64 dummy64;
-	u32 rdattr_err = 0;
-	__be32 status;
 	int err;
-	struct nfs4_acl *acl = NULL;
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	void *context = NULL;
-	int contextlen;
-#endif
-	bool contextsupport = false;
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	u32 minorversion = resp->cstate.minorversion;
 	struct path path = {
 		.mnt	= exp->ex_path.mnt,
 		.dentry	= dentry,
 	};
-	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	union {
+		u32		attrmask[3];
+		unsigned long	mask[2];
+	} u;
+	bool file_modified;
+	unsigned long bit;
+	u64 size = 0;
+
+	WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
+	WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
+
+	args.rqstp = rqstp;
+	args.exp = exp;
+	args.dentry = dentry;
+	args.ignore_crossmnt = (ignore_crossmnt != 0);
 
-	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-	BUG_ON(!nfsd_attrs_supported(minorversion, bmval));
+	/*
+	 * Make a local copy of the attribute bitmap that can be modified.
+	 */
+	memset(&u, 0, sizeof(u));
+	u.attrmask[0] = bmval[0];
+	u.attrmask[1] = bmval[1];
+	u.attrmask[2] = bmval[2];
 
+	args.rdattr_err = 0;
 	if (exp->ex_fslocs.migrated) {
-		status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err);
+		status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1],
+						&u.attrmask[2], &args.rdattr_err);
 		if (status)
 			goto out;
 	}
-	if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
-		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
+	args.size = 0;
+	if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry),
+						      &file_modified, &size);
 		if (status)
 			goto out;
 	}
 
-	err = vfs_getattr(&path, &stat,
+	err = vfs_getattr(&path, &args.stat,
 			  STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
 			  AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_nfserr;
-	if (!(stat.result_mask & STATX_BTIME))
+	args.size = file_modified ? size : args.stat.size;
+
+	if (!(args.stat.result_mask & STATX_BTIME))
 		/* underlying FS does not offer btime so we can't share it */
-		bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
-	if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+		u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+	if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
 			FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
-	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+	    (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		err = vfs_statfs(&path, &statfs);
+		err = vfs_statfs(&path, &args.statfs);
 		if (err)
 			goto out_nfserr;
 	}
-	if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+	if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
+	    !fhp) {
 		tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
 		status = nfserr_jukebox;
 		if (!tempfh)
@@ -3015,12 +3568,15 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 		status = fh_compose(tempfh, exp, dentry, NULL);
 		if (status)
 			goto out;
-		fhp = tempfh;
-	}
-	if (bmval0 & FATTR4_WORD0_ACL) {
-		err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+		args.fhp = tempfh;
+	} else
+		args.fhp = fhp;
+
+	args.acl = NULL;
+	if (u.attrmask[0] & FATTR4_WORD0_ACL) {
+		err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl);
 		if (err == -EOPNOTSUPP)
-			bmval0 &= ~FATTR4_WORD0_ACL;
+			u.attrmask[0] &= ~FATTR4_WORD0_ACL;
 		else if (err == -EINVAL) {
 			status = nfserr_attrnotsupp;
 			goto out;
@@ -3028,452 +3584,53 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 			goto out_nfserr;
 	}
 
+	args.contextsupport = false;
+
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
-	     bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+	args.context = NULL;
+	if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+	     u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
 		if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
 			err = security_inode_getsecctx(d_inode(dentry),
-						&context, &contextlen);
+						&args.context, &args.contextlen);
 		else
 			err = -EOPNOTSUPP;
-		contextsupport = (err == 0);
-		if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+		args.contextsupport = (err == 0);
+		if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
 			if (err == -EOPNOTSUPP)
-				bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+				u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 			else if (err)
 				goto out_nfserr;
 		}
 	}
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
 
-	status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2);
+	/* attrmask */
+	status = nfsd4_encode_bitmap4(xdr, u.attrmask[0],
+				      u.attrmask[1], u.attrmask[2]);
 	if (status)
 		goto out;
 
+	/* attr_vals */
 	attrlen_offset = xdr->buf->len;
 	attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
 	if (!attrlen_p)
 		goto out_resource;
-
-	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		u32 supp[3];
-
-		memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
-
-		if (!IS_POSIXACL(dentry->d_inode))
-			supp[0] &= ~FATTR4_WORD0_ACL;
-		if (!contextsupport)
-			supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-		if (!supp[2]) {
-			p = xdr_reserve_space(xdr, 12);
-			if (!p)
-				goto out_resource;
-			*p++ = cpu_to_be32(2);
-			*p++ = cpu_to_be32(supp[0]);
-			*p++ = cpu_to_be32(supp[1]);
-		} else {
-			p = xdr_reserve_space(xdr, 16);
-			if (!p)
-				goto out_resource;
-			*p++ = cpu_to_be32(3);
-			*p++ = cpu_to_be32(supp[0]);
-			*p++ = cpu_to_be32(supp[1]);
-			*p++ = cpu_to_be32(supp[2]);
-		}
-	}
-	if (bmval0 & FATTR4_WORD0_TYPE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		dummy = nfs4_file_type(stat.mode);
-		if (dummy == NF4BAD) {
-			status = nfserr_serverfault;
+	for_each_set_bit(bit, (const unsigned long *)&u.mask,
+			 ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
+		status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
+		if (status != nfs_ok)
 			goto out;
-		}
-		*p++ = cpu_to_be32(dummy);
-	}
-	if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
-			*p++ = cpu_to_be32(NFS4_FH_PERSISTENT);
-		else
-			*p++ = cpu_to_be32(NFS4_FH_PERSISTENT|
-						NFS4_FH_VOL_RENAME);
 	}
-	if (bmval0 & FATTR4_WORD0_CHANGE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = encode_change(p, &stat, d_inode(dentry), exp);
-	}
-	if (bmval0 & FATTR4_WORD0_SIZE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, stat.size);
-	}
-	if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(0);
-	}
-	if (bmval0 & FATTR4_WORD0_FSID) {
-		p = xdr_reserve_space(xdr, 16);
-		if (!p)
-			goto out_resource;
-		if (exp->ex_fslocs.migrated) {
-			p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
-			p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
-		} else switch(fsid_source(fhp)) {
-		case FSIDSOURCE_FSID:
-			p = xdr_encode_hyper(p, (u64)exp->ex_fsid);
-			p = xdr_encode_hyper(p, (u64)0);
-			break;
-		case FSIDSOURCE_DEV:
-			*p++ = cpu_to_be32(0);
-			*p++ = cpu_to_be32(MAJOR(stat.dev));
-			*p++ = cpu_to_be32(0);
-			*p++ = cpu_to_be32(MINOR(stat.dev));
-			break;
-		case FSIDSOURCE_UUID:
-			p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
-								EX_UUID_LEN);
-			break;
-		}
-	}
-	if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(0);
-	}
-	if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(nn->nfsd4_lease);
-	}
-	if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(rdattr_err);
-	}
-	if (bmval0 & FATTR4_WORD0_ACL) {
-		struct nfs4_ace *ace;
-
-		if (acl == NULL) {
-			p = xdr_reserve_space(xdr, 4);
-			if (!p)
-				goto out_resource;
-
-			*p++ = cpu_to_be32(0);
-			goto out_acl;
-		}
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(acl->naces);
-
-		for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
-			p = xdr_reserve_space(xdr, 4*3);
-			if (!p)
-				goto out_resource;
-			*p++ = cpu_to_be32(ace->type);
-			*p++ = cpu_to_be32(ace->flag);
-			*p++ = cpu_to_be32(ace->access_mask &
-							NFS4_ACE_MASK_ALL);
-			status = nfsd4_encode_aclname(xdr, rqstp, ace);
-			if (status)
-				goto out;
-		}
-	}
-out_acl:
-	if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ?
-			ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
-	}
-	if (bmval0 & FATTR4_WORD0_CANSETTIME) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(0);
-	}
-	if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
-		p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
-					fhp->fh_handle.fh_size);
-	}
-	if (bmval0 & FATTR4_WORD0_FILEID) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, stat.ino);
-	}
-	if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
-	}
-	if (bmval0 & FATTR4_WORD0_FILES_FREE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
-	}
-	if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) statfs.f_files);
-	}
-	if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
-		status = nfsd4_encode_fs_locations(xdr, rqstp, exp);
-		if (status)
-			goto out;
-	}
-	if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes);
-	}
-	if (bmval0 & FATTR4_WORD0_MAXLINK) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(255);
-	}
-	if (bmval0 & FATTR4_WORD0_MAXNAME) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(statfs.f_namelen);
-	}
-	if (bmval0 & FATTR4_WORD0_MAXREAD) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
-	}
-	if (bmval0 & FATTR4_WORD0_MAXWRITE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
-	}
-	if (bmval1 & FATTR4_WORD1_MODE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(stat.mode & S_IALLUGO);
-	}
-	if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval1 & FATTR4_WORD1_NUMLINKS) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(stat.nlink);
-	}
-	if (bmval1 & FATTR4_WORD1_OWNER) {
-		status = nfsd4_encode_user(xdr, rqstp, stat.uid);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
-		status = nfsd4_encode_group(xdr, rqstp, stat.gid);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_RAWDEV) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32((u32) MAJOR(stat.rdev));
-		*p++ = cpu_to_be32((u32) MINOR(stat.rdev));
-	}
-	if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
-		p = xdr_encode_hyper(p, dummy64);
-	}
-	if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
-		p = xdr_encode_hyper(p, dummy64);
-	}
-	if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
-		p = xdr_encode_hyper(p, dummy64);
-	}
-	if (bmval1 & FATTR4_WORD1_SPACE_USED) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		dummy64 = (u64)stat.blocks << 9;
-		p = xdr_encode_hyper(p, dummy64);
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
-		status = nfsd4_encode_nfstime4(xdr, &stat.atime);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
-		status = nfsd4_encode_nfstime4(xdr, &stat.btime);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
-		p = xdr_reserve_space(xdr, 12);
-		if (!p)
-			goto out_resource;
-		p = encode_time_delta(p, d_inode(dentry));
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
-		status = nfsd4_encode_nfstime4(xdr, &stat.ctime);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
-		status = nfsd4_encode_nfstime4(xdr, &stat.mtime);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
-		u64 ino = stat.ino;
-
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-                	goto out_resource;
-		/*
-		 * Get ino of mountpoint in parent filesystem, if not ignoring
-		 * crossmount and this is the root of a cross-mounted
-		 * filesystem.
-		 */
-		if (ignore_crossmnt == 0 &&
-		    dentry == exp->ex_path.mnt->mnt_root) {
-			err = nfsd4_get_mounted_on_ino(exp, &ino);
-			if (err)
-				goto out_nfserr;
-		}
-		p = xdr_encode_hyper(p, ino);
-	}
-#ifdef CONFIG_NFSD_PNFS
-	if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-		status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
-		if (status)
-			goto out;
-	}
-
-	if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
-		status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
-		if (status)
-			goto out;
-	}
-
-	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(stat.blksize);
-	}
-#endif /* CONFIG_NFSD_PNFS */
-	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
-		u32 supp[3];
-
-		memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
-		supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
-		supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
-		supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
-
-		status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
-		if (status)
-			goto out;
-	}
-
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
-		status = nfsd4_encode_security_label(xdr, rqstp, context,
-								contextlen);
-		if (status)
-			goto out;
-	}
-#endif
-
-	if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		err = xattr_supports_user_prefix(d_inode(dentry));
-		*p++ = cpu_to_be32(err == 0);
-	}
-
 	*attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
 	status = nfs_ok;
 
 out:
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	if (context)
-		security_release_secctx(context, contextlen);
+	if (args.context)
+		security_release_secctx(args.context, args.contextlen);
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
-	kfree(acl);
+	kfree(args.acl);
 	if (tempfh) {
 		fh_put(tempfh);
 		kfree(tempfh);
@@ -3514,12 +3671,28 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
 	__be32 ret;
 
 	svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
-	ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
-							ignore_crossmnt);
+	ret = nfsd4_encode_fattr4(rqstp, &xdr, fhp, exp, dentry, bmval,
+				  ignore_crossmnt);
 	*p = xdr.p;
 	return ret;
 }
 
+/*
+ * The buffer space for this field was reserved during a previous
+ * call to nfsd4_encode_entry4().
+ */
+static void nfsd4_encode_entry4_nfs_cookie4(const struct nfsd4_readdir *readdir,
+					    u64 offset)
+{
+	__be64 cookie = cpu_to_be64(offset);
+	struct xdr_stream *xdr = readdir->xdr;
+
+	if (!readdir->cookie_offset)
+		return;
+	write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, &cookie,
+			       sizeof(cookie));
+}
+
 static inline int attributes_need_mount(u32 *bmval)
 {
 	if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME))
@@ -3530,8 +3703,8 @@ static inline int attributes_need_mount(u32 *bmval)
 }
 
 static __be32
-nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
-			const char *name, int namlen)
+nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name,
+			  int namlen)
 {
 	struct svc_export *exp = cd->rd_fhp->fh_export;
 	struct dentry *dentry;
@@ -3574,33 +3747,34 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 
 	}
 out_encode:
-	nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval,
-					cd->rd_rqstp, ignore_crossmnt);
+	nfserr = nfsd4_encode_fattr4(cd->rd_rqstp, cd->xdr, NULL, exp, dentry,
+				     cd->rd_bmval, ignore_crossmnt);
 out_put:
 	dput(dentry);
 	exp_put(exp);
 	return nfserr;
 }
 
-static __be32 *
-nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
+static __be32
+nfsd4_encode_entry4_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
 {
-	__be32 *p;
-
-	p = xdr_reserve_space(xdr, 20);
-	if (!p)
-		return NULL;
-	*p++ = htonl(2);
-	*p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
-	*p++ = htonl(0);			 /* bmval1 */
+	__be32 status;
 
-	*p++ = htonl(4);     /* attribute length */
-	*p++ = nfserr;       /* no htonl */
-	return p;
+	/* attrmask */
+	status = nfsd4_encode_bitmap4(xdr, FATTR4_WORD0_RDATTR_ERROR, 0, 0);
+	if (status != nfs_ok)
+		return status;
+	/* attr_vals */
+	if (xdr_stream_encode_u32(xdr, XDR_UNIT) != XDR_UNIT)
+		return nfserr_resource;
+	/* rdattr_error */
+	if (xdr_stream_encode_be32(xdr, nfserr) != XDR_UNIT)
+		return nfserr_resource;
+	return nfs_ok;
 }
 
 static int
-nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
+nfsd4_encode_entry4(void *ccdv, const char *name, int namlen,
 		    loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct readdir_cd *ccd = ccdv;
@@ -3611,8 +3785,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	u32 name_and_cookie;
 	int entry_bytes;
 	__be32 nfserr = nfserr_toosmall;
-	__be64 wire_offset;
-	__be32 *p;
 
 	/* In nfsv4, "." and ".." never make it onto the wire.. */
 	if (name && isdotent(name, namlen)) {
@@ -3620,24 +3792,19 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 		return 0;
 	}
 
-	if (cd->cookie_offset) {
-		wire_offset = cpu_to_be64(offset);
-		write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset,
-							&wire_offset, 8);
-	}
+	/* Encode the previous entry's cookie value */
+	nfsd4_encode_entry4_nfs_cookie4(cd, offset);
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+	if (xdr_stream_encode_item_present(xdr) != XDR_UNIT)
 		goto fail;
-	*p++ = xdr_one;                             /* mark entry present */
+
+	/* Reserve send buffer space for this entry's cookie value. */
 	cookie_offset = xdr->buf->len;
-	p = xdr_reserve_space(xdr, 3*4 + namlen);
-	if (!p)
+	if (nfsd4_encode_nfs_cookie4(xdr, OFFSET_MAX) != nfs_ok)
 		goto fail;
-	p = xdr_encode_hyper(p, OFFSET_MAX);        /* offset of next entry */
-	p = xdr_encode_array(p, name, namlen);      /* name length & name */
-
-	nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
+	if (nfsd4_encode_component4(xdr, name, namlen) != nfs_ok)
+		goto fail;
+	nfserr = nfsd4_encode_entry4_fattr(cd, name, namlen);
 	switch (nfserr) {
 	case nfs_ok:
 		break;
@@ -3668,8 +3835,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 		 */
 		if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
 			goto fail;
-		p = nfsd4_encode_rdattr_error(xdr, nfserr);
-		if (p == NULL) {
+		if (nfsd4_encode_entry4_rdattr_error(xdr, nfserr)) {
 			nfserr = nfserr_toosmall;
 			goto fail;
 		}
@@ -3727,18 +3893,26 @@ nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid)
 	return nfs_ok;
 }
 
+/* This is a frequently-encoded item; open-coded for speed */
 static __be32
-nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+nfsd4_encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
 {
 	__be32 *p;
 
-	p = xdr_reserve_space(xdr, sizeof(stateid_t));
+	p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (!p)
 		return nfserr_resource;
 	*p++ = cpu_to_be32(sid->si_generation);
-	p = xdr_encode_opaque_fixed(p, &sid->si_opaque,
-					sizeof(stateid_opaque_t));
-	return 0;
+	memcpy(p, &sid->si_opaque, sizeof(sid->si_opaque));
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_sessionid4(struct xdr_stream *xdr,
+			const struct nfs4_sessionid *sessionid)
+{
+	return nfsd4_encode_opaque_fixed(xdr, sessionid->data,
+					 NFS4_MAX_SESSIONID_LEN);
 }
 
 static __be32
@@ -3747,14 +3921,14 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_access *access = &u->access;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
+	__be32 status;
 
-	p = xdr_reserve_space(xdr, 8);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(access->ac_supported);
-	*p++ = cpu_to_be32(access->ac_resp_access);
-	return 0;
+	/* supported */
+	status = nfsd4_encode_uint32_t(xdr, access->ac_supported);
+	if (status != nfs_ok)
+		return status;
+	/* access */
+	return nfsd4_encode_uint32_t(xdr, access->ac_resp_access);
 }
 
 static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr,
@@ -3762,17 +3936,16 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
 {
 	struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
-	if (!p)
+	/* bctsr_sessid */
+	nfserr = nfsd4_encode_sessionid4(xdr, &bcts->sessionid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* bctsr_dir */
+	if (xdr_stream_encode_u32(xdr, bcts->dir) != XDR_UNIT)
 		return nfserr_resource;
-	p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
-					NFS4_MAX_SESSIONID_LEN);
-	*p++ = cpu_to_be32(bcts->dir);
-	/* Upshifting from TCP to RDMA is not supported */
-	*p++ = cpu_to_be32(0);
-	return 0;
+	/* bctsr_use_conn_in_rdma_mode */
+	return nfsd4_encode_bool(xdr, false);
 }
 
 static __be32
@@ -3782,7 +3955,8 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_close *close = &u->close;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_stateid(xdr, &close->cl_stateid);
+	/* open_stateid */
+	return nfsd4_encode_stateid4(xdr, &close->cl_stateid);
 }
 
 
@@ -3802,11 +3976,13 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_create *create = &u->create;
 	struct xdr_stream *xdr = resp->xdr;
 
+	/* cinfo */
 	nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo);
 	if (nfserr)
 		return nfserr;
-	return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
-			create->cr_bmval[1], create->cr_bmval[2]);
+	/* attrset */
+	return nfsd4_encode_bitmap4(xdr, create->cr_bmval[0],
+				    create->cr_bmval[1], create->cr_bmval[2]);
 }
 
 static __be32
@@ -3817,65 +3993,56 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct svc_fh *fhp = getattr->ga_fhp;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
-				    getattr->ga_bmval, resp->rqstp, 0);
+	/* obj_attributes */
+	return nfsd4_encode_fattr4(resp->rqstp, xdr, fhp, fhp->fh_export,
+				   fhp->fh_dentry, getattr->ga_bmval, 0);
 }
 
 static __be32
 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr,
 		   union nfsd4_op_u *u)
 {
-	struct svc_fh **fhpp = &u->getfh;
 	struct xdr_stream *xdr = resp->xdr;
-	struct svc_fh *fhp = *fhpp;
-	unsigned int len;
-	__be32 *p;
+	struct svc_fh *fhp = u->getfh;
 
-	len = fhp->fh_handle.fh_size;
-	p = xdr_reserve_space(xdr, len + 4);
-	if (!p)
-		return nfserr_resource;
-	p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
-	return 0;
+	/* object */
+	return nfsd4_encode_nfs_fh4(xdr, &fhp->fh_handle);
 }
 
-/*
-* Including all fields other than the name, a LOCK4denied structure requires
-*   8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
-*/
 static __be32
-nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
+nfsd4_encode_lock_owner4(struct xdr_stream *xdr, const clientid_t *clientid,
+			 const struct xdr_netobj *owner)
 {
-	struct xdr_netobj *conf = &ld->ld_owner;
-	__be32 *p;
+	__be32 status;
 
-again:
-	p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len));
-	if (!p) {
-		/*
-		 * Don't fail to return the result just because we can't
-		 * return the conflicting open:
-		 */
-		if (conf->len) {
-			kfree(conf->data);
-			conf->len = 0;
-			conf->data = NULL;
-			goto again;
-		}
+	/* clientid */
+	status = nfsd4_encode_clientid4(xdr, clientid);
+	if (status != nfs_ok)
+		return status;
+	/* owner */
+	return nfsd4_encode_opaque(xdr, owner->data, owner->len);
+}
+
+static __be32
+nfsd4_encode_lock4denied(struct xdr_stream *xdr,
+			 const struct nfsd4_lock_denied *ld)
+{
+	__be32 status;
+
+	/* offset */
+	status = nfsd4_encode_offset4(xdr, ld->ld_start);
+	if (status != nfs_ok)
+		return status;
+	/* length */
+	status = nfsd4_encode_length4(xdr, ld->ld_length);
+	if (status != nfs_ok)
+		return status;
+	/* locktype */
+	if (xdr_stream_encode_u32(xdr, ld->ld_type) != XDR_UNIT)
 		return nfserr_resource;
-	}
-	p = xdr_encode_hyper(p, ld->ld_start);
-	p = xdr_encode_hyper(p, ld->ld_length);
-	*p++ = cpu_to_be32(ld->ld_type);
-	if (conf->len) {
-		p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8);
-		p = xdr_encode_opaque(p, conf->data, conf->len);
-		kfree(conf->data);
-	}  else {  /* non - nfsv4 lock in conflict, no clientid nor owner */
-		p = xdr_encode_hyper(p, (u64)0); /* clientid */
-		*p++ = cpu_to_be32(0); /* length of owner name */
-	}
-	return nfserr_denied;
+	/* owner */
+	return nfsd4_encode_lock_owner4(xdr, &ld->ld_clientid,
+					&ld->ld_owner);
 }
 
 static __be32
@@ -3884,13 +4051,21 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_lock *lock = &u->lock;
 	struct xdr_stream *xdr = resp->xdr;
+	__be32 status;
 
-	if (!nfserr)
-		nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
-	else if (nfserr == nfserr_denied)
-		nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied);
-
-	return nfserr;
+	switch (nfserr) {
+	case nfs_ok:
+		/* resok4 */
+		status = nfsd4_encode_stateid4(xdr, &lock->lk_resp_stateid);
+		break;
+	case nfserr_denied:
+		/* denied */
+		status = nfsd4_encode_lock4denied(xdr, &lock->lk_denied);
+		break;
+	default:
+		return nfserr;
+	}
+	return status != nfs_ok ? status : nfserr;
 }
 
 static __be32
@@ -3899,9 +4074,14 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_lockt *lockt = &u->lockt;
 	struct xdr_stream *xdr = resp->xdr;
+	__be32 status;
 
-	if (nfserr == nfserr_denied)
-		nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
+	if (nfserr == nfserr_denied) {
+		/* denied */
+		status = nfsd4_encode_lock4denied(xdr, &lockt->lt_denied);
+		if (status != nfs_ok)
+			return status;
+	}
 	return nfserr;
 }
 
@@ -3912,7 +4092,8 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_locku *locku = &u->locku;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
+	/* lock_stateid */
+	return nfsd4_encode_stateid4(xdr, &locku->lu_stateid);
 }
 
 
@@ -3926,104 +4107,159 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfsd4_encode_change_info4(xdr, &link->li_cinfo);
 }
 
+/*
+ * This implementation does not yet support returning an ACE in an
+ * OPEN that offers a delegation.
+ */
+static __be32
+nfsd4_encode_open_nfsace4(struct xdr_stream *xdr)
+{
+	__be32 status;
+
+	/* type */
+	status = nfsd4_encode_acetype4(xdr, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* flag */
+	status = nfsd4_encode_aceflag4(xdr, 0);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* access mask */
+	status = nfsd4_encode_acemask4(xdr, 0);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* who - empty for now */
+	if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+		return nfserr_resource;
+	return nfs_ok;
+}
 
 static __be32
-nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
-		  union nfsd4_op_u *u)
+nfsd4_encode_open_read_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
 {
-	struct nfsd4_open *open = &u->open;
-	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
+	__be32 status;
 
-	nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
-	if (nfserr)
-		return nfserr;
-	nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
-	if (nfserr)
-		return nfserr;
-	if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0)
+	/* stateid */
+	status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid);
+	if (status != nfs_ok)
+		return status;
+	/* recall */
+	status = nfsd4_encode_bool(xdr, open->op_recall);
+	if (status != nfs_ok)
+		return status;
+	/* permissions */
+	return nfsd4_encode_open_nfsace4(xdr);
+}
+
+static __be32
+nfsd4_encode_nfs_space_limit4(struct xdr_stream *xdr, u64 filesize)
+{
+	/* limitby */
+	if (xdr_stream_encode_u32(xdr, NFS4_LIMIT_SIZE) != XDR_UNIT)
 		return nfserr_resource;
+	/* filesize */
+	return nfsd4_encode_uint64_t(xdr, filesize);
+}
 
-	nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
-					open->op_bmval[2]);
-	if (nfserr)
-		return nfserr;
+static __be32
+nfsd4_encode_open_write_delegation4(struct xdr_stream *xdr,
+				    struct nfsd4_open *open)
+{
+	__be32 status;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+	/* stateid */
+	status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid);
+	if (status != nfs_ok)
+		return status;
+	/* recall */
+	status = nfsd4_encode_bool(xdr, open->op_recall);
+	if (status != nfs_ok)
+		return status;
+	/* space_limit */
+	status = nfsd4_encode_nfs_space_limit4(xdr, 0);
+	if (status != nfs_ok)
+		return status;
+	return nfsd4_encode_open_nfsace4(xdr);
+}
+
+static __be32
+nfsd4_encode_open_none_delegation4(struct xdr_stream *xdr,
+				   struct nfsd4_open *open)
+{
+	__be32 status = nfs_ok;
+
+	/* ond_why */
+	if (xdr_stream_encode_u32(xdr, open->op_why_no_deleg) != XDR_UNIT)
 		return nfserr_resource;
+	switch (open->op_why_no_deleg) {
+	case WND4_CONTENTION:
+		/* ond_server_will_push_deleg */
+		status = nfsd4_encode_bool(xdr, false);
+		break;
+	case WND4_RESOURCE:
+		/* ond_server_will_signal_avail */
+		status = nfsd4_encode_bool(xdr, false);
+	}
+	return status;
+}
+
+static __be32
+nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
+{
+	__be32 status;
 
-	*p++ = cpu_to_be32(open->op_delegate_type);
+	/* delegation_type */
+	if (xdr_stream_encode_u32(xdr, open->op_delegate_type) != XDR_UNIT)
+		return nfserr_resource;
 	switch (open->op_delegate_type) {
 	case NFS4_OPEN_DELEGATE_NONE:
+		status = nfs_ok;
 		break;
 	case NFS4_OPEN_DELEGATE_READ:
-		nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
-		if (nfserr)
-			return nfserr;
-		p = xdr_reserve_space(xdr, 20);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(open->op_recall);
-
-		/*
-		 * TODO: ACE's in delegations
-		 */
-		*p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);   /* XXX: is NULL principal ok? */
+		/* read */
+		status = nfsd4_encode_open_read_delegation4(xdr, open);
 		break;
 	case NFS4_OPEN_DELEGATE_WRITE:
-		nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
-		if (nfserr)
-			return nfserr;
-
-		p = xdr_reserve_space(xdr, XDR_UNIT * 8);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(open->op_recall);
-
-		/*
-		 * Always flush on close
-		 *
-		 * TODO: space_limit's in delegations
-		 */
-		*p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
-		*p++ = xdr_zero;
-		*p++ = xdr_zero;
-
-		/*
-		 * TODO: ACE's in delegations
-		 */
-		*p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);   /* XXX: is NULL principal ok? */
+		/* write */
+		status = nfsd4_encode_open_write_delegation4(xdr, open);
 		break;
-	case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
-		switch (open->op_why_no_deleg) {
-		case WND4_CONTENTION:
-		case WND4_RESOURCE:
-			p = xdr_reserve_space(xdr, 8);
-			if (!p)
-				return nfserr_resource;
-			*p++ = cpu_to_be32(open->op_why_no_deleg);
-			/* deleg signaling not supported yet: */
-			*p++ = cpu_to_be32(0);
-			break;
-		default:
-			p = xdr_reserve_space(xdr, 4);
-			if (!p)
-				return nfserr_resource;
-			*p++ = cpu_to_be32(open->op_why_no_deleg);
-		}
+	case NFS4_OPEN_DELEGATE_NONE_EXT:
+		/* od_whynone */
+		status = nfsd4_encode_open_none_delegation4(xdr, open);
 		break;
 	default:
-		BUG();
+		status = nfserr_serverfault;
 	}
-	/* XXX save filehandle here */
-	return 0;
+
+	return status;
+}
+
+static __be32
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  union nfsd4_op_u *u)
+{
+	struct nfsd4_open *open = &u->open;
+	struct xdr_stream *xdr = resp->xdr;
+
+	/* stateid */
+	nfserr = nfsd4_encode_stateid4(xdr, &open->op_stateid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* cinfo */
+	nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* rflags */
+	nfserr = nfsd4_encode_uint32_t(xdr, open->op_rflags);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* attrset */
+	nfserr = nfsd4_encode_bitmap4(xdr, open->op_bmval[0],
+				      open->op_bmval[1], open->op_bmval[2]);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* delegation */
+	return nfsd4_encode_open_delegation4(xdr, open);
 }
 
 static __be32
@@ -4033,7 +4269,8 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_open_confirm *oc = &u->open_confirm;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
+	/* open_stateid */
+	return nfsd4_encode_stateid4(xdr, &oc->oc_resp_stateid);
 }
 
 static __be32
@@ -4043,7 +4280,8 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_open_downgrade *od = &u->open_downgrade;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_stateid(xdr, &od->od_stateid);
+	/* open_stateid */
+	return nfsd4_encode_stateid4(xdr, &od->od_stateid);
 }
 
 /*
@@ -4227,90 +4465,83 @@ out_err:
 	return nfserr;
 }
 
-static __be32
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
-		     union nfsd4_op_u *u)
+static __be32 nfsd4_encode_dirlist4(struct xdr_stream *xdr,
+				    struct nfsd4_readdir *readdir,
+				    u32 max_payload)
 {
-	struct nfsd4_readdir *readdir = &u->readdir;
-	int maxcount;
-	int bytes_left;
+	int bytes_left, maxcount, starting_len = xdr->buf->len;
 	loff_t offset;
-	__be64 wire_offset;
-	struct xdr_stream *xdr = resp->xdr;
-	int starting_len = xdr->buf->len;
-	__be32 *p;
-
-	nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
-	if (nfserr != nfs_ok)
-		return nfserr;
+	__be32 status;
 
 	/*
 	 * Number of bytes left for directory entries allowing for the
-	 * final 8 bytes of the readdir and a following failed op:
+	 * final 8 bytes of the readdir and a following failed op.
 	 */
-	bytes_left = xdr->buf->buflen - xdr->buf->len
-			- COMPOUND_ERR_SLACK_SPACE - 8;
-	if (bytes_left < 0) {
-		nfserr = nfserr_resource;
-		goto err_no_verf;
-	}
-	maxcount = svc_max_payload(resp->rqstp);
-	maxcount = min_t(u32, readdir->rd_maxcount, maxcount);
+	bytes_left = xdr->buf->buflen - xdr->buf->len -
+		COMPOUND_ERR_SLACK_SPACE - XDR_UNIT * 2;
+	if (bytes_left < 0)
+		return nfserr_resource;
+	maxcount = min_t(u32, readdir->rd_maxcount, max_payload);
+
 	/*
-	 * Note the rfc defines rd_maxcount as the size of the
-	 * READDIR4resok structure, which includes the verifier above
-	 * and the 8 bytes encoded at the end of this function:
+	 * The RFC defines rd_maxcount as the size of the
+	 * READDIR4resok structure, which includes the verifier
+	 * and the 8 bytes encoded at the end of this function.
 	 */
-	if (maxcount < 16) {
-		nfserr = nfserr_toosmall;
-		goto err_no_verf;
-	}
-	maxcount = min_t(int, maxcount-16, bytes_left);
+	if (maxcount < XDR_UNIT * 4)
+		return nfserr_toosmall;
+	maxcount = min_t(int, maxcount - XDR_UNIT * 4, bytes_left);
 
-	/* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+	/* RFC 3530 14.2.24 allows us to ignore dircount when it's 0 */
 	if (!readdir->rd_dircount)
-		readdir->rd_dircount = svc_max_payload(resp->rqstp);
+		readdir->rd_dircount = max_payload;
 
+	/* *entries */
 	readdir->xdr = xdr;
 	readdir->rd_maxcount = maxcount;
 	readdir->common.err = 0;
 	readdir->cookie_offset = 0;
-
 	offset = readdir->rd_cookie;
-	nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
-			      &offset,
-			      &readdir->common, nfsd4_encode_dirent);
-	if (nfserr == nfs_ok &&
-	    readdir->common.err == nfserr_toosmall &&
-	    xdr->buf->len == starting_len + 8) {
-		/* nothing encoded; which limit did we hit?: */
-		if (maxcount - 16 < bytes_left)
-			/* It was the fault of rd_maxcount: */
-			nfserr = nfserr_toosmall;
-		else
-			/* We ran out of buffer space: */
-			nfserr = nfserr_resource;
+	status = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, &offset,
+			      &readdir->common, nfsd4_encode_entry4);
+	if (status)
+		return status;
+	if (readdir->common.err == nfserr_toosmall &&
+	    xdr->buf->len == starting_len) {
+		/* No entries were encoded. Which limit did we hit? */
+		if (maxcount - XDR_UNIT * 4 < bytes_left)
+			/* It was the fault of rd_maxcount */
+			return nfserr_toosmall;
+		/* We ran out of buffer space */
+		return nfserr_resource;
 	}
-	if (nfserr)
-		goto err_no_verf;
+	/* Encode the final entry's cookie value */
+	nfsd4_encode_entry4_nfs_cookie4(readdir, offset);
+	/* No entries follow */
+	if (xdr_stream_encode_item_absent(xdr) != XDR_UNIT)
+		return nfserr_resource;
 
-	if (readdir->cookie_offset) {
-		wire_offset = cpu_to_be64(offset);
-		write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset,
-							&wire_offset, 8);
-	}
+	/* eof */
+	return nfsd4_encode_bool(xdr, readdir->common.err == nfserr_eof);
+}
 
-	p = xdr_reserve_space(xdr, 8);
-	if (!p) {
-		WARN_ON_ONCE(1);
-		goto err_no_verf;
-	}
-	*p++ = 0;	/* no more entries */
-	*p++ = htonl(readdir->common.err == nfserr_eof);
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     union nfsd4_op_u *u)
+{
+	struct nfsd4_readdir *readdir = &u->readdir;
+	struct xdr_stream *xdr = resp->xdr;
+	int starting_len = xdr->buf->len;
 
-	return 0;
-err_no_verf:
-	xdr_truncate_encode(xdr, starting_len);
+	/* cookieverf */
+	nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
+	if (nfserr != nfs_ok)
+		return nfserr;
+
+	/* reply */
+	nfserr = nfsd4_encode_dirlist4(xdr, readdir, svc_max_payload(resp->rqstp));
+	if (nfserr != nfs_ok)
+		xdr_truncate_encode(xdr, starting_len);
 	return nfserr;
 }
 
@@ -4338,13 +4569,34 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
 }
 
 static __be32
+nfsd4_encode_rpcsec_gss_info(struct xdr_stream *xdr,
+			     struct rpcsec_gss_info *info)
+{
+	__be32 status;
+
+	/* oid */
+	if (xdr_stream_encode_opaque(xdr, info->oid.data, info->oid.len) < 0)
+		return nfserr_resource;
+	/* qop */
+	status = nfsd4_encode_qop4(xdr, info->qop);
+	if (status != nfs_ok)
+		return status;
+	/* service */
+	if (xdr_stream_encode_u32(xdr, info->service) != XDR_UNIT)
+		return nfserr_resource;
+
+	return nfs_ok;
+}
+
+static __be32
 nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 {
 	u32 i, nflavs, supported;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
-	__be32 *p, *flavorsp;
 	static bool report = true;
+	__be32 *flavorsp;
+	__be32 status;
 
 	if (exp->ex_nflavors) {
 		flavs = exp->ex_flavors;
@@ -4367,10 +4619,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 	}
 
 	supported = 0;
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+	flavorsp = xdr_reserve_space(xdr, XDR_UNIT);
+	if (!flavorsp)
 		return nfserr_resource;
-	flavorsp = p++;		/* to be backfilled later */
 
 	for (i = 0; i < nflavs; i++) {
 		rpc_authflavor_t pf = flavs[i].pseudoflavor;
@@ -4378,20 +4629,22 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 
 		if (rpcauth_get_gssinfo(pf, &info) == 0) {
 			supported++;
-			p = xdr_reserve_space(xdr, 4 + 4 +
-					      XDR_LEN(info.oid.len) + 4 + 4);
-			if (!p)
-				return nfserr_resource;
-			*p++ = cpu_to_be32(RPC_AUTH_GSS);
-			p = xdr_encode_opaque(p,  info.oid.data, info.oid.len);
-			*p++ = cpu_to_be32(info.qop);
-			*p++ = cpu_to_be32(info.service);
+
+			/* flavor */
+			status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS);
+			if (status != nfs_ok)
+				return status;
+			/* flavor_info */
+			status = nfsd4_encode_rpcsec_gss_info(xdr, &info);
+			if (status != nfs_ok)
+				return status;
 		} else if (pf < RPC_AUTH_MAXFLAVOR) {
 			supported++;
-			p = xdr_reserve_space(xdr, 4);
-			if (!p)
-				return nfserr_resource;
-			*p++ = cpu_to_be32(pf);
+
+			/* flavor */
+			status = nfsd4_encode_uint32_t(xdr, pf);
+			if (status != nfs_ok)
+				return status;
 		} else {
 			if (report)
 				pr_warn("NFS: SECINFO: security flavor %u "
@@ -4401,7 +4654,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 
 	if (nflavs != supported)
 		report = false;
-	*flavorsp = htonl(supported);
+	*flavorsp = cpu_to_be32(supported);
 	return 0;
 }
 
@@ -4425,34 +4678,25 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
 }
 
-/*
- * The SETATTR encode routine is special -- it always encodes a bitmap,
- * regardless of the error status.
- */
 static __be32
 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 		     union nfsd4_op_u *u)
 {
 	struct nfsd4_setattr *setattr = &u->setattr;
-	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
+	__be32 status;
 
-	p = xdr_reserve_space(xdr, 16);
-	if (!p)
-		return nfserr_resource;
-	if (nfserr) {
-		*p++ = cpu_to_be32(3);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);
-	}
-	else {
-		*p++ = cpu_to_be32(3);
-		*p++ = cpu_to_be32(setattr->sa_bmval[0]);
-		*p++ = cpu_to_be32(setattr->sa_bmval[1]);
-		*p++ = cpu_to_be32(setattr->sa_bmval[2]);
+	switch (nfserr) {
+	case nfs_ok:
+		/* attrsset */
+		status = nfsd4_encode_bitmap4(resp->xdr, setattr->sa_bmval[0],
+					      setattr->sa_bmval[1],
+					      setattr->sa_bmval[2]);
+		break;
+	default:
+		/* attrsset */
+		status = nfsd4_encode_bitmap4(resp->xdr, 0, 0, 0);
 	}
-	return nfserr;
+	return status != nfs_ok ? status : nfserr;
 }
 
 static __be32
@@ -4488,86 +4732,148 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
 		   union nfsd4_op_u *u)
 {
 	struct nfsd4_write *write = &u->write;
+	struct xdr_stream *xdr = resp->xdr;
 
-	if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0)
+	/* count */
+	nfserr = nfsd4_encode_count4(xdr, write->wr_bytes_written);
+	if (nfserr)
+		return nfserr;
+	/* committed */
+	if (xdr_stream_encode_u32(xdr, write->wr_how_written) != XDR_UNIT)
 		return nfserr_resource;
-	return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier);
+	/* writeverf */
+	return nfsd4_encode_verifier4(xdr, &write->wr_verifier);
 }
 
 static __be32
-nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
-			 union nfsd4_op_u *u)
+nfsd4_encode_state_protect_ops4(struct xdr_stream *xdr,
+				struct nfsd4_exchange_id *exid)
 {
-	struct nfsd4_exchange_id *exid = &u->exchange_id;
-	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
-	char *major_id;
-	char *server_scope;
-	int major_id_sz;
-	int server_scope_sz;
-	uint64_t minor_id = 0;
-	struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+	__be32 status;
 
-	major_id = nn->nfsd_name;
-	major_id_sz = strlen(nn->nfsd_name);
-	server_scope = nn->nfsd_name;
-	server_scope_sz = strlen(nn->nfsd_name);
+	/* spo_must_enforce */
+	status = nfsd4_encode_bitmap4(xdr, exid->spo_must_enforce[0],
+				      exid->spo_must_enforce[1],
+				      exid->spo_must_enforce[2]);
+	if (status != nfs_ok)
+		return status;
+	/* spo_must_allow */
+	return nfsd4_encode_bitmap4(xdr, exid->spo_must_allow[0],
+				    exid->spo_must_allow[1],
+				    exid->spo_must_allow[2]);
+}
 
-	if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok)
-		return nfserr_resource;
-	if (xdr_stream_encode_u32(xdr, exid->seqid) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u32(xdr, exid->flags) < 0)
-		return nfserr_resource;
+static __be32
+nfsd4_encode_state_protect4_r(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid)
+{
+	__be32 status;
 
-	if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0)
+	if (xdr_stream_encode_u32(xdr, exid->spa_how) != XDR_UNIT)
 		return nfserr_resource;
 	switch (exid->spa_how) {
 	case SP4_NONE:
+		status = nfs_ok;
 		break;
 	case SP4_MACH_CRED:
-		/* spo_must_enforce bitmap: */
-		nfserr = nfsd4_encode_bitmap(xdr,
-					exid->spo_must_enforce[0],
-					exid->spo_must_enforce[1],
-					exid->spo_must_enforce[2]);
-		if (nfserr)
-			return nfserr;
-		/* spo_must_allow bitmap: */
-		nfserr = nfsd4_encode_bitmap(xdr,
-					exid->spo_must_allow[0],
-					exid->spo_must_allow[1],
-					exid->spo_must_allow[2]);
-		if (nfserr)
-			return nfserr;
+		/* spr_mach_ops */
+		status = nfsd4_encode_state_protect_ops4(xdr, exid);
 		break;
 	default:
-		WARN_ON_ONCE(1);
+		status = nfserr_serverfault;
 	}
+	return status;
+}
 
-	p = xdr_reserve_space(xdr,
-		8 /* so_minor_id */ +
-		4 /* so_major_id.len */ +
-		(XDR_QUADLEN(major_id_sz) * 4) +
-		4 /* eir_server_scope.len */ +
-		(XDR_QUADLEN(server_scope_sz) * 4) +
-		4 /* eir_server_impl_id.count (0) */);
-	if (!p)
+static __be32
+nfsd4_encode_server_owner4(struct xdr_stream *xdr, struct svc_rqst *rqstp)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	__be32 status;
+
+	/* so_minor_id */
+	status = nfsd4_encode_uint64_t(xdr, 0);
+	if (status != nfs_ok)
+		return status;
+	/* so_major_id */
+	return nfsd4_encode_opaque(xdr, nn->nfsd_name, strlen(nn->nfsd_name));
+}
+
+static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
+			 union nfsd4_op_u *u)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+	struct nfsd4_exchange_id *exid = &u->exchange_id;
+	struct xdr_stream *xdr = resp->xdr;
+
+	/* eir_clientid */
+	nfserr = nfsd4_encode_clientid4(xdr, &exid->clientid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_sequenceid */
+	nfserr = nfsd4_encode_sequenceid4(xdr, exid->seqid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_flags */
+	nfserr = nfsd4_encode_uint32_t(xdr, exid->flags);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_state_protect */
+	nfserr = nfsd4_encode_state_protect4_r(xdr, exid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_server_owner */
+	nfserr = nfsd4_encode_server_owner4(xdr, resp->rqstp);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_server_scope */
+	nfserr = nfsd4_encode_opaque(xdr, nn->nfsd_name,
+				     strlen(nn->nfsd_name));
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_server_impl_id<1> */
+	if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
 		return nfserr_resource;
 
-	/* The server_owner struct */
-	p = xdr_encode_hyper(p, minor_id);      /* Minor id */
-	/* major id */
-	p = xdr_encode_opaque(p, major_id, major_id_sz);
+	return nfs_ok;
+}
 
-	/* Server scope */
-	p = xdr_encode_opaque(p, server_scope, server_scope_sz);
+static __be32
+nfsd4_encode_channel_attrs4(struct xdr_stream *xdr,
+			    const struct nfsd4_channel_attrs *attrs)
+{
+	__be32 status;
 
-	/* Implementation id */
-	*p++ = cpu_to_be32(0);	/* zero length nfs_impl_id4 array */
-	return 0;
+	/* ca_headerpadsize */
+	status = nfsd4_encode_count4(xdr, 0);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxrequestsize */
+	status = nfsd4_encode_count4(xdr, attrs->maxreq_sz);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxresponsesize */
+	status = nfsd4_encode_count4(xdr, attrs->maxresp_sz);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxresponsesize_cached */
+	status = nfsd4_encode_count4(xdr, attrs->maxresp_cached);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxoperations */
+	status = nfsd4_encode_count4(xdr, attrs->maxops);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxrequests */
+	status = nfsd4_encode_count4(xdr, attrs->maxreqs);
+	if (status != nfs_ok)
+		return status;
+	/* ca_rdma_ird<1> */
+	if (xdr_stream_encode_u32(xdr, attrs->nr_rdma_attrs) != XDR_UNIT)
+		return nfserr_resource;
+	if (attrs->nr_rdma_attrs)
+		return nfsd4_encode_uint32_t(xdr, attrs->rdma_attrs);
+	return nfs_ok;
 }
 
 static __be32
@@ -4576,52 +4882,25 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_create_session *sess = &u->create_session;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
-
-	p = xdr_reserve_space(xdr, 24);
-	if (!p)
-		return nfserr_resource;
-	p = xdr_encode_opaque_fixed(p, sess->sessionid.data,
-					NFS4_MAX_SESSIONID_LEN);
-	*p++ = cpu_to_be32(sess->seqid);
-	*p++ = cpu_to_be32(sess->flags);
 
-	p = xdr_reserve_space(xdr, 28);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(0); /* headerpadsz */
-	*p++ = cpu_to_be32(sess->fore_channel.maxreq_sz);
-	*p++ = cpu_to_be32(sess->fore_channel.maxresp_sz);
-	*p++ = cpu_to_be32(sess->fore_channel.maxresp_cached);
-	*p++ = cpu_to_be32(sess->fore_channel.maxops);
-	*p++ = cpu_to_be32(sess->fore_channel.maxreqs);
-	*p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs);
-
-	if (sess->fore_channel.nr_rdma_attrs) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(sess->fore_channel.rdma_attrs);
-	}
-
-	p = xdr_reserve_space(xdr, 28);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(0); /* headerpadsz */
-	*p++ = cpu_to_be32(sess->back_channel.maxreq_sz);
-	*p++ = cpu_to_be32(sess->back_channel.maxresp_sz);
-	*p++ = cpu_to_be32(sess->back_channel.maxresp_cached);
-	*p++ = cpu_to_be32(sess->back_channel.maxops);
-	*p++ = cpu_to_be32(sess->back_channel.maxreqs);
-	*p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs);
-
-	if (sess->back_channel.nr_rdma_attrs) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(sess->back_channel.rdma_attrs);
-	}
-	return 0;
+	/* csr_sessionid */
+	nfserr = nfsd4_encode_sessionid4(xdr, &sess->sessionid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* csr_sequence */
+	nfserr = nfsd4_encode_sequenceid4(xdr, sess->seqid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* csr_flags */
+	nfserr = nfsd4_encode_uint32_t(xdr, sess->flags);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* csr_fore_chan_attrs */
+	nfserr = nfsd4_encode_channel_attrs4(xdr, &sess->fore_channel);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* csr_back_chan_attrs */
+	return nfsd4_encode_channel_attrs4(xdr, &sess->back_channel);
 }
 
 static __be32
@@ -4630,22 +4909,35 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_sequence *seq = &u->sequence;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
-	if (!p)
-		return nfserr_resource;
-	p = xdr_encode_opaque_fixed(p, seq->sessionid.data,
-					NFS4_MAX_SESSIONID_LEN);
-	*p++ = cpu_to_be32(seq->seqid);
-	*p++ = cpu_to_be32(seq->slotid);
+	/* sr_sessionid */
+	nfserr = nfsd4_encode_sessionid4(xdr, &seq->sessionid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_sequenceid */
+	nfserr = nfsd4_encode_sequenceid4(xdr, seq->seqid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_slotid */
+	nfserr = nfsd4_encode_slotid4(xdr, seq->slotid);
+	if (nfserr != nfs_ok)
+		return nfserr;
 	/* Note slotid's are numbered from zero: */
-	*p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */
-	*p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */
-	*p++ = cpu_to_be32(seq->status_flags);
+	/* sr_highest_slotid */
+	nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_target_highest_slotid */
+	nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_status_flags */
+	nfserr = nfsd4_encode_uint32_t(xdr, seq->status_flags);
+	if (nfserr != nfs_ok)
+		return nfserr;
 
 	resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */
-	return 0;
+	return nfs_ok;
 }
 
 static __be32
@@ -4653,125 +4945,132 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 			  union nfsd4_op_u *u)
 {
 	struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
-	struct xdr_stream *xdr = resp->xdr;
 	struct nfsd4_test_stateid_id *stateid, *next;
-	__be32 *p;
+	struct xdr_stream *xdr = resp->xdr;
 
-	p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids));
-	if (!p)
+	/* tsr_status_codes<> */
+	if (xdr_stream_encode_u32(xdr, test_stateid->ts_num_ids) != XDR_UNIT)
 		return nfserr_resource;
-	*p++ = htonl(test_stateid->ts_num_ids);
-
-	list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
-		*p++ = stateid->ts_id_status;
+	list_for_each_entry_safe(stateid, next,
+				 &test_stateid->ts_stateid_list, ts_id_list) {
+		if (xdr_stream_encode_be32(xdr, stateid->ts_id_status) != XDR_UNIT)
+			return nfserr_resource;
 	}
-
-	return 0;
+	return nfs_ok;
 }
 
 #ifdef CONFIG_NFSD_PNFS
 static __be32
-nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
-		union nfsd4_op_u *u)
+nfsd4_encode_device_addr4(struct xdr_stream *xdr,
+			  const struct nfsd4_getdeviceinfo *gdev)
 {
-	struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
-	struct xdr_stream *xdr = resp->xdr;
+	u32 needed_len, starting_len = xdr->buf->len;
 	const struct nfsd4_layout_ops *ops;
-	u32 starting_len = xdr->buf->len, needed_len;
-	__be32 *p;
+	__be32 status;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+	/* da_layout_type */
+	if (xdr_stream_encode_u32(xdr, gdev->gd_layout_type) != XDR_UNIT)
 		return nfserr_resource;
-
-	*p++ = cpu_to_be32(gdev->gd_layout_type);
-
+	/* da_addr_body */
 	ops = nfsd4_layout_ops[gdev->gd_layout_type];
-	nfserr = ops->encode_getdeviceinfo(xdr, gdev);
-	if (nfserr) {
+	status = ops->encode_getdeviceinfo(xdr, gdev);
+	if (status != nfs_ok) {
 		/*
-		 * We don't bother to burden the layout drivers with
-		 * enforcing gd_maxcount, just tell the client to
-		 * come back with a bigger buffer if it's not enough.
+		 * Don't burden the layout drivers with enforcing
+		 * gd_maxcount. Just tell the client to come back
+		 * with a bigger buffer if it's not enough.
 		 */
-		if (xdr->buf->len + 4 > gdev->gd_maxcount)
+		if (xdr->buf->len + XDR_UNIT > gdev->gd_maxcount)
 			goto toosmall;
-		return nfserr;
+		return status;
 	}
 
-	if (gdev->gd_notify_types) {
-		p = xdr_reserve_space(xdr, 4 + 4);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(1);			/* bitmap length */
-		*p++ = cpu_to_be32(gdev->gd_notify_types);
-	} else {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			return nfserr_resource;
-		*p++ = 0;
-	}
+	return nfs_ok;
 
-	return 0;
 toosmall:
-	dprintk("%s: maxcount too small\n", __func__);
-	needed_len = xdr->buf->len + 4 /* notifications */;
+	needed_len = xdr->buf->len + XDR_UNIT;	/* notifications */
 	xdr_truncate_encode(xdr, starting_len);
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(needed_len);
+
+	status = nfsd4_encode_count4(xdr, needed_len);
+	if (status != nfs_ok)
+		return status;
 	return nfserr_toosmall;
 }
 
 static __be32
-nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 		union nfsd4_op_u *u)
 {
-	struct nfsd4_layoutget *lgp = &u->layoutget;
+	struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
 	struct xdr_stream *xdr = resp->xdr;
-	const struct nfsd4_layout_ops *ops;
-	__be32 *p;
-
-	p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
-	if (!p)
-		return nfserr_resource;
 
-	*p++ = cpu_to_be32(1);	/* we always set return-on-close */
-	*p++ = cpu_to_be32(lgp->lg_sid.si_generation);
-	p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
-				    sizeof(stateid_opaque_t));
+	/* gdir_device_addr */
+	nfserr = nfsd4_encode_device_addr4(xdr, gdev);
+	if (nfserr)
+		return nfserr;
+	/* gdir_notification */
+	return nfsd4_encode_bitmap4(xdr, gdev->gd_notify_types, 0, 0);
+}
 
-	*p++ = cpu_to_be32(1);	/* we always return a single layout */
-	p = xdr_encode_hyper(p, lgp->lg_seg.offset);
-	p = xdr_encode_hyper(p, lgp->lg_seg.length);
-	*p++ = cpu_to_be32(lgp->lg_seg.iomode);
-	*p++ = cpu_to_be32(lgp->lg_layout_type);
+static __be32
+nfsd4_encode_layout4(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp)
+{
+	const struct nfsd4_layout_ops *ops = nfsd4_layout_ops[lgp->lg_layout_type];
+	__be32 status;
 
-	ops = nfsd4_layout_ops[lgp->lg_layout_type];
+	/* lo_offset */
+	status = nfsd4_encode_offset4(xdr, lgp->lg_seg.offset);
+	if (status != nfs_ok)
+		return status;
+	/* lo_length */
+	status = nfsd4_encode_length4(xdr, lgp->lg_seg.length);
+	if (status != nfs_ok)
+		return status;
+	/* lo_iomode */
+	if (xdr_stream_encode_u32(xdr, lgp->lg_seg.iomode) != XDR_UNIT)
+		return nfserr_resource;
+	/* lo_content */
+	if (xdr_stream_encode_u32(xdr, lgp->lg_layout_type) != XDR_UNIT)
+		return nfserr_resource;
 	return ops->encode_layoutget(xdr, lgp);
 }
 
 static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+		union nfsd4_op_u *u)
+{
+	struct nfsd4_layoutget *lgp = &u->layoutget;
+	struct xdr_stream *xdr = resp->xdr;
+
+	/* logr_return_on_close */
+	nfserr = nfsd4_encode_bool(xdr, true);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* logr_stateid */
+	nfserr = nfsd4_encode_stateid4(xdr, &lgp->lg_sid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* logr_layout<> */
+	if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+		return nfserr_resource;
+	return nfsd4_encode_layout4(xdr, lgp);
+}
+
+static __be32
 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
 			  union nfsd4_op_u *u)
 {
 	struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(lcp->lc_size_chg);
-	if (lcp->lc_size_chg) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			return nfserr_resource;
-		p = xdr_encode_hyper(p, lcp->lc_newsize);
-	}
-
-	return 0;
+	/* ns_sizechanged */
+	nfserr = nfsd4_encode_bool(xdr, lcp->lc_size_chg);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	if (lcp->lc_size_chg)
+		/* ns_size */
+		return nfsd4_encode_length4(xdr, lcp->lc_newsize);
+	return nfs_ok;
 }
 
 static __be32
@@ -4780,103 +5079,108 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(lrp->lrs_present);
+	/* lrs_present */
+	nfserr = nfsd4_encode_bool(xdr, lrp->lrs_present);
+	if (nfserr != nfs_ok)
+		return nfserr;
 	if (lrp->lrs_present)
-		return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
-	return 0;
+		/* lrs_stateid */
+		return nfsd4_encode_stateid4(xdr, &lrp->lr_sid);
+	return nfs_ok;
 }
 #endif /* CONFIG_NFSD_PNFS */
 
 static __be32
-nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
-		struct nfsd42_write_res *write, bool sync)
+nfsd4_encode_write_response4(struct xdr_stream *xdr,
+			     const struct nfsd4_copy *copy)
 {
-	__be32 *p;
-	p = xdr_reserve_space(resp->xdr, 4);
-	if (!p)
-		return nfserr_resource;
+	const struct nfsd42_write_res *write = &copy->cp_res;
+	u32 count = nfsd4_copy_is_sync(copy) ? 0 : 1;
+	__be32 status;
 
-	if (sync)
-		*p++ = cpu_to_be32(0);
-	else {
-		__be32 nfserr;
-		*p++ = cpu_to_be32(1);
-		nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid);
-		if (nfserr)
-			return nfserr;
+	/* wr_callback_id<1> */
+	if (xdr_stream_encode_u32(xdr, count) != XDR_UNIT)
+		return nfserr_resource;
+	if (count) {
+		status = nfsd4_encode_stateid4(xdr, &write->cb_stateid);
+		if (status != nfs_ok)
+			return status;
 	}
-	p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
-	if (!p)
+
+	/* wr_count */
+	status = nfsd4_encode_length4(xdr, write->wr_bytes_written);
+	if (status != nfs_ok)
+		return status;
+	/* wr_committed */
+	if (xdr_stream_encode_u32(xdr, write->wr_stable_how) != XDR_UNIT)
 		return nfserr_resource;
+	/* wr_writeverf */
+	return nfsd4_encode_verifier4(xdr, &write->wr_verifier);
+}
 
-	p = xdr_encode_hyper(p, write->wr_bytes_written);
-	*p++ = cpu_to_be32(write->wr_stable_how);
-	p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
-				    NFS4_VERIFIER_SIZE);
-	return nfs_ok;
+static __be32 nfsd4_encode_copy_requirements4(struct xdr_stream *xdr,
+					      const struct nfsd4_copy *copy)
+{
+	__be32 status;
+
+	/* cr_consecutive */
+	status = nfsd4_encode_bool(xdr, true);
+	if (status != nfs_ok)
+		return status;
+	/* cr_synchronous */
+	return nfsd4_encode_bool(xdr, nfsd4_copy_is_sync(copy));
 }
 
 static __be32
-nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  union nfsd4_op_u *u)
 {
-	struct xdr_stream *xdr = resp->xdr;
-	struct nfs42_netaddr *addr;
-	__be32 *p;
+	struct nfsd4_copy *copy = &u->copy;
 
-	p = xdr_reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(ns->nl4_type);
+	nfserr = nfsd4_encode_write_response4(resp->xdr, copy);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	return nfsd4_encode_copy_requirements4(resp->xdr, copy);
+}
 
+static __be32
+nfsd4_encode_netloc4(struct xdr_stream *xdr, const struct nl4_server *ns)
+{
+	__be32 status;
+
+	if (xdr_stream_encode_u32(xdr, ns->nl4_type) != XDR_UNIT)
+		return nfserr_resource;
 	switch (ns->nl4_type) {
 	case NL4_NETADDR:
-		addr = &ns->u.nl4_addr;
-
-		/* netid_len, netid, uaddr_len, uaddr (port included
-		 * in RPCBIND_MAXUADDRLEN)
-		 */
-		p = xdr_reserve_space(xdr,
-			4 /* netid len */ +
-			(XDR_QUADLEN(addr->netid_len) * 4) +
-			4 /* uaddr len */ +
-			(XDR_QUADLEN(addr->addr_len) * 4));
-		if (!p)
-			return nfserr_resource;
-
-		*p++ = cpu_to_be32(addr->netid_len);
-		p = xdr_encode_opaque_fixed(p, addr->netid,
-					    addr->netid_len);
-		*p++ = cpu_to_be32(addr->addr_len);
-		p = xdr_encode_opaque_fixed(p, addr->addr,
-					addr->addr_len);
+		/* nl_addr */
+		status = nfsd4_encode_netaddr4(xdr, &ns->u.nl4_addr);
 		break;
 	default:
-		WARN_ON_ONCE(ns->nl4_type != NL4_NETADDR);
-		return nfserr_inval;
+		status = nfserr_serverfault;
 	}
-
-	return 0;
+	return status;
 }
 
 static __be32
-nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
-		  union nfsd4_op_u *u)
+nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
+			 union nfsd4_op_u *u)
 {
-	struct nfsd4_copy *copy = &u->copy;
-	__be32 *p;
+	struct nfsd4_copy_notify *cn = &u->copy_notify;
+	struct xdr_stream *xdr = resp->xdr;
 
-	nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
-					 nfsd4_copy_is_sync(copy));
+	/* cnr_lease_time */
+	nfserr = nfsd4_encode_nfstime4(xdr, &cn->cpn_lease_time);
 	if (nfserr)
 		return nfserr;
-
-	p = xdr_reserve_space(resp->xdr, 4 + 4);
-	*p++ = xdr_one; /* cr_consecutive */
-	*p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero;
-	return 0;
+	/* cnr_stateid */
+	nfserr = nfsd4_encode_stateid4(xdr, &cn->cpn_cnr_stateid);
+	if (nfserr)
+		return nfserr;
+	/* cnr_source_server<> */
+	if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+		return nfserr_resource;
+	return nfsd4_encode_netloc4(xdr, cn->cpn_src);
 }
 
 static __be32
@@ -4885,14 +5189,15 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_offload_status *os = &u->offload_status;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, 8 + 4);
-	if (!p)
+	/* osr_count */
+	nfserr = nfsd4_encode_length4(xdr, os->count);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* osr_complete<1> */
+	if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
 		return nfserr_resource;
-	p = xdr_encode_hyper(p, os->count);
-	*p++ = cpu_to_be32(0);
-	return nfserr;
+	return nfs_ok;
 }
 
 static __be32
@@ -4970,53 +5275,18 @@ out:
 }
 
 static __be32
-nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
-			 union nfsd4_op_u *u)
-{
-	struct nfsd4_copy_notify *cn = &u->copy_notify;
-	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
-
-	if (nfserr)
-		return nfserr;
-
-	/* 8 sec, 4 nsec */
-	p = xdr_reserve_space(xdr, 12);
-	if (!p)
-		return nfserr_resource;
-
-	/* cnr_lease_time */
-	p = xdr_encode_hyper(p, cn->cpn_sec);
-	*p++ = cpu_to_be32(cn->cpn_nsec);
-
-	/* cnr_stateid */
-	nfserr = nfsd4_encode_stateid(xdr, &cn->cpn_cnr_stateid);
-	if (nfserr)
-		return nfserr;
-
-	/* cnr_src.nl_nsvr */
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-
-	*p++ = cpu_to_be32(1);
-
-	nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src);
-	return nfserr;
-}
-
-static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  union nfsd4_op_u *u)
 {
 	struct nfsd4_seek *seek = &u->seek;
-	__be32 *p;
-
-	p = xdr_reserve_space(resp->xdr, 4 + 8);
-	*p++ = cpu_to_be32(seek->seek_eof);
-	p = xdr_encode_hyper(p, seek->seek_pos);
+	struct xdr_stream *xdr = resp->xdr;
 
-	return 0;
+	/* sr_eof */
+	nfserr = nfsd4_encode_bool(xdr, seek->seek_eof);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_offset */
+	return nfsd4_encode_offset4(xdr, seek->seek_pos);
 }
 
 static __be32
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7ed02fb88a36..3e15b72f421d 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -26,6 +26,7 @@
 #include "pnfs.h"
 #include "filecache.h"
 #include "trace.h"
+#include "netlink.h"
 
 /*
  *	We have a single directory with several nodes in it.
@@ -1132,7 +1133,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
 	/* Following advice from simple_fill_super documentation: */
 	inode->i_ino = iunique(sb, NFSD_MaxReserved);
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	switch (mode & S_IFMT) {
 	case S_IFDIR:
 		inode->i_fop = &simple_dir_operations;
@@ -1496,6 +1497,203 @@ static int create_proc_exports_entry(void)
 unsigned int nfsd_net_id;
 
 /**
+ * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ *   %0: The rpc_status_get command may proceed
+ *   %-ENODEV: There is no NFSD running in this namespace
+ */
+int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb)
+{
+	struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
+	int ret = -ENODEV;
+
+	mutex_lock(&nfsd_mutex);
+	if (nn->nfsd_serv) {
+		svc_get(nn->nfsd_serv);
+		ret = 0;
+	}
+	mutex_unlock(&nfsd_mutex);
+
+	return ret;
+}
+
+static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
+					    struct netlink_callback *cb,
+					    struct nfsd_genl_rqstp *rqstp)
+{
+	void *hdr;
+	u32 i;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			  &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) ||
+	    nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) ||
+	    nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME,
+			ktime_to_us(rqstp->rq_stime),
+			NFSD_A_RPC_STATUS_PAD))
+		return -ENOBUFS;
+
+	switch (rqstp->rq_saddr.sa_family) {
+	case AF_INET: {
+		const struct sockaddr_in *s_in, *d_in;
+
+		s_in = (const struct sockaddr_in *)&rqstp->rq_saddr;
+		d_in = (const struct sockaddr_in *)&rqstp->rq_daddr;
+		if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4,
+				    s_in->sin_addr.s_addr) ||
+		    nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4,
+				    d_in->sin_addr.s_addr) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+				 s_in->sin_port) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+				 d_in->sin_port))
+			return -ENOBUFS;
+		break;
+	}
+	case AF_INET6: {
+		const struct sockaddr_in6 *s_in, *d_in;
+
+		s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr;
+		d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr;
+		if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6,
+				     &s_in->sin6_addr) ||
+		    nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6,
+				     &d_in->sin6_addr) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+				 s_in->sin6_port) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+				 d_in->sin6_port))
+			return -ENOBUFS;
+		break;
+	}
+	}
+
+	for (i = 0; i < rqstp->rq_opcnt; i++)
+		if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS,
+				rqstp->rq_opnum[i]))
+			return -ENOBUFS;
+
+	genlmsg_end(skb, hdr);
+	return 0;
+}
+
+/**
+ * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit
+ * @skb: reply buffer
+ * @cb: netlink metadata and command arguments
+ *
+ * Returns the size of the reply or a negative errno.
+ */
+int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
+				  struct netlink_callback *cb)
+{
+	struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
+	int i, ret, rqstp_index = 0;
+
+	rcu_read_lock();
+
+	for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
+		struct svc_rqst *rqstp;
+
+		if (i < cb->args[0]) /* already consumed */
+			continue;
+
+		rqstp_index = 0;
+		list_for_each_entry_rcu(rqstp,
+				&nn->nfsd_serv->sv_pools[i].sp_all_threads,
+				rq_all) {
+			struct nfsd_genl_rqstp genl_rqstp;
+			unsigned int status_counter;
+
+			if (rqstp_index++ < cb->args[1]) /* already consumed */
+				continue;
+			/*
+			 * Acquire rq_status_counter before parsing the rqst
+			 * fields. rq_status_counter is set to an odd value in
+			 * order to notify the consumers the rqstp fields are
+			 * meaningful.
+			 */
+			status_counter =
+				smp_load_acquire(&rqstp->rq_status_counter);
+			if (!(status_counter & 1))
+				continue;
+
+			genl_rqstp.rq_xid = rqstp->rq_xid;
+			genl_rqstp.rq_flags = rqstp->rq_flags;
+			genl_rqstp.rq_vers = rqstp->rq_vers;
+			genl_rqstp.rq_prog = rqstp->rq_prog;
+			genl_rqstp.rq_proc = rqstp->rq_proc;
+			genl_rqstp.rq_stime = rqstp->rq_stime;
+			genl_rqstp.rq_opcnt = 0;
+			memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp),
+			       sizeof(struct sockaddr));
+			memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp),
+			       sizeof(struct sockaddr));
+
+#ifdef CONFIG_NFSD_V4
+			if (rqstp->rq_vers == NFS4_VERSION &&
+			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
+				/* NFSv4 compound */
+				struct nfsd4_compoundargs *args;
+				int j;
+
+				args = rqstp->rq_argp;
+				genl_rqstp.rq_opcnt = args->opcnt;
+				for (j = 0; j < genl_rqstp.rq_opcnt; j++)
+					genl_rqstp.rq_opnum[j] =
+						args->ops[j].opnum;
+			}
+#endif /* CONFIG_NFSD_V4 */
+
+			/*
+			 * Acquire rq_status_counter before reporting the rqst
+			 * fields to the user.
+			 */
+			if (smp_load_acquire(&rqstp->rq_status_counter) !=
+			    status_counter)
+				continue;
+
+			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
+							       &genl_rqstp);
+			if (ret)
+				goto out;
+		}
+	}
+
+	cb->args[0] = i;
+	cb->args[1] = rqstp_index;
+	ret = skb->len;
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/**
+ * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ *   %0: Success
+ */
+int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
+{
+	mutex_lock(&nfsd_mutex);
+	nfsd_put(sock_net(cb->skb->sk));
+	mutex_unlock(&nfsd_mutex);
+
+	return 0;
+}
+
+/**
  * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
  * @net: a freshly-created network namespace
  *
@@ -1589,6 +1787,10 @@ static int __init init_nfsd(void)
 	retval = register_filesystem(&nfsd_fs_type);
 	if (retval)
 		goto out_free_all;
+	retval = genl_register_family(&nfsd_nl_family);
+	if (retval)
+		goto out_free_all;
+
 	return 0;
 out_free_all:
 	nfsd4_destroy_laundry_wq();
@@ -1613,6 +1815,7 @@ out_free_slabs:
 
 static void __exit exit_nfsd(void)
 {
+	genl_unregister_family(&nfsd_nl_family);
 	unregister_filesystem(&nfsd_fs_type);
 	nfsd4_destroy_laundry_wq();
 	unregister_cld_notifier();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 11c14faa6c67..f5ff42f41ee7 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -62,6 +62,23 @@ struct readdir_cd {
 	__be32			err;	/* 0, nfserr, or nfserr_eof */
 };
 
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND	50
+
+struct nfsd_genl_rqstp {
+	struct sockaddr		rq_daddr;
+	struct sockaddr		rq_saddr;
+	unsigned long		rq_flags;
+	ktime_t			rq_stime;
+	__be32			rq_xid;
+	u32			rq_vers;
+	u32			rq_prog;
+	u32			rq_proc;
+
+	/* NFSv4 compound */
+	u32			rq_opcnt;
+	u32			rq_opnum[NFSD_MAX_OPS_PER_COMPOUND];
+};
 
 extern struct svc_program	nfsd_program;
 extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 355bf0db3235..dbfa0ac13564 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -771,7 +771,7 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
  * assume that the new change attr is always logged to stable storage in some
  * fashion before the results can be seen.
  */
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
+u64 nfsd4_change_attribute(const struct kstat *stat, const struct inode *inode)
 {
 	u64 chattr;
 
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 40426f899e76..6ebdf7ea27bf 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -293,7 +293,8 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
 	fhp->fh_pre_saved = false;
 }
 
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
+u64 nfsd4_change_attribute(const struct kstat *stat,
+			   const struct inode *inode);
 __be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp);
 __be32 fh_fill_post_attrs(struct svc_fh *fhp);
 __be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index c7af1095f6b5..d6122bb2d167 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -572,7 +572,6 @@ static void nfsd_last_thread(struct net *net)
 		return;
 
 	nfsd_shutdown_net(net);
-	pr_info("nfsd: last server has exited, flushing export cache\n");
 	nfsd_export_flush(net);
 }
 
@@ -713,14 +712,13 @@ int nfsd_nrpools(struct net *net)
 
 int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
 {
-	int i = 0;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct svc_serv *serv = nn->nfsd_serv;
+	int i;
 
-	if (nn->nfsd_serv != NULL) {
-		for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
-			nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
-	}
-
+	if (serv)
+		for (i = 0; i < serv->sv_nrpools && i < n; i++)
+			nthreads[i] = atomic_read(&serv->sv_pools[i].sp_nrthreads);
 	return 0;
 }
 
@@ -787,7 +785,6 @@ int
 nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 {
 	int	error;
-	bool	nfsd_up_before;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct svc_serv *serv;
 
@@ -807,8 +804,6 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 	error = nfsd_create_serv(net);
 	if (error)
 		goto out;
-
-	nfsd_up_before = nn->nfsd_net_up;
 	serv = nn->nfsd_serv;
 
 	error = nfsd_startup_net(net, cred);
@@ -816,17 +811,15 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 		goto out_put;
 	error = svc_set_num_threads(serv, NULL, nrservs);
 	if (error)
-		goto out_shutdown;
+		goto out_put;
 	error = serv->sv_nrthreads;
-	if (error == 0)
-		nfsd_last_thread(net);
-out_shutdown:
-	if (error < 0 && !nfsd_up_before)
-		nfsd_shutdown_net(net);
 out_put:
 	/* Threads now hold service active */
 	if (xchg(&nn->keep_active, 0))
 		svc_put(serv);
+
+	if (serv->sv_nrthreads == 0)
+		nfsd_last_thread(net);
 	svc_put(serv);
 out:
 	mutex_unlock(&nfsd_mutex);
@@ -957,7 +950,7 @@ nfsd(void *vrqstp)
 	/*
 	 * The main request loop
 	 */
-	while (!kthread_should_stop()) {
+	while (!svc_thread_should_stop(rqstp)) {
 		/* Update sv_maxconn if it has changed */
 		rqstp->rq_server->sv_maxconn = nn->max_connections;
 
@@ -998,6 +991,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
 		goto out_decode_err;
 
+	/*
+	 * Release rq_status_counter setting it to an odd value after the rpc
+	 * request has been properly parsed. rq_status_counter is used to
+	 * notify the consumers if the rqstp fields are stable
+	 * (rq_status_counter is odd) or not meaningful (rq_status_counter
+	 * is even).
+	 */
+	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
+
 	rp = NULL;
 	switch (nfsd_cache_lookup(rqstp, &rp)) {
 	case RC_DOIT:
@@ -1015,6 +1017,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
 		goto out_encode_err;
 
+	/*
+	 * Release rq_status_counter setting it to an even value after the rpc
+	 * request has been properly processed.
+	 */
+	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
+
 	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
 out_cached_reply:
 	return 1;
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 4f4282d4eeca..de1e0dfed06a 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -27,12 +27,12 @@ struct nfsd4_layout_ops {
 			struct nfs4_client *clp,
 			struct nfsd4_getdeviceinfo *gdevp);
 	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
-			struct nfsd4_getdeviceinfo *gdevp);
+			const struct nfsd4_getdeviceinfo *gdevp);
 
 	__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
 			struct nfsd4_layoutget *lgp);
-	__be32 (*encode_layoutget)(struct xdr_stream *,
-			struct nfsd4_layoutget *lgp);
+	__be32 (*encode_layoutget)(struct xdr_stream *xdr,
+			const struct nfsd4_layoutget *lgp);
 
 	__be32 (*proc_layoutcommit)(struct inode *inode,
 			struct nfsd4_layoutcommit *lcp);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index cbddcf484dba..f96eaa8e9413 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -117,6 +117,24 @@ struct nfs4_cpntf_state {
 	time64_t		cpntf_time;	/* last time stateid used */
 };
 
+struct nfs4_cb_fattr {
+	struct nfsd4_callback ncf_getattr;
+	u32 ncf_cb_status;
+	u32 ncf_cb_bmap[1];
+
+	/* from CB_GETATTR reply */
+	u64 ncf_cb_change;
+	u64 ncf_cb_fsize;
+
+	unsigned long ncf_cb_flags;
+	bool ncf_file_modified;
+	u64 ncf_initial_cinfo;
+	u64 ncf_cur_fsize;
+};
+
+/* bits for ncf_cb_flags */
+#define	CB_GETATTR_BUSY		0
+
 /*
  * Represents a delegation stateid. The nfs4_client holds references to these
  * and they are put when it is being destroyed or when the delegation is
@@ -150,6 +168,9 @@ struct nfs4_delegation {
 	int			dl_retries;
 	struct nfsd4_callback	dl_recall;
 	bool			dl_recalled;
+
+	/* for CB_GETATTR */
+	struct nfs4_cb_fattr    dl_cb_fattr;
 };
 
 #define cb_to_delegation(cb) \
@@ -174,8 +195,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
 
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
-/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND	50
 /* Maximum  session per slot cache size */
 #define NFSD_SLOT_CACHE_SIZE		2048
 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
@@ -642,6 +661,7 @@ enum nfsd4_cb_op {
 	NFSPROC4_CLNT_CB_SEQUENCE,
 	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
 	NFSPROC4_CLNT_CB_RECALL_ANY,
+	NFSPROC4_CLNT_CB_GETATTR,
 };
 
 /* Returns true iff a is later than b: */
@@ -734,5 +754,6 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
 }
 
 extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
-				struct inode *inode);
+		struct inode *inode, bool *file_modified, u64 *size);
+extern void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf);
 #endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 63797635e1c3..12d79f5d4eb1 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -60,7 +60,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 #ifdef CONFIG_NFSD_V4
 	/* Show count for individual nfsv4 operations */
 	/* Writing operation numbers 0 1 2 also for maintaining uniformity */
-	seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
+	seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1);
 	for (i = 0; i <= LAST_NFS4_OP; i++) {
 		seq_printf(seq, " %lld",
 			   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
@@ -76,7 +76,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 
 DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
 
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
+int nfsd_percpu_counters_init(struct percpu_counter *counters, int num)
 {
 	int i, err = 0;
 
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index cf5524e7ca06..14f50c660b61 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -37,9 +37,9 @@ extern struct nfsd_stats	nfsdstats;
 
 extern struct svc_stat		nfsd_svcstats;
 
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num);
+int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
+void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
+void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
 int nfsd_stat_init(void);
 void nfsd_stat_shutdown(void);
 
@@ -61,22 +61,22 @@ static inline void nfsd_stats_rc_nocache_inc(void)
 static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
 {
 	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
-	if (exp)
-		percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]);
+	if (exp && exp->ex_stats)
+		percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]);
 }
 
 static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
 {
 	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
-	if (exp)
-		percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount);
+	if (exp && exp->ex_stats)
+		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount);
 }
 
 static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
 {
 	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
-	if (exp)
-		percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount);
+	if (exp && exp->ex_stats)
+		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount);
 }
 
 static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 803904348871..fbc0ccb40424 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1863,6 +1863,93 @@ TRACE_EVENT(nfsd_end_grace,
 	)
 );
 
+DECLARE_EVENT_CLASS(nfsd_copy_class,
+	TP_PROTO(
+		const struct nfsd4_copy *copy
+	),
+	TP_ARGS(copy),
+	TP_STRUCT__entry(
+		__field(bool, intra)
+		__field(bool, async)
+		__field(u32, src_cl_boot)
+		__field(u32, src_cl_id)
+		__field(u32, src_so_id)
+		__field(u32, src_si_generation)
+		__field(u32, dst_cl_boot)
+		__field(u32, dst_cl_id)
+		__field(u32, dst_so_id)
+		__field(u32, dst_si_generation)
+		__field(u64, src_cp_pos)
+		__field(u64, dst_cp_pos)
+		__field(u64, cp_count)
+		__sockaddr(addr, sizeof(struct sockaddr_in6))
+	),
+	TP_fast_assign(
+		const stateid_t *src_stp = &copy->cp_src_stateid;
+		const stateid_t *dst_stp = &copy->cp_dst_stateid;
+
+		__entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+		__entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+		__entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot;
+		__entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id;
+		__entry->src_so_id = src_stp->si_opaque.so_id;
+		__entry->src_si_generation = src_stp->si_generation;
+		__entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot;
+		__entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id;
+		__entry->dst_so_id = dst_stp->si_opaque.so_id;
+		__entry->dst_si_generation = dst_stp->si_generation;
+		__entry->src_cp_pos = copy->cp_src_pos;
+		__entry->dst_cp_pos = copy->cp_dst_pos;
+		__entry->cp_count = copy->cp_count;
+		__assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+				sizeof(struct sockaddr_in6));
+	),
+	TP_printk("client=%pISpc intra=%d async=%d "
+		"src_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+		"dst_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+		"cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu",
+		__get_sockaddr(addr), __entry->intra, __entry->async,
+		__entry->src_si_generation, __entry->src_cl_boot,
+		__entry->src_cl_id, __entry->src_so_id,
+		__entry->dst_si_generation, __entry->dst_cl_boot,
+		__entry->dst_cl_id, __entry->dst_so_id,
+		__entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count
+	)
+);
+
+#define DEFINE_COPY_EVENT(name)				\
+DEFINE_EVENT(nfsd_copy_class, nfsd_copy_##name,	\
+	TP_PROTO(const struct nfsd4_copy *copy),	\
+	TP_ARGS(copy))
+
+DEFINE_COPY_EVENT(inter);
+DEFINE_COPY_EVENT(intra);
+DEFINE_COPY_EVENT(do_async);
+
+TRACE_EVENT(nfsd_copy_done,
+	TP_PROTO(
+		const struct nfsd4_copy *copy,
+		__be32 status
+	),
+	TP_ARGS(copy, status),
+	TP_STRUCT__entry(
+		__field(int, status)
+		__field(bool, intra)
+		__field(bool, async)
+		__sockaddr(addr, sizeof(struct sockaddr_in6))
+	),
+	TP_fast_assign(
+		__entry->status = be32_to_cpu(status);
+		__entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+		__entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+		__assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+				sizeof(struct sockaddr_in6));
+	),
+	TP_printk("addr=%pISpc status=%d intra=%d async=%d ",
+		__get_sockaddr(addr), __entry->status, __entry->intra, __entry->async
+	)
+);
+
 #endif /* _NFSD_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 02f5fcaad03f..fbbea7498f02 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -337,6 +337,24 @@ out:
 	return err;
 }
 
+static void
+commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp,
+			    int err)
+{
+	switch (err) {
+	case -EAGAIN:
+	case -ESTALE:
+		/*
+		 * Neither of these are the result of a problem with
+		 * durable storage, so avoid a write verifier reset.
+		 */
+		break;
+	default:
+		nfsd_reset_write_verifier(nn);
+		trace_nfsd_writeverf_reset(nn, rqstp, err);
+	}
+}
+
 /*
  * Commit metadata changes to stable storage.
  */
@@ -520,7 +538,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	nfsd_sanitize_attrs(inode, iap);
 
-	if (check_guard && guardtime != inode_get_ctime(inode).tv_sec)
+	if (check_guard && guardtime != inode_get_ctime_sec(inode))
 		return nfserr_notsync;
 
 	/*
@@ -647,8 +665,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
 					&nfsd4_get_cstate(rqstp)->current_fh,
 					dst_pos,
 					count, status);
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, status);
+			commit_reset_write_verifier(nn, rqstp, status);
 			ret = nfserrno(status);
 		}
 	}
@@ -823,7 +840,7 @@ int nfsd_open_break_lease(struct inode *inode, int access)
  * and additional flags.
  * N.B. After this call fhp needs an fh_put
  */
-static __be32
+static int
 __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 			int may_flags, struct file **filp)
 {
@@ -831,14 +848,12 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 	struct inode	*inode;
 	struct file	*file;
 	int		flags = O_RDONLY|O_LARGEFILE;
-	__be32		err;
-	int		host_err = 0;
+	int		host_err = -EPERM;
 
 	path.mnt = fhp->fh_export->ex_path.mnt;
 	path.dentry = fhp->fh_dentry;
 	inode = d_inode(path.dentry);
 
-	err = nfserr_perm;
 	if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
 		goto out;
 
@@ -847,7 +862,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 
 	host_err = nfsd_open_break_lease(inode, may_flags);
 	if (host_err) /* NOMEM or WOULDBLOCK */
-		goto out_nfserr;
+		goto out;
 
 	if (may_flags & NFSD_MAY_WRITE) {
 		if (may_flags & NFSD_MAY_READ)
@@ -859,13 +874,13 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 	file = dentry_open(&path, flags, current_cred());
 	if (IS_ERR(file)) {
 		host_err = PTR_ERR(file);
-		goto out_nfserr;
+		goto out;
 	}
 
 	host_err = ima_file_check(file, may_flags);
 	if (host_err) {
 		fput(file);
-		goto out_nfserr;
+		goto out;
 	}
 
 	if (may_flags & NFSD_MAY_64BIT_COOKIE)
@@ -874,10 +889,8 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 		file->f_mode |= FMODE_32BITHASH;
 
 	*filp = file;
-out_nfserr:
-	err = nfserrno(host_err);
 out:
-	return err;
+	return host_err;
 }
 
 __be32
@@ -885,6 +898,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 		int may_flags, struct file **filp)
 {
 	__be32 err;
+	int host_err;
 	bool retried = false;
 
 	validate_process_creds();
@@ -904,12 +918,13 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 retry:
 	err = fh_verify(rqstp, fhp, type, may_flags);
 	if (!err) {
-		err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
-		if (err == nfserr_stale && !retried) {
+		host_err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+		if (host_err == -EOPENSTALE && !retried) {
 			retried = true;
 			fh_put(fhp);
 			goto retry;
 		}
+		err = nfserrno(host_err);
 	}
 	validate_process_creds();
 	return err;
@@ -922,13 +937,13 @@ retry:
  * @may_flags: internal permission flags
  * @filp: OUT: open "struct file *"
  *
- * Returns an nfsstat value in network byte order.
+ * Returns zero on success, or a negative errno value.
  */
-__be32
+int
 nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
 		   struct file **filp)
 {
-	__be32 err;
+	int err;
 
 	validate_process_creds();
 	err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
@@ -1172,8 +1187,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 	host_err = vfs_iter_write(file, &iter, &pos, flags);
 	file_end_write(file);
 	if (host_err < 0) {
-		nfsd_reset_write_verifier(nn);
-		trace_nfsd_writeverf_reset(nn, rqstp, host_err);
+		commit_reset_write_verifier(nn, rqstp, host_err);
 		goto out_nfserr;
 	}
 	*cnt = host_err;
@@ -1185,10 +1199,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 
 	if (stable && use_wgather) {
 		host_err = wait_for_concurrent_writes(file);
-		if (host_err < 0) {
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, host_err);
-		}
+		if (host_err < 0)
+			commit_reset_write_verifier(nn, rqstp, host_err);
 	}
 
 out_nfserr:
@@ -1331,8 +1343,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 			err = nfserr_notsupp;
 			break;
 		default:
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, err2);
+			commit_reset_write_verifier(nn, rqstp, err2);
 			err = nfserrno(err2);
 		}
 	} else
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a6890ea7b765..e3c29596f4df 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -104,8 +104,8 @@ __be32		nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 int 		nfsd_open_break_lease(struct inode *, int);
 __be32		nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
 				int, struct file **);
-__be32		nfsd_open_verified(struct svc_rqst *, struct svc_fh *,
-				int, struct file **);
+int		nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp,
+				   int may_flags, struct file **filp);
 __be32		nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct file *file, loff_t offset,
 				unsigned long *count,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9d918a79dc16..80e859dc84d8 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -50,6 +50,134 @@
 #define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f))
 #define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f))
 
+/**
+ * nfsd4_encode_bool - Encode an XDR bool type result
+ * @xdr: target XDR stream
+ * @val: boolean value to encode
+ *
+ * Return values:
+ *    %nfs_ok: @val encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_bool(struct xdr_stream *xdr, bool val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	*p = val ? xdr_one : xdr_zero;
+	return nfs_ok;
+}
+
+/**
+ * nfsd4_encode_uint32_t - Encode an XDR uint32_t type result
+ * @xdr: target XDR stream
+ * @val: integer value to encode
+ *
+ * Return values:
+ *    %nfs_ok: @val encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_uint32_t(struct xdr_stream *xdr, u32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	*p = cpu_to_be32(val);
+	return nfs_ok;
+}
+
+#define nfsd4_encode_aceflag4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_acemask4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_acetype4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_count4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_mode4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_nfs_lease4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_qop4(x, v)		nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_sequenceid4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_slotid4(x, v)	nfsd4_encode_uint32_t(x, v)
+
+/**
+ * nfsd4_encode_uint64_t - Encode an XDR uint64_t type result
+ * @xdr: target XDR stream
+ * @val: integer value to encode
+ *
+ * Return values:
+ *    %nfs_ok: @val encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_uint64_t(struct xdr_stream *xdr, u64 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	put_unaligned_be64(val, p);
+	return nfs_ok;
+}
+
+#define nfsd4_encode_changeid4(x, v)	nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_nfs_cookie4(x, v)	nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_length4(x, v)	nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_offset4(x, v)	nfsd4_encode_uint64_t(x, v)
+
+/**
+ * nfsd4_encode_opaque_fixed - Encode a fixed-length XDR opaque type result
+ * @xdr: target XDR stream
+ * @data: pointer to data
+ * @size: length of data in bytes
+ *
+ * Return values:
+ *    %nfs_ok: @data encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_opaque_fixed(struct xdr_stream *xdr, const void *data,
+			  size_t size)
+{
+	__be32 *p = xdr_reserve_space(xdr, xdr_align_size(size));
+	size_t pad = xdr_pad_size(size);
+
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	memcpy(p, data, size);
+	if (pad)
+		memset((char *)p + size, 0, pad);
+	return nfs_ok;
+}
+
+/**
+ * nfsd4_encode_opaque - Encode a variable-length XDR opaque type result
+ * @xdr: target XDR stream
+ * @data: pointer to data
+ * @size: length of data in bytes
+ *
+ * Return values:
+ *    %nfs_ok: @data encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_opaque(struct xdr_stream *xdr, const void *data, size_t size)
+{
+	size_t pad = xdr_pad_size(size);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(size));
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	*p++ = cpu_to_be32(size);
+	memcpy(p, data, size);
+	if (pad)
+		memset((char *)p + size, 0, pad);
+	return nfs_ok;
+}
+
+#define nfsd4_encode_component4(x, d, s)	nfsd4_encode_opaque(x, d, s)
+
 struct nfsd4_compound_state {
 	struct svc_fh		current_fh;
 	struct svc_fh		save_fh;
@@ -170,12 +298,8 @@ struct nfsd4_lock {
 	} v;
 
 	/* response */
-	union {
-		struct {
-			stateid_t               stateid;
-		} ok;
-		struct nfsd4_lock_denied        denied;
-	} u;
+	stateid_t			lk_resp_stateid;
+	struct nfsd4_lock_denied        lk_denied;
 };
 #define lk_new_open_seqid       v.new.open_seqid
 #define lk_new_open_stateid     v.new.open_stateid
@@ -185,20 +309,15 @@ struct nfsd4_lock {
 #define lk_old_lock_stateid     v.old.lock_stateid
 #define lk_old_lock_seqid       v.old.lock_seqid
 
-#define lk_resp_stateid u.ok.stateid
-#define lk_denied       u.denied
-
-
 struct nfsd4_lockt {
 	u32				lt_type;
 	clientid_t			lt_clientid;
 	struct xdr_netobj		lt_owner;
 	u64				lt_offset;
 	u64				lt_length;
-	struct nfsd4_lock_denied  	lt_denied;
+	struct nfsd4_lock_denied	lt_denied;
 };
 
- 
 struct nfsd4_locku {
 	u32             lu_type;
 	u32             lu_seqid;
@@ -267,9 +386,9 @@ struct nfsd4_open {
 	u32		op_deleg_want;      /* request */
 	stateid_t	op_stateid;         /* response */
 	__be32		op_xdr_error;       /* see nfsd4_open_omfg() */
-	u32		op_recall;          /* recall */
 	struct nfsd4_change_info  op_cinfo; /* response */
 	u32		op_rflags;          /* response */
+	bool		op_recall;          /* response */
 	bool		op_truncate;        /* used during processing */
 	bool		op_created;         /* used during processing */
 	struct nfs4_openowner *op_openowner; /* used during processing */
@@ -496,7 +615,7 @@ struct nfsd4_layoutcommit {
 	u32			lc_layout_type;	/* request */
 	u32			lc_up_len;	/* layout length */
 	void			*lc_up_layout;	/* decoded by callback */
-	u32			lc_size_chg;	/* boolean for response */
+	bool			lc_size_chg;	/* response */
 	u64			lc_newsize;	/* response */
 };
 
@@ -508,7 +627,7 @@ struct nfsd4_layoutreturn {
 	u32			lrf_body_len;	/* request */
 	void			*lrf_body;	/* request */
 	stateid_t		lr_sid;		/* request/response */
-	u32			lrs_present;	/* response */
+	bool			lrs_present;	/* response */
 };
 
 struct nfsd4_fallocate {
@@ -626,8 +745,7 @@ struct nfsd4_copy_notify {
 
 	/* response */
 	stateid_t		cpn_cnr_stateid;
-	u64			cpn_sec;
-	u32			cpn_nsec;
+	struct timespec64	cpn_lease_time;
 	struct nl4_server	*cpn_src;
 };
 
@@ -820,8 +938,10 @@ extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, union nfsd4_op_u *u);
 extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
 		union nfsd4_op_u *u);
+extern void nfsd4_lock_release(union nfsd4_op_u *u);
 extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
 		union nfsd4_op_u *u);
+extern void nfsd4_lockt_release(union nfsd4_op_u *u);
 extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
 		union nfsd4_op_u *u);
 extern __be32
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 0d39af1b00a0..e8b00309c449 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -54,3 +54,21 @@
 #define NFS4_dec_cb_recall_any_sz	(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+
+/*
+ * 1: CB_GETATTR opcode (32-bit)
+ * N: file_handle
+ * 1: number of entry in attribute array (32-bit)
+ * 1: entry 0 in attribute array (32-bit)
+ */
+#define NFS4_enc_cb_getattr_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_nfs4_fh_sz + 1 + 1)
+/*
+ * 4: fattr_bitmap_maxsz
+ * 1: attribute array len
+ * 2: change attr (64-bit)
+ * 2: size (64-bit)
+ */
+#define NFS4_dec_cb_getattr_sz		(cb_compound_dec_hdr_sz  +      \
+			cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index bce734b68f08..de2073c47651 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 	nilfs_set_de_type(de, inode);
 	nilfs_commit_chunk(page, mapping, from, to);
 	nilfs_put_page(page);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 }
 
 /*
@@ -519,7 +519,7 @@ got_it:
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
 	nilfs_commit_chunk(page, page->mapping, from, to);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	nilfs_mark_inode_dirty(dir);
 	/* OFFSET_CACHE */
 out_put:
@@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 		pde->rec_len = nilfs_rec_len_to_disk(to - from);
 	dir->inode = 0;
 	nilfs_commit_chunk(page, mapping, from, to);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 out:
 	nilfs_put_page(page);
 	return err;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 1a8bd5993476..f861f3a0bf5c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 	atomic64_inc(&root->inodes_count);
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_ino = ino;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
 		err = nilfs_bmap_read(ii->i_bmap, NULL);
@@ -449,12 +449,12 @@ int nilfs_read_inode_common(struct inode *inode,
 	i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	inode->i_size = le64_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode_set_atime(inode, le64_to_cpu(raw_inode->i_mtime),
+			le32_to_cpu(raw_inode->i_mtime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime),
 			le32_to_cpu(raw_inode->i_ctime_nsec));
-	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	inode_set_mtime(inode, le64_to_cpu(raw_inode->i_mtime),
+			le32_to_cpu(raw_inode->i_mtime_nsec));
 	if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
 		return -EIO; /* this inode is for metadata and corrupted */
 	if (inode->i_nlink == 0)
@@ -768,10 +768,10 @@ void nilfs_write_inode_common(struct inode *inode,
 	raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le64(inode->i_size);
-	raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
-	raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
-	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	raw_inode->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	raw_inode->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+	raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+	raw_inode->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
 
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
@@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode)
 
 	nilfs_truncate_bmap(ii, blkoff);
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..869b016014d2 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	struct dnotify_struct *dn;
 	struct inode *inode;
 	fl_owner_t id = current->files;
-	struct file *f;
+	struct file *f = NULL;
 	int destroy = 0, error = 0;
 	__u32 mask;
 
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	}
 
 	rcu_read_lock();
-	f = lookup_fd_rcu(fd);
+	f = lookup_fdget_rcu(fd);
 	rcu_read_unlock();
 
 	/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
 		fsnotify_put_mark(new_fsn_mark);
 	if (dn)
 		kmem_cache_free(dnotify_struct_cache, dn);
+	if (f)
+		fput(f);
 	return error;
 }
 
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 647a22433bd8..9a4b228d42fa 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -84,7 +84,7 @@ slow:
 		return -ENOMEM;
 	}
 	inode->i_ino = ns->inum;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_flags |= S_IMMUTABLE;
 	inode->i_mode = S_IFREG | S_IRUGO;
 	inode->i_fop = &ns_file_operations;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 99ac6ea277c4..aba1e22db4e9 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -648,7 +648,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * mtime is the last change of the data within the file. Not changed
 	 * when only metadata is changed, e.g. a rename doesn't affect mtime.
 	 */
-	vi->i_mtime = ntfs2utc(si->last_data_change_time);
+	inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time));
 	/*
 	 * ctime is the last change of the metadata of the file. This obviously
 	 * always changes, when mtime is changed. ctime can be changed on its
@@ -659,7 +659,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * Last access to the data within the file. Not changed during a rename
 	 * for example but changed whenever the file is written to.
 	 */
-	vi->i_atime = ntfs2utc(si->last_access_time);
+	inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time));
 
 	/* Find the attribute list attribute if present. */
 	ntfs_attr_reinit_search_ctx(ctx);
@@ -1217,9 +1217,9 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
 	set_nlink(vi, base_vi->i_nlink);
-	vi->i_mtime	= base_vi->i_mtime;
+	inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
 	inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
-	vi->i_atime	= base_vi->i_atime;
+	inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
 	vi->i_generation = ni->seq_no = base_ni->seq_no;
 
 	/* Set inode type to zero but preserve permissions. */
@@ -1483,9 +1483,9 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
 	set_nlink(vi, base_vi->i_nlink);
-	vi->i_mtime	= base_vi->i_mtime;
+	inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
 	inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
-	vi->i_atime	= base_vi->i_atime;
+	inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
 	vi->i_generation = ni->seq_no = base_ni->seq_no;
 	/* Set inode type to zero but preserve permissions. */
 	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
@@ -2805,13 +2805,14 @@ done:
 	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
 		struct timespec64 now = current_time(VFS_I(base_ni));
 		struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
+		struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni));
 		int sync_it = 0;
 
-		if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+		if (!timespec64_equal(&mtime, &now) ||
 		    !timespec64_equal(&ctime, &now))
 			sync_it = 1;
 		inode_set_ctime_to_ts(VFS_I(base_ni), now);
-		VFS_I(base_ni)->i_mtime = now;
+		inode_set_mtime_to_ts(VFS_I(base_ni), now);
 
 		if (sync_it)
 			mark_inode_dirty_sync(VFS_I(base_ni));
@@ -2925,9 +2926,9 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		}
 	}
 	if (ia_valid & ATTR_ATIME)
-		vi->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(vi, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
-		vi->i_mtime = attr->ia_mtime;
+		inode_set_mtime_to_ts(vi, attr->ia_mtime);
 	if (ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(vi, attr->ia_ctime);
 	mark_inode_dirty(vi);
@@ -2996,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 	si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset));
 	/* Update the access times if they have changed. */
-	nt = utc2ntfs(vi->i_mtime);
+	nt = utc2ntfs(inode_get_mtime(vi));
 	if (si->last_data_change_time != nt) {
 		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino, (long long)
@@ -3014,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 		si->last_mft_change_time = nt;
 		modified = true;
 	}
-	nt = utc2ntfs(vi->i_atime);
+	nt = utc2ntfs(inode_get_atime(vi));
 	if (si->last_access_time != nt) {
 		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index ad1a8f72da22..6fd1dc4b08c8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2682,7 +2682,7 @@ mft_rec_already_initialized:
 			vi->i_mode &= ~S_IWUGO;
 
 		/* Set the inode times to the current time. */
-		vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi);
+		simple_inode_init_ts(vi);
 		/*
 		 * Set the file size to 0, the ntfs inode sizes are set to 0 by
 		 * the call to ntfs_init_big_inode() below.
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 1f7a194983c5..ad4a70b5d432 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -342,7 +342,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
 		err = 0;
 	}
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 
 	if (IS_SYNC(inode)) {
@@ -400,7 +400,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
 	ni_unlock(ni);
 
 	ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (!IS_DIRSYNC(inode)) {
 		dirty = 1;
 	} else {
@@ -642,7 +642,7 @@ out:
 		filemap_invalidate_unlock(mapping);
 
 	if (!err) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		mark_inode_dirty(inode);
 	}
 
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index dad976a68985..3df2d9e34b91 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -3271,7 +3271,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
 	if (is_rec_inuse(ni->mi.mrec) &&
 	    !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
 		bool modified = false;
-		struct timespec64 ctime = inode_get_ctime(inode);
+		struct timespec64 ts;
 
 		/* Update times in standard attribute. */
 		std = ni_std(ni);
@@ -3281,19 +3281,22 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
 		}
 
 		/* Update the access times if they have changed. */
-		dup.m_time = kernel2nt(&inode->i_mtime);
+		ts = inode_get_mtime(inode);
+		dup.m_time = kernel2nt(&ts);
 		if (std->m_time != dup.m_time) {
 			std->m_time = dup.m_time;
 			modified = true;
 		}
 
-		dup.c_time = kernel2nt(&ctime);
+		ts = inode_get_mtime(inode);
+		dup.c_time = kernel2nt(&ts);
 		if (std->c_time != dup.c_time) {
 			std->c_time = dup.c_time;
 			modified = true;
 		}
 
-		dup.a_time = kernel2nt(&inode->i_atime);
+		ts = inode_get_atime(inode);
+		dup.a_time = kernel2nt(&ts);
 		if (std->a_time != dup.a_time) {
 			std->a_time = dup.a_time;
 			modified = true;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index d6d021e19aaa..5e3d71374918 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -44,7 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
 	u64 t64;
 	struct MFT_REC *rec;
 	struct runs_tree *run;
-	struct timespec64 ctime;
+	struct timespec64 ts;
 
 	inode->i_op = NULL;
 	/* Setup 'uid' and 'gid' */
@@ -169,10 +169,12 @@ next_attr:
 #ifdef STATX_BTIME
 		nt2kernel(std5->cr_time, &ni->i_crtime);
 #endif
-		nt2kernel(std5->a_time, &inode->i_atime);
-		nt2kernel(std5->c_time, &ctime);
-		inode_set_ctime_to_ts(inode, ctime);
-		nt2kernel(std5->m_time, &inode->i_mtime);
+		nt2kernel(std5->a_time, &ts);
+		inode_set_atime_to_ts(inode, ts);
+		nt2kernel(std5->c_time, &ts);
+		inode_set_ctime_to_ts(inode, ts);
+		nt2kernel(std5->m_time, &ts);
+		inode_set_mtime_to_ts(inode, ts);
 
 		ni->std_fa = std5->fa;
 
@@ -960,7 +962,8 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
 
 	if (err >= 0) {
 		if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) {
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 			ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
 			dirty = true;
 		}
@@ -1660,9 +1663,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
 	d_instantiate(dentry, inode);
 
 	/* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
-	inode->i_atime = inode->i_mtime =
-		inode_set_ctime_to_ts(inode, ni->i_crtime);
-	dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime);
+	inode_set_atime_to_ts(inode, ni->i_crtime);
+	inode_set_ctime_to_ts(inode, ni->i_crtime);
+	inode_set_mtime_to_ts(inode, ni->i_crtime);
+	inode_set_mtime_to_ts(dir, ni->i_crtime);
+	inode_set_ctime_to_ts(dir, ni->i_crtime);
 
 	mark_inode_dirty(dir);
 	mark_inode_dirty(inode);
@@ -1768,7 +1773,7 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
 
 	if (!err) {
 		drop_nlink(inode);
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 		mark_inode_dirty(dir);
 		inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
 		if (inode->i_nlink)
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index eedacf94edd8..ee3093be5170 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
 	err = ntfs_link_inode(inode, de);
 
 	if (!err) {
-		dir->i_mtime = inode_set_ctime_to_ts(
-			inode, inode_set_ctime_current(dir));
+		inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 		mark_inode_dirty(inode);
 		mark_inode_dirty(dir);
 		d_instantiate(de, inode);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 0e6a2777870c..f6706143d14b 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -872,7 +872,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
 
 int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
 ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern const struct xattr_handler *ntfs_xattr_handlers[];
+extern const struct xattr_handler * const ntfs_xattr_handlers[];
 
 int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
 void ntfs_get_wsl_perm(struct inode *inode);
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 4920548192a0..4274b6f31cfa 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -1021,7 +1021,7 @@ static const struct xattr_handler ntfs_other_xattr_handler = {
 	.list	= ntfs_xattr_user_list,
 };
 
-const struct xattr_handler *ntfs_xattr_handlers[] = {
+const struct xattr_handler * const ntfs_xattr_handlers[] = {
 	&ntfs_other_xattr_handler,
 	NULL,
 };
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e75137a8e7cb..62464d194da3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -193,8 +193,8 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
 	inode->i_mode = new_mode;
 	inode_set_ctime_current(inode);
 	di->i_mode = cpu_to_le16(inode->i_mode);
-	di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
 
 	ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index aef58f1395c8..f0937902f7b4 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7436,10 +7436,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
 	}
 
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
-	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 
 	ocfs2_update_inode_fsync_trans(handle, inode, 1);
 	ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0fdba30740ab..6ab03494fc6e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2048,9 +2048,9 @@ out_write_size:
 		}
 		inode->i_blocks = ocfs2_inode_sector_count(inode);
 		di->i_size = cpu_to_le64((u64)i_size_read(inode));
-		inode->i_mtime = inode_set_ctime_current(inode);
-		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-		di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+		di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+		di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 		if (handle)
 			ocfs2_update_inode_fsync_trans(handle, inode, 1);
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21472e3ed182..4d7efefa98c5 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -213,7 +213,7 @@ struct o2hb_region {
 	unsigned int		hr_num_pages;
 
 	struct page             **hr_slot_data;
-	struct block_device	*hr_bdev;
+	struct bdev_handle	*hr_bdev_handle;
 	struct o2hb_disk_slot	*hr_slots;
 
 	/* live node map of this region */
@@ -261,6 +261,11 @@ struct o2hb_region {
 	int			hr_last_hb_status;
 };
 
+static inline struct block_device *reg_bdev(struct o2hb_region *reg)
+{
+	return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL;
+}
+
 struct o2hb_bio_wait_ctxt {
 	atomic_t          wc_num_reqs;
 	struct completion wc_io_complete;
@@ -286,7 +291,7 @@ static void o2hb_write_timeout(struct work_struct *work)
 			     hr_write_timeout_work.work);
 
 	mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u "
-	     "milliseconds\n", reg->hr_bdev,
+	     "milliseconds\n", reg_bdev(reg),
 	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
 
 	if (o2hb_global_heartbeat_active()) {
@@ -383,7 +388,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
 		if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
 			printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n",
 				o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
-				config_item_name(&reg->hr_item), reg->hr_bdev);
+				config_item_name(&reg->hr_item), reg_bdev(reg));
 			set_bit(master_node, reg->hr_nego_node_bitmap);
 		}
 		if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
@@ -398,7 +403,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
 		}
 
 		printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n",
-			config_item_name(&reg->hr_item), reg->hr_bdev);
+			config_item_name(&reg->hr_item),
+			reg_bdev(reg));
 		/* approve negotiate timeout request. */
 		o2hb_arm_timeout(reg);
 
@@ -419,7 +425,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
 		/* negotiate timeout with master node. */
 		printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n",
 			o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
-			reg->hr_bdev, master_node);
+			reg_bdev(reg), master_node);
 		ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
 				master_node);
 		if (ret)
@@ -436,7 +442,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	nego_msg = (struct o2hb_nego_msg *)msg->buf;
 	printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n",
-		nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_bdev);
+		nego_msg->node_num, config_item_name(&reg->hr_item),
+		reg_bdev(reg));
 	if (nego_msg->node_num < O2NM_MAX_NODES)
 		set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
 	else
@@ -451,7 +458,7 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
 	struct o2hb_region *reg = data;
 
 	printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n",
-		config_item_name(&reg->hr_item), reg->hr_bdev);
+		config_item_name(&reg->hr_item), reg_bdev(reg));
 	o2hb_arm_timeout(reg);
 	return 0;
 }
@@ -515,7 +522,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 	 * GFP_KERNEL that the local node can get fenced. It would be
 	 * nicest if we could pre-allocate these bios and avoid this
 	 * all together. */
-	bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC);
+	bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC);
 	if (!bio) {
 		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
 		bio = ERR_PTR(-ENOMEM);
@@ -687,7 +694,7 @@ static int o2hb_check_own_slot(struct o2hb_region *reg)
 		errstr = ERRSTR3;
 
 	mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), "
-	     "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev,
+	     "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg),
 	     slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
 	     (unsigned long long)slot->ds_last_time, hb_block->hb_node,
 	     (unsigned long long)le64_to_cpu(hb_block->hb_generation),
@@ -861,7 +868,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
 		goto unlock;
 
 	printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
-	       config_item_name(&reg->hr_item), reg->hr_bdev);
+	       config_item_name(&reg->hr_item), reg_bdev(reg));
 
 	set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
 
@@ -920,7 +927,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
 		 * consider it a transient miss but don't populate any
 		 * other values as they may be junk. */
 		mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n",
-		     slot->ds_node_num, reg->hr_bdev);
+		     slot->ds_node_num, reg_bdev(reg));
 		o2hb_dump_slot(hb_block);
 
 		slot->ds_equal_samples++;
@@ -1003,8 +1010,8 @@ fire_callbacks:
 			     "of %u ms, but our count is %u ms.\n"
 			     "Please double check your configuration values "
 			     "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
-			     slot->ds_node_num, reg->hr_bdev, slot_dead_ms,
-			     dead_ms);
+			     slot->ds_node_num, reg_bdev(reg),
+			     slot_dead_ms, dead_ms);
 		}
 		goto out;
 	}
@@ -1143,7 +1150,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 		 * can't be sure that the new block ever made it to
 		 * disk */
 		mlog(ML_ERROR, "Write error %d on device \"%pg\"\n",
-		     write_wc.wc_error, reg->hr_bdev);
+		     write_wc.wc_error, reg_bdev(reg));
 		ret = write_wc.wc_error;
 		goto bail;
 	}
@@ -1169,7 +1176,7 @@ bail:
 			printk(KERN_NOTICE "o2hb: Unable to stabilize "
 			       "heartbeat on region %s (%pg)\n",
 			       config_item_name(&reg->hr_item),
-			       reg->hr_bdev);
+			       reg_bdev(reg));
 			atomic_set(&reg->hr_steady_iterations, 0);
 			reg->hr_aborted_start = 1;
 			wake_up(&o2hb_steady_queue);
@@ -1489,7 +1496,7 @@ static void o2hb_region_release(struct config_item *item)
 	struct page *page;
 	struct o2hb_region *reg = to_o2hb_region(item);
 
-	mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev);
+	mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg));
 
 	kfree(reg->hr_tmp_block);
 
@@ -1502,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item)
 		kfree(reg->hr_slot_data);
 	}
 
-	if (reg->hr_bdev)
-		blkdev_put(reg->hr_bdev, NULL);
+	if (reg->hr_bdev_handle)
+		bdev_release(reg->hr_bdev_handle);
 
 	kfree(reg->hr_slots);
 
@@ -1562,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
 	unsigned long block_bytes;
 	unsigned int block_bits;
 
-	if (reg->hr_bdev)
+	if (reg->hr_bdev_handle)
 		return -EINVAL;
 
 	status = o2hb_read_block_input(reg, page, &block_bytes,
@@ -1591,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
 	char *p = (char *)page;
 	ssize_t ret;
 
-	if (reg->hr_bdev)
+	if (reg->hr_bdev_handle)
 		return -EINVAL;
 
 	ret = kstrtoull(p, 0, &tmp);
@@ -1616,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
 	unsigned long tmp;
 	char *p = (char *)page;
 
-	if (reg->hr_bdev)
+	if (reg->hr_bdev_handle)
 		return -EINVAL;
 
 	tmp = simple_strtoul(p, &p, 0);
@@ -1635,8 +1642,8 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
 {
 	unsigned int ret = 0;
 
-	if (to_o2hb_region(item)->hr_bdev)
-		ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev);
+	if (to_o2hb_region(item)->hr_bdev_handle)
+		ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
 
 	return ret;
 }
@@ -1745,7 +1752,10 @@ out:
 	return ret;
 }
 
-/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+/*
+ * this is acting as commit; we set up all of hr_bdev_handle and hr_task or
+ * nothing
+ */
 static ssize_t o2hb_region_dev_store(struct config_item *item,
 				     const char *page,
 				     size_t count)
@@ -1759,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	ssize_t ret = -EINVAL;
 	int live_threshold;
 
-	if (reg->hr_bdev)
+	if (reg->hr_bdev_handle)
 		goto out;
 
 	/* We can't heartbeat without having had our node number
@@ -1785,16 +1795,15 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	if (!S_ISBLK(f.file->f_mapping->host->i_mode))
 		goto out2;
 
-	reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev,
-					 BLK_OPEN_WRITE | BLK_OPEN_READ, NULL,
-					 NULL);
-	if (IS_ERR(reg->hr_bdev)) {
-		ret = PTR_ERR(reg->hr_bdev);
-		reg->hr_bdev = NULL;
+	reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev,
+			BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
+	if (IS_ERR(reg->hr_bdev_handle)) {
+		ret = PTR_ERR(reg->hr_bdev_handle);
+		reg->hr_bdev_handle = NULL;
 		goto out2;
 	}
 
-	sectsize = bdev_logical_block_size(reg->hr_bdev);
+	sectsize = bdev_logical_block_size(reg_bdev(reg));
 	if (sectsize != reg->hr_block_bytes) {
 		mlog(ML_ERROR,
 		     "blocksize %u incorrect for device, expected %d",
@@ -1890,12 +1899,12 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 
 	if (hb_task && o2hb_global_heartbeat_active())
 		printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n",
-		       config_item_name(&reg->hr_item), reg->hr_bdev);
+		       config_item_name(&reg->hr_item), reg_bdev(reg));
 
 out3:
 	if (ret < 0) {
-		blkdev_put(reg->hr_bdev, NULL);
-		reg->hr_bdev = NULL;
+		bdev_release(reg->hr_bdev_handle);
+		reg->hr_bdev_handle = NULL;
 	}
 out2:
 	fdput(f);
@@ -2085,7 +2094,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 		printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n",
 		       ((atomic_read(&reg->hr_steady_iterations) == 0) ?
 			"stopped" : "start aborted"), config_item_name(item),
-		       reg->hr_bdev);
+		       reg_bdev(reg));
 	}
 
 	/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8b123d543e6e..a14c8fee6ee5 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1658,7 +1658,8 @@ int __ocfs2_add_entry(handle_t *handle,
 				offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
 
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
-			dir->i_mtime = inode_set_ctime_current(dir);
+			inode_set_mtime_to_ts(dir,
+					      inode_set_ctime_current(dir));
 			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
 			if (retval < 0) {
 				mlog_errno(retval);
@@ -2962,11 +2963,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	ocfs2_dinode_new_extent_list(dir, di);
 
 	i_size_write(dir, sb->s_blocksize);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	di->i_size = cpu_to_le64(sb->s_blocksize);
-	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec);
-	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir));
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir));
 	ocfs2_update_inode_fsync_trans(handle, dir, 1);
 
 	/*
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 81265123ce6c..9b57d012fd5c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -337,7 +337,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inc_nlink(inode);
 
 		inode->i_fop = &simple_dir_operations;
@@ -360,7 +360,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 
 	inode->i_ino = get_next_ino();
 	inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	ip = DLMFS_I(inode);
 	ip->ip_conn = DLMFS_I(parent)->ip_conn;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c3e2961ee5db..64a6ef638495 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2162,7 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
-	struct timespec64 ctime = inode_get_ctime(inode);
+	struct timespec64 ts;
 
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
@@ -2183,12 +2183,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	lvb->lvb_igid      = cpu_to_be32(i_gid_read(inode));
 	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
 	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
-	lvb->lvb_iatime_packed  =
-		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
-	lvb->lvb_ictime_packed =
-		cpu_to_be64(ocfs2_pack_timespec(&ctime));
-	lvb->lvb_imtime_packed =
-		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+	ts = inode_get_atime(inode);
+	lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+	ts = inode_get_ctime(inode);
+	lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+	ts = inode_get_mtime(inode);
+	lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
 	lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
 	lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
 	lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2209,7 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
-	struct timespec64 ctime;
+	struct timespec64 ts;
 
 	mlog_meta_lvb(0, lockres);
 
@@ -2236,13 +2236,12 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
 	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
-	ocfs2_unpack_timespec(&inode->i_atime,
-			      be64_to_cpu(lvb->lvb_iatime_packed));
-	ocfs2_unpack_timespec(&inode->i_mtime,
-			      be64_to_cpu(lvb->lvb_imtime_packed));
-	ocfs2_unpack_timespec(&ctime,
-			      be64_to_cpu(lvb->lvb_ictime_packed));
-	inode_set_ctime_to_ts(inode, ctime);
+	ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed));
+	inode_set_atime_to_ts(inode, ts);
+	ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed));
+	inode_set_mtime_to_ts(inode, ts);
+	ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed));
+	inode_set_ctime_to_ts(inode, ts);
 	spin_unlock(&oi->ip_lock);
 	return 0;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c45596c25c66..94e2a1244442 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -233,16 +233,18 @@ int ocfs2_should_update_atime(struct inode *inode,
 
 	if (vfsmnt->mnt_flags & MNT_RELATIME) {
 		struct timespec64 ctime = inode_get_ctime(inode);
+		struct timespec64 atime = inode_get_atime(inode);
+		struct timespec64 mtime = inode_get_mtime(inode);
 
-		if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
-		    (timespec64_compare(&inode->i_atime, &ctime) <= 0))
+		if ((timespec64_compare(&atime, &mtime) <= 0) ||
+		    (timespec64_compare(&atime, &ctime) <= 0))
 			return 1;
 
 		return 0;
 	}
 
 	now = current_time(inode);
-	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
+	if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum))
 		return 0;
 	else
 		return 1;
@@ -275,9 +277,9 @@ int ocfs2_update_inode_atime(struct inode *inode,
 	 * have i_rwsem to guard against concurrent changes to other
 	 * inode fields.
 	 */
-	inode->i_atime = current_time(inode);
-	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
-	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	inode_set_atime_to_ts(inode, current_time(inode));
+	di->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+	di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
 	ocfs2_journal_dirty(handle, bh);
 
@@ -296,7 +298,7 @@ int ocfs2_set_inode_size(handle_t *handle,
 
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
@@ -417,12 +419,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 	}
 
 	i_size_write(inode, new_i_size);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 	di = (struct ocfs2_dinode *) fe_bh->b_data;
 	di->i_size = cpu_to_le64(new_i_size);
-	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
 
 	ocfs2_journal_dirty(handle, fe_bh);
@@ -821,9 +823,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	i_size_write(inode, abs_to);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	di->i_size = cpu_to_le64((u64)i_size_read(inode));
-	inode->i_mtime = inode_set_ctime_current(inode);
-	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+	di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 	di->i_mtime_nsec = di->i_ctime_nsec;
 	if (handle) {
 		ocfs2_journal_dirty(handle, di_bh);
@@ -2040,7 +2042,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 		goto out_inode_unlock;
 	}
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
 	if (ret < 0)
 		mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index e8771600b930..999111bfc271 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -302,10 +302,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		inode->i_blocks = ocfs2_inode_sector_count(inode);
 		inode->i_mapping->a_ops = &ocfs2_aops;
 	}
-	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
-	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
-	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
-	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+		        le32_to_cpu(fe->i_atime_nsec));
+	inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+		        le32_to_cpu(fe->i_mtime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
 		        le32_to_cpu(fe->i_ctime_nsec));
 
@@ -1312,12 +1312,12 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	fe->i_uid = cpu_to_le32(i_uid_read(inode));
 	fe->i_gid = cpu_to_le32(i_gid_read(inode));
 	fe->i_mode = cpu_to_le16(inode->i_mode);
-	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
-	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-	fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
-	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
-	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+	fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+	fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+	fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+	fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 
 	ocfs2_journal_dirty(handle, bh);
 	ocfs2_update_inode_fsync_trans(handle, inode, 1);
@@ -1348,10 +1348,10 @@ void ocfs2_refresh_inode(struct inode *inode,
 		inode->i_blocks = 0;
 	else
 		inode->i_blocks = ocfs2_inode_sector_count(inode);
-	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
-	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
-	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
-	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+			le32_to_cpu(fe->i_atime_nsec));
+	inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+			le32_to_cpu(fe->i_mtime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
 			le32_to_cpu(fe->i_ctime_nsec));
 
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 05d67968a3a9..1f9ed117e78b 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -951,8 +951,8 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
 
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 	inode_set_ctime_current(inode);
-	di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
 
 	ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5cd6d7771cea..681e9501cdd3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -795,8 +795,8 @@ static int ocfs2_link(struct dentry *old_dentry,
 	inc_nlink(inode);
 	inode_set_ctime_current(inode);
 	ocfs2_set_links_count(fe, inode->i_nlink);
-	fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 	ocfs2_journal_dirty(handle, fe_bh);
 
 	err = ocfs2_add_entry(handle, dentry, inode,
@@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir,
 	ocfs2_set_links_count(fe, inode->i_nlink);
 	ocfs2_journal_dirty(handle, fe_bh);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	if (S_ISDIR(inode->i_mode))
 		drop_nlink(dir);
 
@@ -1550,8 +1550,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
 	if (status >= 0) {
 		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
 
-		old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
-		old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
+		old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode));
+		old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode));
 		ocfs2_journal_dirty(handle, old_inode_bh);
 	} else
 		mlog_errno(status);
@@ -1592,7 +1592,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
 		drop_nlink(new_inode);
 		inode_set_ctime_current(new_inode);
 	}
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
 
 	if (update_dot_dot) {
 		status = ocfs2_update_entry(old_inode, handle,
@@ -1614,8 +1614,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
 
 	if (old_dir != new_dir) {
 		/* Keep the same times on both directories.*/
-		new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
-							 inode_get_ctime(old_dir));
+		inode_set_mtime_to_ts(new_dir,
+				      inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir)));
 
 		/*
 		 * This will also pick up the i_nlink change from the
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 25c8ec3c8c3a..3f80a56d0d60 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3751,8 +3751,8 @@ static int ocfs2_change_ctime(struct inode *inode,
 	}
 
 	inode_set_ctime_current(inode);
-	di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 
 	ocfs2_journal_dirty(handle, di_bh);
 
@@ -4075,10 +4075,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
 		 */
 		inode_set_ctime_current(t_inode);
 
-		di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec);
-		di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec);
+		di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode));
+		di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode));
 
-		t_inode->i_mtime = s_inode->i_mtime;
+		inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode));
 		di->i_mtime = s_di->i_mtime;
 		di->i_mtime_nsec = s_di->i_mtime_nsec;
 	}
@@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest,
 	if (newlen > i_size_read(dest))
 		i_size_write(dest, newlen);
 	spin_unlock(&OCFS2_I(dest)->ip_lock);
-	dest->i_mtime = inode_set_ctime_current(dest);
+	inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest));
 
 	ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
 	if (ret) {
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index da7718cef735..e544c704b583 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -37,7 +37,7 @@ struct ocfs2_slot_info {
 	unsigned int si_blocks;
 	struct buffer_head **si_bh;
 	unsigned int si_num_slots;
-	struct ocfs2_slot si_slots[];
+	struct ocfs2_slot si_slots[] __counted_by(si_num_slots);
 };
 
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6510ad783c91..3b81213ed7b8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -87,14 +87,14 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
 };
 
-const struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler * const ocfs2_xattr_handlers[] = {
 	&ocfs2_xattr_user_handler,
 	&ocfs2_xattr_trusted_handler,
 	&ocfs2_xattr_security_handler,
 	NULL
 };
 
-static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]		= &ocfs2_xattr_user_handler,
 	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]	= &nop_posix_acl_access,
 	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &nop_posix_acl_default,
@@ -3422,8 +3422,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 		}
 
 		inode_set_ctime_current(inode);
-		di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-		di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+		di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+		di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 		ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
 	}
 out:
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f..65e9aa743919 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,7 +30,7 @@ struct ocfs2_security_xattr_info {
 extern const struct xattr_handler ocfs2_xattr_user_handler;
 extern const struct xattr_handler ocfs2_xattr_trusted_handler;
 extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern const struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler * const ocfs2_xattr_handlers[];
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
 int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 2f8c1882f45c..d6cd81163030 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
 	inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
 	inode->i_mapping->a_ops = &omfs_aops;
 
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	switch (mode & S_IFMT) {
 	case S_IFDIR:
 		inode->i_op = &omfs_dir_inops;
@@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait)
 	oi->i_head.h_magic = OMFS_IMAGIC;
 	oi->i_size = cpu_to_be64(inode->i_size);
 
-	ctime = inode_get_ctime(inode).tv_sec * 1000LL +
-		((inode_get_ctime(inode).tv_nsec + 999)/1000);
+	ctime = inode_get_ctime_sec(inode) * 1000LL +
+		((inode_get_ctime_nsec(inode) + 999)/1000);
 	oi->i_ctime = cpu_to_be64(ctime);
 
 	omfs_update_checksums(oi);
@@ -230,11 +230,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
 	ctime = be64_to_cpu(oi->i_ctime);
 	nsecs = do_div(ctime, 1000) * 1000L;
 
-	inode->i_atime.tv_sec = ctime;
-	inode->i_mtime.tv_sec = ctime;
+	inode_set_atime(inode, ctime, nsecs);
+	inode_set_mtime(inode, ctime, nsecs);
 	inode_set_ctime(inode, ctime, nsecs);
-	inode->i_atime.tv_nsec = nsecs;
-	inode->i_mtime.tv_nsec = nsecs;
 
 	inode->i_mapping->a_ops = &omfs_aops;
 
diff --git a/fs/open.c b/fs/open.c
index 98f6601fbac6..02dc608d40d8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 	return ksys_fchown(fd, user, group);
 }
 
+static inline int file_get_write_access(struct file *f)
+{
+	int error;
+
+	error = get_write_access(f->f_inode);
+	if (unlikely(error))
+		return error;
+	error = mnt_get_write_access(f->f_path.mnt);
+	if (unlikely(error))
+		goto cleanup_inode;
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		error = mnt_get_write_access(backing_file_user_path(f)->mnt);
+		if (unlikely(error))
+			goto cleanup_mnt;
+	}
+	return 0;
+
+cleanup_mnt:
+	mnt_put_write_access(f->f_path.mnt);
+cleanup_inode:
+	put_write_access(f->f_inode);
+	return error;
+}
+
 static int do_dentry_open(struct file *f,
 			  struct inode *inode,
 			  int (*open)(struct inode *, struct file *))
@@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f,
 	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
 		i_readcount_inc(inode);
 	} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
-		error = get_write_access(inode);
+		error = file_get_write_access(f);
 		if (unlikely(error))
 			goto cleanup_file;
-		error = __mnt_want_write(f->f_path.mnt);
-		if (unlikely(error)) {
-			put_write_access(inode);
-			goto cleanup_file;
-		}
 		f->f_mode |= FMODE_WRITER;
 	}
 
@@ -1163,20 +1182,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
 
 /**
  * backing_file_open - open a backing file for kernel internal use
- * @path:	path of the file to open
+ * @user_path:	path that the user reuqested to open
  * @flags:	open flags
  * @real_path:	path of the backing file
  * @cred:	credentials for open
  *
  * Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @path may be on the stackable filesystem and backing inode on the
- * underlying filesystem. In this case, we want to be able to return
- * the @real_path of the backing inode. This is done by embedding the
- * returned file into a container structure that also stores the path of
- * the backing inode on the underlying filesystem, which can be
- * retrieved using backing_file_real_path().
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem.  In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
  */
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred)
 {
@@ -1187,9 +1205,9 @@ struct file *backing_file_open(const struct path *path, int flags,
 	if (IS_ERR(f))
 		return f;
 
-	f->f_path = *path;
-	path_get(real_path);
-	*backing_file_real_path(f) = *real_path;
+	path_get(user_path);
+	*backing_file_user_path(f) = *user_path;
+	f->f_path = *real_path;
 	error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
 	if (error) {
 		fput(f);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index b2457cb97fa0..c4b65a6d41cc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -237,7 +237,7 @@ found:
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (inode->i_state & I_NEW) {
-		inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		ent_oi = OP_I(inode);
 		ent_oi->type = ent_type;
 		ent_oi->u = ent_data;
@@ -387,7 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
 		goto out_no_root;
 	}
 
-	root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode);
+	simple_inode_init_ts(root_inode);
 	root_inode->i_op = &openprom_inode_operations;
 	root_inode->i_fop = &openprom_operations;
 	root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index b711654ca18a..926d9c0a428a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -103,7 +103,7 @@ enum orangefs_vfs_op_states {
 #define ORANGEFS_CACHE_CREATE_FLAGS 0
 #endif
 
-extern const struct xattr_handler *orangefs_xattr_handlers[];
+extern const struct xattr_handler * const orangefs_xattr_handlers[];
 
 extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
 extern int orangefs_set_acl(struct mnt_idmap *idmap,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 0a9fcfdf552f..0fdceb00ca07 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -155,14 +155,14 @@ static inline void copy_attributes_from_inode(struct inode *inode,
 	if (orangefs_inode->attr_valid & ATTR_ATIME) {
 		attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
 		if (orangefs_inode->attr_valid & ATTR_ATIME_SET) {
-			attrs->atime = (time64_t)inode->i_atime.tv_sec;
+			attrs->atime = (time64_t) inode_get_atime_sec(inode);
 			attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
 		}
 	}
 	if (orangefs_inode->attr_valid & ATTR_MTIME) {
 		attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
 		if (orangefs_inode->attr_valid & ATTR_MTIME_SET) {
-			attrs->mtime = (time64_t)inode->i_mtime.tv_sec;
+			attrs->mtime = (time64_t) inode_get_mtime_sec(inode);
 			attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
 		}
 	}
@@ -357,15 +357,15 @@ again2:
 	    downcall.resp.getattr.attributes.owner);
 	inode->i_gid = make_kgid(&init_user_ns, new_op->
 	    downcall.resp.getattr.attributes.group);
-	inode->i_atime.tv_sec = (time64_t)new_op->
-	    downcall.resp.getattr.attributes.atime;
-	inode->i_mtime.tv_sec = (time64_t)new_op->
-	    downcall.resp.getattr.attributes.mtime;
+	inode_set_atime(inode,
+			(time64_t)new_op->downcall.resp.getattr.attributes.atime,
+			0);
+	inode_set_mtime(inode,
+			(time64_t)new_op->downcall.resp.getattr.attributes.mtime,
+			0);
 	inode_set_ctime(inode,
 			(time64_t)new_op->downcall.resp.getattr.attributes.ctime,
 			0);
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
 
 	/* special case: mark the root inode as sticky */
 	inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 68b62689a63e..74ef75586f38 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -554,7 +554,7 @@ static const struct xattr_handler orangefs_xattr_default_handler = {
 	.set = orangefs_xattr_set_default,
 };
 
-const struct xattr_handler *orangefs_xattr_handlers[] = {
+const struct xattr_handler * const orangefs_xattr_handlers[] = {
 	&orangefs_xattr_default_handler,
 	NULL
 };
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 8be4dc050d1e..ec3671ca140c 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -239,6 +239,7 @@ static void ovl_file_accessed(struct file *file)
 {
 	struct inode *inode, *upperinode;
 	struct timespec64 ctime, uctime;
+	struct timespec64 mtime, umtime;
 
 	if (file->f_flags & O_NOATIME)
 		return;
@@ -251,9 +252,11 @@ static void ovl_file_accessed(struct file *file)
 
 	ctime = inode_get_ctime(inode);
 	uctime = inode_get_ctime(upperinode);
-	if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) ||
-	     !timespec64_equal(&ctime, &uctime))) {
-		inode->i_mtime = upperinode->i_mtime;
+	mtime = inode_get_mtime(inode);
+	umtime = inode_get_mtime(upperinode);
+	if ((!timespec64_equal(&mtime, &umtime)) ||
+	     !timespec64_equal(&ctime, &uctime)) {
+		inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode));
 		inode_set_ctime_to_ts(inode, uctime);
 	}
 
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 83ef66644c21..b6e98a7d36ce 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -704,7 +704,8 @@ int ovl_update_time(struct inode *inode, int flags)
 
 		if (upperpath.dentry) {
 			touch_atime(&upperpath);
-			inode->i_atime = d_inode(upperpath.dentry)->i_atime;
+			inode_set_atime_to_ts(inode,
+					      inode_get_atime(d_inode(upperpath.dentry)));
 		}
 	}
 	return 0;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3fa2416264a4..6cd949c59fed 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	struct dentry *real = NULL, *lower;
 	int err;
 
-	/* It's an overlay file */
+	/*
+	 * vfs is only expected to call d_real() with NULL from d_real_inode()
+	 * and with overlay inode from file_dentry() on an overlay file.
+	 *
+	 * TODO: remove @inode argument from d_real() API, remove code in this
+	 * function that deals with non-NULL @inode and remove d_real() call
+	 * from file_dentry().
+	 */
 	if (inode && d_inode(dentry) == inode)
 		return dentry;
+	else if (inode)
+		goto bug;
 
 	if (!d_is_reg(dentry)) {
-		if (!inode || inode == d_inode(dentry))
-			return dentry;
-		goto bug;
+		/* d_real_inode() is only relevant for regular files */
+		return dentry;
 	}
 
 	real = ovl_dentry_upper(dentry);
@@ -487,13 +495,13 @@ static const struct xattr_handler ovl_other_xattr_handler = {
 	.set = ovl_other_xattr_set,
 };
 
-static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
+static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = {
 	&ovl_own_trusted_xattr_handler,
 	&ovl_other_xattr_handler,
 	NULL
 };
 
-static const struct xattr_handler *ovl_user_xattr_handlers[] = {
+static const struct xattr_handler * const ovl_user_xattr_handlers[] = {
 	&ovl_own_user_xattr_handler,
 	&ovl_other_xattr_handler,
 	NULL
@@ -1488,8 +1496,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
 		ovl_trusted_xattr_handlers;
 	sb->s_fs_info = ofs;
+#ifdef CONFIG_FS_POSIX_ACL
 	sb->s_flags |= SB_POSIXACL;
+#endif
 	sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+	/*
+	 * Ensure that umask handling is done by the filesystems used
+	 * for the the upper layer instead of overlayfs as that would
+	 * lead to unexpected results.
+	 */
+	sb->s_iflags |= SB_I_NOUMASK;
 
 	err = -ENOMEM;
 	root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 89e0d60d35b6..868afd8834c3 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1409,8 +1409,8 @@ void ovl_copyattr(struct inode *inode)
 	inode->i_uid = vfsuid_into_kuid(vfsuid);
 	inode->i_gid = vfsgid_into_kgid(vfsgid);
 	inode->i_mode = realinode->i_mode;
-	inode->i_atime = realinode->i_atime;
-	inode->i_mtime = realinode->i_mtime;
+	inode_set_atime_to_ts(inode, inode_get_atime(realinode));
+	inode_set_mtime_to_ts(inode, inode_get_mtime(realinode));
 	inode_set_ctime_to_ts(inode, inode_get_ctime(realinode));
 	i_size_write(inode, i_size_read(realinode));
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 139190165a1c..8916c455a469 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe)
 	return !pipe_empty(head, tail) || !writers;
 }
 
+static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
+					    struct pipe_buffer *buf,
+					    unsigned int tail)
+{
+	pipe_buf_release(pipe, buf);
+
+	/*
+	 * If the pipe has a watch_queue, we need additional protection
+	 * by the spinlock because notifications get posted with only
+	 * this spinlock, no mutex
+	 */
+	if (pipe_has_watch_queue(pipe)) {
+		spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+		if (buf->flags & PIPE_BUF_FLAG_LOSS)
+			pipe->note_loss = true;
+#endif
+		pipe->tail = ++tail;
+		spin_unlock_irq(&pipe->rd_wait.lock);
+		return tail;
+	}
+
+	/*
+	 * Without a watch_queue, we can simply increment the tail
+	 * without the spinlock - the mutex is enough.
+	 */
+	pipe->tail = ++tail;
+	return tail;
+}
+
 static ssize_t
 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 {
@@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 				buf->len = 0;
 			}
 
-			if (!buf->len) {
-				pipe_buf_release(pipe, buf);
-				spin_lock_irq(&pipe->rd_wait.lock);
-#ifdef CONFIG_WATCH_QUEUE
-				if (buf->flags & PIPE_BUF_FLAG_LOSS)
-					pipe->note_loss = true;
-#endif
-				tail++;
-				pipe->tail = tail;
-				spin_unlock_irq(&pipe->rd_wait.lock);
-			}
+			if (!buf->len)
+				tail = pipe_update_tail(pipe, buf, tail);
 			total_len -= chars;
 			if (!total_len)
 				break;	/* common path: read succeeded */
@@ -437,12 +458,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 
-#ifdef CONFIG_WATCH_QUEUE
-	if (pipe->watch_queue) {
+	if (pipe_has_watch_queue(pipe)) {
 		ret = -EXDEV;
 		goto out;
 	}
-#endif
 
 	/*
 	 * If it wasn't empty we try to merge new data into
@@ -507,16 +526,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 			 * it, either the reader will consume it or it'll still
 			 * be there for the next write.
 			 */
-			spin_lock_irq(&pipe->rd_wait.lock);
-
-			head = pipe->head;
-			if (pipe_full(head, pipe->tail, pipe->max_usage)) {
-				spin_unlock_irq(&pipe->rd_wait.lock);
-				continue;
-			}
-
 			pipe->head = head + 1;
-			spin_unlock_irq(&pipe->rd_wait.lock);
 
 			/* Insert it into the buffer array */
 			buf = &pipe->bufs[head & mask];
@@ -898,7 +908,7 @@ static struct inode * get_pipe_inode(void)
 	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	return inode;
 
@@ -1324,10 +1334,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
 	unsigned int nr_slots, size;
 	long ret = 0;
 
-#ifdef CONFIG_WATCH_QUEUE
-	if (pipe->watch_queue)
+	if (pipe_has_watch_queue(pipe))
 		return -EBUSY;
-#endif
 
 	size = round_pipe_size(arg);
 	nr_slots = size >> PAGE_SHIFT;
@@ -1379,10 +1387,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
 
 	if (file->f_op != &pipefifo_fops || !pipe)
 		return NULL;
-#ifdef CONFIG_WATCH_QUEUE
-	if (for_splice && pipe->watch_queue)
+	if (for_splice && pipe_has_watch_queue(pipe))
 		return NULL;
-#endif
 	return pipe;
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..83396ab14998 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1902,7 +1902,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb,
 	ei = PROC_I(inode);
 	inode->i_mode = mode;
 	inode->i_ino = get_next_ino();
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &proc_def_inode_operations;
 
 	/*
@@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
 	rc = -ENOENT;
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma->vm_file->f_path;
+		*path = *file_user_path(vma->vm_file);
 		path_get(path);
 		rc = 0;
 	}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, fd);
-	if (file)
-		*mode = file->f_mode;
+	file = task_lookup_fdget_rcu(task, fd);
 	rcu_read_unlock();
+	if (file) {
+		*mode = file->f_mode;
+		fput(file);
+	}
 	return !!file;
 }
 
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 		char name[10 + 1];
 		unsigned int len;
 
-		f = task_lookup_next_fd_rcu(p, &fd);
+		f = task_lookup_next_fdget_rcu(p, &fd);
 		ctx->pos = fd + 2LL;
 		if (!f)
 			break;
 		data.mode = f->f_mode;
 		rcu_read_unlock();
+		fput(f);
 		data.fd = fd;
 
 		len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 532dc9d240f7..592ed2516f47 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -660,7 +660,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 
 	inode->i_private = de->data;
 	inode->i_ino = de->low_ino;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	PROC_I(inode)->pde = de;
 	if (is_empty_pde(de)) {
 		make_empty_dir_inode(inode);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c88854df0b62..bc9a2db89cfa 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -465,7 +465,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	head->count++;
 	spin_unlock(&sysctl_lock);
 
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_mode = table->mode;
 	if (!S_ISDIR(table->mode)) {
 		inode->i_mode |= S_IFREG;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ecc4da8d265e..b46fbfd22681 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s)
 		struct inode *inode = new_inode(s);
 		if (inode) {
 			inode->i_ino = self_inum;
-			inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+			simple_inode_init_ts(inode);
 			inode->i_mode = S_IFLNK | S_IRWXUGO;
 			inode->i_uid = GLOBAL_ROOT_UID;
 			inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..1593940ca01e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		if (anon_name)
 			seq_printf(m, "[anon_shmem:%s]", anon_name->name);
 		else
-			seq_file_path(m, file, "\n");
+			seq_path(m, file_user_path(file), "\n");
 		goto done;
 	}
 
@@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 
 	if (file) {
 		seq_puts(m, " file=");
-		seq_file_path(m, file, "\n\t= ");
+		seq_path(m, file_user_path(file), "\n\t= ");
 	} else if (vma_is_initial_heap(vma)) {
 		seq_puts(m, " heap");
 	} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 7cebd397cc26..bce674533000 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	} else if (mm && vma_is_initial_stack(vma)) {
 		seq_pad(m, ' ');
 		seq_puts(m, "[stack]");
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 63ac1f93289f..0e5050d6ab64 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s)
 		struct inode *inode = new_inode(s);
 		if (inode) {
 			inode->i_ino = thread_self_inum;
-			inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+			simple_inode_init_ts(inode);
 			inode->i_mode = S_IFLNK | S_IRWXUGO;
 			inode->i_uid = GLOBAL_ROOT_UID;
 			inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 585360706b33..d41c20d1b5e8 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb)
 	struct inode *inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 	return inode;
 }
@@ -390,7 +390,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
 	inode->i_private = private;
 
 	if (record->time.tv_sec)
-		inode->i_mtime = inode_set_ctime_to_ts(inode, record->time);
+		inode_set_mtime_to_ts(inode,
+				      inode_set_ctime_to_ts(inode, record->time));
 
 	d_add(dentry, inode);
 
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index e5bca9a004cc..03425928d2fb 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -464,6 +464,8 @@ out:
  */
 int pstore_register(struct pstore_info *psi)
 {
+	char *new_backend;
+
 	if (backend && strcmp(backend, psi->name)) {
 		pr_warn("backend '%s' already in use: ignoring '%s'\n",
 			backend, psi->name);
@@ -484,11 +486,16 @@ int pstore_register(struct pstore_info *psi)
 		return -EINVAL;
 	}
 
+	new_backend = kstrdup(psi->name, GFP_KERNEL);
+	if (!new_backend)
+		return -ENOMEM;
+
 	mutex_lock(&psinfo_lock);
 	if (psinfo) {
 		pr_warn("backend '%s' already loaded: ignoring '%s'\n",
 			psinfo->name, psi->name);
 		mutex_unlock(&psinfo_lock);
+		kfree(new_backend);
 		return -EBUSY;
 	}
 
@@ -521,7 +528,7 @@ int pstore_register(struct pstore_info *psi)
 	 * Update the module parameter backend, so it is visible
 	 * through /sys/module/pstore/parameters/backend
 	 */
-	backend = kstrdup(psi->name, GFP_KERNEL);
+	backend = new_backend;
 
 	pr_info("Registered %s as persistent store backend\n", psi->name);
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index a7171f5532a1..6eb9bb369b57 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -301,10 +301,8 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
 	i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid));
 	set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
 	inode->i_size    = le32_to_cpu(raw_inode->di_size);
-	inode->i_mtime.tv_sec   = le32_to_cpu(raw_inode->di_mtime);
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_sec   = le32_to_cpu(raw_inode->di_atime);
-	inode->i_atime.tv_nsec = 0;
+	inode_set_mtime(inode, le32_to_cpu(raw_inode->di_mtime), 0);
+	inode_set_atime(inode, le32_to_cpu(raw_inode->di_atime), 0);
 	inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0);
 	inode->i_blocks  = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
 
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 21f90d519f1a..a286c545717f 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -558,10 +558,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
 	i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid));
 	i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid));
 	inode->i_size    = fs64_to_cpu(sbi, raw_inode->di_size);
-	inode->i_mtime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_mtime);
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_atime);
-	inode->i_atime.tv_nsec = 0;
+	inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0);
+	inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0);
 	inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0);
 
 	/* calc blocks based on 512 byte blocksize */
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 18e8387cab41..4ac05a9e25bc 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 		inode->i_mapping->a_ops = &ram_aops;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
@@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 		d_instantiate(dentry, inode);
 		dget(dentry);	/* Extra count - pin the dentry in core */
 		error = 0;
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	}
 	return error;
 }
@@ -138,7 +138,8 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		if (!error) {
 			d_instantiate(dentry, inode);
 			dget(dentry);
-			dir->i_mtime = inode_set_ctime_current(dir);
+			inode_set_mtime_to_ts(dir,
+					      inode_set_ctime_current(dir));
 		} else
 			iput(inode);
 	}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 86e55d4bb10d..c8572346556f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1257,11 +1257,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		i_uid_write(inode, sd_v1_uid(sd));
 		i_gid_write(inode, sd_v1_gid(sd));
 		inode->i_size = sd_v1_size(sd);
-		inode->i_atime.tv_sec = sd_v1_atime(sd);
-		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
+		inode_set_atime(inode, sd_v1_atime(sd), 0);
+		inode_set_mtime(inode, sd_v1_mtime(sd), 0);
 		inode_set_ctime(inode, sd_v1_ctime(sd), 0);
-		inode->i_atime.tv_nsec = 0;
-		inode->i_mtime.tv_nsec = 0;
 
 		inode->i_blocks = sd_v1_blocks(sd);
 		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
@@ -1311,11 +1309,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		i_uid_write(inode, sd_v2_uid(sd));
 		inode->i_size = sd_v2_size(sd);
 		i_gid_write(inode, sd_v2_gid(sd));
-		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
-		inode->i_atime.tv_sec = sd_v2_atime(sd);
+		inode_set_mtime(inode, sd_v2_mtime(sd), 0);
+		inode_set_atime(inode, sd_v2_atime(sd), 0);
 		inode_set_ctime(inode, sd_v2_ctime(sd), 0);
-		inode->i_mtime.tv_nsec = 0;
-		inode->i_atime.tv_nsec = 0;
 		inode->i_blocks = sd_v2_blocks(sd);
 		rdev = sd_v2_rdev(sd);
 		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -1370,9 +1366,9 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
 	set_sd_v2_uid(sd_v2, i_uid_read(inode));
 	set_sd_v2_size(sd_v2, size);
 	set_sd_v2_gid(sd_v2, i_gid_read(inode));
-	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
-	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
-	set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec);
+	set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
+	set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
+	set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
 	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
@@ -1391,9 +1387,9 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
 	set_sd_v1_gid(sd_v1, i_gid_read(inode));
 	set_sd_v1_nlink(sd_v1, inode->i_nlink);
 	set_sd_v1_size(sd_v1, size);
-	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
-	set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec);
-	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
+	set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
+	set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
+	set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
 
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
@@ -1984,7 +1980,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	/* uid and gid must already be set by the caller for quota init */
 
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_size = i_size;
 	inode->i_blocks = 0;
 	inode->i_bytes = 0;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 015bfe4e4524..171c912af50f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -90,8 +90,7 @@ static int flush_commit_list(struct super_block *s,
 static int can_dirty(struct reiserfs_journal_cnode *cn);
 static int journal_join(struct reiserfs_transaction_handle *th,
 			struct super_block *sb);
-static void release_journal_dev(struct super_block *super,
-			       struct reiserfs_journal *journal);
+static void release_journal_dev(struct reiserfs_journal *journal);
 static void dirty_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl);
 static void flush_async_commits(struct work_struct *work);
@@ -1893,7 +1892,7 @@ static void free_journal_ram(struct super_block *sb)
 	 * j_header_bh is on the journal dev, make sure
 	 * not to release the journal dev until we brelse j_header_bh
 	 */
-	release_journal_dev(sb, journal);
+	release_journal_dev(journal);
 	vfree(journal);
 }
 
@@ -2387,7 +2386,7 @@ static int journal_read(struct super_block *sb)
 
 	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 	reiserfs_info(sb, "checking transaction log (%pg)\n",
-		      journal->j_dev_bd);
+		      journal->j_bdev_handle->bdev);
 	start = ktime_get_seconds();
 
 	/*
@@ -2448,7 +2447,7 @@ static int journal_read(struct super_block *sb)
 		 * device and journal device to be the same
 		 */
 		d_bh =
-		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
+		    reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock,
 				    sb->s_blocksize,
 				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				    SB_ONDISK_JOURNAL_SIZE(sb));
@@ -2587,17 +2586,11 @@ static void journal_list_init(struct super_block *sb)
 	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
 }
 
-static void release_journal_dev(struct super_block *super,
-			       struct reiserfs_journal *journal)
+static void release_journal_dev(struct reiserfs_journal *journal)
 {
-	if (journal->j_dev_bd != NULL) {
-		void *holder = NULL;
-
-		if (journal->j_dev_bd->bd_dev != super->s_dev)
-			holder = journal;
-
-		blkdev_put(journal->j_dev_bd, holder);
-		journal->j_dev_bd = NULL;
+	if (journal->j_bdev_handle) {
+		bdev_release(journal->j_bdev_handle);
+		journal->j_bdev_handle = NULL;
 	}
 }
 
@@ -2612,7 +2605,7 @@ static int journal_init_dev(struct super_block *super,
 
 	result = 0;
 
-	journal->j_dev_bd = NULL;
+	journal->j_bdev_handle = NULL;
 	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
 	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
 
@@ -2623,36 +2616,37 @@ static int journal_init_dev(struct super_block *super,
 	if ((!jdev_name || !jdev_name[0])) {
 		if (jdev == super->s_dev)
 			holder = NULL;
-		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder,
-						      NULL);
-		if (IS_ERR(journal->j_dev_bd)) {
-			result = PTR_ERR(journal->j_dev_bd);
-			journal->j_dev_bd = NULL;
+		journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode,
+							  holder, NULL);
+		if (IS_ERR(journal->j_bdev_handle)) {
+			result = PTR_ERR(journal->j_bdev_handle);
+			journal->j_bdev_handle = NULL;
 			reiserfs_warning(super, "sh-458",
 					 "cannot init journal device unknown-block(%u,%u): %i",
 					 MAJOR(jdev), MINOR(jdev), result);
 			return result;
 		} else if (jdev != super->s_dev)
-			set_blocksize(journal->j_dev_bd, super->s_blocksize);
+			set_blocksize(journal->j_bdev_handle->bdev,
+				      super->s_blocksize);
 
 		return 0;
 	}
 
-	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder,
-					       NULL);
-	if (IS_ERR(journal->j_dev_bd)) {
-		result = PTR_ERR(journal->j_dev_bd);
-		journal->j_dev_bd = NULL;
+	journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode,
+						   holder, NULL);
+	if (IS_ERR(journal->j_bdev_handle)) {
+		result = PTR_ERR(journal->j_bdev_handle);
+		journal->j_bdev_handle = NULL;
 		reiserfs_warning(super, "sh-457",
 				 "journal_init_dev: Cannot open '%s': %i",
 				 jdev_name, result);
 		return result;
 	}
 
-	set_blocksize(journal->j_dev_bd, super->s_blocksize);
+	set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize);
 	reiserfs_info(super,
 		      "journal_init_dev: journal device: %pg\n",
-		      journal->j_dev_bd);
+		      journal->j_bdev_handle->bdev);
 	return 0;
 }
 
@@ -2810,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 				 "journal header magic %x (device %pg) does "
 				 "not match to magic found in super block %x",
 				 jh->jh_journal.jp_journal_magic,
-				 journal->j_dev_bd,
+				 journal->j_bdev_handle->bdev,
 				 sb_jp_journal_magic(rs));
 		brelse(bhjh);
 		goto free_and_return;
@@ -2834,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	reiserfs_info(sb, "journal params: device %pg, size %u, "
 		      "journal first block %u, max trans len %u, max batch %u, "
 		      "max commit age %u, max trans age %u\n",
-		      journal->j_dev_bd,
+		      journal->j_bdev_handle->bdev,
 		      SB_ONDISK_JOURNAL_SIZE(sb),
 		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		      journal->j_trans_max,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9c5704be2435..994d6e6995ab 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	}
 
 	dir->i_size += paste_size;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	if (!S_ISDIR(inode->i_mode) && visible)
 		/* reiserfs_mkdir or reiserfs_rename will do that by itself */
 		reiserfs_update_sd(th, dir);
@@ -966,8 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 			       inode->i_nlink);
 
 	clear_nlink(inode);
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	reiserfs_update_sd(&th, inode);
 
 	DEC_DIR_INODE_NLINK(dir)
@@ -1075,7 +1075,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	reiserfs_update_sd(&th, inode);
 
 	dir->i_size -= (de.de_entrylen + DEH_SIZE);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	reiserfs_update_sd(&th, dir);
 
 	if (!savelink)
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 3dba8acf4e83..83cb9402e0f9 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused)
 		   "prepare: \t%12lu\n"
 		   "prepare_retry: \t%12lu\n",
 		   DJP(jp_journal_1st_block),
-		   SB_JOURNAL(sb)->j_dev_bd,
+		   SB_JOURNAL(sb)->j_bdev_handle->bdev,
 		   DJP(jp_journal_dev),
 		   DJP(jp_journal_size),
 		   DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 7d12b8c5b2fa..725667880e62 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -299,7 +299,7 @@ struct reiserfs_journal {
 	/* oldest journal block.  start here for traverse */
 	struct reiserfs_journal_cnode *j_first;
 
-	struct block_device *j_dev_bd;
+	struct bdev_handle *j_bdev_handle;
 
 	/* first block on s_dev of reserved area journal */
 	int j_1st_reserved_block;
@@ -1165,7 +1165,7 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
 	return bmap_nr > ((1LL << 16) - 1);
 }
 
-extern const struct xattr_handler *reiserfs_xattr_handlers[];
+extern const struct xattr_handler * const reiserfs_xattr_handlers[];
 
 /*
  * this says about version of key of all items (but stat data) the
@@ -2809,9 +2809,12 @@ struct reiserfs_journal_header {
 #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
 
 /* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
-#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_find_get_block(s, block) __find_get_block(\
+		SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+		block, s->s_blocksize)
+#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+		block, s->s_blocksize)
 
 enum reiserfs_bh_state_bits {
 	BH_JDirty = BH_PrivateStart,	/* buffer is in current transaction */
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 3676e02a0232..2138ee7d271d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2003,7 +2003,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
 			pathrelse(&s_search_path);
 
 			if (update_timestamps) {
-				inode->i_mtime = current_time(inode);
+				inode_set_mtime_to_ts(inode,
+						      current_time(inode));
 				inode_set_ctime_current(inode);
 			}
 			reiserfs_update_sd(th, inode);
@@ -2028,7 +2029,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
 update_and_out:
 	if (update_timestamps) {
 		/* this is truncate, not file closing */
-		inode->i_mtime = current_time(inode);
+		inode_set_mtime_to_ts(inode, current_time(inode));
 		inode_set_ctime_current(inode);
 	}
 	reiserfs_update_sd(th, inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7eaf36b3de12..67b5510beded 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2587,7 +2587,7 @@ out:
 		return err;
 	if (inode->i_size < off + len - towrite)
 		i_size_write(inode, off + len - towrite);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	return len - towrite;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6000964c2b80..998035a6388e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -780,7 +780,7 @@ static inline bool reiserfs_posix_acl_list(const char *name,
 }
 
 /* This is the implementation for the xattr plugin infrastructure */
-static inline bool reiserfs_xattr_list(const struct xattr_handler **handlers,
+static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
 				       const char *name, struct dentry *dentry)
 {
 	if (handlers) {
@@ -911,7 +911,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
 #endif
 
 /* Actual operations that are exported to VFS-land */
-const struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler * const reiserfs_xattr_handlers[] = {
 #ifdef CONFIG_REISERFS_FS_XATTR
 	&reiserfs_xattr_user_handler,
 	&reiserfs_xattr_trusted_handler,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 5c35f6c76037..545ad44f96b8 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -322,7 +322,8 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 
 	set_nlink(i, 1);		/* Hard to decide.. */
 	i->i_size = be32_to_cpu(ri.size);
-	i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0);
+	inode_set_mtime_to_ts(i,
+			      inode_set_atime_to_ts(i, inode_set_ctime(i, 0, 0)));
 
 	/* set up mode and ops */
 	mode = romfs_modemap[nextfh & ROMFH_TYPE];
@@ -593,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb)
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		blkdev_put(sb->s_bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 #endif
 }
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 41daebd220ff..8ca3d7606bb4 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -127,7 +127,7 @@ extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
 			struct dentry *direntry, const char *symname);
 
 #ifdef CONFIG_CIFS_XATTR
-extern const struct xattr_handler *cifs_xattr_handlers[];
+extern const struct xattr_handler * const cifs_xattr_handlers[];
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 #else
 # define cifs_xattr_handlers NULL
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 2108b3b40ce9..cf17e3dd703e 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1085,7 +1085,8 @@ int cifs_close(struct inode *inode, struct file *file)
 		    !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
 		    dclose) {
 			if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
-				inode->i_mtime = inode_set_ctime_current(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_set_ctime_current(inode));
 			}
 			spin_lock(&cinode->deferred_lock);
 			cifs_add_deferred_close(cfile, dclose);
@@ -2596,7 +2597,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 					   write_data, to - from, &offset);
 		cifsFileInfo_put(open_file);
 		/* Does mm or vfs already set times? */
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		if ((bytes_written > 0) && (offset))
 			rc = 0;
 		else if (bytes_written < 0)
@@ -4647,11 +4648,13 @@ static void cifs_readahead(struct readahead_control *ractl)
 static int cifs_readpage_worker(struct file *file, struct page *page,
 	loff_t *poffset)
 {
+	struct inode *inode = file_inode(file);
+	struct timespec64 atime, mtime;
 	char *read_data;
 	int rc;
 
 	/* Is the page cached? */
-	rc = cifs_readpage_from_fscache(file_inode(file), page);
+	rc = cifs_readpage_from_fscache(inode, page);
 	if (rc == 0)
 		goto read_complete;
 
@@ -4666,11 +4669,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 		cifs_dbg(FYI, "Bytes read %d\n", rc);
 
 	/* we do not want atime to be less than mtime, it broke some apps */
-	file_inode(file)->i_atime = current_time(file_inode(file));
-	if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime)))
-		file_inode(file)->i_atime = file_inode(file)->i_mtime;
-	else
-		file_inode(file)->i_atime = current_time(file_inode(file));
+	atime = inode_set_atime_to_ts(inode, current_time(inode));
+	mtime = inode_get_mtime(inode);
+	if (timespec64_compare(&atime, &mtime))
+		inode_set_atime_to_ts(inode, inode_get_mtime(inode));
 
 	if (PAGE_SIZE > rc)
 		memset(read_data + rc, 0, PAGE_SIZE - rc);
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index 84f3b09367d2..a3d73720914f 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -49,12 +49,12 @@ static inline
 void cifs_fscache_fill_coherency(struct inode *inode,
 				 struct cifs_fscache_inode_coherency_data *cd)
 {
-	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct timespec64 ctime = inode_get_ctime(inode);
+	struct timespec64 mtime = inode_get_mtime(inode);
 
 	memset(cd, 0, sizeof(*cd));
-	cd->last_write_time_sec   = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
-	cd->last_write_time_nsec  = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
+	cd->last_write_time_sec   = cpu_to_le64(mtime.tv_sec);
+	cd->last_write_time_nsec  = cpu_to_le32(mtime.tv_nsec);
 	cd->last_change_time_sec  = cpu_to_le64(ctime.tv_sec);
 	cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec);
 }
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index d7c302442c1e..3abfe77bfa46 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -82,6 +82,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 {
 	struct cifs_fscache_inode_coherency_data cd;
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct timespec64 mtime;
 
 	cifs_dbg(FYI, "%s: revalidating inode %llu\n",
 		 __func__, cifs_i->uniqueid);
@@ -101,7 +102,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 
 	 /* revalidate if mtime or size have changed */
 	fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
-	if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+	mtime = inode_get_mtime(inode);
+	if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
 	    cifs_i->server_eof == fattr->cf_eof) {
 		cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
 			 __func__, cifs_i->uniqueid);
@@ -164,10 +166,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode);
 	/* we do not want atime to be less than mtime, it broke some apps */
 	if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0)
-		inode->i_atime = fattr->cf_mtime;
+		inode_set_atime_to_ts(inode, fattr->cf_mtime);
 	else
-		inode->i_atime = fattr->cf_atime;
-	inode->i_mtime = fattr->cf_mtime;
+		inode_set_atime_to_ts(inode, fattr->cf_atime);
+	inode_set_mtime_to_ts(inode, fattr->cf_mtime);
 	inode_set_ctime_to_ts(inode, fattr->cf_ctime);
 	inode->i_rdev = fattr->cf_rdev;
 	cifs_nlink_fattr_to_inode(inode, fattr);
@@ -1816,7 +1818,7 @@ out_reval:
 					   when needed */
 		inode_set_ctime_current(inode);
 	}
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	cifs_inode = CIFS_I(dir);
 	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
 unlink_out:
@@ -2131,7 +2133,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	cifsInode->time = 0;
 
 	inode_set_ctime_current(d_inode(direntry));
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 rmdir_exit:
 	free_dentry_path(page);
@@ -2337,9 +2339,6 @@ unlink_target:
 	/* force revalidate to go get info when needed */
 	CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
 
-	source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir,
-									  inode_set_ctime_current(target_dir));
-
 cifs_rename_exit:
 	kfree(info_buf_source);
 	free_dentry_path(page2);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 9aeecee6b91b..f4849a8ad40b 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -1403,12 +1403,14 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
 
 	/* Creation time should not need to be updated on close */
 	if (file_inf.LastWriteTime)
-		inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime);
+		inode_set_mtime_to_ts(inode,
+				      cifs_NTtimeToUnix(file_inf.LastWriteTime));
 	if (file_inf.ChangeTime)
 		inode_set_ctime_to_ts(inode,
 				      cifs_NTtimeToUnix(file_inf.ChangeTime));
 	if (file_inf.LastAccessTime)
-		inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime);
+		inode_set_atime_to_ts(inode,
+				      cifs_NTtimeToUnix(file_inf.LastAccessTime));
 
 	/*
 	 * i_blocks is not related to (i_size / i_blksize),
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 4ad5531686d8..ac199160bce6 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -478,7 +478,7 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = {
 	.set = cifs_xattr_set,
 };
 
-const struct xattr_handler *cifs_xattr_handlers[] = {
+const struct xattr_handler * const cifs_xattr_handlers[] = {
 	&cifs_user_xattr_handler,
 	&cifs_os2_xattr_handler,
 	&cifs_cifs_acl_xattr_handler,
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 93262ca3f58a..658209839729 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -4834,9 +4834,9 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
 
 	file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
 	file_info->CreationTime = cpu_to_le64(fp->create_time);
-	time = ksmbd_UnixTimeToNT(inode->i_atime);
+	time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
 	file_info->LastAccessTime = cpu_to_le64(time);
-	time = ksmbd_UnixTimeToNT(inode->i_mtime);
+	time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
 	file_info->LastWriteTime = cpu_to_le64(time);
 	time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
 	file_info->ChangeTime = cpu_to_le64(time);
@@ -5443,9 +5443,9 @@ int smb2_close(struct ksmbd_work *work)
 		rsp->EndOfFile = cpu_to_le64(inode->i_size);
 		rsp->Attributes = fp->f_ci->m_fattr;
 		rsp->CreationTime = cpu_to_le64(fp->create_time);
-		time = ksmbd_UnixTimeToNT(inode->i_atime);
+		time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
 		rsp->LastAccessTime = cpu_to_le64(time);
-		time = ksmbd_UnixTimeToNT(inode->i_mtime);
+		time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
 		rsp->LastWriteTime = cpu_to_le64(time);
 		time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
 		rsp->ChangeTime = cpu_to_le64(time);
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index c6e626b00546..aa3411354e66 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -59,9 +59,9 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
 	i_uid_write(inode, i_uid);
 	i_gid_write(inode, i_gid);
 	inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
-	inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
-	inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
-	inode_set_ctime(inode, inode->i_mtime.tv_sec, 0);
+	inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
+	inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
+	inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
 	inode->i_mode = le16_to_cpu(sqsh_ino->mode);
 	inode->i_size = 0;
 
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index a6164fdf9435..5a756e6790b5 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -111,4 +111,4 @@ extern const struct address_space_operations squashfs_symlink_aops;
 extern const struct inode_operations squashfs_symlink_inode_ops;
 
 /* xattr.c */
-extern const struct xattr_handler *squashfs_xattr_handlers[];
+extern const struct xattr_handler * const squashfs_xattr_handlers[];
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e1e3f3dd5a06..ce6608cabd49 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -262,7 +262,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int type)
 	}
 }
 
-const struct xattr_handler *squashfs_xattr_handlers[] = {
+const struct xattr_handler * const squashfs_xattr_handlers[] = {
 	&squashfs_xattr_user_handler,
 	&squashfs_xattr_trusted_handler,
 	&squashfs_xattr_security_handler,
diff --git a/fs/stack.c b/fs/stack.c
index b5e01bdb5f5f..f18920119944 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -66,8 +66,8 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
 	dest->i_uid = src->i_uid;
 	dest->i_gid = src->i_gid;
 	dest->i_rdev = src->i_rdev;
-	dest->i_atime = src->i_atime;
-	dest->i_mtime = src->i_mtime;
+	inode_set_atime_to_ts(dest, inode_get_atime(src));
+	inode_set_mtime_to_ts(dest, inode_get_mtime(src));
 	inode_set_ctime_to_ts(dest, inode_get_ctime(src));
 	dest->i_blkbits = src->i_blkbits;
 	dest->i_flags = src->i_flags;
diff --git a/fs/stat.c b/fs/stat.c
index d43a5cc1bfa4..24bb0209e459 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,8 +57,8 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
 	stat->gid = vfsgid_into_kgid(vfsgid);
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
-	stat->atime = inode->i_atime;
-	stat->mtime = inode->i_mtime;
+	stat->atime = inode_get_atime(inode);
+	stat->mtime = inode_get_mtime(inode);
 	stat->ctime = inode_get_ctime(inode);
 	stat->blksize = i_blocksize(inode);
 	stat->blocks = inode->i_blocks;
diff --git a/fs/super.c b/fs/super.c
index 2d762ce67f6e..c7b452e12e4c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1419,32 +1419,48 @@ EXPORT_SYMBOL(sget_dev);
 
 #ifdef CONFIG_BLOCK
 /*
- * Lock a super block that the callers holds a reference to.
+ * Lock the superblock that is holder of the bdev. Returns the superblock
+ * pointer if we successfully locked the superblock and it is alive. Otherwise
+ * we return NULL and just unlock bdev->bd_holder_lock.
  *
- * The caller needs to ensure that the super_block isn't being freed while
- * calling this function, e.g. by holding a lock over the call to this function
- * and the place that clears the pointer to the superblock used by this function
- * before freeing the superblock.
+ * The function must be called with bdev->bd_holder_lock and releases it.
  */
-static bool super_lock_shared_active(struct super_block *sb)
+static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
+	__releases(&bdev->bd_holder_lock)
 {
-	bool born = super_lock_shared(sb);
+	struct super_block *sb = bdev->bd_holder;
+	bool born;
+
+	lockdep_assert_held(&bdev->bd_holder_lock);
+	lockdep_assert_not_held(&sb->s_umount);
+	lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
+
+	/* Make sure sb doesn't go away from under us */
+	spin_lock(&sb_lock);
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+	mutex_unlock(&bdev->bd_holder_lock);
 
+	born = super_lock_shared(sb);
 	if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
 		super_unlock_shared(sb);
-		return false;
+		put_super(sb);
+		return NULL;
 	}
-	return true;
+	/*
+	 * The superblock is active and we hold s_umount, we can drop our
+	 * temporary reference now.
+	 */
+	put_super(sb);
+	return sb;
 }
 
 static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 {
-	struct super_block *sb = bdev->bd_holder;
-
-	/* bd_holder_lock ensures that the sb isn't freed */
-	lockdep_assert_held(&bdev->bd_holder_lock);
+	struct super_block *sb;
 
-	if (!super_lock_shared_active(sb))
+	sb = bdev_super_lock_shared(bdev);
+	if (!sb)
 		return;
 
 	if (!surprise)
@@ -1459,11 +1475,10 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 
 static void fs_bdev_sync(struct block_device *bdev)
 {
-	struct super_block *sb = bdev->bd_holder;
-
-	lockdep_assert_held(&bdev->bd_holder_lock);
+	struct super_block *sb;
 
-	if (!super_lock_shared_active(sb))
+	sb = bdev_super_lock_shared(bdev);
+	if (!sb)
 		return;
 	sync_filesystem(sb);
 	super_unlock_shared(sb);
@@ -1479,14 +1494,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		struct fs_context *fc)
 {
 	blk_mode_t mode = sb_open_mode(sb_flags);
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 
-	bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+	if (IS_ERR(bdev_handle)) {
 		if (fc)
 			errorf(fc, "%s: Can't open blockdev", fc->source);
-		return PTR_ERR(bdev);
+		return PTR_ERR(bdev_handle);
 	}
+	bdev = bdev_handle->bdev;
 
 	/*
 	 * This really should be in blkdev_get_by_dev, but right now can't due
@@ -1494,7 +1511,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	 * writable from userspace even for a read-only block device.
 	 */
 	if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
-		blkdev_put(bdev, sb);
+		bdev_release(bdev_handle);
 		return -EACCES;
 	}
 
@@ -1510,10 +1527,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		if (fc)
 			warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
-		blkdev_put(bdev, sb);
+		bdev_release(bdev_handle);
 		return -EBUSY;
 	}
 	spin_lock(&sb_lock);
+	sb->s_bdev_handle = bdev_handle;
 	sb->s_bdev = bdev;
 	sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
 	if (bdev_stable_writes(bdev))
@@ -1646,7 +1664,7 @@ void kill_block_super(struct super_block *sb)
 	generic_shutdown_super(sb);
 	if (bdev) {
 		sync_blockdev(bdev);
-		blkdev_put(bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 }
 
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 2f5ead88d00b..2e126d72d619 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -224,7 +224,7 @@ got_it:
 	memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
 	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
 	dir_commit_chunk(page, pos, SYSV_DIRSIZE);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	err = sysv_handle_dirsync(dir);
 out_page:
@@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
 	}
 	de->inode = 0;
 	dir_commit_chunk(page, pos, SYSV_DIRSIZE);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	return sysv_handle_dirsync(inode);
 }
@@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page,
 	}
 	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
 	dir_commit_chunk(page, pos, SYSV_DIRSIZE);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	return sysv_handle_dirsync(inode);
 }
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 6719da5889d9..269df6d49815 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
 	dirty_sb(sb);
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_ino = fs16_to_cpu(sbi, ino);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_blocks = 0;
 	memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
 	SYSV_I(inode)->i_dir_start_lookup = 0;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0aa3827d8178..5a915b2e68f5 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -200,11 +200,9 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
 	i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
 	set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
 	inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
-	inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
-	inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
+	inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0);
+	inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0);
 	inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
 	inode->i_blocks = 0;
 
 	si = SYSV_I(inode);
@@ -253,9 +251,9 @@ static int __sysv_write_inode(struct inode *inode, int wait)
 	raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
 	raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
 	raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
-	raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
-	raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec);
-	raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec);
+	raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode));
+	raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode));
+	raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode));
 
 	si = SYSV_I(inode);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index edb94e55de8e..725981474e5f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -423,7 +423,7 @@ do_indirects:
 		}
 		n++;
 	}
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (IS_SYNC(inode))
 		sysv_sync_inode (inode);
 	else
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 891653ba9cf3..429603d865a9 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -152,7 +152,7 @@ struct inode *tracefs_get_inode(struct super_block *sb)
 	struct inode *inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 	return inode;
 }
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 3125e76376ee..921f9033d0d2 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,8 +88,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 }
 
 const struct fscrypt_operations ubifs_crypt_operations = {
-	.flags			= FS_CFLG_OWN_PAGES,
-	.key_prefix		= "ubifs:",
+	.legacy_key_prefix	= "ubifs:",
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
 	.empty_dir		= ubifs_crypt_empty_dir,
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef9e527d9ff..d013c5b3f1ed 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -237,14 +237,14 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
 	pr_err("\tuid            %u\n", (unsigned int)i_uid_read(inode));
 	pr_err("\tgid            %u\n", (unsigned int)i_gid_read(inode));
 	pr_err("\tatime          %u.%u\n",
-	       (unsigned int)inode->i_atime.tv_sec,
-	       (unsigned int)inode->i_atime.tv_nsec);
+	       (unsigned int) inode_get_atime_sec(inode),
+	       (unsigned int) inode_get_atime_nsec(inode));
 	pr_err("\tmtime          %u.%u\n",
-	       (unsigned int)inode->i_mtime.tv_sec,
-	       (unsigned int)inode->i_mtime.tv_nsec);
+	       (unsigned int) inode_get_mtime_sec(inode),
+	       (unsigned int) inode_get_mtime_nsec(inode));
 	pr_err("\tctime          %u.%u\n",
-	       (unsigned int) inode_get_ctime(inode).tv_sec,
-	       (unsigned int) inode_get_ctime(inode).tv_nsec);
+	       (unsigned int) inode_get_ctime_sec(inode),
+	       (unsigned int) inode_get_ctime_nsec(inode));
 	pr_err("\tcreat_sqnum    %llu\n", ui->creat_sqnum);
 	pr_err("\txattr_size     %u\n", ui->xattr_size);
 	pr_err("\txattr_cnt      %u\n", ui->xattr_cnt);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 2f48c58d47cd..7af442de44c3 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -96,7 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 	inode->i_flags |= S_NOCMTIME;
 
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_mapping->nrpages = 0;
 
 	if (!is_xattr) {
@@ -324,7 +324,8 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
@@ -767,7 +768,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	inode_set_ctime_current(inode);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
@@ -841,7 +843,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	drop_nlink(inode);
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
 	if (err)
 		goto out_cancel;
@@ -944,7 +947,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 	drop_nlink(dir);
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
 	if (err)
 		goto out_cancel;
@@ -1018,7 +1022,8 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	inc_nlink(dir);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err) {
 		ubifs_err(c, "cannot create directory, error %d", err);
@@ -1109,7 +1114,8 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
@@ -1209,7 +1215,8 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e5382f0b2587..2e65fd2dbdc3 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1088,9 +1088,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
 	if (attr->ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
 	if (attr->ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (attr->ia_valid & ATTR_MTIME)
-		inode->i_mtime = attr->ia_mtime;
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
 	if (attr->ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (attr->ia_valid & ATTR_MODE) {
@@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	mutex_lock(&ui->ui_mutex);
 	ui->ui_size = inode->i_size;
 	/* Truncation changes inode [mc]time */
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	/* Other attributes may be changed at the same time as well */
 	do_attr_changes(inode, attr);
 	err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	mutex_lock(&ui->ui_mutex);
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* Truncation changes inode [mc]time */
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		/* 'truncate_setsize()' changed @i_size, update @ui_size */
 		ui->ui_size = inode->i_size;
 	}
@@ -1365,9 +1365,9 @@ static inline int mctime_update_needed(const struct inode *inode,
 				       const struct timespec64 *now)
 {
 	struct timespec64 ctime = inode_get_ctime(inode);
+	struct timespec64 mtime = inode_get_mtime(inode);
 
-	if (!timespec64_equal(&inode->i_mtime, now) ||
-	    !timespec64_equal(&ctime, now))
+	if (!timespec64_equal(&mtime, now) || !timespec64_equal(&ctime, now))
 		return 1;
 	return 0;
 }
@@ -1429,7 +1429,7 @@ static int update_mctime(struct inode *inode)
 			return err;
 
 		mutex_lock(&ui->ui_mutex);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		release = ui->dirty;
 		mark_inode_dirty_sync(inode);
 		mutex_unlock(&ui->ui_mutex);
@@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 		struct ubifs_inode *ui = ubifs_inode(inode);
 
 		mutex_lock(&ui->ui_mutex);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		release = ui->dirty;
 		mark_inode_dirty_sync(inode);
 		mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index ffc9beee7be6..d69d2154645b 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -452,12 +452,12 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
 	ino->ch.node_type = UBIFS_INO_NODE;
 	ino_key_init_flash(c, &ino->key, inode->i_ino);
 	ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
-	ino->atime_sec  = cpu_to_le64(inode->i_atime.tv_sec);
-	ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-	ino->ctime_sec  = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
-	ino->mtime_sec  = cpu_to_le64(inode->i_mtime.tv_sec);
-	ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ino->atime_sec  = cpu_to_le64(inode_get_atime_sec(inode));
+	ino->atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+	ino->ctime_sec  = cpu_to_le64(inode_get_ctime_sec(inode));
+	ino->ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+	ino->mtime_sec  = cpu_to_le64(inode_get_mtime_sec(inode));
+	ino->mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 	ino->uid   = cpu_to_le32(i_uid_read(inode));
 	ino->gid   = cpu_to_le32(i_gid_read(inode));
 	ino->mode  = cpu_to_le32(inode->i_mode);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b08fb28d16b5..366941d4a18a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -142,10 +142,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 	set_nlink(inode, le32_to_cpu(ino->nlink));
 	i_uid_write(inode, le32_to_cpu(ino->uid));
 	i_gid_write(inode, le32_to_cpu(ino->gid));
-	inode->i_atime.tv_sec  = (int64_t)le64_to_cpu(ino->atime_sec);
-	inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
-	inode->i_mtime.tv_sec  = (int64_t)le64_to_cpu(ino->mtime_sec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+	inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec),
+			le32_to_cpu(ino->atime_nsec));
+	inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec),
+			le32_to_cpu(ino->mtime_nsec));
 	inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec),
 			le32_to_cpu(ino->ctime_nsec));
 	inode->i_mode = le32_to_cpu(ino->mode);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ebb3ad6b5e7e..62633816d7d0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2043,7 +2043,7 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
 			size_t size);
 
 #ifdef CONFIG_UBIFS_FS_XATTR
-extern const struct xattr_handler *ubifs_xattr_handlers[];
+extern const struct xattr_handler * const ubifs_xattr_handlers[];
 ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum);
 int ubifs_purge_xattrs(struct inode *host);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 406c82eab513..0847db521984 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -735,7 +735,7 @@ static const struct xattr_handler ubifs_security_xattr_handler = {
 };
 #endif
 
-const struct xattr_handler *ubifs_xattr_handlers[] = {
+const struct xattr_handler * const ubifs_xattr_handlers[] = {
 	&ubifs_user_xattr_handler,
 	&ubifs_trusted_xattr_handler,
 #ifdef CONFIG_UBIFS_FS_SECURITY
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6b558cbbeb6b..5f1f969f4134 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -100,8 +100,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
 	else
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	iinfo->i_crtime = inode->i_mtime;
+	simple_inode_init_ts(inode);
+	iinfo->i_crtime = inode_get_mtime(inode);
 	if (unlikely(insert_inode_locked(inode) < 0)) {
 		make_bad_inode(inode);
 		iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a17a6184cc39..d8493449d4c5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1296,7 +1296,7 @@ set_size:
 			goto out_unlock;
 	}
 update_time:
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (IS_SYNC(inode))
 		udf_sync_inode(inode);
 	else
@@ -1327,7 +1327,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
 	int bs = inode->i_sb->s_blocksize;
 	int ret = -EIO;
 	uint32_t uid, gid;
-	struct timespec64 ctime;
+	struct timespec64 ts;
 
 reread:
 	if (iloc->partitionReferenceNum >= sbi->s_partitions) {
@@ -1504,10 +1504,12 @@ reread:
 		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
 			(inode->i_sb->s_blocksize_bits - 9);
 
-		udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime);
-		udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime);
-		udf_disk_stamp_to_time(&ctime, fe->attrTime);
-		inode_set_ctime_to_ts(inode, ctime);
+		udf_disk_stamp_to_time(&ts, fe->accessTime);
+		inode_set_atime_to_ts(inode, ts);
+		udf_disk_stamp_to_time(&ts, fe->modificationTime);
+		inode_set_mtime_to_ts(inode, ts);
+		udf_disk_stamp_to_time(&ts, fe->attrTime);
+		inode_set_ctime_to_ts(inode, ts);
 
 		iinfo->i_unique = le64_to_cpu(fe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1519,11 +1521,13 @@ reread:
 		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
 		    (inode->i_sb->s_blocksize_bits - 9);
 
-		udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime);
-		udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime);
+		udf_disk_stamp_to_time(&ts, efe->accessTime);
+		inode_set_atime_to_ts(inode, ts);
+		udf_disk_stamp_to_time(&ts, efe->modificationTime);
+		inode_set_mtime_to_ts(inode, ts);
+		udf_disk_stamp_to_time(&ts, efe->attrTime);
+		inode_set_ctime_to_ts(inode, ts);
 		udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime);
-		udf_disk_stamp_to_time(&ctime, efe->attrTime);
-		inode_set_ctime_to_ts(inode, ctime);
 
 		iinfo->i_unique = le64_to_cpu(efe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1798,8 +1802,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
 		fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
 
-		udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
-		udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode));
+		udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode));
 		udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode));
 		memset(&(fe->impIdent), 0, sizeof(struct regid));
 		strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
@@ -1829,12 +1833,14 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 				cpu_to_le32(inode->i_sb->s_blocksize);
 		}
 
-		udf_adjust_time(iinfo, inode->i_atime);
-		udf_adjust_time(iinfo, inode->i_mtime);
+		udf_adjust_time(iinfo, inode_get_atime(inode));
+		udf_adjust_time(iinfo, inode_get_mtime(inode));
 		udf_adjust_time(iinfo, inode_get_ctime(inode));
 
-		udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
-		udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&efe->accessTime,
+				       inode_get_atime(inode));
+		udf_time_to_disk_stamp(&efe->modificationTime,
+				       inode_get_mtime(inode));
 		udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
 		udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode));
 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ae55ab8859b6..3508ac484da3 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 	*(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
 		cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
 	udf_fiiter_write_fi(&iter, NULL);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	udf_fiiter_release(&iter);
 	udf_add_fid_counter(dir->i_sb, false, 1);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	udf_fiiter_release(&iter);
 	udf_add_fid_counter(dir->i_sb, true, 1);
 	inc_nlink(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	d_instantiate_new(dentry, inode);
 
@@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	inode->i_size = 0;
 	inode_dec_link_count(dir);
 	udf_add_fid_counter(dir->i_sb, true, -1);
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	mark_inode_dirty(dir);
 	ret = 0;
 end_rmdir:
@@ -555,7 +555,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 		set_nlink(inode, 1);
 	}
 	udf_fiiter_delete_entry(&iter);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	inode_dec_link_count(inode);
 	udf_add_fid_counter(dir->i_sb, false, -1);
@@ -748,7 +748,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	udf_add_fid_counter(dir->i_sb, false, 1);
 	inode_set_ctime_current(inode);
 	mark_inode_dirty(inode);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	ihold(inode);
 	d_instantiate(dentry, inode);
@@ -866,8 +866,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode),
 				    -1);
 	}
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
-	new_dir->i_mtime = inode_set_ctime_current(new_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
+	inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
 	mark_inode_dirty(old_dir);
 	mark_inode_dirty(new_dir);
 
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index fd57f03b6c93..27c85d92d1dc 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 	ufs_commit_chunk(page, pos, len);
 	ufs_put_page(page);
 	if (update_times)
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	ufs_handle_dirsync(dir);
 }
@@ -397,7 +397,7 @@ got_it:
 	ufs_set_de_type(sb, de, inode->i_mode);
 
 	ufs_commit_chunk(page, pos, rec_len);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	mark_inode_dirty(dir);
 	err = ufs_handle_dirsync(dir);
@@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
 		pde->d_reclen = cpu_to_fs16(sb, to - from);
 	dir->d_ino = 0;
 	ufs_commit_chunk(page, pos, to - from);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	err = ufs_handle_dirsync(inode);
 out:
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a1e7bd9d1f98..73531827ecee 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -292,7 +292,7 @@ cg_found:
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_blocks = 0;
 	inode->i_generation = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	ufsi->i_flags = UFS_I(dir)->i_flags;
 	ufsi->i_lastfrag = 0;
 	ufsi->i_shadow = 0;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 21a4779a2de5..338e4b97312f 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -579,13 +579,15 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 	i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode));
 
 	inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
-	inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
+	inode_set_atime(inode,
+			(signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec),
+			0);
 	inode_set_ctime(inode,
 			(signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec),
 			0);
-	inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_nsec = 0;
+	inode_set_mtime(inode,
+			(signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec),
+			0);
 	inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
 	inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen);
 	ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
@@ -626,12 +628,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 	i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid));
 
 	inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
-	inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
+	inode_set_atime(inode, fs64_to_cpu(sb, ufs2_inode->ui_atime),
+			fs32_to_cpu(sb, ufs2_inode->ui_atimensec));
 	inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime),
 			fs32_to_cpu(sb, ufs2_inode->ui_ctimensec));
-	inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
-	inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
-	inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
+	inode_set_mtime(inode, fs64_to_cpu(sb, ufs2_inode->ui_mtime),
+			fs32_to_cpu(sb, ufs2_inode->ui_mtimensec));
 	inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
 	inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
 	ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
@@ -725,12 +727,14 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 	ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode));
 
 	ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
-	ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
+	ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb,
+						 inode_get_atime_sec(inode));
 	ufs_inode->ui_atime.tv_usec = 0;
 	ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb,
-						 inode_get_ctime(inode).tv_sec);
+						 inode_get_ctime_sec(inode));
 	ufs_inode->ui_ctime.tv_usec = 0;
-	ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
+	ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb,
+						 inode_get_mtime_sec(inode));
 	ufs_inode->ui_mtime.tv_usec = 0;
 	ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks);
 	ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -770,13 +774,15 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
 	ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode));
 
 	ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
-	ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
-	ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
-	ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec);
+	ufs_inode->ui_atime = cpu_to_fs64(sb, inode_get_atime_sec(inode));
+	ufs_inode->ui_atimensec = cpu_to_fs32(sb,
+					      inode_get_atime_nsec(inode));
+	ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime_sec(inode));
 	ufs_inode->ui_ctimensec = cpu_to_fs32(sb,
-					      inode_get_ctime(inode).tv_nsec);
-	ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
-	ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
+					      inode_get_ctime_nsec(inode));
+	ufs_inode->ui_mtime = cpu_to_fs64(sb, inode_get_mtime_sec(inode));
+	ufs_inode->ui_mtimensec = cpu_to_fs32(sb,
+					      inode_get_mtime_nsec(inode));
 
 	ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks);
 	ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -1208,7 +1214,7 @@ static int ufs_truncate(struct inode *inode, loff_t size)
 	truncate_setsize(inode, size);
 
 	ufs_truncate_blocks(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 out:
 	UFSD("EXIT: err %d\n", err);
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 83f20dd15522..72ac9320e6a3 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -126,12 +126,12 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
 	do_div(allocated, 512);
 	inode->i_blocks = allocated;
 
-	inode->i_atime = ns_to_timespec64(
-				 info->access_time.ns_relative_to_unix_epoch);
+	inode_set_atime_to_ts(inode,
+			      ns_to_timespec64(info->access_time.ns_relative_to_unix_epoch));
 	inode_set_ctime_to_ts(inode,
 			      ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch));
-	inode->i_mtime = ns_to_timespec64(
-			   info->modification_time.ns_relative_to_unix_epoch);
+	inode_set_mtime_to_ts(inode,
+			      ns_to_timespec64(info->modification_time.ns_relative_to_unix_epoch));
 	return 0;
 }
 
@@ -194,7 +194,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
 	struct vboxsf_sbi *sbi;
 	struct vboxsf_inode *sf_i;
 	struct shfl_fsobjinfo info;
-	struct timespec64 prev_mtime;
+	struct timespec64 mtime, prev_mtime;
 	struct inode *inode;
 	int err;
 
@@ -202,7 +202,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
 		return -EINVAL;
 
 	inode = d_inode(dentry);
-	prev_mtime = inode->i_mtime;
+	prev_mtime = inode_get_mtime(inode);
 	sf_i = VBOXSF_I(inode);
 	sbi = VBOXSF_SBI(dentry->d_sb);
 	if (!sf_i->force_restat) {
@@ -225,7 +225,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
 	 * page-cache for it.  Note this also gets triggered by our own writes,
 	 * this is unavoidable.
 	 */
-	if (timespec64_compare(&inode->i_mtime, &prev_mtime) > 0)
+	mtime = inode_get_mtime(inode);
+	if (timespec64_compare(&mtime, &prev_mtime) > 0)
 		invalidate_inode_pages2(inode->i_mapping);
 
 	return 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index efd4736bc94b..09d927603433 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -56,7 +56,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
 static const struct xattr_handler *
 xattr_resolve_name(struct inode *inode, const char **name)
 {
-	const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+	const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
 	const struct xattr_handler *handler;
 
 	if (!(inode->i_opflags & IOP_XATTR)) {
@@ -162,7 +162,7 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
 int
 xattr_supports_user_prefix(struct inode *inode)
 {
-	const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+	const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
 	const struct xattr_handler *handler;
 
 	if (!(inode->i_opflags & IOP_XATTR)) {
@@ -999,7 +999,7 @@ int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-	const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+	const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
 	ssize_t remaining_size = buffer_size;
 	int err = 0;
 
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index a35781577cad..543f3748c2a3 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -220,8 +220,10 @@ xfs_inode_from_disk(
 	 * a time before epoch is converted to a time long after epoch
 	 * on 64 bit systems.
 	 */
-	inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
-	inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+	inode_set_atime_to_ts(inode,
+			      xfs_inode_from_disk_ts(from, from->di_atime));
+	inode_set_mtime_to_ts(inode,
+			      xfs_inode_from_disk_ts(from, from->di_mtime));
 	inode_set_ctime_to_ts(inode,
 			      xfs_inode_from_disk_ts(from, from->di_ctime));
 
@@ -315,8 +317,8 @@ xfs_inode_to_disk(
 	to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
 	to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
 
-	to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
-	to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+	to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode));
+	to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode));
 	to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode));
 	to->di_nlink = cpu_to_be32(inode->i_nlink);
 	to->di_gen = cpu_to_be32(inode->i_generation);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index fa180ab66b73..396648acb5be 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -970,6 +970,7 @@ xfs_rtfree_extent(
 	xfs_mount_t	*mp;		/* file system mount structure */
 	xfs_fsblock_t	sb;		/* summary file block number */
 	struct xfs_buf	*sumbp = NULL;	/* summary file block buffer */
+	struct timespec64 atime;
 
 	mp = tp->t_mountp;
 
@@ -999,7 +1000,10 @@ xfs_rtfree_extent(
 	    mp->m_sb.sb_rextents) {
 		if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
 			mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
-		*(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
+
+		atime = inode_get_atime(VFS_I(mp->m_rbmip));
+		*((uint64_t *)&atime) = 0;
+		inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
 		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
 	}
 	return 0;
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 6b2296ff248a..70e97ea6eee7 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -65,7 +65,7 @@ xfs_trans_ichgtime(
 	tv = current_time(inode);
 
 	if (flags & XFS_ICHGTIME_MOD)
-		inode->i_mtime = tv;
+		inode_set_mtime_to_ts(inode, tv);
 	if (flags & XFS_ICHGTIME_CHG)
 		inode_set_ctime_to_ts(inode, tv);
 	if (flags & XFS_ICHGTIME_CREATE)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fcefab687285..40e0a1f1f753 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1644,7 +1644,7 @@ xfs_swap_extents(
 	uint64_t		f;
 	int			resblks = 0;
 	unsigned int		flags = 0;
-	struct timespec64	ctime;
+	struct timespec64	ctime, mtime;
 
 	/*
 	 * Lock the inodes against other IO, page faults and truncate to
@@ -1758,10 +1758,11 @@ xfs_swap_extents(
 	 * under it.
 	 */
 	ctime = inode_get_ctime(VFS_I(ip));
+	mtime = inode_get_mtime(VFS_I(ip));
 	if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) ||
 	    (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) ||
-	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
-	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+	    (sbp->bs_mtime.tv_sec != mtime.tv_sec) ||
+	    (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) {
 		error = -EBUSY;
 		goto out_trans_cancel;
 	}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c1ece4a08ff4..003e157241da 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1945,8 +1945,6 @@ void
 xfs_free_buftarg(
 	struct xfs_buftarg	*btp)
 {
-	struct block_device	*bdev = btp->bt_bdev;
-
 	unregister_shrinker(&btp->bt_shrinker);
 	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
 	percpu_counter_destroy(&btp->bt_io_count);
@@ -1954,8 +1952,8 @@ xfs_free_buftarg(
 
 	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
 	/* the main block device is closed by kill_block_super */
-	if (bdev != btp->bt_mount->m_super->s_bdev)
-		blkdev_put(bdev, btp->bt_mount->m_super);
+	if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
+		bdev_release(btp->bt_bdev_handle);
 
 	kmem_free(btp);
 }
@@ -1990,16 +1988,15 @@ xfs_setsize_buftarg(
  */
 STATIC int
 xfs_setsize_buftarg_early(
-	xfs_buftarg_t		*btp,
-	struct block_device	*bdev)
+	xfs_buftarg_t		*btp)
 {
-	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
+	return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev));
 }
 
 struct xfs_buftarg *
 xfs_alloc_buftarg(
 	struct xfs_mount	*mp,
-	struct block_device	*bdev)
+	struct bdev_handle	*bdev_handle)
 {
 	xfs_buftarg_t		*btp;
 	const struct dax_holder_operations *ops = NULL;
@@ -2010,9 +2007,10 @@ xfs_alloc_buftarg(
 	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
 
 	btp->bt_mount = mp;
-	btp->bt_dev =  bdev->bd_dev;
-	btp->bt_bdev = bdev;
-	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+	btp->bt_bdev_handle = bdev_handle;
+	btp->bt_dev = bdev_handle->bdev->bd_dev;
+	btp->bt_bdev = bdev_handle->bdev;
+	btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
 					    mp, ops);
 
 	/*
@@ -2022,7 +2020,7 @@ xfs_alloc_buftarg(
 	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
 			     DEFAULT_RATELIMIT_BURST);
 
-	if (xfs_setsize_buftarg_early(btp, bdev))
+	if (xfs_setsize_buftarg_early(btp))
 		goto error_free;
 
 	if (list_lru_init(&btp->bt_lru))
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index df8f47953bb4..ada9d310b7d3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -98,6 +98,7 @@ typedef unsigned int xfs_buf_flags_t;
  */
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
+	struct bdev_handle	*bt_bdev_handle;
 	struct block_device	*bt_bdev;
 	struct dax_device	*bt_daxdev;
 	u64			bt_dax_part_off;
@@ -364,7 +365,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
  *	Handling of buftargs.
  */
 struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
-		struct block_device *bdev);
+		struct bdev_handle *bdev_handle);
 extern void xfs_free_buftarg(struct xfs_buftarg *);
 extern void xfs_buftarg_wait(struct xfs_buftarg *);
 extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4d55f58d99b7..36f5cf802c07 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -844,8 +844,8 @@ xfs_init_new_inode(
 	ASSERT(ip->i_nblocks == 0);
 
 	tv = inode_set_ctime_current(inode);
-	inode->i_mtime = tv;
-	inode->i_atime = tv;
+	inode_set_mtime_to_ts(inode, tv);
+	inode_set_atime_to_ts(inode, tv);
 
 	ip->i_extsize = 0;
 	ip->i_diflags = 0;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 127b2410eb20..17c51804f9c6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -526,8 +526,8 @@ xfs_inode_to_log_dinode(
 	to->di_projid_hi = ip->i_projid >> 16;
 
 	memset(to->di_pad3, 0, sizeof(to->di_pad3));
-	to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
-	to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+	to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode));
+	to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode));
 	to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
 	to->di_nlink = inode->i_nlink;
 	to->di_gen = inode->i_generation;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2b3b05c28e9e..fdfda4fba12b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -572,8 +572,8 @@ xfs_vn_getattr(
 	stat->uid = vfsuid_into_kuid(vfsuid);
 	stat->gid = vfsgid_into_kgid(vfsgid);
 	stat->ino = ip->i_ino;
-	stat->atime = inode->i_atime;
-	stat->mtime = inode->i_mtime;
+	stat->atime = inode_get_atime(inode);
+	stat->mtime = inode_get_mtime(inode);
 	stat->ctime = inode_get_ctime(inode);
 	stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
 
@@ -1067,9 +1067,9 @@ xfs_vn_update_time(
 		now = current_time(inode);
 
 	if (flags & S_MTIME)
-		inode->i_mtime = now;
+		inode_set_mtime_to_ts(inode, now);
 	if (flags & S_ATIME)
-		inode->i_atime = now;
+		inode_set_atime_to_ts(inode, now);
 
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, log_flags);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f5377ba5967a..14462614fcc8 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -107,12 +107,12 @@ xfs_bulkstat_one_int(
 	buf->bs_size = ip->i_disk_size;
 
 	buf->bs_nlink = inode->i_nlink;
-	buf->bs_atime = inode->i_atime.tv_sec;
-	buf->bs_atime_nsec = inode->i_atime.tv_nsec;
-	buf->bs_mtime = inode->i_mtime.tv_sec;
-	buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
-	buf->bs_ctime = inode_get_ctime(inode).tv_sec;
-	buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec;
+	buf->bs_atime = inode_get_atime_sec(inode);
+	buf->bs_atime_nsec = inode_get_atime_nsec(inode);
+	buf->bs_mtime = inode_get_mtime_sec(inode);
+	buf->bs_mtime_nsec = inode_get_mtime_nsec(inode);
+	buf->bs_ctime = inode_get_ctime_sec(inode);
+	buf->bs_ctime_nsec = inode_get_ctime_nsec(inode);
 	buf->bs_gen = inode->i_generation;
 	buf->bs_mode = inode->i_mode;
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 16534e9873f6..2e1a4e5cd03d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1420,25 +1420,26 @@ xfs_rtunmount_inodes(
  */
 int					/* error */
 xfs_rtpick_extent(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_extlen_t	len,		/* allocation length (rtextents) */
-	xfs_rtblock_t	*pick)		/* result rt extent */
-{
-	xfs_rtblock_t	b;		/* result block */
-	int		log2;		/* log of sequence number */
-	uint64_t	resid;		/* residual after log removed */
-	uint64_t	seq;		/* sequence number of file creation */
-	uint64_t	*seqp;		/* pointer to seqno in inode */
+	xfs_mount_t		*mp,		/* file system mount point */
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_extlen_t		len,		/* allocation length (rtextents) */
+	xfs_rtblock_t		*pick)		/* result rt extent */
+	{
+	xfs_rtblock_t		b;		/* result block */
+	int			log2;		/* log of sequence number */
+	uint64_t		resid;		/* residual after log removed */
+	uint64_t		seq;		/* sequence number of file creation */
+	struct timespec64	ts;		/* temporary timespec64 storage */
 
 	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 
-	seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
 	if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
 		mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
-		*seqp = 0;
+		seq = 0;
+	} else {
+		ts = inode_get_atime(VFS_I(mp->m_rbmip));
+		seq = (uint64_t)ts.tv_sec;
 	}
-	seq = *seqp;
 	if ((log2 = xfs_highbit64(seq)) == -1)
 		b = 0;
 	else {
@@ -1450,7 +1451,8 @@ xfs_rtpick_extent(
 		if (b + len > mp->m_sb.sb_rextents)
 			b = mp->m_sb.sb_rextents - len;
 	}
-	*seqp = seq + 1;
+	ts.tv_sec = (time64_t)seq + 1;
+	inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts);
 	xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
 	*pick = b;
 	return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 819a3568b28f..f0ae07828153 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -361,14 +361,15 @@ STATIC int
 xfs_blkdev_get(
 	xfs_mount_t		*mp,
 	const char		*name,
-	struct block_device	**bdevp)
+	struct bdev_handle	**handlep)
 {
 	int			error = 0;
 
-	*bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				    mp->m_super, &fs_holder_ops);
-	if (IS_ERR(*bdevp)) {
-		error = PTR_ERR(*bdevp);
+	*handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				     mp->m_super, &fs_holder_ops);
+	if (IS_ERR(*handlep)) {
+		error = PTR_ERR(*handlep);
+		*handlep = NULL;
 		xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
 	}
 
@@ -433,7 +434,7 @@ xfs_open_devices(
 {
 	struct super_block	*sb = mp->m_super;
 	struct block_device	*ddev = sb->s_bdev;
-	struct block_device	*logdev = NULL, *rtdev = NULL;
+	struct bdev_handle	*logdev_handle = NULL, *rtdev_handle = NULL;
 	int			error;
 
 	/*
@@ -446,17 +447,19 @@ xfs_open_devices(
 	 * Open real time and log devices - order is important.
 	 */
 	if (mp->m_logname) {
-		error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+		error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
 		if (error)
 			goto out_relock;
 	}
 
 	if (mp->m_rtname) {
-		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
 		if (error)
 			goto out_close_logdev;
 
-		if (rtdev == ddev || rtdev == logdev) {
+		if (rtdev_handle->bdev == ddev ||
+		    (logdev_handle &&
+		     rtdev_handle->bdev == logdev_handle->bdev)) {
 			xfs_warn(mp,
 	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
 			error = -EINVAL;
@@ -468,22 +471,25 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = -ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
-	if (rtdev) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+	if (rtdev_handle) {
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
 		if (!mp->m_rtdev_targp)
 			goto out_free_ddev_targ;
 	}
 
-	if (logdev && logdev != ddev) {
-		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+	if (logdev_handle && logdev_handle->bdev != ddev) {
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
 		if (!mp->m_logdev_targp)
 			goto out_free_rtdev_targ;
 	} else {
 		mp->m_logdev_targp = mp->m_ddev_targp;
+		/* Handle won't be used, drop it */
+		if (logdev_handle)
+			bdev_release(logdev_handle);
 	}
 
 	error = 0;
@@ -497,11 +503,11 @@ out_relock:
  out_free_ddev_targ:
 	xfs_free_buftarg(mp->m_ddev_targp);
  out_close_rtdev:
-	 if (rtdev)
-		 blkdev_put(rtdev, sb);
+	 if (rtdev_handle)
+		bdev_release(rtdev_handle);
  out_close_logdev:
-	if (logdev && logdev != ddev)
-		blkdev_put(logdev, sb);
+	if (logdev_handle)
+		bdev_release(logdev_handle);
 	goto out_relock;
 }
 
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index a3975f325f4e..987843f84d03 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -186,7 +186,7 @@ static const struct xattr_handler xfs_xattr_security_handler = {
 	.set	= xfs_xattr_set,
 };
 
-const struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler * const xfs_xattr_handlers[] = {
 	&xfs_xattr_user_handler,
 	&xfs_xattr_trusted_handler,
 	&xfs_xattr_security_handler,
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
index 2b09133b1b9b..cec766cad26c 100644
--- a/fs/xfs/xfs_xattr.h
+++ b/fs/xfs/xfs_xattr.h
@@ -8,6 +8,6 @@
 
 int xfs_attr_change(struct xfs_da_args *args);
 
-extern const struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler * const xfs_xattr_handlers[];
 
 #endif /* __XFS_XATTR_H__ */
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 9d1a9808fbbb..e6a75401677d 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -658,8 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
 
 	inode->i_ino = ino;
 	inode->i_mode = z->z_mode;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
-								inode_get_ctime(dir));
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(dir))));
 	inode->i_uid = z->z_uid;
 	inode->i_gid = z->z_gid;
 	inode->i_size = z->z_wpoffset;
@@ -695,8 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
 	inode->i_ino = ino;
 	inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555);
 	inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
-								inode_get_ctime(root));
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(root))));
 	inode->i_private = &sbi->s_zgroup[ztype];
 	set_nlink(inode, 2);
 
@@ -1319,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 
 	inode->i_ino = bdev_nr_zones(sb->s_bdev);
 	inode->i_mode = S_IFDIR | 0555;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &zonefs_dir_inode_operations;
 	inode->i_fop = &zonefs_dir_operations;
 	inode->i_size = 2;
diff --git a/include/asm-generic/spinlock.h b/include/asm-generic/spinlock.h
index fdfebcb050f4..90803a826ba0 100644
--- a/include/asm-generic/spinlock.h
+++ b/include/asm-generic/spinlock.h
@@ -68,11 +68,18 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 	smp_store_release(ptr, (u16)val + 1);
 }
 
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+	u32 val = lock.counter;
+
+	return ((val >> 16) == (val & 0xffff));
+}
+
 static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	u32 val = atomic_read(lock);
+	arch_spinlock_t val = READ_ONCE(*lock);
 
-	return ((val >> 16) != (val & 0xffff));
+	return !arch_spin_value_unlocked(val);
 }
 
 static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
@@ -82,11 +89,6 @@ static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock)
 	return (s16)((val >> 16) - (val & 0xffff)) > 1;
 }
 
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
-	return !arch_spin_is_locked(&lock);
-}
-
 #include <asm/qrwlock.h>
 
 #endif /* __ASM_GENERIC_SPINLOCK_H */
diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index b83ef19da13d..5e95faa959c4 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -428,6 +428,19 @@ extern void raw_cmpxchg128_relaxed_not_implemented(void);
 
 #define raw_sync_cmpxchg arch_sync_cmpxchg
 
+#ifdef arch_sync_try_cmpxchg
+#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
+#else
+#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
+({ \
+	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
+	___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
+	if (unlikely(___r != ___o)) \
+		*___op = ___r; \
+	likely(___r == ___o); \
+})
+#endif
+
 /**
  * raw_atomic_read() - atomic load with relaxed ordering
  * @v: pointer to atomic_t
@@ -4649,4 +4662,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 2fdd6702823fa842f9cea57a002e6e4476ae780c
+// eec048affea735b8464f58e6d96992101f8f85f1
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index d401b406ef7c..54d7bbe0aeaa 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -4998,6 +4998,14 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
+#define sync_try_cmpxchg(ptr, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	kcsan_mb(); \
+	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
+})
+
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 1568f875fef72097413caab8339120c065a39aa4
+// 2cc4bc990fef44d3836ec108f11b610f3f438184
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 8d51f69f9f5e..70f97f685bff 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -90,6 +90,16 @@ struct linux_binfmt {
 #endif
 } __randomize_layout;
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc {
+	struct list_head entries;
+	rwlock_t entries_lock;
+	bool enabled;
+} __randomize_layout;
+
+extern struct binfmt_misc init_binfmt_misc;
+#endif
+
 extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
 
 /* Registration of default binfmt handlers */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index eef450f25982..51fa7ffdee83 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1479,14 +1479,25 @@ extern const struct blk_holder_ops fs_holder_ops;
 #define sb_open_mode(flags) \
 	(BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
 
+struct bdev_handle {
+	struct block_device *bdev;
+	void *holder;
+	blk_mode_t mode;
+};
+
 struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 		const struct blk_holder_ops *hops);
 struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
 		void *holder, const struct blk_holder_ops *hops);
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+		const struct blk_holder_ops *hops);
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+		void *holder, const struct blk_holder_ops *hops);
 int bd_prepare_to_claim(struct block_device *bdev, void *holder,
 		const struct blk_holder_ops *hops);
 void bd_abort_claiming(struct block_device *bdev, void *holder);
 void blkdev_put(struct block_device *bdev, void *holder);
+void bdev_release(struct bdev_handle *handle);
 
 /* just for blk-cgroup, don't use elsewhere */
 struct block_device *blkdev_get_no_open(dev_t dev);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index bf9823956758..b8610e9d2471 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -278,7 +278,7 @@ struct ceph_osd_request {
 	int r_attempts;
 	u32 r_map_dne_bound;
 
-	struct ceph_osd_req_op r_ops[];
+	struct ceph_osd_req_op r_ops[] __counted_by(r_num_ops);
 };
 
 struct ceph_request_redirect {
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 53f1a7a932b0..9f1a9c455b68 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -7,8 +7,9 @@
 /*
  * DEFINE_FREE(name, type, free):
  *	simple helper macro that defines the required wrapper for a __free()
- *	based cleanup function. @free is an expression using '_T' to access
- *	the variable.
+ *	based cleanup function. @free is an expression using '_T' to access the
+ *	variable. @free should typically include a NULL test before calling a
+ *	function, see the example below.
  *
  * __free(name):
  *	variable attribute to add a scoped based cleanup to the variable.
@@ -17,6 +18,9 @@
  *	like a non-atomic xchg(var, NULL), such that the cleanup function will
  *	be inhibited -- provided it sanely deals with a NULL value.
  *
+ *	NOTE: this has __must_check semantics so that it is harder to accidentally
+ *	leak the resource.
+ *
  * return_ptr(p):
  *	returns p while inhibiting the __free().
  *
@@ -24,6 +28,8 @@
  *
  * DEFINE_FREE(kfree, void *, if (_T) kfree(_T))
  *
+ * void *alloc_obj(...)
+ * {
  *	struct obj *p __free(kfree) = kmalloc(...);
  *	if (!p)
  *		return NULL;
@@ -32,6 +38,24 @@
  *		return NULL;
  *
  *	return_ptr(p);
+ * }
+ *
+ * NOTE: the DEFINE_FREE()'s @free expression includes a NULL test even though
+ * kfree() is fine to be called with a NULL value. This is on purpose. This way
+ * the compiler sees the end of our alloc_obj() function as:
+ *
+ *	tmp = p;
+ *	p = NULL;
+ *	if (p)
+ *		kfree(p);
+ *	return tmp;
+ *
+ * And through the magic of value-propagation and dead-code-elimination, it
+ * eliminates the actual cleanup call and compiles into:
+ *
+ *	return p;
+ *
+ * Without the NULL test it turns into a mess and the compiler can't help us.
  */
 
 #define DEFINE_FREE(_name, _type, _free) \
@@ -39,8 +63,17 @@
 
 #define __free(_name)	__cleanup(__free_##_name)
 
+#define __get_and_null_ptr(p) \
+	({ __auto_type __ptr = &(p); \
+	   __auto_type __val = *__ptr; \
+	   *__ptr = NULL;  __val; })
+
+static inline __must_check
+const volatile void * __must_check_fn(const volatile void *val)
+{ return val; }
+
 #define no_free_ptr(p) \
-	({ __auto_type __ptr = (p); (p) = NULL; __ptr; })
+	((typeof(p)) __must_check_fn(__get_and_null_ptr(p)))
 
 #define return_ptr(p)	return no_free_ptr(p)
 
diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h
index c88cdc4ae4ec..722a586bb224 100644
--- a/drivers/md/bcache/closure.h
+++ b/include/linux/closure.h
@@ -155,7 +155,7 @@ struct closure {
 
 	atomic_t		remaining;
 
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 #define CLOSURE_MAGIC_DEAD	0xc054dead
 #define CLOSURE_MAGIC_ALIVE	0xc054a11e
 
@@ -172,6 +172,11 @@ void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void __closure_sync(struct closure *cl);
 
+static inline unsigned closure_nr_remaining(struct closure *cl)
+{
+	return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK;
+}
+
 /**
  * closure_sync - sleep until a closure a closure has nothing left to wait on
  *
@@ -180,19 +185,17 @@ void __closure_sync(struct closure *cl);
  */
 static inline void closure_sync(struct closure *cl)
 {
-	if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+	if (closure_nr_remaining(cl) != 1)
 		__closure_sync(cl);
 }
 
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 
-void closure_debug_init(void);
 void closure_debug_create(struct closure *cl);
 void closure_debug_destroy(struct closure *cl);
 
 #else
 
-static inline void closure_debug_init(void) {}
 static inline void closure_debug_create(struct closure *cl) {}
 static inline void closure_debug_destroy(struct closure *cl) {}
 
@@ -200,21 +203,21 @@ static inline void closure_debug_destroy(struct closure *cl) {}
 
 static inline void closure_set_ip(struct closure *cl)
 {
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 	cl->ip = _THIS_IP_;
 #endif
 }
 
 static inline void closure_set_ret_ip(struct closure *cl)
 {
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 	cl->ip = _RET_IP_;
 #endif
 }
 
 static inline void closure_set_waiting(struct closure *cl, unsigned long f)
 {
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 	cl->waiting_on = f;
 #endif
 }
@@ -243,6 +246,7 @@ static inline void closure_queue(struct closure *cl)
 	 */
 	BUILD_BUG_ON(offsetof(struct closure, fn)
 		     != offsetof(struct work_struct, func));
+
 	if (wq) {
 		INIT_WORK(&cl->work, cl->work.func);
 		BUG_ON(!queue_work(wq, &cl->work));
@@ -255,7 +259,7 @@ static inline void closure_queue(struct closure *cl)
  */
 static inline void closure_get(struct closure *cl)
 {
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 	BUG_ON((atomic_inc_return(&cl->remaining) &
 		CLOSURE_REMAINING_MASK) <= 1);
 #else
@@ -271,7 +275,7 @@ static inline void closure_get(struct closure *cl)
  */
 static inline void closure_init(struct closure *cl, struct closure *parent)
 {
-	memset(cl, 0, sizeof(struct closure));
+	cl->fn = NULL;
 	cl->parent = parent;
 	if (parent)
 		closure_get(parent);
@@ -375,4 +379,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
 	continue_at_nobarrier(cl, fn, wq);
 }
 
+#define __closure_wait_event(waitlist, _cond)				\
+do {									\
+	struct closure cl;						\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		closure_wait(waitlist, &cl);				\
+		if (_cond)						\
+			break;						\
+		closure_sync(&cl);					\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+} while (0)
+
+#define closure_wait_event(waitlist, _cond)				\
+do {									\
+	if (!(_cond))							\
+		__closure_wait_event(waitlist, _cond);			\
+} while (0)
+
 #endif /* _LINUX_CLOSURE_H */
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index eb768a866fe3..fc8094419084 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -155,6 +155,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
 static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
 #endif	/* !CONFIG_HOTPLUG_CPU */
 
+DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
+
 #ifdef CONFIG_PM_SLEEP_SMP
 extern int freeze_secondary_cpus(int primary);
 extern void thaw_secondary_cpus(void);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068f7738be22..d246d325918a 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -204,7 +204,6 @@ enum cpuhp_state {
 	CPUHP_AP_KVM_ONLINE,
 	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
-	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
 	CPUHP_AP_BLK_MQ_ONLINE,
 	CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 0c06561bf5ff..08704c29fdb4 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -92,7 +92,7 @@ int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 struct crash_mem {
 	unsigned int max_nr_ranges;
 	unsigned int nr_ranges;
-	struct range ranges[];
+	struct range ranges[] __counted_by(max_nr_ranges);
 };
 
 extern int crash_exclude_mem_range(struct crash_mem *mem,
diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..af8d353a4b86 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/key.h>
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/uidgid.h>
 #include <linux/sched.h>
 #include <linux/sched/user.h>
@@ -23,7 +24,7 @@ struct inode;
  * COW Supplementary groups list
  */
 struct group_info {
-	atomic_t	usage;
+	refcount_t	usage;
 	int		ngroups;
 	kgid_t		gid[];
 } __randomize_layout;
@@ -39,7 +40,7 @@ struct group_info {
  */
 static inline struct group_info *get_group_info(struct group_info *gi)
 {
-	atomic_inc(&gi->usage);
+	refcount_inc(&gi->usage);
 	return gi;
 }
 
@@ -49,7 +50,7 @@ static inline struct group_info *get_group_info(struct group_info *gi)
  */
 #define put_group_info(group_info)			\
 do {							\
-	if (atomic_dec_and_test(&(group_info)->usage))	\
+	if (refcount_dec_and_test(&(group_info)->usage))	\
 		groups_free(group_info);		\
 } while (0)
 
@@ -219,6 +220,20 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 }
 
 /**
+ * get_new_cred_many - Get references on a new set of credentials
+ * @cred: The new credentials to reference
+ * @nr: Number of references to acquire
+ *
+ * Get references on the specified set of new credentials.  The caller must
+ * release all acquired references.
+ */
+static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
+{
+	atomic_add(nr, &cred->usage);
+	return cred;
+}
+
+/**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
  *
@@ -227,16 +242,16 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
  */
 static inline struct cred *get_new_cred(struct cred *cred)
 {
-	atomic_inc(&cred->usage);
-	return cred;
+	return get_new_cred_many(cred, 1);
 }
 
 /**
- * get_cred - Get a reference on a set of credentials
+ * get_cred_many - Get references on a set of credentials
  * @cred: The credentials to reference
+ * @nr: Number of references to acquire
  *
- * Get a reference on the specified set of credentials.  The caller must
- * release the reference.  If %NULL is passed, it is returned with no action.
+ * Get references on the specified set of credentials.  The caller must release
+ * all acquired reference.  If %NULL is passed, it is returned with no action.
  *
  * This is used to deal with a committed set of credentials.  Although the
  * pointer is const, this will temporarily discard the const and increment the
@@ -244,14 +259,28 @@ static inline struct cred *get_new_cred(struct cred *cred)
  * accidental alteration of a set of credentials that should be considered
  * immutable.
  */
-static inline const struct cred *get_cred(const struct cred *cred)
+static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
 {
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return cred;
 	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
-	return get_new_cred(nonconst_cred);
+	return get_new_cred_many(nonconst_cred, nr);
+}
+
+/*
+ * get_cred - Get a reference on a set of credentials
+ * @cred: The credentials to reference
+ *
+ * Get a reference on the specified set of credentials.  The caller must
+ * release the reference.  If %NULL is passed, it is returned with no action.
+ *
+ * This is used to deal with a committed set of credentials.
+ */
+static inline const struct cred *get_cred(const struct cred *cred)
+{
+	return get_cred_many(cred, 1);
 }
 
 static inline const struct cred *get_cred_rcu(const struct cred *cred)
@@ -269,6 +298,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 /**
  * put_cred - Release a reference to a set of credentials
  * @cred: The credentials to release
+ * @nr: Number of references to release
  *
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.  If %NULL is passed, nothing is done.
@@ -277,17 +307,29 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
  * on task_struct are attached by const pointers to prevent accidental
  * alteration of otherwise immutable credential sets.
  */
-static inline void put_cred(const struct cred *_cred)
+static inline void put_cred_many(const struct cred *_cred, int nr)
 {
 	struct cred *cred = (struct cred *) _cred;
 
 	if (cred) {
 		validate_creds(cred);
-		if (atomic_dec_and_test(&(cred)->usage))
+		if (atomic_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
 }
 
+/*
+ * put_cred - Release a reference to a set of credentials
+ * @cred: The credentials to release
+ *
+ * Release a reference to a set of credentials, deleting them when the last ref
+ * is released.  If %NULL is passed, nothing is done.
+ */
+static inline void put_cred(const struct cred *cred)
+{
+	put_cred_many(cred, 1);
+}
+
 /**
  * current_cred - Access the current task's subjective credentials
  *
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6b351e009f59..3da2f0545d5d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -251,6 +251,7 @@ extern struct dentry * d_make_root(struct inode *);
 /* <clickety>-<click> the ramfs-type tree */
 extern void d_genocide(struct dentry *);
 
+extern void d_mark_tmpfile(struct file *, struct inode *);
 extern void d_tmpfile(struct file *, struct inode *);
 
 extern struct dentry *d_find_alias(struct inode *);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 69d0435c7ebb..772ab4d74d94 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -165,6 +165,7 @@ void dm_error(const char *message);
 
 struct dm_dev {
 	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct dax_device *dax_dev;
 	blk_mode_t mode;
 	char name[16];
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..0388e8c20f52 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -105,6 +105,12 @@ enum fid_type {
 	FILEID_LUSTRE = 0x97,
 
 	/*
+	 * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
+	 */
+	FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1,
+	FILEID_BCACHEFS_WITH_PARENT = 0xb2,
+
+	/*
 	 * 64 bit unique kernfs id
 	 */
 	FILEID_KERNFS = 0xfe,
@@ -224,9 +230,23 @@ struct export_operations {
 						  atomic attribute updates
 						*/
 #define EXPORT_OP_FLUSH_ON_CLOSE	(0x20) /* fs flushes file data on close */
+#define EXPORT_OP_ASYNC_LOCK		(0x40) /* fs can do async lock request */
 	unsigned long	flags;
 };
 
+/**
+ * exportfs_lock_op_is_async() - export op supports async lock operation
+ * @export_ops:	the nfs export operations to check
+ *
+ * Returns true if the nfs export_operations structure has
+ * EXPORT_OP_ASYNC_LOCK in their flags set
+ */
+static inline bool
+exportfs_lock_op_is_async(const struct export_operations *export_ops)
+{
+	return export_ops->flags & EXPORT_OP_ASYNC_LOCK;
+}
+
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 				    int *max_len, struct inode *parent,
 				    int flags);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..bc4c3287a65e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,20 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
 	return files_lookup_fd_raw(files, fd);
 }
 
-static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
-{
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
-			   "suspicious rcu_dereference_check() usage");
-	return files_lookup_fd_raw(files, fd);
-}
-
-static inline struct file *lookup_fd_rcu(unsigned int fd)
-{
-	return files_lookup_fd_rcu(current->files, fd);
-}
-
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
+struct file *lookup_fdget_rcu(unsigned int fd);
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
 
 struct task_struct;
 
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index e8b12ec8b060..d1ea3898564c 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -100,6 +100,18 @@
 #define SD_ITAPDLY	0xFF180314
 #define SD_OTAPDLYSEL	0xFF180318
 
+/**
+ * XPM_EVENT_ERROR_MASK_DDRMC_CR: Error event mask for DDRMC MC Correctable ECC Error.
+ */
+#define XPM_EVENT_ERROR_MASK_DDRMC_CR		BIT(18)
+
+/**
+ * XPM_EVENT_ERROR_MASK_DDRMC_NCR: Error event mask for DDRMC MC Non-Correctable ECC Error.
+ */
+#define XPM_EVENT_ERROR_MASK_DDRMC_NCR		BIT(19)
+#define XPM_EVENT_ERROR_MASK_NOC_NCR		BIT(13)
+#define XPM_EVENT_ERROR_MASK_NOC_CR		BIT(12)
+
 enum pm_api_cb_id {
 	PM_INIT_SUSPEND_CB = 30,
 	PM_ACKNOWLEDGE_CB = 31,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a40823c3c67..c27c324ba58a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -67,7 +67,7 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
-struct fscrypt_info;
+struct fscrypt_inode_info;
 struct fscrypt_operations;
 struct fsverity_info;
 struct fsverity_operations;
@@ -671,8 +671,8 @@ struct inode {
 	};
 	dev_t			i_rdev;
 	loff_t			i_size;
-	struct timespec64	i_atime;
-	struct timespec64	i_mtime;
+	struct timespec64	__i_atime;
+	struct timespec64	__i_mtime;
 	struct timespec64	__i_ctime; /* use inode_*_ctime accessors! */
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
@@ -738,7 +738,7 @@ struct inode {
 #endif
 
 #ifdef CONFIG_FS_ENCRYPTION
-	struct fscrypt_info	*i_crypt_info;
+	struct fscrypt_inode_info	*i_crypt_info;
 #endif
 
 #ifdef CONFIG_FS_VERITY
@@ -1042,7 +1042,10 @@ static inline struct file *get_file(struct file *f)
 	atomic_long_inc(&f->f_count);
 	return f;
 }
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+
+struct file *get_file_rcu(struct file __rcu **f);
+struct file *get_file_active(struct file **f);
+
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
@@ -1119,7 +1122,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_NOATIME      BIT(10)	/* Do not update access times. */
 #define SB_NODIRATIME   BIT(11)	/* Do not update directory access times */
 #define SB_SILENT       BIT(15)
-#define SB_POSIXACL     BIT(16)	/* VFS does not apply the umask */
+#define SB_POSIXACL     BIT(16)	/* Supports POSIX ACLs */
 #define SB_INLINECRYPT  BIT(17)	/* Use blk-crypto for encrypted files */
 #define SB_KERNMOUNT    BIT(22)	/* this is a kern_mount call */
 #define SB_I_VERSION    BIT(23)	/* Update inode I_version field */
@@ -1166,6 +1169,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
 #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
 #define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
+#define SB_I_NOUMASK	0x00001000	/* VFS does not apply umask */
 
 /* Possible states of 'frozen' field */
 enum {
@@ -1206,7 +1210,7 @@ struct super_block {
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
 #endif
-	const struct xattr_handler **s_xattr;
+	const struct xattr_handler * const *s_xattr;
 #ifdef CONFIG_FS_ENCRYPTION
 	const struct fscrypt_operations	*s_cop;
 	struct fscrypt_keyring	*s_master_keys; /* master crypto keys in use */
@@ -1221,6 +1225,7 @@ struct super_block {
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
+	struct bdev_handle	*s_bdev_handle;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
 	struct hlist_node	s_instances;
@@ -1511,24 +1516,81 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
 struct timespec64 current_time(struct inode *inode);
 struct timespec64 inode_set_ctime_current(struct inode *inode);
 
-/**
- * inode_get_ctime - fetch the current ctime from the inode
- * @inode: inode from which to fetch ctime
- *
- * Grab the current ctime from the inode and return it.
- */
+static inline time64_t inode_get_atime_sec(const struct inode *inode)
+{
+	return inode->__i_atime.tv_sec;
+}
+
+static inline long inode_get_atime_nsec(const struct inode *inode)
+{
+	return inode->__i_atime.tv_nsec;
+}
+
+static inline struct timespec64 inode_get_atime(const struct inode *inode)
+{
+	return inode->__i_atime;
+}
+
+static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
+						      struct timespec64 ts)
+{
+	inode->__i_atime = ts;
+	return ts;
+}
+
+static inline struct timespec64 inode_set_atime(struct inode *inode,
+						time64_t sec, long nsec)
+{
+	struct timespec64 ts = { .tv_sec  = sec,
+				 .tv_nsec = nsec };
+	return inode_set_atime_to_ts(inode, ts);
+}
+
+static inline time64_t inode_get_mtime_sec(const struct inode *inode)
+{
+	return inode->__i_mtime.tv_sec;
+}
+
+static inline long inode_get_mtime_nsec(const struct inode *inode)
+{
+	return inode->__i_mtime.tv_nsec;
+}
+
+static inline struct timespec64 inode_get_mtime(const struct inode *inode)
+{
+	return inode->__i_mtime;
+}
+
+static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
+						      struct timespec64 ts)
+{
+	inode->__i_mtime = ts;
+	return ts;
+}
+
+static inline struct timespec64 inode_set_mtime(struct inode *inode,
+						time64_t sec, long nsec)
+{
+	struct timespec64 ts = { .tv_sec  = sec,
+				 .tv_nsec = nsec };
+	return inode_set_mtime_to_ts(inode, ts);
+}
+
+static inline time64_t inode_get_ctime_sec(const struct inode *inode)
+{
+	return inode->__i_ctime.tv_sec;
+}
+
+static inline long inode_get_ctime_nsec(const struct inode *inode)
+{
+	return inode->__i_ctime.tv_nsec;
+}
+
 static inline struct timespec64 inode_get_ctime(const struct inode *inode)
 {
 	return inode->__i_ctime;
 }
 
-/**
- * inode_set_ctime_to_ts - set the ctime in the inode
- * @inode: inode in which to set the ctime
- * @ts: value to set in the ctime field
- *
- * Set the ctime in @inode to @ts
- */
 static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
 						      struct timespec64 ts)
 {
@@ -1553,6 +1615,8 @@ static inline struct timespec64 inode_set_ctime(struct inode *inode,
 	return inode_set_ctime_to_ts(inode, ts);
 }
 
+struct timespec64 simple_inode_init_ts(struct inode *inode);
+
 /*
  * Snapshotting support.
  */
@@ -2081,7 +2145,12 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
 #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
+
+#ifdef CONFIG_FS_POSIX_ACL
 #define IS_POSIXACL(inode)	__IS_FLG(inode, SB_POSIXACL)
+#else
+#define IS_POSIXACL(inode)	0
+#endif
 
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
@@ -2409,7 +2478,7 @@ struct filename {
 };
 static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
 
-static inline struct mnt_idmap *file_mnt_idmap(struct file *file)
+static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
 {
 	return mnt_idmap(file->f_path.mnt);
 }
@@ -2448,24 +2517,24 @@ struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *creds);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred);
-struct path *backing_file_real_path(struct file *f);
+struct path *backing_file_user_path(struct file *f);
 
 /*
- * file_real_path - get the path corresponding to f_inode
+ * file_user_path - get the path to display for memory mapped file
  *
- * When opening a backing file for a stackable filesystem (e.g.,
- * overlayfs) f_path may be on the stackable filesystem and f_inode on
- * the underlying filesystem.  When the path associated with f_inode is
- * needed, this helper should be used instead of accessing f_path
- * directly.
-*/
-static inline const struct path *file_real_path(struct file *f)
+ * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
+ * stored in ->vm_file is a backing file whose f_inode is on the underlying
+ * filesystem.  When the mapped file path is displayed to user (e.g. via
+ * /proc/<pid>/maps), this helper should be used to get the path to display
+ * to the user, which is the path of the fd that user has requested to map.
+ */
+static inline const struct path *file_user_path(struct file *f)
 {
 	if (unlikely(f->f_mode & FMODE_BACKING))
-		return backing_file_real_path(f);
+		return backing_file_user_path(f);
 	return &f->f_path;
 }
 
diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h
index 010d39d0dc1c..2b1f74b24070 100644
--- a/include/linux/fs_stack.h
+++ b/include/linux/fs_stack.h
@@ -16,14 +16,14 @@ extern void fsstack_copy_inode_size(struct inode *dst, struct inode *src);
 static inline void fsstack_copy_attr_atime(struct inode *dest,
 					   const struct inode *src)
 {
-	dest->i_atime = src->i_atime;
+	inode_set_atime_to_ts(dest, inode_get_atime(src));
 }
 
 static inline void fsstack_copy_attr_times(struct inode *dest,
 					   const struct inode *src)
 {
-	dest->i_atime = src->i_atime;
-	dest->i_mtime = src->i_mtime;
+	inode_set_atime_to_ts(dest, inode_get_atime(src));
+	inode_set_mtime_to_ts(dest, inode_get_mtime(src));
 	inode_set_ctime_to_ts(dest, inode_get_ctime(src));
 }
 
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index c895b12737a1..12f9e455d569 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -31,7 +31,7 @@
 #define FSCRYPT_CONTENTS_ALIGNMENT 16
 
 union fscrypt_policy;
-struct fscrypt_info;
+struct fscrypt_inode_info;
 struct fs_parameter;
 struct seq_file;
 
@@ -59,26 +59,55 @@ struct fscrypt_name {
 
 #ifdef CONFIG_FS_ENCRYPTION
 
-/*
- * If set, the fscrypt bounce page pool won't be allocated (unless another
- * filesystem needs it).  Set this if the filesystem always uses its own bounce
- * pages for writes and therefore won't need the fscrypt bounce page pool.
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
 /* Crypto operations for filesystems */
 struct fscrypt_operations {
 
-	/* Set of optional flags; see above for allowed flags */
-	unsigned int flags;
+	/*
+	 * If set, then fs/crypto/ will allocate a global bounce page pool the
+	 * first time an encryption key is set up for a file.  The bounce page
+	 * pool is required by the following functions:
+	 *
+	 * - fscrypt_encrypt_pagecache_blocks()
+	 * - fscrypt_zeroout_range() for files not using inline crypto
+	 *
+	 * If the filesystem doesn't use those, it doesn't need to set this.
+	 */
+	unsigned int needs_bounce_pages : 1;
 
 	/*
-	 * If set, this is a filesystem-specific key description prefix that
-	 * will be accepted for "logon" keys for v1 fscrypt policies, in
-	 * addition to the generic prefix "fscrypt:".  This functionality is
-	 * deprecated, so new filesystems shouldn't set this field.
+	 * If set, then fs/crypto/ will allow the use of encryption settings
+	 * that assume inode numbers fit in 32 bits (i.e.
+	 * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other
+	 * prerequisites for these settings are also met.  This is only useful
+	 * if the filesystem wants to support inline encryption hardware that is
+	 * limited to 32-bit or 64-bit data unit numbers and where programming
+	 * keyslots is very slow.
 	 */
-	const char *key_prefix;
+	unsigned int has_32bit_inodes : 1;
+
+	/*
+	 * If set, then fs/crypto/ will allow users to select a crypto data unit
+	 * size that is less than the filesystem block size.  This is done via
+	 * the log2_data_unit_size field of the fscrypt policy.  This flag is
+	 * not compatible with filesystems that encrypt variable-length blocks
+	 * (i.e. blocks that aren't all equal to filesystem's block size), for
+	 * example as a result of compression.  It's also not compatible with
+	 * the fscrypt_encrypt_block_inplace() and
+	 * fscrypt_decrypt_block_inplace() functions.
+	 */
+	unsigned int supports_subblock_data_units : 1;
+
+	/*
+	 * This field exists only for backwards compatibility reasons and should
+	 * only be set by the filesystems that are setting it already.  It
+	 * contains the filesystem-specific key description prefix that is
+	 * accepted for "logon" keys for v1 fscrypt policies.  This
+	 * functionality is deprecated in favor of the generic prefix
+	 * "fscrypt:", which itself is deprecated in favor of the filesystem
+	 * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY.  Filesystems that
+	 * are newly adding fscrypt support should not set this field.
+	 */
+	const char *legacy_key_prefix;
 
 	/*
 	 * Get the fscrypt context of the given inode.
@@ -146,21 +175,6 @@ struct fscrypt_operations {
 	bool (*has_stable_inodes)(struct super_block *sb);
 
 	/*
-	 * Get the number of bits that the filesystem uses to represent inode
-	 * numbers and file logical block numbers.
-	 *
-	 * By default, both of these are assumed to be 64-bit.  This function
-	 * can be implemented to declare that either or both of these numbers is
-	 * shorter, which may allow the use of the
-	 * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags and/or the use of
-	 * inline crypto hardware whose maximum DUN length is less than 64 bits
-	 * (e.g., eMMC v5.2 spec compliant hardware).  This function only needs
-	 * to be implemented if support for one of these features is needed.
-	 */
-	void (*get_ino_and_lblk_bits)(struct super_block *sb,
-				      int *ino_bits_ret, int *lblk_bits_ret);
-
-	/*
 	 * Return an array of pointers to the block devices to which the
 	 * filesystem may write encrypted file contents, NULL if the filesystem
 	 * only has a single such block device, or an ERR_PTR() on error.
@@ -178,7 +192,8 @@ struct fscrypt_operations {
 					     unsigned int *num_devs);
 };
 
-static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
+static inline struct fscrypt_inode_info *
+fscrypt_get_inode_info(const struct inode *inode)
 {
 	/*
 	 * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
@@ -390,7 +405,8 @@ static inline void fscrypt_set_ops(struct super_block *sb,
 }
 #else  /* !CONFIG_FS_ENCRYPTION */
 
-static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
+static inline struct fscrypt_inode_info *
+fscrypt_get_inode_info(const struct inode *inode)
 {
 	return NULL;
 }
@@ -868,7 +884,7 @@ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
  */
 static inline bool fscrypt_has_encryption_key(const struct inode *inode)
 {
-	return fscrypt_get_info(inode) != NULL;
+	return fscrypt_get_inode_info(inode) != NULL;
 }
 
 /**
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index ed48e4f1e755..bcb6609b54b3 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -96,8 +96,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 	if (file->f_mode & FMODE_NONOTIFY)
 		return 0;
 
-	/* Overlayfs internal files have fake f_path */
-	path = file_real_path(file);
+	path = &file->f_path;
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 107613f7d792..847413164738 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -38,6 +38,7 @@
 
 #include <asm/page.h>
 #include <linux/bug.h>
+#include <linux/limits.h>
 #include <linux/log2.h>
 #include <linux/math.h>
 #include <linux/types.h>
@@ -116,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 
 #define __genradix_cast(_radix)		(typeof((_radix)->type[0]) *)
 #define __genradix_obj_size(_radix)	sizeof((_radix)->type[0])
+#define __genradix_objs_per_page(_radix)			\
+	(PAGE_SIZE / sizeof((_radix)->type[0]))
+#define __genradix_page_remainder(_radix)			\
+	(PAGE_SIZE % sizeof((_radix)->type[0]))
+
 #define __genradix_idx_to_offset(_radix, _idx)			\
 	__idx_to_offset(_idx, __genradix_obj_size(_radix))
 
@@ -179,11 +185,35 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 #define genradix_iter_peek(_iter, _radix)			\
 	(__genradix_cast(_radix)				\
 	 __genradix_iter_peek(_iter, &(_radix)->tree,		\
-			      PAGE_SIZE / __genradix_obj_size(_radix)))
+			__genradix_objs_per_page(_radix)))
+
+void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
+				size_t, size_t);
+
+/**
+ * genradix_iter_peek_prev - get first entry at or below iterator's current
+ *			     position
+ * @_iter:	a genradix_iter
+ * @_radix:	genradix being iterated over
+ *
+ * If no more entries exist at or below @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek_prev(_iter, _radix)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_iter_peek_prev(_iter, &(_radix)->tree,	\
+			__genradix_objs_per_page(_radix),	\
+			__genradix_obj_size(_radix) +		\
+			__genradix_page_remainder(_radix)))
 
 static inline void __genradix_iter_advance(struct genradix_iter *iter,
 					   size_t obj_size)
 {
+	if (iter->offset + obj_size < iter->offset) {
+		iter->offset	= SIZE_MAX;
+		iter->pos	= SIZE_MAX;
+		return;
+	}
+
 	iter->offset += obj_size;
 
 	if (!is_power_of_2(obj_size) &&
@@ -196,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_iter_advance(_iter, _radix)			\
 	__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
 
+static inline void __genradix_iter_rewind(struct genradix_iter *iter,
+					  size_t obj_size)
+{
+	if (iter->offset == 0 ||
+	    iter->offset == SIZE_MAX) {
+		iter->offset = SIZE_MAX;
+		return;
+	}
+
+	if ((iter->offset & (PAGE_SIZE - 1)) == 0)
+		iter->offset -= PAGE_SIZE % obj_size;
+
+	iter->offset -= obj_size;
+	iter->pos--;
+}
+
+#define genradix_iter_rewind(_iter, _radix)			\
+	__genradix_iter_rewind(_iter, __genradix_obj_size(_radix))
+
 #define genradix_for_each_from(_radix, _iter, _p, _start)	\
 	for (_iter = genradix_iter_init(_radix, _start);	\
 	     (_p = genradix_iter_peek(&_iter, _radix)) != NULL;	\
@@ -213,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
 #define genradix_for_each(_radix, _iter, _p)			\
 	genradix_for_each_from(_radix, _iter, _p, 0)
 
+#define genradix_last_pos(_radix)				\
+	(SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
+
+/**
+ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
+ * @_radix:	genradix to iterate over
+ * @_iter:	a genradix_iter to track current position
+ * @_p:		pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each_reverse(_radix, _iter, _p)		\
+	for (_iter = genradix_iter_init(_radix,	genradix_last_pos(_radix));\
+	     (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\
+	     genradix_iter_rewind(&_iter, _radix))
+
 int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
 
 /**
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 4a1dc88ddbff..76121c2bb4f8 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,7 +566,7 @@ enum
  *
  * _ RCU:
  * 	1) rcutree_migrate_callbacks() migrates the queue.
- * 	2) rcu_report_dead() reports the final quiescent states.
+ * 	2) rcutree_report_cpu_dead() reports the final quiescent states.
  *
  * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
  *
diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
new file mode 100644
index 000000000000..270454a6703d
--- /dev/null
+++ b/include/linux/iov_iter.h
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* I/O iterator iteration building functions.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_IOV_ITER_H
+#define _LINUX_IOV_ITER_H
+
+#include <linux/uio.h>
+#include <linux/bvec.h>
+
+typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
+			     void *priv, void *priv2);
+typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
+			      void *priv, void *priv2);
+
+/*
+ * Handle ITER_UBUF.
+ */
+static __always_inline
+size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_ustep_f step)
+{
+	void __user *base = iter->ubuf;
+	size_t progress = 0, remain;
+
+	remain = step(base + iter->iov_offset, 0, len, priv, priv2);
+	progress = len - remain;
+	iter->iov_offset += progress;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_IOVEC.
+ */
+static __always_inline
+size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		     iov_ustep_f step)
+{
+	const struct iovec *p = iter->__iov;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t part = min(len, p->iov_len - skip);
+
+		if (likely(part)) {
+			remain = step(p->iov_base + skip, progress, part, priv, priv2);
+			consumed = part - remain;
+			progress += consumed;
+			skip += consumed;
+			len -= consumed;
+			if (skip < p->iov_len)
+				break;
+		}
+		p++;
+		skip = 0;
+	} while (len);
+
+	iter->nr_segs -= p - iter->__iov;
+	iter->__iov = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_KVEC.
+ */
+static __always_inline
+size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_step_f step)
+{
+	const struct kvec *p = iter->kvec;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t part = min(len, p->iov_len - skip);
+
+		if (likely(part)) {
+			remain = step(p->iov_base + skip, progress, part, priv, priv2);
+			consumed = part - remain;
+			progress += consumed;
+			skip += consumed;
+			len -= consumed;
+			if (skip < p->iov_len)
+				break;
+		}
+		p++;
+		skip = 0;
+	} while (len);
+
+	iter->nr_segs -= p - iter->kvec;
+	iter->kvec = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_BVEC.
+ */
+static __always_inline
+size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		    iov_step_f step)
+{
+	const struct bio_vec *p = iter->bvec;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	do {
+		size_t remain, consumed;
+		size_t offset = p->bv_offset + skip, part;
+		void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);
+
+		part = min3(len,
+			   (size_t)(p->bv_len - skip),
+			   (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
+		remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
+		kunmap_local(kaddr);
+		consumed = part - remain;
+		len -= consumed;
+		progress += consumed;
+		skip += consumed;
+		if (skip >= p->bv_len) {
+			skip = 0;
+			p++;
+		}
+		if (remain)
+			break;
+	} while (len);
+
+	iter->nr_segs -= p - iter->bvec;
+	iter->bvec = p;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_XARRAY.
+ */
+static __always_inline
+size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		      iov_step_f step)
+{
+	struct folio *folio;
+	size_t progress = 0;
+	loff_t start = iter->xarray_start + iter->iov_offset;
+	pgoff_t index = start / PAGE_SIZE;
+	XA_STATE(xas, iter->xarray, index);
+
+	rcu_read_lock();
+	xas_for_each(&xas, folio, ULONG_MAX) {
+		size_t remain, consumed, offset, part, flen;
+
+		if (xas_retry(&xas, folio))
+			continue;
+		if (WARN_ON(xa_is_value(folio)))
+			break;
+		if (WARN_ON(folio_test_hugetlb(folio)))
+			break;
+
+		offset = offset_in_folio(folio, start + progress);
+		flen = min(folio_size(folio) - offset, len);
+
+		while (flen) {
+			void *base = kmap_local_folio(folio, offset);
+
+			part = min_t(size_t, flen,
+				     PAGE_SIZE - offset_in_page(offset));
+			remain = step(base, progress, part, priv, priv2);
+			kunmap_local(base);
+
+			consumed = part - remain;
+			progress += consumed;
+			len -= consumed;
+
+			if (remain || len == 0)
+				goto out;
+			flen -= consumed;
+			offset += consumed;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	iter->iov_offset += progress;
+	iter->count -= progress;
+	return progress;
+}
+
+/*
+ * Handle ITER_DISCARD.
+ */
+static __always_inline
+size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		      iov_step_f step)
+{
+	size_t progress = len;
+
+	iter->count -= progress;
+	return progress;
+}
+
+/**
+ * iterate_and_advance2 - Iterate over an iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @priv2: More data for the step functions.
+ * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * Iterate over the next part of an iterator, up to the specified length.  The
+ * buffer is presented in segments, which for kernel iteration are broken up by
+ * physical pages and mapped, with the mapped address being presented.
+ *
+ * Two step functions, @step and @ustep, must be provided, one for handling
+ * mapped kernel addresses and the other is given user addresses which have the
+ * potential to fault since no pinning is performed.
+ *
+ * The step functions are passed the address and length of the segment, @priv,
+ * @priv2 and the amount of data so far iterated over (which can, for example,
+ * be added to @priv to point to the right part of a second buffer).  The step
+ * functions should return the amount of the segment they didn't process (ie. 0
+ * indicates complete processsing).
+ *
+ * This function returns the amount of data processed (ie. 0 means nothing was
+ * processed and the value of @len means processes to completion).
+ */
+static __always_inline
+size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
+			    void *priv2, iov_ustep_f ustep, iov_step_f step)
+{
+	if (unlikely(iter->count < len))
+		len = iter->count;
+	if (unlikely(!len))
+		return 0;
+
+	if (likely(iter_is_ubuf(iter)))
+		return iterate_ubuf(iter, len, priv, priv2, ustep);
+	if (likely(iter_is_iovec(iter)))
+		return iterate_iovec(iter, len, priv, priv2, ustep);
+	if (iov_iter_is_bvec(iter))
+		return iterate_bvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_kvec(iter))
+		return iterate_kvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_xarray(iter))
+		return iterate_xarray(iter, len, priv, priv2, step);
+	return iterate_discard(iter, len, priv, priv2, step);
+}
+
+/**
+ * iterate_and_advance - Iterate over an iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * As iterate_and_advance2(), but priv2 is always NULL.
+ */
+static __always_inline
+size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
+			   iov_ustep_f ustep, iov_step_f step)
+{
+	return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
+}
+
+#endif /* _LINUX_IOV_ITER_H */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index d8a6fdce9373..90081afa10ce 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -215,8 +215,6 @@ struct irq_data {
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
  * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
  * IRQD_CAN_RESERVE		- Can use reservation mode
- * IRQD_MSI_NOMASK_QUIRK	- Non-maskable MSI quirk for affinity change
- *				  required
  * IRQD_HANDLE_ENFORCE_IRQCTX	- Enforce that handle_irq_*() is only invoked
  *				  from actual interrupt context.
  * IRQD_AFFINITY_ON_ACTIVATE	- Affinity is set on activation. Don't call
@@ -247,11 +245,10 @@ enum {
 	IRQD_SINGLE_TARGET		= BIT(24),
 	IRQD_DEFAULT_TRIGGER_SET	= BIT(25),
 	IRQD_CAN_RESERVE		= BIT(26),
-	IRQD_MSI_NOMASK_QUIRK		= BIT(27),
-	IRQD_HANDLE_ENFORCE_IRQCTX	= BIT(28),
-	IRQD_AFFINITY_ON_ACTIVATE	= BIT(29),
-	IRQD_IRQ_ENABLED_ON_SUSPEND	= BIT(30),
-	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(31),
+	IRQD_HANDLE_ENFORCE_IRQCTX	= BIT(27),
+	IRQD_AFFINITY_ON_ACTIVATE	= BIT(28),
+	IRQD_IRQ_ENABLED_ON_SUSPEND	= BIT(29),
+	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(30),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -426,21 +423,6 @@ static inline bool irqd_can_reserve(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_CAN_RESERVE;
 }
 
-static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
-{
-	__irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
-}
-
-static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
-{
-	__irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
-}
-
-static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
-{
-	return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
-}
-
 static inline void irqd_set_affinity_on_activate(struct irq_data *d)
 {
 	__irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE;
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 51c254b7fec2..ee0a82c60508 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -174,7 +174,7 @@ struct irq_domain {
 	irq_hw_number_t			hwirq_max;
 	unsigned int			revmap_size;
 	struct radix_tree_root		revmap_tree;
-	struct irq_data __rcu		*revmap[];
+	struct irq_data __rcu		*revmap[] __counted_by(revmap_size);
 };
 
 /* Irq domain flags */
diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index f174ff1b59ee..8f972eaca2ed 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -256,7 +256,7 @@ inode_peek_iversion(const struct inode *inode)
  * For filesystems without any sort of change attribute, the best we can
  * do is fake one up from the ctime:
  */
-static inline u64 time_to_chattr(struct timespec64 *t)
+static inline u64 time_to_chattr(const struct timespec64 *t)
 {
 	u64 chattr = t->tv_sec;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb6c6109fdca..4944136efaa2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -664,7 +664,7 @@ struct kvm_irq_routing_table {
 	 * Array indexed by gsi. Each entry contains list of irq chips
 	 * the gsi is connected to.
 	 */
-	struct hlist_head map[];
+	struct hlist_head map[] __counted_by(nr_rt_entries);
 };
 #endif
 
diff --git a/include/linux/list.h b/include/linux/list.h
index 164b4d0e9d2a..1837caedf723 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -687,6 +687,14 @@ static inline void list_splice_tail_init(struct list_head *list,
 	for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
 
 /**
+ * list_for_each_reverse - iterate backwards over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_reverse(pos, head) \
+	for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+/**
  * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
  * @pos:	the &struct list_head to use as a loop cursor.
  * @head:	the head for your list.
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 85bda2d02d65..2c982ff7475a 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -74,6 +74,33 @@ static inline void init_llist_head(struct llist_head *list)
 }
 
 /**
+ * init_llist_node - initialize lock-less list node
+ * @node:	the node to be initialised
+ *
+ * In cases where there is a need to test if a node is on
+ * a list or not, this initialises the node to clearly
+ * not be on any list.
+ */
+static inline void init_llist_node(struct llist_node *node)
+{
+	node->next = node;
+}
+
+/**
+ * llist_on_list - test if a lock-list list node is on a list
+ * @node:	the node to test
+ *
+ * When a node is on a list the ->next pointer will be NULL or
+ * some other node.  It can never point to itself.  We use that
+ * in init_llist_node() to record that a node is not on any list,
+ * and here to test whether it is on any list.
+ */
+static inline bool llist_on_list(const struct llist_node *node)
+{
+	return node->next != node;
+}
+
+/**
  * llist_entry - get the struct of this entry
  * @ptr:	the &struct llist_node pointer.
  * @type:	the type of the struct this is embedded in.
@@ -249,6 +276,25 @@ static inline struct llist_node *__llist_del_all(struct llist_head *head)
 
 extern struct llist_node *llist_del_first(struct llist_head *head);
 
+/**
+ * llist_del_first_init - delete first entry from lock-list and mark is as being off-list
+ * @head:	the head of lock-less list to delete from.
+ *
+ * This behave the same as llist_del_first() except that llist_init_node() is called
+ * on the returned node so that llist_on_list() will report false for the node.
+ */
+static inline struct llist_node *llist_del_first_init(struct llist_head *head)
+{
+	struct llist_node *n = llist_del_first(head);
+
+	if (n)
+		init_llist_node(n);
+	return n;
+}
+
+extern bool llist_del_first_this(struct llist_head *head,
+				 struct llist_node *this);
+
 struct llist_node *llist_reverse_order(struct llist_node *head);
 
 #endif /* LLIST_H */
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 0f016d69c996..9f565416d186 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -282,7 +282,7 @@ __be32		  nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
 			struct nlm_host *, struct nlm_lock *,
 			struct nlm_lock *, struct nlm_cookie *);
 __be32		  nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *);
-void		  nlmsvc_retry_blocked(void);
+void		  nlmsvc_retry_blocked(struct svc_rqst *rqstp);
 void		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
 					nlm_host_match_fn_t match);
 void		  nlmsvc_grant_reply(struct nlm_cookie *, __be32);
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ac962c4cb44b..99b8176c3738 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -43,17 +43,17 @@ LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old,
 	 const kernel_cap_t *permitted)
 LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns,
 	 int cap, unsigned int opts)
-LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, struct super_block *sb)
+LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, const struct super_block *sb)
 LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
 LSM_HOOK(int, 0, syslog, int type)
 LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
 	 const struct timezone *tz)
 LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
 LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
-LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *file)
+LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
-LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm)
-LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
+LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, const struct linux_binprm *bprm)
+LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, const struct linux_binprm *bprm)
 LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference)
 LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
 	 struct fs_context *src_sc)
@@ -66,7 +66,7 @@ LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
 LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
 LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts)
 LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts)
-LSM_HOOK(int, 0, sb_kern_mount, struct super_block *sb)
+LSM_HOOK(int, 0, sb_kern_mount, const struct super_block *sb)
 LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb)
 LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry)
 LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path,
diff --git a/include/linux/lwq.h b/include/linux/lwq.h
new file mode 100644
index 000000000000..d081d5cf8e33
--- /dev/null
+++ b/include/linux/lwq.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef LWQ_H
+#define LWQ_H
+/*
+ * Light-weight single-linked queue built from llist
+ *
+ * Entries can be enqueued from any context with no locking.
+ * Entries can be dequeued from process context with integrated locking.
+ *
+ * This is particularly suitable when work items are queued in
+ * BH or IRQ context, and where work items are handled one at a time
+ * by dedicated threads.
+ */
+#include <linux/container_of.h>
+#include <linux/spinlock.h>
+#include <linux/llist.h>
+
+struct lwq_node {
+	struct llist_node node;
+};
+
+struct lwq {
+	spinlock_t		lock;
+	struct llist_node	*ready;		/* entries to be dequeued */
+	struct llist_head	new;		/* entries being enqueued */
+};
+
+/**
+ * lwq_init - initialise a lwq
+ * @q:	the lwq object
+ */
+static inline void lwq_init(struct lwq *q)
+{
+	spin_lock_init(&q->lock);
+	q->ready = NULL;
+	init_llist_head(&q->new);
+}
+
+/**
+ * lwq_empty - test if lwq contains any entry
+ * @q:	the lwq object
+ *
+ * This empty test contains an acquire barrier so that if a wakeup
+ * is sent when lwq_dequeue returns true, it is safe to go to sleep after
+ * a test on lwq_empty().
+ */
+static inline bool lwq_empty(struct lwq *q)
+{
+	/* acquire ensures ordering wrt lwq_enqueue() */
+	return smp_load_acquire(&q->ready) == NULL && llist_empty(&q->new);
+}
+
+struct llist_node *__lwq_dequeue(struct lwq *q);
+/**
+ * lwq_dequeue - dequeue first (oldest) entry from lwq
+ * @q:		the queue to dequeue from
+ * @type:	the type of object to return
+ * @member:	them member in returned object which is an lwq_node.
+ *
+ * Remove a single object from the lwq and return it.  This will take
+ * a spinlock and so must always be called in the same context, typcially
+ * process contet.
+ */
+#define lwq_dequeue(q, type, member)					\
+	({ struct llist_node *_n = __lwq_dequeue(q);			\
+	  _n ? container_of(_n, type, member.node) : NULL; })
+
+struct llist_node *lwq_dequeue_all(struct lwq *q);
+
+/**
+ * lwq_for_each_safe - iterate over detached queue allowing deletion
+ * @_n:		iterator variable
+ * @_t1:	temporary struct llist_node **
+ * @_t2:	temporary struct llist_node *
+ * @_l:		address of llist_node pointer from lwq_dequeue_all()
+ * @_member:	member in _n where lwq_node is found.
+ *
+ * Iterate over members in a dequeued list.  If the iterator variable
+ * is set to NULL, the iterator removes that entry from the queue.
+ */
+#define lwq_for_each_safe(_n, _t1, _t2, _l, _member)			\
+	for (_t1 = (_l);						\
+	     *(_t1) ? (_n = container_of(*(_t1), typeof(*(_n)), _member.node),\
+		       _t2 = ((*_t1)->next),				\
+		       true)						\
+	     : false;							\
+	     (_n) ? (_t1 = &(_n)->_member.node.next, 0)			\
+	     : ((*(_t1) = (_t2)),  0))
+
+/**
+ * lwq_enqueue - add a new item to the end of the queue
+ * @n	- the lwq_node embedded in the item to be added
+ * @q	- the lwq to append to.
+ *
+ * No locking is needed to append to the queue so this can
+ * be called from any context.
+ * Return %true is the list may have previously been empty.
+ */
+static inline bool lwq_enqueue(struct lwq_node *n, struct lwq *q)
+{
+	/* acquire enqures ordering wrt lwq_dequeue */
+	return llist_add(&n->node, &q->new) &&
+		smp_load_acquire(&q->ready) == NULL;
+}
+
+/**
+ * lwq_enqueue_batch - add a list of new items to the end of the queue
+ * @n	- the lwq_node embedded in the first item to be added
+ * @q	- the lwq to append to.
+ *
+ * No locking is needed to append to the queue so this can
+ * be called from any context.
+ * Return %true is the list may have previously been empty.
+ */
+static inline bool lwq_enqueue_batch(struct llist_node *n, struct lwq *q)
+{
+	struct llist_node *e = n;
+
+	/* acquire enqures ordering wrt lwq_dequeue */
+	return llist_add_batch(llist_reverse_order(n), e, &q->new) &&
+		smp_load_acquire(&q->ready) == NULL;
+}
+#endif /* LWQ_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..116c28c51468 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	unsigned int pid_bit;
 
 	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
-	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
-		__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
+		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
@@ -3308,8 +3308,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
 static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
-/* These take the mm semaphore themselves */
-extern int __must_check vm_brk(unsigned long, unsigned long);
+/* This takes the mm semaphore itself */
 extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 74b49c4c7a52..4be8e310b189 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -540,9 +540,36 @@ struct vma_lock {
 };
 
 struct vma_numab_state {
+	/*
+	 * Initialised as time in 'jiffies' after which VMA
+	 * should be scanned.  Delays first scan of new VMA by at
+	 * least sysctl_numa_balancing_scan_delay:
+	 */
 	unsigned long next_scan;
-	unsigned long next_pid_reset;
-	unsigned long access_pids[2];
+
+	/*
+	 * Time in jiffies when pids_active[] is reset to
+	 * detect phase change behaviour:
+	 */
+	unsigned long pids_active_reset;
+
+	/*
+	 * Approximate tracking of PIDs that trapped a NUMA hinting
+	 * fault. May produce false positives due to hash collisions.
+	 *
+	 *   [0] Previous PID tracking
+	 *   [1] Current PID tracking
+	 *
+	 * Window moves after next_pid_reset has expired approximately
+	 * every VMA_PID_RESET_PERIOD jiffies:
+	 */
+	unsigned long pids_active[2];
+
+	/*
+	 * MM scan sequence ID when the VMA was last completely scanned.
+	 * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
+	 */
+	int prev_scan_seq;
 };
 
 /*
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4f40b40306d0..ac3dd2876197 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt);
 extern bool mnt_may_suid(struct vfsmount *mnt);
 
 extern struct vfsmount *clone_private_mount(const struct path *path);
-extern int __mnt_want_write(struct vfsmount *);
-extern void __mnt_drop_write(struct vfsmount *);
+int mnt_get_write_access(struct vfsmount *mnt);
+void mnt_put_write_access(struct vfsmount *mnt);
 
 extern struct vfsmount *fc_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index a50ea79522f8..ddace8c34dcf 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -547,12 +547,6 @@ enum {
 	MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS	= (1 << 5),
 	/* Free MSI descriptors */
 	MSI_FLAG_FREE_MSI_DESCS		= (1 << 6),
-	/*
-	 * Quirk to handle MSI implementations which do not provide
-	 * masking. Currently known to affect x86, but has to be partially
-	 * handled in the core MSI code.
-	 */
-	MSI_FLAG_NOMASK_QUIRK		= (1 << 7),
 
 	/* Mask for the generic functionality */
 	MSI_GENERIC_FLAGS_MASK		= GENMASK(15, 0),
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 1463cbda4888..3100371b5e32 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -92,6 +92,30 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
+/**
+ * mode_strip_umask - handle vfs umask stripping
+ * @dir:	parent directory of the new inode
+ * @mode:	mode of the new inode to be created in @dir
+ *
+ * In most filesystems, umask stripping depends on whether or not the
+ * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask
+ * stripping is done directly in here. If the filesystem does support POSIX
+ * ACLs umask stripping is deferred until the filesystem calls
+ * posix_acl_create().
+ *
+ * Some filesystems (like NFSv4) also want to avoid umask stripping by the
+ * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK
+ * to get this effect without declaring that they support POSIX ACLs.
+ *
+ * Returns: mode
+ */
+static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode)
+{
+	if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK))
+		mode &= ~current_umask();
+	return mode;
+}
+
 extern int __must_check nd_jump_link(const struct path *path);
 
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
@@ -112,7 +136,7 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 static inline bool
 retry_estale(const long error, const unsigned int flags)
 {
-	return error == -ESTALE && !(flags & LOOKUP_REVAL);
+	return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL));
 }
 
 #endif /* _LINUX_NAMEI_H */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 730003c4f4af..c11c4db34639 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -150,7 +150,7 @@ enum nfs_opnum4 {
 	OP_WRITE_SAME = 70,
 	OP_CLONE = 71,
 
-	/* xattr support (RFC8726) */
+	/* xattr support (RFC8276) */
 	OP_GETXATTR                = 72,
 	OP_SETXATTR                = 73,
 	OP_LISTXATTRS              = 74,
@@ -389,79 +389,203 @@ enum lock_type4 {
 	NFS4_WRITEW_LT = 4
 };
 
+/*
+ * Symbol names and values are from RFC 7531 Section 2.
+ * "XDR Description of NFSv4.0"
+ */
+enum {
+	FATTR4_SUPPORTED_ATTRS		= 0,
+	FATTR4_TYPE			= 1,
+	FATTR4_FH_EXPIRE_TYPE		= 2,
+	FATTR4_CHANGE			= 3,
+	FATTR4_SIZE			= 4,
+	FATTR4_LINK_SUPPORT		= 5,
+	FATTR4_SYMLINK_SUPPORT		= 6,
+	FATTR4_NAMED_ATTR		= 7,
+	FATTR4_FSID			= 8,
+	FATTR4_UNIQUE_HANDLES		= 9,
+	FATTR4_LEASE_TIME		= 10,
+	FATTR4_RDATTR_ERROR		= 11,
+	FATTR4_ACL			= 12,
+	FATTR4_ACLSUPPORT		= 13,
+	FATTR4_ARCHIVE			= 14,
+	FATTR4_CANSETTIME		= 15,
+	FATTR4_CASE_INSENSITIVE		= 16,
+	FATTR4_CASE_PRESERVING		= 17,
+	FATTR4_CHOWN_RESTRICTED		= 18,
+	FATTR4_FILEHANDLE		= 19,
+	FATTR4_FILEID			= 20,
+	FATTR4_FILES_AVAIL		= 21,
+	FATTR4_FILES_FREE		= 22,
+	FATTR4_FILES_TOTAL		= 23,
+	FATTR4_FS_LOCATIONS		= 24,
+	FATTR4_HIDDEN			= 25,
+	FATTR4_HOMOGENEOUS		= 26,
+	FATTR4_MAXFILESIZE		= 27,
+	FATTR4_MAXLINK			= 28,
+	FATTR4_MAXNAME			= 29,
+	FATTR4_MAXREAD			= 30,
+	FATTR4_MAXWRITE			= 31,
+	FATTR4_MIMETYPE			= 32,
+	FATTR4_MODE			= 33,
+	FATTR4_NO_TRUNC			= 34,
+	FATTR4_NUMLINKS			= 35,
+	FATTR4_OWNER			= 36,
+	FATTR4_OWNER_GROUP		= 37,
+	FATTR4_QUOTA_AVAIL_HARD		= 38,
+	FATTR4_QUOTA_AVAIL_SOFT		= 39,
+	FATTR4_QUOTA_USED		= 40,
+	FATTR4_RAWDEV			= 41,
+	FATTR4_SPACE_AVAIL		= 42,
+	FATTR4_SPACE_FREE		= 43,
+	FATTR4_SPACE_TOTAL		= 44,
+	FATTR4_SPACE_USED		= 45,
+	FATTR4_SYSTEM			= 46,
+	FATTR4_TIME_ACCESS		= 47,
+	FATTR4_TIME_ACCESS_SET		= 48,
+	FATTR4_TIME_BACKUP		= 49,
+	FATTR4_TIME_CREATE		= 50,
+	FATTR4_TIME_DELTA		= 51,
+	FATTR4_TIME_METADATA		= 52,
+	FATTR4_TIME_MODIFY		= 53,
+	FATTR4_TIME_MODIFY_SET		= 54,
+	FATTR4_MOUNTED_ON_FILEID	= 55,
+};
+
+/*
+ * Symbol names and values are from RFC 5662 Section 2.
+ * "XDR Description of NFSv4.1"
+ */
+enum {
+	FATTR4_DIR_NOTIF_DELAY		= 56,
+	FATTR4_DIRENT_NOTIF_DELAY	= 57,
+	FATTR4_DACL			= 58,
+	FATTR4_SACL			= 59,
+	FATTR4_CHANGE_POLICY		= 60,
+	FATTR4_FS_STATUS		= 61,
+	FATTR4_FS_LAYOUT_TYPES		= 62,
+	FATTR4_LAYOUT_HINT		= 63,
+	FATTR4_LAYOUT_TYPES		= 64,
+	FATTR4_LAYOUT_BLKSIZE		= 65,
+	FATTR4_LAYOUT_ALIGNMENT		= 66,
+	FATTR4_FS_LOCATIONS_INFO	= 67,
+	FATTR4_MDSTHRESHOLD		= 68,
+	FATTR4_RETENTION_GET		= 69,
+	FATTR4_RETENTION_SET		= 70,
+	FATTR4_RETENTEVT_GET		= 71,
+	FATTR4_RETENTEVT_SET		= 72,
+	FATTR4_RETENTION_HOLD		= 73,
+	FATTR4_MODE_SET_MASKED		= 74,
+	FATTR4_SUPPATTR_EXCLCREAT	= 75,
+	FATTR4_FS_CHARSET_CAP		= 76,
+};
+
+/*
+ * Symbol names and values are from RFC 7863 Section 2.
+ * "XDR Description of NFSv4.2"
+ */
+enum {
+	FATTR4_CLONE_BLKSIZE		= 77,
+	FATTR4_SPACE_FREED		= 78,
+	FATTR4_CHANGE_ATTR_TYPE		= 79,
+	FATTR4_SEC_LABEL		= 80,
+};
+
+/*
+ * Symbol names and values are from RFC 8275 Section 5.
+ * "The mode_umask Attribute"
+ */
+enum {
+	FATTR4_MODE_UMASK		= 81,
+};
+
+/*
+ * Symbol names and values are from RFC 8276 Section 8.6.
+ * "Numeric Values Assigned to Protocol Extensions"
+ */
+enum {
+	FATTR4_XATTR_SUPPORT		= 82,
+};
+
+/*
+ * The following internal definitions enable processing the above
+ * attribute bits within 32-bit word boundaries.
+ */
 
 /* Mandatory Attributes */
-#define FATTR4_WORD0_SUPPORTED_ATTRS    (1UL << 0)
-#define FATTR4_WORD0_TYPE               (1UL << 1)
-#define FATTR4_WORD0_FH_EXPIRE_TYPE     (1UL << 2)
-#define FATTR4_WORD0_CHANGE             (1UL << 3)
-#define FATTR4_WORD0_SIZE               (1UL << 4)
-#define FATTR4_WORD0_LINK_SUPPORT       (1UL << 5)
-#define FATTR4_WORD0_SYMLINK_SUPPORT    (1UL << 6)
-#define FATTR4_WORD0_NAMED_ATTR         (1UL << 7)
-#define FATTR4_WORD0_FSID               (1UL << 8)
-#define FATTR4_WORD0_UNIQUE_HANDLES     (1UL << 9)
-#define FATTR4_WORD0_LEASE_TIME         (1UL << 10)
-#define FATTR4_WORD0_RDATTR_ERROR       (1UL << 11)
+#define FATTR4_WORD0_SUPPORTED_ATTRS    BIT(FATTR4_SUPPORTED_ATTRS)
+#define FATTR4_WORD0_TYPE               BIT(FATTR4_TYPE)
+#define FATTR4_WORD0_FH_EXPIRE_TYPE     BIT(FATTR4_FH_EXPIRE_TYPE)
+#define FATTR4_WORD0_CHANGE             BIT(FATTR4_CHANGE)
+#define FATTR4_WORD0_SIZE               BIT(FATTR4_SIZE)
+#define FATTR4_WORD0_LINK_SUPPORT       BIT(FATTR4_LINK_SUPPORT)
+#define FATTR4_WORD0_SYMLINK_SUPPORT    BIT(FATTR4_SYMLINK_SUPPORT)
+#define FATTR4_WORD0_NAMED_ATTR         BIT(FATTR4_NAMED_ATTR)
+#define FATTR4_WORD0_FSID               BIT(FATTR4_FSID)
+#define FATTR4_WORD0_UNIQUE_HANDLES     BIT(FATTR4_UNIQUE_HANDLES)
+#define FATTR4_WORD0_LEASE_TIME         BIT(FATTR4_LEASE_TIME)
+#define FATTR4_WORD0_RDATTR_ERROR       BIT(FATTR4_RDATTR_ERROR)
 /* Mandatory in NFSv4.1 */
-#define FATTR4_WORD2_SUPPATTR_EXCLCREAT (1UL << 11)
+#define FATTR4_WORD2_SUPPATTR_EXCLCREAT BIT(FATTR4_SUPPATTR_EXCLCREAT - 64)
 
 /* Recommended Attributes */
-#define FATTR4_WORD0_ACL                (1UL << 12)
-#define FATTR4_WORD0_ACLSUPPORT         (1UL << 13)
-#define FATTR4_WORD0_ARCHIVE            (1UL << 14)
-#define FATTR4_WORD0_CANSETTIME         (1UL << 15)
-#define FATTR4_WORD0_CASE_INSENSITIVE   (1UL << 16)
-#define FATTR4_WORD0_CASE_PRESERVING    (1UL << 17)
-#define FATTR4_WORD0_CHOWN_RESTRICTED   (1UL << 18)
-#define FATTR4_WORD0_FILEHANDLE         (1UL << 19)
-#define FATTR4_WORD0_FILEID             (1UL << 20)
-#define FATTR4_WORD0_FILES_AVAIL        (1UL << 21)
-#define FATTR4_WORD0_FILES_FREE         (1UL << 22)
-#define FATTR4_WORD0_FILES_TOTAL        (1UL << 23)
-#define FATTR4_WORD0_FS_LOCATIONS       (1UL << 24)
-#define FATTR4_WORD0_HIDDEN             (1UL << 25)
-#define FATTR4_WORD0_HOMOGENEOUS        (1UL << 26)
-#define FATTR4_WORD0_MAXFILESIZE        (1UL << 27)
-#define FATTR4_WORD0_MAXLINK            (1UL << 28)
-#define FATTR4_WORD0_MAXNAME            (1UL << 29)
-#define FATTR4_WORD0_MAXREAD            (1UL << 30)
-#define FATTR4_WORD0_MAXWRITE           (1UL << 31)
-#define FATTR4_WORD1_MIMETYPE           (1UL << 0)
-#define FATTR4_WORD1_MODE               (1UL << 1)
-#define FATTR4_WORD1_NO_TRUNC           (1UL << 2)
-#define FATTR4_WORD1_NUMLINKS           (1UL << 3)
-#define FATTR4_WORD1_OWNER              (1UL << 4)
-#define FATTR4_WORD1_OWNER_GROUP        (1UL << 5)
-#define FATTR4_WORD1_QUOTA_HARD         (1UL << 6)
-#define FATTR4_WORD1_QUOTA_SOFT         (1UL << 7)
-#define FATTR4_WORD1_QUOTA_USED         (1UL << 8)
-#define FATTR4_WORD1_RAWDEV             (1UL << 9)
-#define FATTR4_WORD1_SPACE_AVAIL        (1UL << 10)
-#define FATTR4_WORD1_SPACE_FREE         (1UL << 11)
-#define FATTR4_WORD1_SPACE_TOTAL        (1UL << 12)
-#define FATTR4_WORD1_SPACE_USED         (1UL << 13)
-#define FATTR4_WORD1_SYSTEM             (1UL << 14)
-#define FATTR4_WORD1_TIME_ACCESS        (1UL << 15)
-#define FATTR4_WORD1_TIME_ACCESS_SET    (1UL << 16)
-#define FATTR4_WORD1_TIME_BACKUP        (1UL << 17)
-#define FATTR4_WORD1_TIME_CREATE        (1UL << 18)
-#define FATTR4_WORD1_TIME_DELTA         (1UL << 19)
-#define FATTR4_WORD1_TIME_METADATA      (1UL << 20)
-#define FATTR4_WORD1_TIME_MODIFY        (1UL << 21)
-#define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
-#define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
-#define FATTR4_WORD1_DACL               (1UL << 26)
-#define FATTR4_WORD1_SACL               (1UL << 27)
-#define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
-#define FATTR4_WORD2_LAYOUT_TYPES       (1UL << 0)
-#define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
-#define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
-#define FATTR4_WORD2_CLONE_BLKSIZE	(1UL << 13)
-#define FATTR4_WORD2_CHANGE_ATTR_TYPE	(1UL << 15)
-#define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
-#define FATTR4_WORD2_MODE_UMASK		(1UL << 17)
-#define FATTR4_WORD2_XATTR_SUPPORT	(1UL << 18)
+#define FATTR4_WORD0_ACL                BIT(FATTR4_ACL)
+#define FATTR4_WORD0_ACLSUPPORT         BIT(FATTR4_ACLSUPPORT)
+#define FATTR4_WORD0_ARCHIVE            BIT(FATTR4_ARCHIVE)
+#define FATTR4_WORD0_CANSETTIME         BIT(FATTR4_CANSETTIME)
+#define FATTR4_WORD0_CASE_INSENSITIVE   BIT(FATTR4_CASE_INSENSITIVE)
+#define FATTR4_WORD0_CASE_PRESERVING    BIT(FATTR4_CASE_PRESERVING)
+#define FATTR4_WORD0_CHOWN_RESTRICTED   BIT(FATTR4_CHOWN_RESTRICTED)
+#define FATTR4_WORD0_FILEHANDLE         BIT(FATTR4_FILEHANDLE)
+#define FATTR4_WORD0_FILEID             BIT(FATTR4_FILEID)
+#define FATTR4_WORD0_FILES_AVAIL        BIT(FATTR4_FILES_AVAIL)
+#define FATTR4_WORD0_FILES_FREE         BIT(FATTR4_FILES_FREE)
+#define FATTR4_WORD0_FILES_TOTAL        BIT(FATTR4_FILES_TOTAL)
+#define FATTR4_WORD0_FS_LOCATIONS       BIT(FATTR4_FS_LOCATIONS)
+#define FATTR4_WORD0_HIDDEN             BIT(FATTR4_HIDDEN)
+#define FATTR4_WORD0_HOMOGENEOUS        BIT(FATTR4_HOMOGENEOUS)
+#define FATTR4_WORD0_MAXFILESIZE        BIT(FATTR4_MAXFILESIZE)
+#define FATTR4_WORD0_MAXLINK            BIT(FATTR4_MAXLINK)
+#define FATTR4_WORD0_MAXNAME            BIT(FATTR4_MAXNAME)
+#define FATTR4_WORD0_MAXREAD            BIT(FATTR4_MAXREAD)
+#define FATTR4_WORD0_MAXWRITE           BIT(FATTR4_MAXWRITE)
+
+#define FATTR4_WORD1_MIMETYPE           BIT(FATTR4_MIMETYPE - 32)
+#define FATTR4_WORD1_MODE               BIT(FATTR4_MODE	- 32)
+#define FATTR4_WORD1_NO_TRUNC           BIT(FATTR4_NO_TRUNC - 32)
+#define FATTR4_WORD1_NUMLINKS           BIT(FATTR4_NUMLINKS - 32)
+#define FATTR4_WORD1_OWNER              BIT(FATTR4_OWNER - 32)
+#define FATTR4_WORD1_OWNER_GROUP        BIT(FATTR4_OWNER_GROUP - 32)
+#define FATTR4_WORD1_QUOTA_HARD         BIT(FATTR4_QUOTA_AVAIL_HARD - 32)
+#define FATTR4_WORD1_QUOTA_SOFT         BIT(FATTR4_QUOTA_AVAIL_SOFT - 32)
+#define FATTR4_WORD1_QUOTA_USED         BIT(FATTR4_QUOTA_USED - 32)
+#define FATTR4_WORD1_RAWDEV             BIT(FATTR4_RAWDEV - 32)
+#define FATTR4_WORD1_SPACE_AVAIL        BIT(FATTR4_SPACE_AVAIL - 32)
+#define FATTR4_WORD1_SPACE_FREE         BIT(FATTR4_SPACE_FREE - 32)
+#define FATTR4_WORD1_SPACE_TOTAL        BIT(FATTR4_SPACE_TOTAL - 32)
+#define FATTR4_WORD1_SPACE_USED         BIT(FATTR4_SPACE_USED - 32)
+#define FATTR4_WORD1_SYSTEM             BIT(FATTR4_SYSTEM - 32)
+#define FATTR4_WORD1_TIME_ACCESS        BIT(FATTR4_TIME_ACCESS - 32)
+#define FATTR4_WORD1_TIME_ACCESS_SET    BIT(FATTR4_TIME_ACCESS_SET - 32)
+#define FATTR4_WORD1_TIME_BACKUP        BIT(FATTR4_TIME_BACKUP - 32)
+#define FATTR4_WORD1_TIME_CREATE        BIT(FATTR4_TIME_CREATE - 32)
+#define FATTR4_WORD1_TIME_DELTA         BIT(FATTR4_TIME_DELTA - 32)
+#define FATTR4_WORD1_TIME_METADATA      BIT(FATTR4_TIME_METADATA - 32)
+#define FATTR4_WORD1_TIME_MODIFY        BIT(FATTR4_TIME_MODIFY - 32)
+#define FATTR4_WORD1_TIME_MODIFY_SET    BIT(FATTR4_TIME_MODIFY_SET - 32)
+#define FATTR4_WORD1_MOUNTED_ON_FILEID  BIT(FATTR4_MOUNTED_ON_FILEID - 32)
+#define FATTR4_WORD1_DACL               BIT(FATTR4_DACL	- 32)
+#define FATTR4_WORD1_SACL               BIT(FATTR4_SACL	- 32)
+#define FATTR4_WORD1_FS_LAYOUT_TYPES    BIT(FATTR4_FS_LAYOUT_TYPES - 32)
+
+#define FATTR4_WORD2_LAYOUT_TYPES       BIT(FATTR4_LAYOUT_TYPES - 64)
+#define FATTR4_WORD2_LAYOUT_BLKSIZE     BIT(FATTR4_LAYOUT_BLKSIZE - 64)
+#define FATTR4_WORD2_MDSTHRESHOLD       BIT(FATTR4_MDSTHRESHOLD	- 64)
+#define FATTR4_WORD2_CLONE_BLKSIZE	BIT(FATTR4_CLONE_BLKSIZE - 64)
+#define FATTR4_WORD2_CHANGE_ATTR_TYPE	BIT(FATTR4_CHANGE_ATTR_TYPE - 64)
+#define FATTR4_WORD2_SECURITY_LABEL     BIT(FATTR4_SEC_LABEL - 64)
+#define FATTR4_WORD2_MODE_UMASK		BIT(FATTR4_MODE_UMASK - 64)
+#define FATTR4_WORD2_XATTR_SUPPORT	BIT(FATTR4_XATTR_SUPPORT - 64)
 
 /* MDS threshold bitmap bits */
 #define THRESHOLD_RD                    (1UL << 0)
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 59df211d051f..a904861de800 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -12,6 +12,7 @@
 #define MAX_NUMNODES    (1 << NODES_SHIFT)
 
 #define	NUMA_NO_NODE	(-1)
+#define	NUMA_NO_MEMBLK	(-1)
 
 /* optionally keep NUMA memory info available post init */
 #ifdef CONFIG_NUMA_KEEP_MEMINFO
@@ -25,7 +26,7 @@
 #include <asm/sparsemem.h>
 
 /* Generic implementation available */
-int numa_map_to_online_node(int node);
+int numa_nearest_node(int node, unsigned int state);
 
 #ifndef memory_add_physaddr_to_nid
 static inline int memory_add_physaddr_to_nid(u64 start)
@@ -43,11 +44,18 @@ static inline int phys_to_target_node(u64 start)
 	return 0;
 }
 #endif
+#ifndef numa_fill_memblks
+static inline int __init numa_fill_memblks(u64 start, u64 end)
+{
+	return NUMA_NO_MEMBLK;
+}
+#endif
 #else /* !CONFIG_NUMA */
-static inline int numa_map_to_online_node(int node)
+static inline int numa_nearest_node(int node, unsigned int state)
 {
 	return NUMA_NO_NODE;
 }
+
 static inline int memory_add_physaddr_to_nid(u64 start)
 {
 	return 0;
@@ -58,6 +66,8 @@ static inline int phys_to_target_node(u64 start)
 }
 #endif
 
+#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
+
 #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
 extern const struct attribute_group arch_node_dev_group;
 #endif
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 03f82c2c2ebf..33212e93f4a6 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -48,13 +48,13 @@
 #define ANNOTATE_NOENDBR					\
 	"986: \n\t"						\
 	".pushsection .discard.noendbr\n\t"			\
-	".long 986b - .\n\t"					\
+	".long 986b\n\t"					\
 	".popsection\n\t"
 
 #define ASM_REACHABLE							\
 	"998:\n\t"							\
 	".pushsection .discard.reachable\n\t"				\
-	".long 998b - .\n\t"						\
+	".long 998b\n\t"						\
 	".popsection\n\t"
 
 #else /* __ASSEMBLY__ */
@@ -66,7 +66,7 @@
 #define ANNOTATE_INTRA_FUNCTION_CALL				\
 	999:							\
 	.pushsection .discard.intra_function_calls;		\
-	.long 999b - .;						\
+	.long 999b;						\
 	.popsection;
 
 /*
@@ -118,7 +118,7 @@
 .macro ANNOTATE_NOENDBR
 .Lhere_\@:
 	.pushsection .discard.noendbr
-	.long	.Lhere_\@ - .
+	.long	.Lhere_\@
 	.popsection
 .endm
 
@@ -130,7 +130,8 @@
  * it will be ignored.
  */
 .macro VALIDATE_UNRET_BEGIN
-#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
+#if defined(CONFIG_NOINSTR_VALIDATION) && \
+	(defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
 .Lhere_\@:
 	.pushsection .discard.validate_unret
 	.long	.Lhere_\@ - .
@@ -141,7 +142,7 @@
 .macro REACHABLE
 .Lhere_\@:
 	.pushsection .discard.reachable
-	.long	.Lhere_\@ - .
+	.long	.Lhere_\@
 	.popsection
 .endm
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c7c2c3c6c65..b56417276042 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1624,6 +1624,8 @@ struct msix_entry {
 	u16	entry;	/* Driver uses to specify entry, OS writes */
 };
 
+struct msi_domain_template;
+
 #ifdef CONFIG_PCI_MSI
 int pci_msi_vec_count(struct pci_dev *dev);
 void pci_disable_msi(struct pci_dev *dev);
@@ -1656,6 +1658,11 @@ void pci_msix_free_irq(struct pci_dev *pdev, struct msi_map map);
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
+bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
+			   unsigned int hwsize, void *data);
+struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
+				 const struct irq_affinity_desc *affdesc);
+void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
 
 #else
 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
@@ -1719,6 +1726,25 @@ static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev,
 {
 	return cpu_possible_mask;
 }
+
+static inline bool pci_create_ims_domain(struct pci_dev *pdev,
+					 const struct msi_domain_template *template,
+					 unsigned int hwsize, void *data)
+{ return false; }
+
+static inline struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev,
+					       union msi_instance_cookie *icookie,
+					       const struct irq_affinity_desc *affdesc)
+{
+	struct msi_map map = { .index = -ENOSYS, };
+
+	return map;
+}
+
+static inline void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map)
+{
+}
+
 #endif
 
 /**
@@ -2616,14 +2642,6 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
 void pci_uevent_ers(struct pci_dev *pdev, enum  pci_ers_result err_type);
 #endif
 
-struct msi_domain_template;
-
-bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
-			   unsigned int hwsize, void *data);
-struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
-				 const struct irq_affinity_desc *affdesc);
-void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
-
 #include <linux/dma-mapping.h>
 
 #define pci_printk(level, pdev, fmt, arg...) \
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0328a4c01e0b..91c1f6d5b44f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -581,6 +581,7 @@
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3
 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb
 #define PCI_DEVICE_ID_AMD_MI200_DF_F3	0x14d3
+#define PCI_DEVICE_ID_AMD_MI300_DF_F3	0x152b
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7b5406e3288d..afb028c54f33 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -879,6 +879,7 @@ struct perf_event_pmu_context {
 	unsigned int			embedded : 1;
 
 	unsigned int			nr_events;
+	unsigned int			nr_cgroups;
 
 	atomic_t			refcount; /* event <-> epc */
 	struct rcu_head			rcu_head;
@@ -1574,7 +1575,7 @@ extern int sysctl_perf_cpu_time_max_percent;
 
 extern void perf_sample_event_took(u64 sample_len_ns);
 
-int perf_proc_update_handler(struct ctl_table *table, int write,
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 608a9eb86bff..8ff23bf5a819 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -62,9 +62,6 @@ struct pipe_inode_info {
 	unsigned int tail;
 	unsigned int max_usage;
 	unsigned int ring_size;
-#ifdef CONFIG_WATCH_QUEUE
-	bool note_loss;
-#endif
 	unsigned int nr_accounted;
 	unsigned int readers;
 	unsigned int writers;
@@ -72,6 +69,9 @@ struct pipe_inode_info {
 	unsigned int r_counter;
 	unsigned int w_counter;
 	bool poll_usage;
+#ifdef CONFIG_WATCH_QUEUE
+	bool note_loss;
+#endif
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
@@ -125,6 +125,22 @@ struct pipe_buf_operations {
 };
 
 /**
+ * pipe_has_watch_queue - Check whether the pipe is a watch_queue,
+ * i.e. it was created with O_NOTIFICATION_PIPE
+ * @pipe: The pipe to check
+ *
+ * Return: true if pipe is a watch queue, false otherwise.
+ */
+static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe)
+{
+#ifdef CONFIG_WATCH_QUEUE
+	return pipe->watch_queue != NULL;
+#else
+	return false;
+#endif
+}
+
+/**
  * pipe_empty - Return true if the pipe is empty
  * @head: The pipe ring head pointer
  * @tail: The pipe ring tail pointer
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 80cb00db42a4..79594aeb160d 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -154,7 +154,9 @@ struct packet_stacked_data
 
 struct pktcdvd_device
 {
-	struct block_device	*bdev;		/* dev attached */
+	struct bdev_handle	*bdev_handle;	/* dev attached */
+	/* handle acquired for bdev during pkt_open_dev() */
+	struct bdev_handle	*open_bdev_handle;
 	dev_t			pkt_dev;	/* our dev */
 	struct packet_settings	settings;
 	struct packet_stats	stats;
diff --git a/include/linux/platform_data/gsc_hwmon.h b/include/linux/platform_data/gsc_hwmon.h
index f2781aa7eff8..70e8a6bec0f6 100644
--- a/include/linux/platform_data/gsc_hwmon.h
+++ b/include/linux/platform_data/gsc_hwmon.h
@@ -40,6 +40,6 @@ struct gsc_hwmon_platform_data {
 	unsigned int resolution;
 	unsigned int vreference;
 	unsigned int fan_base;
-	struct gsc_hwmon_channel channels[];
+	struct gsc_hwmon_channel channels[] __counted_by(nchannels);
 };
 #endif
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1424670df161..9aa6358a1a16 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void)
 	return level;
 }
 
+/*
+ * These macro definitions avoid redundant invocations of preempt_count()
+ * because such invocations would result in redundant loads given that
+ * preempt_count() is commonly implemented with READ_ONCE().
+ */
+
 #define nmi_count()	(preempt_count() & NMI_MASK)
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #ifdef CONFIG_PREEMPT_RT
 # define softirq_count()	(current->softirq_disable_cnt & SOFTIRQ_MASK)
+# define irq_count()		((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
 #else
 # define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+# define irq_count()		(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
 #endif
-#define irq_count()	(nmi_count() | hardirq_count() | softirq_count())
 
 /*
  * Macros to retrieve the current execution context:
@@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void)
 #define in_nmi()		(nmi_count())
 #define in_hardirq()		(hardirq_count())
 #define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
-#define in_task()		(!(in_nmi() | in_hardirq() | in_serving_softirq()))
+#ifdef CONFIG_PREEMPT_RT
+# define in_task()		(!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
+#else
+# define in_task()		(!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+#endif
 
 /*
  * The following macros are deprecated and should not be used in new code:
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index eceda1d1407a..730f77381d55 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -5,7 +5,7 @@
 
 struct pseudo_fs_context {
 	const struct super_operations *ops;
-	const struct xattr_handler **xattr;
+	const struct xattr_handler * const *xattr;
 	const struct dentry_operations *dops;
 	unsigned long magic;
 };
diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h
new file mode 100644
index 000000000000..ebf371364581
--- /dev/null
+++ b/include/linux/rcu_notifier.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update notifiers, initially RCU CPU stall notifier.
+ * Separate from rcupdate.h to avoid #include loops.
+ *
+ * Copyright (C) 2023 Paul E. McKenney.
+ */
+
+#ifndef __LINUX_RCU_NOTIFIER_H
+#define __LINUX_RCU_NOTIFIER_H
+
+// Actions for RCU CPU stall notifier calls.
+#define RCU_STALL_NOTIFY_NORM	1
+#define RCU_STALL_NOTIFY_EXP	2
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#include <linux/notifier.h>
+#include <linux/types.h>
+
+int rcu_stall_chain_notifier_register(struct notifier_block *n);
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n);
+
+#else // #ifdef CONFIG_RCU_STALL_COMMON
+
+// No RCU CPU stall warnings in Tiny RCU.
+static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; }
+static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; }
+
+#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+
+#endif /* __LINUX_RCU_NOTIFIER_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e5f920ade90..f7206b2623c9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -122,8 +122,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 void rcu_init(void);
 extern int rcu_scheduler_active;
 void rcu_sched_clock_irq(int user);
-void rcu_report_dead(unsigned int cpu);
-void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_TASKS_RCU_GENERIC
 void rcu_init_tasks_generic(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 7b949292908a..d9ac7b136aea 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -171,6 +171,6 @@ static inline void rcu_all_qs(void) { barrier(); }
 #define rcutree_offline_cpu      NULL
 #define rcutree_dead_cpu         NULL
 #define rcutree_dying_cpu        NULL
-static inline void rcu_cpu_starting(unsigned int cpu) { }
+static inline void rcutree_report_cpu_starting(unsigned int cpu) { }
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 126f6b418f6a..254244202ea9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -37,7 +37,6 @@ void synchronize_rcu_expedited(void);
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
 
 void rcu_barrier(void);
-bool rcu_eqs_special_set(int cpu);
 void rcu_momentary_dyntick_idle(void);
 void kfree_rcu_scheduler_running(void);
 bool rcu_gp_might_be_stalled(void);
@@ -111,9 +110,21 @@ void rcu_all_qs(void);
 /* RCUtree hotplug events */
 int rcutree_prepare_cpu(unsigned int cpu);
 int rcutree_online_cpu(unsigned int cpu);
-int rcutree_offline_cpu(unsigned int cpu);
+void rcutree_report_cpu_starting(unsigned int cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
 int rcutree_dead_cpu(unsigned int cpu);
 int rcutree_dying_cpu(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
+int rcutree_offline_cpu(unsigned int cpu);
+#else
+#define rcutree_dead_cpu NULL
+#define rcutree_dying_cpu NULL
+#define rcutree_offline_cpu NULL
+#endif
+
+void rcutree_migrate_callbacks(int cpu);
+
+/* Called from hotplug and also arm64 early secondary boot failure */
+void rcutree_report_cpu_dead(void);
 
 #endif /* __LINUX_RCUTREE_H */
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 8334eeacfec5..66942d7fba7f 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -94,7 +94,7 @@ struct rdt_domain {
  *			zero CBM.
  * @shareable_bits:	Bitmask of shareable resource with other
  *			executing entities
- * @arch_has_sparse_bitmaps:	True if a bitmap like f00f is valid.
+ * @arch_has_sparse_bitmasks:	True if a bitmask like f00f is valid.
  * @arch_has_per_cpu_cfg:	True if QOS_CFG register for this cache
  *				level has CPU scope.
  */
@@ -102,7 +102,7 @@ struct resctrl_cache {
 	unsigned int	cbm_len;
 	unsigned int	min_cbm_bits;
 	unsigned int	shareable_bits;
-	bool		arch_has_sparse_bitmaps;
+	bool		arch_has_sparse_bitmasks;
 	bool		arch_has_per_cpu_cfg;
 };
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 4c0bcbeb1f00..5f8e438a0312 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -225,6 +225,23 @@ static inline bool is_leap_year(unsigned int year)
 	return (!(year % 4) && (year % 100)) || !(year % 400);
 }
 
+/**
+ * rtc_bound_alarmtime() - Return alarm time bound by rtc limit
+ * @rtc: Pointer to rtc device structure
+ * @requested: Requested alarm timeout
+ *
+ * Return: Alarm timeout bound by maximum alarm time supported by rtc.
+ */
+static inline ktime_t rtc_bound_alarmtime(struct rtc_device *rtc,
+					  ktime_t requested)
+{
+	if (rtc->alarm_offset_max &&
+	    rtc->alarm_offset_max * MSEC_PER_SEC < ktime_to_ms(requested))
+		return ms_to_ktime(rtc->alarm_offset_max * MSEC_PER_SEC);
+
+	return requested;
+}
+
 #define devm_rtc_register_device(device) \
 	__devm_rtc_register_device(THIS_MODULE, device)
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..12ec109ce8c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,7 +63,6 @@ struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
-struct sched_param;
 struct seq_file;
 struct sighand_struct;
 struct signal_struct;
@@ -370,6 +369,10 @@ extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
 #endif
 
+struct sched_param {
+	int sched_priority;
+};
+
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
 	/* Cumulative counters: */
@@ -750,10 +753,8 @@ struct task_struct {
 #endif
 	unsigned int			__state;
 
-#ifdef CONFIG_PREEMPT_RT
 	/* saved state for "spinlock sleepers" */
 	unsigned int			saved_state;
-#endif
 
 	/*
 	 * This begins the randomizable portion of task_struct. Only
@@ -875,6 +876,7 @@ struct task_struct {
 
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
+	struct address_space		*faults_disabled_mapping;
 
 	int				exit_state;
 	int				exit_code;
@@ -911,6 +913,9 @@ struct task_struct {
 	 * ->sched_remote_wakeup gets used, so it can be in this word.
 	 */
 	unsigned			sched_remote_wakeup:1;
+#ifdef CONFIG_RT_MUTEXES
+	unsigned			sched_rt_mutex:1;
+#endif
 
 	/* Bit to tell LSMs we're in execve(): */
 	unsigned			in_execve:1;
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 7c83d4d5a971..df3aca89d4f5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_DEADLINE_H
+#define _LINUX_SCHED_DEADLINE_H
 
 /*
  * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p);
 extern void dl_clear_root_domain(struct root_domain *rd);
 
 #endif /* CONFIG_SMP */
+
+#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..b69afb8630db 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -15,6 +15,16 @@
 #define TNF_FAULT_LOCAL	0x08
 #define TNF_MIGRATE_FAIL 0x10
 
+enum numa_vmaskip_reason {
+	NUMAB_SKIP_UNSUITABLE,
+	NUMAB_SKIP_SHARED_RO,
+	NUMAB_SKIP_INACCESSIBLE,
+	NUMAB_SKIP_SCAN_DELAY,
+	NUMAB_SKIP_PID_INACTIVE,
+	NUMAB_SKIP_IGNORE_PID,
+	NUMAB_SKIP_SEQ_COMPLETED,
+};
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 994c25640e15..b2b9e6eb9683 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_RT_MUTEXES
+extern void rt_mutex_pre_schedule(void);
+extern void rt_mutex_schedule(void);
+extern void rt_mutex_post_schedule(void);
+
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
  */
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index fad77b5172e2..a8b28647aafc 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -110,6 +110,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
 SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
 
 /*
+ * Domain members share CPU cluster (LLC tags or L2 cache)
+ *
+ * NEEDS_GROUPS: Clusters are shared between groups.
+ */
+SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
+
+/*
  * Domain members share CPU package resources (i.e. caches)
  *
  * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..9610bad018a3 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void);
 	while ((t = next_thread(t)) != g)
 
 #define __for_each_thread(signal, t)	\
-	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
+		lockdep_is_held(&tasklist_lock))
 
 #define for_each_thread(p, t)		\
 	__for_each_thread((p)->signal, t)
diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index 59d3736c454c..fb1e295e7e63 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; }
 
 void arch_smt_update(void);
 
-#endif
+#endif /* _LINUX_SCHED_SMT_H */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 67b573d5bf28..de545ba85218 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void)
 #ifdef CONFIG_SCHED_CLUSTER
 static inline int cpu_cluster_flags(void)
 {
-	return SD_SHARE_PKG_RESOURCES;
+	return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
@@ -109,8 +109,6 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
-	u64 avg_scan_cost;		/* select_idle_sibling */
-
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -179,6 +177,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
 bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_resources(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -232,6 +231,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
+static inline bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
 #endif	/* !CONFIG_SMP */
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
index 3c3e049224ae..969aaf5ef9d6 100644
--- a/include/linux/sched/types.h
+++ b/include/linux/sched/types.h
@@ -20,4 +20,4 @@ struct task_cputime {
 	unsigned long long		sum_exec_runtime;
 };
 
-#endif
+#endif /* _LINUX_SCHED_TYPES_H */
diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
index 837a23624a66..bc60243d43b3 100644
--- a/include/linux/sched/vhost_task.h
+++ b/include/linux/sched/vhost_task.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_VHOST_TASK_H
-#define _LINUX_VHOST_TASK_H
-
+#ifndef _LINUX_SCHED_VHOST_TASK_H
+#define _LINUX_SCHED_VHOST_TASK_H
 
 struct vhost_task;
 
@@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk);
 void vhost_task_stop(struct vhost_task *vtsk);
 void vhost_task_wake(struct vhost_task *vtsk);
 
-#endif
+#endif /* _LINUX_SCHED_VHOST_TASK_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 5f16eecde00b..1d1df326c881 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -151,7 +151,7 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 		      const kernel_cap_t *effective,
 		      const kernel_cap_t *inheritable,
 		      const kernel_cap_t *permitted);
-extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
+extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags);
 int cap_inode_removexattr(struct mnt_idmap *idmap,
@@ -284,16 +284,16 @@ int security_capable(const struct cred *cred,
 		       struct user_namespace *ns,
 		       int cap,
 		       unsigned int opts);
-int security_quotactl(int cmds, int type, int id, struct super_block *sb);
+int security_quotactl(int cmds, int type, int id, const struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
 int security_settime64(const struct timespec64 *ts, const struct timezone *tz);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_bprm_creds_for_exec(struct linux_binprm *bprm);
-int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
+int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file);
 int security_bprm_check(struct linux_binprm *bprm);
-void security_bprm_committing_creds(struct linux_binprm *bprm);
-void security_bprm_committed_creds(struct linux_binprm *bprm);
+void security_bprm_committing_creds(const struct linux_binprm *bprm);
+void security_bprm_committed_creds(const struct linux_binprm *bprm);
 int security_fs_context_submount(struct fs_context *fc, struct super_block *reference);
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
 int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
@@ -304,7 +304,7 @@ void security_free_mnt_opts(void **mnt_opts);
 int security_sb_eat_lsm_opts(char *options, void **mnt_opts);
 int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts);
 int security_sb_remount(struct super_block *sb, void *mnt_opts);
-int security_sb_kern_mount(struct super_block *sb);
+int security_sb_kern_mount(const struct super_block *sb);
 int security_sb_show_options(struct seq_file *m, struct super_block *sb);
 int security_sb_statfs(struct dentry *dentry);
 int security_sb_mount(const char *dev_name, const struct path *path,
@@ -581,7 +581,7 @@ static inline int security_capable(const struct cred *cred,
 }
 
 static inline int security_quotactl(int cmds, int type, int id,
-				     struct super_block *sb)
+				     const struct super_block *sb)
 {
 	return 0;
 }
@@ -613,7 +613,7 @@ static inline int security_bprm_creds_for_exec(struct linux_binprm *bprm)
 }
 
 static inline int security_bprm_creds_from_file(struct linux_binprm *bprm,
-						struct file *file)
+						const struct file *file)
 {
 	return cap_bprm_creds_from_file(bprm, file);
 }
@@ -623,11 +623,11 @@ static inline int security_bprm_check(struct linux_binprm *bprm)
 	return 0;
 }
 
-static inline void security_bprm_committing_creds(struct linux_binprm *bprm)
+static inline void security_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 }
 
-static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
+static inline void security_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 }
 
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index e9bd2f65d7f4..e92f9d5577ba 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -191,11 +191,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
  * @lockname:		"LOCKNAME" part of seqcount_LOCKNAME_t
  * @locktype:		LOCKNAME canonical C data type
  * @preemptible:	preemptibility of above locktype
- * @lockmember:		argument for lockdep_assert_held()
- * @lockbase:		associated lock release function (prefix only)
- * @lock_acquire:	associated lock acquisition function (full call)
+ * @lockbase:		prefix for associated lock/unlock
  */
-#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \
+#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)	\
 typedef struct seqcount_##lockname {					\
 	seqcount_t		seqcount;				\
 	__SEQ_LOCK(locktype	*lock);					\
@@ -207,6 +205,12 @@ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)			\
 	return &s->seqcount;						\
 }									\
 									\
+static __always_inline const seqcount_t *				\
+__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)	\
+{									\
+	return &s->seqcount;						\
+}									\
+									\
 static __always_inline unsigned						\
 __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
 {									\
@@ -216,7 +220,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
 		return seq;						\
 									\
 	if (preemptible && unlikely(seq & 1)) {				\
-		__SEQ_LOCK(lock_acquire);				\
+		__SEQ_LOCK(lockbase##_lock(s->lock));			\
 		__SEQ_LOCK(lockbase##_unlock(s->lock));			\
 									\
 		/*							\
@@ -242,7 +246,7 @@ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)	\
 static __always_inline void						\
 __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)		\
 {									\
-	__SEQ_LOCK(lockdep_assert_held(lockmember));			\
+	__SEQ_LOCK(lockdep_assert_held(s->lock));			\
 }
 
 /*
@@ -254,6 +258,11 @@ static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
 	return s;
 }
 
+static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
+{
+	return s;
+}
+
 static inline unsigned __seqprop_sequence(const seqcount_t *s)
 {
 	return READ_ONCE(s->sequence);
@@ -271,10 +280,10 @@ static inline void __seqprop_assert(const seqcount_t *s)
 
 #define __SEQ_RT	IS_ENABLED(CONFIG_PREEMPT_RT)
 
-SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    s->lock,        raw_spin, raw_spin_lock(s->lock))
-SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, s->lock,        spin,     spin_lock(s->lock))
-SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, s->lock,        read,     read_lock(s->lock))
-SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     s->lock,        mutex,    mutex_lock(s->lock))
+SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
+SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
+SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
+SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 
 /*
  * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
@@ -294,19 +303,20 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     s->lock,        mutex
 #define SEQCNT_WW_MUTEX_ZERO(name, lock) 	SEQCOUNT_LOCKNAME_ZERO(name, lock)
 
 #define __seqprop_case(s, lockname, prop)				\
-	seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s))
+	seqcount_##lockname##_t: __seqprop_##lockname##_##prop
 
 #define __seqprop(s, prop) _Generic(*(s),				\
-	seqcount_t:		__seqprop_##prop((void *)(s)),		\
+	seqcount_t:		__seqprop_##prop,			\
 	__seqprop_case((s),	raw_spinlock,	prop),			\
 	__seqprop_case((s),	spinlock,	prop),			\
 	__seqprop_case((s),	rwlock,		prop),			\
 	__seqprop_case((s),	mutex,		prop))
 
-#define seqprop_ptr(s)			__seqprop(s, ptr)
-#define seqprop_sequence(s)		__seqprop(s, sequence)
-#define seqprop_preemptible(s)		__seqprop(s, preemptible)
-#define seqprop_assert(s)		__seqprop(s, assert)
+#define seqprop_ptr(s)			__seqprop(s, ptr)(s)
+#define seqprop_const_ptr(s)		__seqprop(s, const_ptr)(s)
+#define seqprop_sequence(s)		__seqprop(s, sequence)(s)
+#define seqprop_preemptible(s)		__seqprop(s, preemptible)(s)
+#define seqprop_assert(s)		__seqprop(s, assert)(s)
 
 /**
  * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
@@ -355,7 +365,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     s->lock,        mutex
  */
 #define read_seqcount_begin(s)						\
 ({									\
-	seqcount_lockdep_reader_access(seqprop_ptr(s));			\
+	seqcount_lockdep_reader_access(seqprop_const_ptr(s));		\
 	raw_read_seqcount_begin(s);					\
 })
 
@@ -421,7 +431,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     s->lock,        mutex
  * Return: true if a read section retry is required, else false
  */
 #define __read_seqcount_retry(s, start)					\
-	do___read_seqcount_retry(seqprop_ptr(s), start)
+	do___read_seqcount_retry(seqprop_const_ptr(s), start)
 
 static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
@@ -441,7 +451,7 @@ static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
  * Return: true if a read section retry is required, else false
  */
 #define read_seqcount_retry(s, start)					\
-	do_read_seqcount_retry(seqprop_ptr(s), start)
+	do_read_seqcount_retry(seqprop_const_ptr(s), start)
 
 static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
@@ -574,7 +584,7 @@ static inline void do_write_seqcount_end(seqcount_t *s)
  * via WRITE_ONCE): a) to ensure the writes become visible to other threads
  * atomically, avoiding compiler optimizations; b) to document which writes are
  * meant to propagate to the reader critical section. This is necessary because
- * neither writes before and after the barrier are enclosed in a seq-writer
+ * neither writes before nor after the barrier are enclosed in a seq-writer
  * critical section that would ensure readers are aware of ongoing writes::
  *
  *	seqcount_t seq;
@@ -864,7 +874,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 }
 
 /*
- * For all seqlock_t write side functions, use the the internal
+ * For all seqlock_t write side functions, use the internal
  * do_write_seqcount_begin() instead of generic write_seqcount_begin().
  * This way, no redundant lockdep_assert_held() checks are added.
  */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 97bfef071255..27998f73183e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3679,6 +3679,9 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l
 	return __skb_put_padto(skb, len, true);
 }
 
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i)
+	__must_check;
+
 static inline int skb_add_data(struct sk_buff *skb,
 			       struct iov_iter *from, int copy)
 {
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8228d1276a2f..ff56ab804bf6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -245,8 +245,9 @@ DEFINE_FREE(kfree, void *, if (_T) kfree(_T))
 size_t ksize(const void *objp);
 
 #ifdef CONFIG_PRINTK
-bool kmem_valid_obj(void *object);
-void kmem_dump_obj(void *object);
+bool kmem_dump_obj(void *object);
+#else
+static inline bool kmem_dump_obj(void *object) { return false; }
 #endif
 
 /*
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 91ea4a67f8ca..e87520dc2959 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,7 +53,7 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 			   void *info, bool wait, const struct cpumask *mask);
 
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
 /*
  * Cpus stopping functions in panic. All have default weak definitions.
diff --git a/include/linux/string.h b/include/linux/string.h
index dbfc66400050..9e3cb6923b0e 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -277,10 +277,12 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
  */
 #define strtomem_pad(dest, src, pad)	do {				\
 	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+	const size_t _src_len = __builtin_object_size(src, 1);		\
 									\
 	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
 		     _dest_len == (size_t)-1);				\
-	memcpy_and_pad(dest, _dest_len, src, strnlen(src, _dest_len), pad); \
+	memcpy_and_pad(dest, _dest_len, src,				\
+		       strnlen(src, min(_src_len, _dest_len)), pad);	\
 } while (0)
 
 /**
@@ -298,10 +300,11 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
  */
 #define strtomem(dest, src)	do {					\
 	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+	const size_t _src_len = __builtin_object_size(src, 1);		\
 									\
 	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
 		     _dest_len == (size_t)-1);				\
-	memcpy(dest, src, min(_dest_len, strnlen(src, _dest_len)));	\
+	memcpy(dest, src, strnlen(src, min(_src_len, _dest_len)));	\
 } while (0)
 
 /**
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 9d1f5bb74dd5..58fb1f90eda5 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -24,8 +24,8 @@ enum string_size_units {
 	STRING_UNITS_2,		/* use binary powers of 2^10 */
 };
 
-void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
-		     char *buf, int len);
+int string_get_size(u64 size, u64 blk_size, enum string_size_units units,
+		    char *buf, int len);
 
 int parse_int_array_user(const char __user *from, size_t count, int **array);
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dbf5b21feafe..b10f987509cc 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -17,6 +17,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/svcauth.h>
+#include <linux/lwq.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/pagevec.h>
@@ -33,10 +34,10 @@
  */
 struct svc_pool {
 	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
-	spinlock_t		sp_lock;	/* protects all fields */
-	struct list_head	sp_sockets;	/* pending sockets */
-	unsigned int		sp_nrthreads;	/* # of threads in pool */
+	struct lwq		sp_xprts;	/* pending transports */
+	atomic_t		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
+	struct llist_head	sp_idle_threads; /* idle server threads */
 
 	/* statistics on pool operation */
 	struct percpu_counter	sp_messages_arrived;
@@ -49,7 +50,8 @@ struct svc_pool {
 /* bits for sp_flags */
 enum {
 	SP_TASK_PENDING,	/* still work to do even if no xprt is queued */
-	SP_CONGESTED,		/* all threads are busy, none idle */
+	SP_NEED_VICTIM,		/* One thread needs to agree to exit */
+	SP_VICTIM_REMAINS,	/* One thread needs to actually exit */
 };
 
 
@@ -88,12 +90,9 @@ struct svc_serv {
 	int			(*sv_threadfn)(void *data);
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-	struct list_head	sv_cb_list;	/* queue for callback requests
+	struct lwq		sv_cb_list;	/* queue for callback requests
 						 * that arrive over the same
 						 * connection */
-	spinlock_t		sv_cb_lock;	/* protects the svc_cb_list */
-	wait_queue_head_t	sv_cb_waitq;	/* sleep here if there are no
-						 * entries in the svc_cb_list */
 	bool			sv_bc_enabled;	/* service uses backchannel */
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
@@ -186,6 +185,7 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp);
  */
 struct svc_rqst {
 	struct list_head	rq_all;		/* all threads list */
+	struct llist_node	rq_idle;	/* On the idle list */
 	struct rcu_head		rq_rcu_head;	/* for RCU deferred kfree */
 	struct svc_xprt *	rq_xprt;	/* transport ptr */
 
@@ -251,6 +251,7 @@ struct svc_rqst {
 						 * net namespace
 						 */
 	void **			rq_lease_breaker; /* The v4 client breaking a lease */
+	unsigned int		rq_status_counter; /* RPC processing counter */
 };
 
 /* bits for rq_flags */
@@ -261,8 +262,7 @@ enum {
 	RQ_DROPME,		/* drop current reply */
 	RQ_SPLICE_OK,		/* turned off in gss privacy to prevent
 				 * encrypting page cache pages */
-	RQ_VICTIM,		/* about to be shut down */
-	RQ_BUSY,		/* request is busy */
+	RQ_VICTIM,		/* Have agreed to shut down */
 	RQ_DATA,		/* request has data */
 };
 
@@ -301,6 +301,28 @@ static inline struct sockaddr *svc_daddr(const struct svc_rqst *rqst)
 	return (struct sockaddr *) &rqst->rq_daddr;
 }
 
+/**
+ * svc_thread_should_stop - check if this thread should stop
+ * @rqstp: the thread that might need to stop
+ *
+ * To stop an svc thread, the pool flags SP_NEED_VICTIM and SP_VICTIM_REMAINS
+ * are set.  The first thread which sees SP_NEED_VICTIM clears it, becoming
+ * the victim using this function.  It should then promptly call
+ * svc_exit_thread() to complete the process, clearing SP_VICTIM_REMAINS
+ * so the task waiting for a thread to exit can wake and continue.
+ *
+ * Return values:
+ *   %true: caller should invoke svc_exit_thread()
+ *   %false: caller should do nothing
+ */
+static inline bool svc_thread_should_stop(struct svc_rqst *rqstp)
+{
+	if (test_and_clear_bit(SP_NEED_VICTIM, &rqstp->rq_pool->sp_flags))
+		set_bit(RQ_VICTIM, &rqstp->rq_flags);
+
+	return test_bit(RQ_VICTIM, &rqstp->rq_flags);
+}
+
 struct svc_deferred_req {
 	u32			prot;	/* protocol (UDP or TCP) */
 	struct svc_xprt		*xprt;
@@ -413,8 +435,7 @@ struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_process(struct svc_rqst *rqstp);
-int		   bc_svc_process(struct svc_serv *, struct rpc_rqst *,
-			struct svc_rqst *);
+void		   svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp);
 int		   svc_register(const struct svc_serv *, struct net *, const int,
 				const unsigned short, const unsigned short);
 
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index fa55d12dc765..8e20cd60e2e7 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -54,7 +54,7 @@ struct svc_xprt {
 	const struct svc_xprt_ops *xpt_ops;
 	struct kref		xpt_ref;
 	struct list_head	xpt_list;
-	struct list_head	xpt_ready;
+	struct lwq_node		xpt_ready;
 	unsigned long		xpt_flags;
 
 	struct svc_serv		*xpt_server;	/* service for transport */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 4ecc89301eb7..f85d3a0daca2 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -57,6 +57,7 @@ struct xprt_class;
 struct seq_file;
 struct svc_serv;
 struct net;
+#include <linux/lwq.h>
 
 /*
  * This describes a complete RPC request
@@ -121,7 +122,7 @@ struct rpc_rqst {
 	int			rq_ntrans;
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-	struct list_head	rq_bc_list;	/* Callback service list */
+	struct lwq_node		rq_bc_list;	/* Callback service list */
 	unsigned long		rq_bc_pa_state;	/* Backchannel prealloc state */
 	struct list_head	rq_bc_pa_list;	/* Backchannel prealloc list */
 #endif /* CONFIG_SUNRPC_BACKCHANEL */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 493487ed7c38..f6dd6575b905 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,6 +298,7 @@ struct swap_info_struct {
 	unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
+	struct bdev_handle *bdev_handle;/* open handle of the bdev */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..0901af60d971 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -549,6 +549,16 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
 				unsigned int nr_futexes, unsigned int flags,
 				struct __kernel_timespec __user *timeout, clockid_t clockid);
+
+asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags);
+
+asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned long mask,
+			       unsigned int flags, struct __kernel_timespec __user *timespec,
+			       clockid_t clockid);
+
+asmlinkage long sys_futex_requeue(struct futex_waitv __user *waiters,
+				  unsigned int flags, int nr_wake, int nr_requeue);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 9459fef5b857..716d17f31c45 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -140,14 +140,6 @@ extern unsigned long tick_nohz_get_idle_calls(void);
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
-
-static inline void tick_nohz_idle_stop_tick_protected(void)
-{
-	local_irq_disable();
-	tick_nohz_idle_stop_tick();
-	local_irq_enable();
-}
-
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
@@ -170,8 +162,6 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
-
-static inline void tick_nohz_idle_stop_tick_protected(void) { }
 #endif /* !CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/include/linux/topology.h b/include/linux/topology.h
index fea32377f7c7..52f5850730b3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int
 #else
 static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-	return cpumask_nth(cpu, cpus);
+	return cpumask_nth_and(cpu, cpus, cpu_online_mask);
 }
 
 static inline const struct cpumask *
diff --git a/include/linux/torture.h b/include/linux/torture.h
index bb466eec01e4..c98d0c83d117 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -81,7 +81,8 @@ static inline void torture_random_init(struct torture_random_state *trsp)
 }
 
 /* Definitions for high-resolution-timer sleeps. */
-int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp);
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+			 struct torture_random_state *trsp);
 int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp);
 int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp);
 int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp);
@@ -120,10 +121,15 @@ void _torture_stop_kthread(char *m, struct task_struct **tp);
 #define torture_stop_kthread(n, tp) \
 	_torture_stop_kthread("Stopping " #n " task", &(tp))
 
+/* Scheduler-related definitions. */
 #ifdef CONFIG_PREEMPTION
 #define torture_preempt_schedule() __preempt_schedule()
 #else
 #define torture_preempt_schedule()	do { } while (0)
 #endif
 
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+#endif
+
 #endif /* __LINUX_TORTURE_H */
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 42bce38a8e87..b6214cbf2a43 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -21,12 +21,12 @@ struct kvec {
 
 enum iter_type {
 	/* iter types */
+	ITER_UBUF,
 	ITER_IOVEC,
-	ITER_KVEC,
 	ITER_BVEC,
+	ITER_KVEC,
 	ITER_XARRAY,
 	ITER_DISCARD,
-	ITER_UBUF,
 };
 
 #define ITER_SOURCE	1	// == WRITE
@@ -43,11 +43,7 @@ struct iov_iter {
 	bool copy_mc;
 	bool nofault;
 	bool data_source;
-	bool user_backed;
-	union {
-		size_t iov_offset;
-		int last_offset;
-	};
+	size_t iov_offset;
 	/*
 	 * Hack alert: overlay ubuf_iovec with iovec + count, so
 	 * that the members resolve correctly regardless of the type
@@ -143,7 +139,7 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 
 static inline bool user_backed_iter(const struct iov_iter *i)
 {
-	return i->user_backed;
+	return iter_is_ubuf(i) || iter_is_iovec(i);
 }
 
 /*
@@ -342,27 +338,6 @@ iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
 	return npages;
 }
 
-struct csum_state {
-	__wsum csum;
-	size_t off;
-};
-
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i);
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
-
-static __always_inline __must_check
-bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
-				  __wsum *csum, struct iov_iter *i)
-{
-	size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i);
-	if (likely(copied == bytes))
-		return true;
-	iov_iter_revert(i, copied);
-	return false;
-}
-size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
-		struct iov_iter *i);
-
 struct iovec *iovec_from_user(const struct iovec __user *uvector,
 		unsigned long nr_segs, unsigned long fast_segs,
 		struct iovec *fast_iov, bool compat);
@@ -383,7 +358,6 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	*i = (struct iov_iter) {
 		.iter_type = ITER_UBUF,
 		.copy_mc = false,
-		.user_backed = true,
 		.data_source = direction,
 		.ubuf = buf,
 		.count = count,
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09bec02c4..6030a8235617 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -65,6 +65,10 @@ enum rlimit_type {
 	UCOUNT_RLIMIT_COUNTS,
 };
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc;
+#endif
+
 struct user_namespace {
 	struct uid_gid_map	uid_map;
 	struct uid_gid_map	gid_map;
@@ -102,6 +106,10 @@ struct user_namespace {
 	struct ucounts		*ucounts;
 	long ucount_max[UCOUNT_COUNTS];
 	long rlimit_max[UCOUNT_RLIMIT_COUNTS];
+
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	struct binfmt_misc *binfmt_misc;
+#endif
 } __randomize_layout;
 
 struct ucounts {
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index 45cd42f55d49..429c7b6afead 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -32,7 +32,7 @@ struct watch_filter {
 		DECLARE_BITMAP(type_filter, WATCH_TYPE__NR);
 	};
 	u32			nr_filters;	/* Number of filters */
-	struct watch_type_filter filters[];
+	struct watch_type_filter filters[] __counted_by(nr_filters);
 };
 
 struct watch_queue {
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1c1d06804d45..24b1e5070f4d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -274,18 +274,16 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
  * to generate better code.
  */
 #ifdef CONFIG_LOCKDEP
-#define __INIT_WORK(_work, _func, _onstack)				\
+#define __INIT_WORK_KEY(_work, _func, _onstack, _key)			\
 	do {								\
-		static struct lock_class_key __key;			\
-									\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
-		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
+		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \
 		INIT_LIST_HEAD(&(_work)->entry);			\
 		(_work)->func = (_func);				\
 	} while (0)
 #else
-#define __INIT_WORK(_work, _func, _onstack)				\
+#define __INIT_WORK_KEY(_work, _func, _onstack, _key)			\
 	do {								\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
@@ -294,12 +292,22 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 	} while (0)
 #endif
 
+#define __INIT_WORK(_work, _func, _onstack)				\
+	do {								\
+		static __maybe_unused struct lock_class_key __key;	\
+									\
+		__INIT_WORK_KEY(_work, _func, _onstack, &__key);	\
+	} while (0)
+
 #define INIT_WORK(_work, _func)						\
 	__INIT_WORK((_work), (_func), 0)
 
 #define INIT_WORK_ONSTACK(_work, _func)					\
 	__INIT_WORK((_work), (_func), 1)
 
+#define INIT_WORK_ONSTACK_KEY(_work, _func, _key)			\
+	__INIT_WORK_KEY((_work), (_func), 1, _key)
+
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
 		INIT_WORK(&(_work)->work, (_func));			\
@@ -693,8 +701,32 @@ static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
 	return fn(arg);
 }
 #else
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+		     void *arg, struct lock_class_key *key);
+/*
+ * A new key is defined for each caller to make sure the work
+ * associated with the function doesn't share its locking class.
+ */
+#define work_on_cpu(_cpu, _fn, _arg)			\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	work_on_cpu_key(_cpu, _fn, _arg, &__key);	\
+})
+
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+			  void *arg, struct lock_class_key *key);
+
+/*
+ * A new key is defined for each caller to make sure the work
+ * associated with the function doesn't share its locking class.
+ */
+#define work_on_cpu_safe(_cpu, _fn, _arg)		\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	work_on_cpu_safe_key(_cpu, _fn, _arg, &__key);	\
+})
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index fd41fdac0a8e..65e49fae8da7 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -162,8 +162,24 @@ struct scsi_device {
 				 * core. */
 	unsigned int eh_timeout; /* Error handling timeout */
 
-	bool manage_system_start_stop; /* Let HLD (sd) manage system start/stop */
-	bool manage_runtime_start_stop; /* Let HLD (sd) manage runtime start/stop */
+	/*
+	 * If true, let the high-level device driver (sd) manage the device
+	 * power state for system suspend/resume (suspend to RAM and
+	 * hibernation) operations.
+	 */
+	bool manage_system_start_stop;
+
+	/*
+	 * If true, let the high-level device driver (sd) manage the device
+	 * power state for runtime device suspand and resume operations.
+	 */
+	bool manage_runtime_start_stop;
+
+	/*
+	 * If true, let the high-level device driver (sd) manage the device
+	 * power state for system shutdown (power off) operations.
+	 */
+	bool manage_shutdown;
 
 	unsigned removable:1;
 	unsigned changed:1;	/* Data invalid due to media change */
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index b2db2c2f1c57..279a7a0c90c0 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1561,7 +1561,6 @@ DECLARE_EVENT_CLASS(btrfs__work,
 		__field(	const void *,	wq			)
 		__field(	const void *,	func			)
 		__field(	const void *,	ordered_func		)
-		__field(	const void *,	ordered_free		)
 		__field(	const void *,	normal_work		)
 	),
 
@@ -1570,14 +1569,12 @@ DECLARE_EVENT_CLASS(btrfs__work,
 		__entry->wq		= work->wq;
 		__entry->func		= work->func;
 		__entry->ordered_func	= work->ordered_func;
-		__entry->ordered_free	= work->ordered_free;
 		__entry->normal_work	= &work->normal_work;
 	),
 
-	TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%ps ordered_func=%p "
-		  "ordered_free=%p",
+	TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%ps ordered_func=%p",
 		  __entry->work, __entry->normal_work, __entry->wq,
-		   __entry->func, __entry->ordered_func, __entry->ordered_free)
+		   __entry->func, __entry->ordered_func)
 );
 
 /*
@@ -2497,6 +2494,82 @@ DEFINE_EVENT(btrfs_raid56_bio, raid56_write,
 	TP_ARGS(rbio, bio, trace_info)
 );
 
+TRACE_EVENT(btrfs_insert_one_raid_extent,
+
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 logical, u64 length,
+		 int num_stripes),
+
+	TP_ARGS(fs_info, logical, length, num_stripes),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	logical		)
+		__field(	u64,	length		)
+		__field(	int,	num_stripes	)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->logical	= logical;
+		__entry->length		= length;
+		__entry->num_stripes	= num_stripes;
+	),
+
+	TP_printk_btrfs("logical=%llu length=%llu num_stripes=%d",
+			__entry->logical, __entry->length,
+			__entry->num_stripes)
+);
+
+TRACE_EVENT(btrfs_raid_extent_delete,
+
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 end,
+		 u64 found_start, u64 found_end),
+
+	TP_ARGS(fs_info, start, end, found_start, found_end),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	start		)
+		__field(	u64,	end		)
+		__field(	u64,	found_start	)
+		__field(	u64,	found_end	)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->start		= start;
+		__entry->end		= end;
+		__entry->found_start	= found_start;
+		__entry->found_end	= found_end;
+	),
+
+	TP_printk_btrfs("start=%llu end=%llu found_start=%llu found_end=%llu",
+			__entry->start, __entry->end, __entry->found_start,
+			__entry->found_end)
+);
+
+TRACE_EVENT(btrfs_get_raid_extent_offset,
+
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 logical, u64 length,
+		 u64 physical, u64 devid),
+
+	TP_ARGS(fs_info, logical, length, physical, devid),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	logical		)
+		__field(	u64,	length		)
+		__field(	u64,	physical	)
+		__field(	u64,	devid		)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->logical	= logical;
+		__entry->length		= length;
+		__entry->physical	= physical;
+		__entry->devid		= devid;
+	),
+
+	TP_printk_btrfs("logical=%llu length=%llu physical=%llu devid=%llu",
+			__entry->logical, __entry->length, __entry->physical,
+			__entry->devid)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/csd.h b/include/trace/events/csd.h
index 67e9d01f80c2..58cc83b99c34 100644
--- a/include/trace/events/csd.h
+++ b/include/trace/events/csd.h
@@ -12,7 +12,7 @@ TRACE_EVENT(csd_queue_cpu,
 	TP_PROTO(const unsigned int cpu,
 		unsigned long callsite,
 		smp_call_func_t func,
-		struct __call_single_data *csd),
+		call_single_data_t *csd),
 
 	TP_ARGS(cpu, callsite, func, csd),
 
@@ -39,7 +39,7 @@ TRACE_EVENT(csd_queue_cpu,
  */
 DECLARE_EVENT_CLASS(csd_function,
 
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 
 	TP_ARGS(func, csd),
 
@@ -57,12 +57,12 @@ DECLARE_EVENT_CLASS(csd_function,
 );
 
 DEFINE_EVENT(csd_function, csd_function_entry,
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 	TP_ARGS(func, csd)
 );
 
 DEFINE_EVENT(csd_function, csd_function_exit,
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 	TP_ARGS(func, csd)
 );
 
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index f8069ef2ee0f..718df1d9b834 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -1667,7 +1667,7 @@ TRACE_EVENT(svcrdma_encode_wseg,
 		__entry->offset = offset;
 	),
 
-	TP_printk("cq_id=%u cid=%d segno=%u %u@0x%016llx:0x%08x",
+	TP_printk("cq.id=%u cid=%d segno=%u %u@0x%016llx:0x%08x",
 		__entry->cq_id, __entry->completion_id,
 		__entry->segno, __entry->length,
 		(unsigned long long)__entry->offset, __entry->handle
@@ -1703,7 +1703,7 @@ TRACE_EVENT(svcrdma_decode_rseg,
 		__entry->offset = segment->rs_offset;
 	),
 
-	TP_printk("cq_id=%u cid=%d segno=%u position=%u %u@0x%016llx:0x%08x",
+	TP_printk("cq.id=%u cid=%d segno=%u position=%u %u@0x%016llx:0x%08x",
 		__entry->cq_id, __entry->completion_id,
 		__entry->segno, __entry->position, __entry->length,
 		(unsigned long long)__entry->offset, __entry->handle
@@ -1740,7 +1740,7 @@ TRACE_EVENT(svcrdma_decode_wseg,
 		__entry->offset = segment->rs_offset;
 	),
 
-	TP_printk("cq_id=%u cid=%d segno=%u %u@0x%016llx:0x%08x",
+	TP_printk("cq.id=%u cid=%d segno=%u %u@0x%016llx:0x%08x",
 		__entry->cq_id, __entry->completion_id,
 		__entry->segno, __entry->length,
 		(unsigned long long)__entry->offset, __entry->handle
@@ -1959,7 +1959,7 @@ TRACE_EVENT(svcrdma_send_pullup,
 		__entry->msglen = msglen;
 	),
 
-	TP_printk("cq_id=%u cid=%d hdr=%u msg=%u (total %u)",
+	TP_printk("cq.id=%u cid=%d hdr=%u msg=%u (total %u)",
 		__entry->cq_id, __entry->completion_id,
 		__entry->hdrlen, __entry->msglen,
 		__entry->hdrlen + __entry->msglen)
@@ -2014,7 +2014,7 @@ TRACE_EVENT(svcrdma_post_send,
 					wr->ex.invalidate_rkey : 0;
 	),
 
-	TP_printk("cq_id=%u cid=%d num_sge=%u inv_rkey=0x%08x",
+	TP_printk("cq.id=%u cid=%d num_sge=%u inv_rkey=0x%08x",
 		__entry->cq_id, __entry->completion_id,
 		__entry->num_sge, __entry->inv_rkey
 	)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..6188ad0d9e0d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -664,6 +664,58 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
 );
 
+#ifdef CONFIG_NUMA_BALANCING
+#define NUMAB_SKIP_REASON					\
+	EM( NUMAB_SKIP_UNSUITABLE,		"unsuitable" )	\
+	EM( NUMAB_SKIP_SHARED_RO,		"shared_ro" )	\
+	EM( NUMAB_SKIP_INACCESSIBLE,		"inaccessible" )	\
+	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
+	EM( NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )	\
+	EM( NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )		\
+	EMe(NUMAB_SKIP_SEQ_COMPLETED,		"seq_completed" )
+
+/* Redefine for export. */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+NUMAB_SKIP_REASON
+
+/* Redefine for symbolic printing. */
+#undef EM
+#undef EMe
+#define EM(a, b)	{ a, b },
+#define EMe(a, b)	{ a, b }
+
+TRACE_EVENT(sched_skip_vma_numa,
+
+	TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
+		 enum numa_vmaskip_reason reason),
+
+	TP_ARGS(mm, vma, reason),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, numa_scan_offset)
+		__field(unsigned long, vm_start)
+		__field(unsigned long, vm_end)
+		__field(enum numa_vmaskip_reason, reason)
+	),
+
+	TP_fast_assign(
+		__entry->numa_scan_offset	= mm->numa_scan_offset;
+		__entry->vm_start		= vma->vm_start;
+		__entry->vm_end			= vma->vm_end;
+		__entry->reason			= reason;
+	),
+
+	TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
+		  __entry->numa_scan_offset,
+		  __entry->vm_start,
+		  __entry->vm_end,
+		  __print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
+);
+#endif /* CONFIG_NUMA_BALANCING */
 
 /*
  * Tracepoint for waking a polling cpu without an IPI.
@@ -735,6 +787,11 @@ DECLARE_TRACE(sched_update_nr_running_tp,
 	TP_PROTO(struct rq *rq, int change),
 	TP_ARGS(rq, change));
 
+DECLARE_TRACE(sched_compute_energy_tp,
+	TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
+		 unsigned long max_util, unsigned long busy_time),
+	TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 6beb38c1dcb5..337c90787fb1 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1677,7 +1677,6 @@ DEFINE_SVCXDRBUF_EVENT(sendto);
 	svc_rqst_flag(DROPME)						\
 	svc_rqst_flag(SPLICE_OK)					\
 	svc_rqst_flag(VICTIM)						\
-	svc_rqst_flag(BUSY)						\
 	svc_rqst_flag_end(DATA)
 
 #undef svc_rqst_flag
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..d9e9cd13e577 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -822,9 +822,15 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+#define __NR_futex_wake 454
+__SYSCALL(__NR_futex_wake, sys_futex_wake)
+#define __NR_futex_wait 455
+__SYSCALL(__NR_futex_wait, sys_futex_wait)
+#define __NR_futex_requeue 456
+__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
 
 #undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 457
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index dbb8b96da50d..7c29d82db9ee 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -333,6 +333,8 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_RAID1C34		(1ULL << 11)
 #define BTRFS_FEATURE_INCOMPAT_ZONED		(1ULL << 12)
 #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2	(1ULL << 13)
+#define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE	(1ULL << 14)
+#define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA	(1ULL << 16)
 
 struct btrfs_ioctl_feature_flags {
 	__u64 compat_flags;
@@ -753,6 +755,7 @@ struct btrfs_ioctl_get_dev_stats {
 #define BTRFS_QUOTA_CTL_ENABLE	1
 #define BTRFS_QUOTA_CTL_DISABLE	2
 #define BTRFS_QUOTA_CTL_RESCAN__NOTUSED	3
+#define BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA 4
 struct btrfs_ioctl_quota_ctl_args {
 	__u64 cmd;
 	__u64 status;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index fc3c32186d7e..c25fc9614594 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -73,6 +73,9 @@
 /* Holds the block group items for extent tree v2. */
 #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
 
+/* Tracks RAID stripes in block groups. */
+#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
+
 /* device stats in the device tree */
 #define BTRFS_DEV_STATS_OBJECTID 0ULL
 
@@ -231,6 +234,14 @@
 #define BTRFS_SHARED_DATA_REF_KEY	184
 
 /*
+ * Special inline ref key which stores the id of the subvolume which originally
+ * created the extent. This subvolume owns the extent permanently from the
+ * perspective of simple quotas. Needed to know which subvolume to free quota
+ * usage from when the extent is deleted.
+ */
+#define BTRFS_EXTENT_OWNER_REF_KEY	188
+
+/*
  * block groups give us hints into the extent allocation trees.  Which
  * blocks are free etc etc
  */
@@ -261,6 +272,8 @@
 #define BTRFS_DEV_ITEM_KEY	216
 #define BTRFS_CHUNK_ITEM_KEY	228
 
+#define BTRFS_RAID_STRIPE_KEY	230
+
 /*
  * Records the overall state of the qgroups.
  * There's only one instance of this key present,
@@ -719,6 +732,30 @@ struct btrfs_free_space_header {
 	__le64 num_bitmaps;
 } __attribute__ ((__packed__));
 
+struct btrfs_raid_stride {
+	/* The id of device this raid extent lives on. */
+	__le64 devid;
+	/* The physical location on disk. */
+	__le64 physical;
+} __attribute__ ((__packed__));
+
+/* The stripe_extent::encoding, 1:1 mapping of enum btrfs_raid_types. */
+#define BTRFS_STRIPE_RAID0	1
+#define BTRFS_STRIPE_RAID1	2
+#define BTRFS_STRIPE_DUP	3
+#define BTRFS_STRIPE_RAID10	4
+#define BTRFS_STRIPE_RAID5	5
+#define BTRFS_STRIPE_RAID6	6
+#define BTRFS_STRIPE_RAID1C3	7
+#define BTRFS_STRIPE_RAID1C4	8
+
+struct btrfs_stripe_extent {
+	__u8 encoding;
+	__u8 reserved[7];
+	/* An array of raid strides this stripe is composed of. */
+	struct btrfs_raid_stride strides[];
+} __attribute__ ((__packed__));
+
 #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
 
@@ -787,6 +824,10 @@ struct btrfs_shared_data_ref {
 	__le32 count;
 } __attribute__ ((__packed__));
 
+struct btrfs_extent_owner_ref {
+	__le64 root_id;
+} __attribute__ ((__packed__));
+
 struct btrfs_extent_inline_ref {
 	__u8 type;
 	__le64 offset;
@@ -1204,9 +1245,17 @@ static inline __u16 btrfs_qgroup_level(__u64 qgroupid)
  */
 #define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT	(1ULL << 2)
 
+/*
+ * Whether or not this filesystem is using simple quotas.  Not exactly the
+ * incompat bit, because we support using simple quotas, disabling it, then
+ * going back to full qgroup quotas.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE	(1ULL << 3)
+
 #define BTRFS_QGROUP_STATUS_FLAGS_MASK	(BTRFS_QGROUP_STATUS_FLAG_ON |		\
 					 BTRFS_QGROUP_STATUS_FLAG_RESCAN |	\
-					 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)
+					 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | \
+					 BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
 
 #define BTRFS_QGROUP_STATUS_VERSION        1
 
@@ -1228,6 +1277,15 @@ struct btrfs_qgroup_status_item {
 	 * of the scan. It contains a logical address
 	 */
 	__le64 rescan;
+
+	/*
+	 * The generation when quotas were last enabled. Used by simple quotas to
+	 * avoid decrementing when freeing an extent that was written before
+	 * enable.
+	 *
+	 * Set only if flags contain BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE.
+	 */
+	__le64 enable_gen;
 } __attribute__ ((__packed__));
 
 struct btrfs_qgroup_info_item {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 9b731976ce2f..9417309b7230 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -140,7 +140,7 @@ typedef __s64	Elf64_Sxword;
 #define ELF64_ST_BIND(x)	ELF_ST_BIND(x)
 #define ELF64_ST_TYPE(x)	ELF_ST_TYPE(x)
 
-typedef struct dynamic {
+typedef struct {
   Elf32_Sword d_tag;
   union {
     Elf32_Sword	d_val;
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index fd1fb0d5389d..7a8f4c290187 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -71,7 +71,8 @@ struct fscrypt_policy_v2 {
 	__u8 contents_encryption_mode;
 	__u8 filenames_encryption_mode;
 	__u8 flags;
-	__u8 __reserved[4];
+	__u8 log2_data_unit_size;
+	__u8 __reserved[3];
 	__u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
 };
 
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 71a5df8d2689..d2ee625ea189 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -44,10 +44,35 @@
 					 FUTEX_PRIVATE_FLAG)
 
 /*
- * Flags to specify the bit length of the futex word for futex2 syscalls.
- * Currently, only 32 is supported.
+ * Flags for futex2 syscalls.
+ *
+ * NOTE: these are not pure flags, they can also be seen as:
+ *
+ *   union {
+ *     u32  flags;
+ *     struct {
+ *       u32 size    : 2,
+ *           numa    : 1,
+ *                   : 4,
+ *           private : 1;
+ *     };
+ *   };
  */
-#define FUTEX_32		2
+#define FUTEX2_SIZE_U8		0x00
+#define FUTEX2_SIZE_U16		0x01
+#define FUTEX2_SIZE_U32		0x02
+#define FUTEX2_SIZE_U64		0x03
+#define FUTEX2_NUMA		0x04
+			/*	0x08 */
+			/*	0x10 */
+			/*	0x20 */
+			/*	0x40 */
+#define FUTEX2_PRIVATE		FUTEX_PRIVATE_FLAG
+
+#define FUTEX2_SIZE_MASK	0x03
+
+/* do not use */
+#define FUTEX_32		FUTEX2_SIZE_U32 /* historical accident :-( */
 
 /*
  * Max numbers of elements in a futex_waitv array
diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h
new file mode 100644
index 000000000000..c8ae72466ee6
--- /dev/null
+++ b/include/uapi/linux/nfsd_netlink.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_NFSD_H
+#define _UAPI_LINUX_NFSD_H
+
+#define NFSD_FAMILY_NAME	"nfsd"
+#define NFSD_FAMILY_VERSION	1
+
+enum {
+	NFSD_A_RPC_STATUS_XID = 1,
+	NFSD_A_RPC_STATUS_FLAGS,
+	NFSD_A_RPC_STATUS_PROG,
+	NFSD_A_RPC_STATUS_VERSION,
+	NFSD_A_RPC_STATUS_PROC,
+	NFSD_A_RPC_STATUS_SERVICE_TIME,
+	NFSD_A_RPC_STATUS_PAD,
+	NFSD_A_RPC_STATUS_SADDR4,
+	NFSD_A_RPC_STATUS_DADDR4,
+	NFSD_A_RPC_STATUS_SADDR6,
+	NFSD_A_RPC_STATUS_DADDR6,
+	NFSD_A_RPC_STATUS_SPORT,
+	NFSD_A_RPC_STATUS_DPORT,
+	NFSD_A_RPC_STATUS_COMPOUND_OPS,
+
+	__NFSD_A_RPC_STATUS_MAX,
+	NFSD_A_RPC_STATUS_MAX = (__NFSD_A_RPC_STATUS_MAX - 1)
+};
+
+enum {
+	NFSD_CMD_RPC_STATUS_GET = 1,
+
+	__NFSD_CMD_MAX,
+	NFSD_CMD_MAX = (__NFSD_CMD_MAX - 1)
+};
+
+#endif /* _UAPI_LINUX_NFSD_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index f2c4589d4dbf..90662385689b 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -4,10 +4,6 @@
 
 #include <linux/types.h>
 
-struct sched_param {
-	int sched_priority;
-};
-
 #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
 #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
 
diff --git a/init/Makefile b/init/Makefile
index ec557ada3c12..cbac576c57d6 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -60,4 +60,5 @@ include/generated/utsversion.h: FORCE
 $(obj)/version-timestamp.o: include/generated/utsversion.h
 CFLAGS_version-timestamp.o := -include include/generated/utsversion.h
 KASAN_SANITIZE_version-timestamp.o := n
+KCSAN_SANITIZE_version-timestamp.o := n
 GCOV_PROFILE_version-timestamp.o := n
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 5dfd30b13f48..5fdef94f0864 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -244,7 +244,7 @@ retry:
 	for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
 		printk(" %s", p);
 	printk("\n");
-	panic("VFS: Unable to mount root fs on %s", b);
+	panic("VFS: Unable to mount root fs on \"%s\" or %s", pretty_name, b);
 out:
 	put_page(page);
 }
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b..f703116e0523 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -85,6 +85,7 @@ struct task_struct init_task
 	.nr_cpus_allowed= NR_CPUS,
 	.mm		= NULL,
 	.active_mm	= &init_mm,
+	.faults_disabled_mapping = NULL,
 	.restart_block	= {
 		.fn = do_no_restart_syscall,
 	},
diff --git a/init/version.c b/init/version.c
index f117921811b4..94c96f6fbfe6 100644
--- a/init/version.c
+++ b/init/version.c
@@ -21,10 +21,10 @@ static int __init early_hostname(char *arg)
 {
 	size_t bufsize = sizeof(init_uts_ns.name.nodename);
 	size_t maxlen  = bufsize - 1;
-	size_t arglen;
+	ssize_t arglen;
 
-	arglen = strlcpy(init_uts_ns.name.nodename, arg, bufsize);
-	if (arglen > maxlen) {
+	arglen = strscpy(init_uts_ns.name.nodename, arg, bufsize);
+	if (arglen < 0) {
 		pr_warn("hostname parameter exceeds %zd characters and will be truncated",
 			maxlen);
 	}
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index c53678875416..f04a43044d91 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -53,7 +53,6 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct io_ring_ctx *ctx = f->private_data;
-	struct io_sq_data *sq = NULL;
 	struct io_overflow_cqe *ocqe;
 	struct io_rings *r = ctx->rings;
 	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
@@ -64,6 +63,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
 	unsigned int cq_shift = 0;
 	unsigned int sq_shift = 0;
 	unsigned int sq_entries, cq_entries;
+	int sq_pid = -1, sq_cpu = -1;
 	bool has_lock;
 	unsigned int i;
 
@@ -143,13 +143,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
 	has_lock = mutex_trylock(&ctx->uring_lock);
 
 	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
-		sq = ctx->sq_data;
-		if (!sq->thread)
-			sq = NULL;
+		struct io_sq_data *sq = ctx->sq_data;
+
+		if (mutex_trylock(&sq->lock)) {
+			if (sq->thread) {
+				sq_pid = task_pid_nr(sq->thread);
+				sq_cpu = task_cpu(sq->thread);
+			}
+			mutex_unlock(&sq->lock);
+		}
 	}
 
-	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
-	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
+	seq_printf(m, "SqThread:\t%d\n", sq_pid);
+	seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
 		struct file *f = io_file_from_index(&ctx->file_table, i);
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index e3fae26e025d..fb73adb89067 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -220,7 +220,6 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct files_struct *files = current->files;
 	struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
-	struct fdtable *fdt;
 	struct file *file;
 	int ret = -EBADF;
 
@@ -230,13 +229,7 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	if (close->fd >= fdt->max_fds) {
-		spin_unlock(&files->file_lock);
-		goto err;
-	}
-	file = rcu_dereference_protected(fdt->fd[close->fd],
-			lockdep_is_held(&files->file_lock));
+	file = files_lookup_fd_locked(files, close->fd);
 	if (!file || io_is_uring_fops(file)) {
 		spin_unlock(&files->file_lock);
 		goto err;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index c8c822fa7980..8f68d5ad4564 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -339,7 +339,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned final_ret = io_fixup_rw_res(req, ret);
 
-	if (req->flags & REQ_F_CUR_POS)
+	if (ret >= 0 && req->flags & REQ_F_CUR_POS)
 		req->file->f_pos = rw->kiocb.ki_pos;
 	if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
 		if (!__io_complete_rw_common(req, ret)) {
@@ -913,15 +913,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		kiocb_start_write(kiocb);
 	kiocb->ki_flags |= IOCB_WRITE;
 
-	/*
-	 * For non-polled IO, set IOCB_DIO_CALLER_COMP, stating that our handler
-	 * groks deferring the completion to task context. This isn't
-	 * necessary and useful for polled IO as that can always complete
-	 * directly.
-	 */
-	if (!(kiocb->ki_flags & IOCB_HIPRI))
-		kiocb->ki_flags |= IOCB_DIO_CALLER_COMP;
-
 	if (likely(req->file->f_op->write_iter))
 		ret2 = call_write_iter(req->file, kiocb, &s->iter);
 	else if (req->file->f_op->write)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ba8215ed663a..5eea4dc0509e 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -302,7 +302,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	if (S_ISREG(mode)) {
 		struct mqueue_inode_info *info;
@@ -596,7 +596,7 @@ static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg)
 
 	put_ipc_ns(ipc_ns);
 	dir->i_size += DIRENT_SIZE;
-	dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
+	simple_inode_init_ts(dir);
 
 	d_instantiate(dentry, inode);
 	dget(dentry);
@@ -618,7 +618,7 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 
-	dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
+	simple_inode_init_ts(dir);
 	dir->i_size -= DIRENT_SIZE;
 	drop_nlink(inode);
 	dput(dentry);
@@ -657,7 +657,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 	if (ret <= 0)
 		return ret;
 
-	inode->i_atime = inode_set_ctime_current(inode);
+	inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
 	return ret;
 }
 
@@ -1163,7 +1163,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 				goto out_unlock;
 			__do_notify(info);
 		}
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 out_unlock:
 	spin_unlock(&info->lock);
@@ -1257,7 +1257,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 
 		msg_ptr = msg_get(info);
 
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 
 		/* There is now free space in queue. */
 		pipelined_receive(&wake_q, info);
@@ -1395,7 +1395,8 @@ retry:
 	if (notification == NULL) {
 		if (info->notify_owner == task_tgid(current)) {
 			remove_notification(info);
-			inode->i_atime = inode_set_ctime_current(inode);
+			inode_set_atime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		}
 	} else if (info->notify_owner != NULL) {
 		ret = -EBUSY;
@@ -1421,7 +1422,7 @@ retry:
 
 		info->notify_owner = get_pid(task_tgid(current));
 		info->notify_user_ns = get_user_ns(current_user_ns());
-		inode->i_atime = inode_set_ctime_current(inode);
+		inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
 	}
 	spin_unlock(&info->lock);
 out_fput:
@@ -1484,7 +1485,7 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
 			f.file->f_flags &= ~O_NONBLOCK;
 		spin_unlock(&f.file->f_lock);
 
-		inode->i_atime = inode_set_ctime_current(inode);
+		inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
 	}
 
 	spin_unlock(&info->lock);
diff --git a/kernel/acct.c b/kernel/acct.c
index 1a9f929fe629..986c8214dabf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -246,7 +246,7 @@ static int acct_on(struct filename *pathname)
 		filp_close(file, NULL);
 		return PTR_ERR(internal);
 	}
-	err = __mnt_want_write(internal);
+	err = mnt_get_write_access(internal);
 	if (err) {
 		mntput(internal);
 		kfree(acct);
@@ -271,7 +271,7 @@ static int acct_on(struct filename *pathname)
 	old = xchg(&ns->bacct, &acct->pin);
 	mutex_unlock(&acct->lock);
 	pin_kill(old);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	mntput(mnt);
 	return 0;
 }
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e867c17d3f84..85a5b306733b 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -34,7 +34,7 @@ struct audit_chunk {
 		struct list_head list;
 		struct audit_tree *owner;
 		unsigned index;		/* index; upper bit indicates 'will prune' */
-	} owners[];
+	} owners[] __counted_by(count);
 };
 
 struct audit_tree_mark {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 65075f1e4ac8..91e82e34b51e 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -527,11 +527,18 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	unsigned long ino;
 	dev_t dev;
 
-	exe_file = get_task_exe_file(tsk);
+	/* only do exe filtering if we are recording @current events/records */
+	if (tsk != current)
+		return 0;
+
+	if (WARN_ON_ONCE(!current->mm))
+		return 0;
+	exe_file = get_mm_exe_file(current->mm);
 	if (!exe_file)
 		return 0;
 	ino = file_inode(exe_file)->i_ino;
 	dev = file_inode(exe_file)->i_sb->s_dev;
 	fput(exe_file);
+
 	return audit_mark_compare(mark, ino, dev);
 }
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 99d0625b6c82..1aafb2ff2e95 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -118,8 +118,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
 		return ERR_PTR(-ENOSPC);
 
 	inode->i_ino = get_next_ino();
-	inode->i_atime = inode_set_ctime_current(inode);
-	inode->i_mtime = inode->i_atime;
+	simple_inode_init_ts(inode);
 
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 
@@ -147,7 +146,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
 	d_instantiate(dentry, inode);
 	dget(dentry);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 }
 
 static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 59e747938bdb..654601dd6b49 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -290,11 +290,9 @@ again:
 	rcu_read_lock();
 	for (;; curr_fd++) {
 		struct file *f;
-		f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
+		f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
 		if (!f)
 			break;
-		if (!get_file_rcu(f))
-			continue;
 
 		/* set info->fd */
 		info->fd = curr_fd;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b6d64f3b8888..484adb375b15 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -207,6 +207,8 @@ static u16 have_exit_callback __read_mostly;
 static u16 have_release_callback __read_mostly;
 static u16 have_canfork_callback __read_mostly;
 
+static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
 	.ns.count	= REFCOUNT_INIT(2),
@@ -1350,7 +1352,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 		cgroup_root_count--;
 	}
 
-	cgroup_favor_dynmods(root, false);
+	if (!have_favordynmods)
+		cgroup_favor_dynmods(root, false);
+
 	cgroup_exit_root_id(root);
 
 	cgroup_unlock();
@@ -1719,20 +1723,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 
 	if (!css->ss) {
 		if (cgroup_on_dfl(cgrp)) {
-			ret = cgroup_addrm_files(&cgrp->self, cgrp,
+			ret = cgroup_addrm_files(css, cgrp,
 						 cgroup_base_files, true);
 			if (ret < 0)
 				return ret;
 
 			if (cgroup_psi_enabled()) {
-				ret = cgroup_addrm_files(&cgrp->self, cgrp,
+				ret = cgroup_addrm_files(css, cgrp,
 							 cgroup_psi_files, true);
 				if (ret < 0)
 					return ret;
 			}
 		} else {
-			cgroup_addrm_files(css, cgrp,
-					   cgroup1_base_files, true);
+			ret = cgroup_addrm_files(css, cgrp,
+						 cgroup1_base_files, true);
+			if (ret < 0)
+				return ret;
 		}
 	} else {
 		list_for_each_entry(cfts, &css->ss->cfts, node) {
@@ -2243,9 +2249,9 @@ static int cgroup_init_fs_context(struct fs_context *fc)
 	fc->user_ns = get_user_ns(ctx->ns->user_ns);
 	fc->global = true;
 
-#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
-	ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
-#endif
+	if (have_favordynmods)
+		ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+
 	return 0;
 }
 
@@ -6127,7 +6133,7 @@ int __init cgroup_init(void)
 
 		if (cgroup1_ssid_disabled(ssid))
 			pr_info("Disabling %s control group subsystem in v1 mounts\n",
-				ss->name);
+				ss->legacy_name);
 
 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
@@ -6770,6 +6776,12 @@ static int __init enable_cgroup_debug(char *str)
 }
 __setup("cgroup_debug", enable_cgroup_debug);
 
+static int __init cgroup_favordynmods_setup(char *str)
+{
+	return (kstrtobool(str, &have_favordynmods) == 0);
+}
+__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);
+
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58ec88efa4f8..615daaf87f1f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -75,16 +75,18 @@ enum prs_errcode {
 	PERR_NOCPUS,
 	PERR_HOTPLUG,
 	PERR_CPUSEMPTY,
+	PERR_HKEEPING,
 };
 
 static const char * const perr_strings[] = {
-	[PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus",
+	[PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus.exclusive",
 	[PERR_INVPARENT] = "Parent is an invalid partition root",
 	[PERR_NOTPART]   = "Parent is not a partition root",
 	[PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",
 	[PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",
 	[PERR_HOTPLUG]   = "No cpu available due to hotplug",
 	[PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+	[PERR_HKEEPING]  = "partition config conflicts with housekeeping setup",
 };
 
 struct cpuset {
@@ -121,14 +123,23 @@ struct cpuset {
 	nodemask_t effective_mems;
 
 	/*
-	 * CPUs allocated to child sub-partitions (default hierarchy only)
-	 * - CPUs granted by the parent = effective_cpus U subparts_cpus
-	 * - effective_cpus and subparts_cpus are mutually exclusive.
+	 * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
 	 *
-	 * effective_cpus contains only onlined CPUs, but subparts_cpus
-	 * may have offlined ones.
+	 * This exclusive CPUs must be a subset of cpus_allowed. A parent
+	 * cgroup can only grant exclusive CPUs to one of its children.
+	 *
+	 * When the cgroup becomes a valid partition root, effective_xcpus
+	 * defaults to cpus_allowed if not set. The effective_cpus of a valid
+	 * partition root comes solely from its effective_xcpus and some of the
+	 * effective_xcpus may be distributed to sub-partitions below & hence
+	 * excluded from its effective_cpus.
+	 */
+	cpumask_var_t effective_xcpus;
+
+	/*
+	 * Exclusive CPUs as requested by the user (default hierarchy only)
 	 */
-	cpumask_var_t subparts_cpus;
+	cpumask_var_t exclusive_cpus;
 
 	/*
 	 * This is old Memory Nodes tasks took on.
@@ -156,8 +167,8 @@ struct cpuset {
 	/* for custom sched domain */
 	int relax_domain_level;
 
-	/* number of CPUs in subparts_cpus */
-	int nr_subparts_cpus;
+	/* number of valid sub-partitions */
+	int nr_subparts;
 
 	/* partition root state */
 	int partition_root_state;
@@ -183,9 +194,20 @@ struct cpuset {
 
 	/* Handle for cpuset.cpus.partition */
 	struct cgroup_file partition_file;
+
+	/* Remote partition silbling list anchored at remote_children */
+	struct list_head remote_sibling;
 };
 
 /*
+ * Exclusive CPUs distributed out to sub-partitions of top_cpuset
+ */
+static cpumask_var_t	subpartitions_cpus;
+
+/* List of remote partition root children */
+static struct list_head remote_children;
+
+/*
  * Partition root states:
  *
  *   0 - member (not a partition root)
@@ -312,7 +334,7 @@ static inline int is_partition_invalid(const struct cpuset *cs)
  */
 static inline void make_partition_invalid(struct cpuset *cs)
 {
-	if (is_partition_valid(cs))
+	if (cs->partition_root_state > 0)
 		cs->partition_root_state = -cs->partition_root_state;
 }
 
@@ -334,6 +356,7 @@ static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
 		  (1 << CS_MEM_EXCLUSIVE)),
 	.partition_root_state = PRS_ROOT,
+	.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
 };
 
 /**
@@ -469,7 +492,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
 
 	if (cs->css.cgroup->nr_populated_csets)
 		return true;
-	if (!excluded_child && !cs->nr_subparts_cpus)
+	if (!excluded_child && !cs->nr_subparts)
 		return cgroup_is_populated(cs->css.cgroup);
 
 	rcu_read_lock();
@@ -596,16 +619,18 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  */
 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
-	cpumask_var_t *pmask1, *pmask2, *pmask3;
+	cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
 
 	if (cs) {
 		pmask1 = &cs->cpus_allowed;
 		pmask2 = &cs->effective_cpus;
-		pmask3 = &cs->subparts_cpus;
+		pmask3 = &cs->effective_xcpus;
+		pmask4 = &cs->exclusive_cpus;
 	} else {
 		pmask1 = &tmp->new_cpus;
 		pmask2 = &tmp->addmask;
 		pmask3 = &tmp->delmask;
+		pmask4 = NULL;
 	}
 
 	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
@@ -617,8 +642,14 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 		goto free_two;
 
+	if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
+		goto free_three;
+
+
 	return 0;
 
+free_three:
+	free_cpumask_var(*pmask3);
 free_two:
 	free_cpumask_var(*pmask2);
 free_one:
@@ -636,7 +667,8 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 	if (cs) {
 		free_cpumask_var(cs->cpus_allowed);
 		free_cpumask_var(cs->effective_cpus);
-		free_cpumask_var(cs->subparts_cpus);
+		free_cpumask_var(cs->effective_xcpus);
+		free_cpumask_var(cs->exclusive_cpus);
 	}
 	if (tmp) {
 		free_cpumask_var(tmp->new_cpus);
@@ -664,6 +696,8 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+	cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+	cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
 	return trial;
 }
 
@@ -677,6 +711,28 @@ static inline void free_cpuset(struct cpuset *cs)
 	kfree(cs);
 }
 
+static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
+{
+	return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
+	       cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
+						  : cs->effective_xcpus;
+}
+
+/*
+ * cpusets_are_exclusive() - check if two cpusets are exclusive
+ *
+ * Return true if exclusive, false if not
+ */
+static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
+{
+	struct cpumask *xcpus1 = fetch_xcpus(cs1);
+	struct cpumask *xcpus2 = fetch_xcpus(cs2);
+
+	if (cpumask_intersects(xcpus1, xcpus2))
+		return false;
+	return true;
+}
+
 /*
  * validate_change_legacy() - Validate conditions specific to legacy (v1)
  *                            behavior.
@@ -776,9 +832,10 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	ret = -EINVAL;
 	cpuset_for_each_child(c, css, par) {
 		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
-		    c != cur &&
-		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
-			goto out;
+		    c != cur) {
+			if (!cpusets_are_exclusive(trial, c))
+				goto out;
+		}
 		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 		    c != cur &&
 		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
@@ -908,7 +965,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	csa = NULL;
 
 	/* Special case for the 99% of systems with one, full, sched domain */
-	if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
+	if (root_load_balance && !top_cpuset.nr_subparts) {
 		ndoms = 1;
 		doms = alloc_sched_domains(ndoms);
 		if (!doms)
@@ -1159,7 +1216,7 @@ static void rebuild_sched_domains_locked(void)
 	 * should be the same as the active CPUs, so checking only top_cpuset
 	 * is enough to detect racing CPU offlines.
 	 */
-	if (!top_cpuset.nr_subparts_cpus &&
+	if (cpumask_empty(subpartitions_cpus) &&
 	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
 		return;
 
@@ -1168,7 +1225,7 @@ static void rebuild_sched_domains_locked(void)
 	 * root should be only a subset of the active CPUs.  Since a CPU in any
 	 * partition root could be offlined, all must be checked.
 	 */
-	if (top_cpuset.nr_subparts_cpus) {
+	if (top_cpuset.nr_subparts) {
 		rcu_read_lock();
 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
 			if (!is_partition_valid(cs)) {
@@ -1232,7 +1289,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
 			 */
 			if (kthread_is_per_cpu(task))
 				continue;
-			cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
+			cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
 		} else {
 			cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
 		}
@@ -1247,32 +1304,22 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
  * @cs: the cpuset the need to recompute the new effective_cpus mask
  * @parent: the parent cpuset
  *
- * If the parent has subpartition CPUs, include them in the list of
- * allowable CPUs in computing the new effective_cpus mask. Since offlined
- * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
- * to mask those out.
+ * The result is valid only if the given cpuset isn't a partition root.
  */
 static void compute_effective_cpumask(struct cpumask *new_cpus,
 				      struct cpuset *cs, struct cpuset *parent)
 {
-	if (parent->nr_subparts_cpus && is_partition_valid(cs)) {
-		cpumask_or(new_cpus, parent->effective_cpus,
-			   parent->subparts_cpus);
-		cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
-		cpumask_and(new_cpus, new_cpus, cpu_active_mask);
-	} else {
-		cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
-	}
+	cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
 }
 
 /*
- * Commands for update_parent_subparts_cpumask
+ * Commands for update_parent_effective_cpumask
  */
-enum subparts_cmd {
-	partcmd_enable,		/* Enable partition root	 */
-	partcmd_disable,	/* Disable partition root	 */
-	partcmd_update,		/* Update parent's subparts_cpus */
-	partcmd_invalidate,	/* Make partition invalid	 */
+enum partition_cmd {
+	partcmd_enable,		/* Enable partition root	  */
+	partcmd_disable,	/* Disable partition root	  */
+	partcmd_update,		/* Update parent's effective_cpus */
+	partcmd_invalidate,	/* Make partition invalid	  */
 };
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1304,13 +1351,23 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs)
  *
  * Changing load balance flag will automatically call
  * rebuild_sched_domains_locked().
+ * This function is for cgroup v2 only.
  */
 static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
 {
 	int new_prs = cs->partition_root_state;
-	bool new_lb = (new_prs != PRS_ISOLATED);
 	bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+	bool new_lb;
 
+	/*
+	 * If cs is not a valid partition root, the load balance state
+	 * will follow its parent.
+	 */
+	if (new_prs > 0) {
+		new_lb = (new_prs != PRS_ISOLATED);
+	} else {
+		new_lb = is_sched_load_balance(parent_cs(cs));
+	}
 	if (new_lb != !!is_sched_load_balance(cs)) {
 		rebuild_domains = true;
 		if (new_lb)
@@ -1323,8 +1380,296 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
 		rebuild_sched_domains_locked();
 }
 
+/*
+ * tasks_nocpu_error - Return true if tasks will have no effective_cpus
+ */
+static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
+			      struct cpumask *xcpus)
+{
+	/*
+	 * A populated partition (cs or parent) can't have empty effective_cpus
+	 */
+	return (cpumask_subset(parent->effective_cpus, xcpus) &&
+		partition_is_populated(parent, cs)) ||
+	       (!cpumask_intersects(xcpus, cpu_active_mask) &&
+		partition_is_populated(cs, NULL));
+}
+
+static void reset_partition_data(struct cpuset *cs)
+{
+	struct cpuset *parent = parent_cs(cs);
+
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+		return;
+
+	lockdep_assert_held(&callback_lock);
+
+	cs->nr_subparts = 0;
+	if (cpumask_empty(cs->exclusive_cpus)) {
+		cpumask_clear(cs->effective_xcpus);
+		if (is_cpu_exclusive(cs))
+			clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+	}
+	if (!cpumask_and(cs->effective_cpus,
+			 parent->effective_cpus, cs->cpus_allowed)) {
+		cs->use_parent_ecpus = true;
+		parent->child_ecpus_count++;
+		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+	}
+}
+
+/*
+ * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: true if xcpus is not empty, false otherwise.
+ *
+ * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
+ * it must be a subset of cpus_allowed and parent's effective_xcpus.
+ */
+static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
+						struct cpumask *xcpus)
+{
+	struct cpuset *parent = parent_cs(cs);
+
+	if (!xcpus)
+		xcpus = cs->effective_xcpus;
+
+	if (!cpumask_empty(cs->exclusive_cpus))
+		cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
+	else
+		cpumask_copy(xcpus, cs->cpus_allowed);
+
+	return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
+}
+
+static inline bool is_remote_partition(struct cpuset *cs)
+{
+	return !list_empty(&cs->remote_sibling);
+}
+
+static inline bool is_local_partition(struct cpuset *cs)
+{
+	return is_partition_valid(cs) && !is_remote_partition(cs);
+}
+
+/*
+ * remote_partition_enable - Enable current cpuset as a remote partition root
+ * @cs: the cpuset to update
+ * @tmp: temparary masks
+ * Return: 1 if successful, 0 if error
+ *
+ * Enable the current cpuset to become a remote partition root taking CPUs
+ * directly from the top cpuset. cpuset_mutex must be held by the caller.
+ */
+static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
+{
+	/*
+	 * The user must have sysadmin privilege.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	/*
+	 * The requested exclusive_cpus must not be allocated to other
+	 * partitions and it can't use up all the root's effective_cpus.
+	 *
+	 * Note that if there is any local partition root above it or
+	 * remote partition root underneath it, its exclusive_cpus must
+	 * have overlapped with subpartitions_cpus.
+	 */
+	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+	if (cpumask_empty(tmp->new_cpus) ||
+	    cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
+	    cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
+		return 0;
+
+	spin_lock_irq(&callback_lock);
+	cpumask_andnot(top_cpuset.effective_cpus,
+		       top_cpuset.effective_cpus, tmp->new_cpus);
+	cpumask_or(subpartitions_cpus,
+		   subpartitions_cpus, tmp->new_cpus);
+
+	if (cs->use_parent_ecpus) {
+		struct cpuset *parent = parent_cs(cs);
+
+		cs->use_parent_ecpus = false;
+		parent->child_ecpus_count--;
+	}
+	list_add(&cs->remote_sibling, &remote_children);
+	spin_unlock_irq(&callback_lock);
+
+	/*
+	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+	 */
+	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+
+	return 1;
+}
+
+/*
+ * remote_partition_disable - Remove current cpuset from remote partition list
+ * @cs: the cpuset to update
+ * @tmp: temparary masks
+ *
+ * The effective_cpus is also updated.
+ *
+ * cpuset_mutex must be held by the caller.
+ */
+static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
+{
+	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+	WARN_ON_ONCE(!is_remote_partition(cs));
+	WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
+
+	spin_lock_irq(&callback_lock);
+	cpumask_andnot(subpartitions_cpus,
+		       subpartitions_cpus, tmp->new_cpus);
+	cpumask_and(tmp->new_cpus,
+		    tmp->new_cpus, cpu_active_mask);
+	cpumask_or(top_cpuset.effective_cpus,
+		   top_cpuset.effective_cpus, tmp->new_cpus);
+	list_del_init(&cs->remote_sibling);
+	cs->partition_root_state = -cs->partition_root_state;
+	if (!cs->prs_err)
+		cs->prs_err = PERR_INVCPUS;
+	reset_partition_data(cs);
+	spin_unlock_irq(&callback_lock);
+
+	/*
+	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+	 */
+	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+}
+
+/*
+ * remote_cpus_update - cpus_exclusive change of remote partition
+ * @cs: the cpuset to be updated
+ * @newmask: the new effective_xcpus mask
+ * @tmp: temparary masks
+ *
+ * top_cpuset and subpartitions_cpus will be updated or partition can be
+ * invalidated.
+ */
+static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
+			       struct tmpmasks *tmp)
+{
+	bool adding, deleting;
+
+	if (WARN_ON_ONCE(!is_remote_partition(cs)))
+		return;
+
+	WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+	if (cpumask_empty(newmask))
+		goto invalidate;
+
+	adding   = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
+	deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
+
+	/*
+	 * Additions of remote CPUs is only allowed if those CPUs are
+	 * not allocated to other partitions and there are effective_cpus
+	 * left in the top cpuset.
+	 */
+	if (adding && (!capable(CAP_SYS_ADMIN) ||
+		       cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+		       cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
+		goto invalidate;
+
+	spin_lock_irq(&callback_lock);
+	if (adding) {
+		cpumask_or(subpartitions_cpus,
+			   subpartitions_cpus, tmp->addmask);
+		cpumask_andnot(top_cpuset.effective_cpus,
+			       top_cpuset.effective_cpus, tmp->addmask);
+	}
+	if (deleting) {
+		cpumask_andnot(subpartitions_cpus,
+			       subpartitions_cpus, tmp->delmask);
+		cpumask_and(tmp->delmask,
+			    tmp->delmask, cpu_active_mask);
+		cpumask_or(top_cpuset.effective_cpus,
+			   top_cpuset.effective_cpus, tmp->delmask);
+	}
+	spin_unlock_irq(&callback_lock);
+
+	/*
+	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+	 */
+	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+	return;
+
+invalidate:
+	remote_partition_disable(cs, tmp);
+}
+
+/*
+ * remote_partition_check - check if a child remote partition needs update
+ * @cs: the cpuset to be updated
+ * @newmask: the new effective_xcpus mask
+ * @delmask: temporary mask for deletion (not in tmp)
+ * @tmp: temparary masks
+ *
+ * This should be called before the given cs has updated its cpus_allowed
+ * and/or effective_xcpus.
+ */
+static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
+				   struct cpumask *delmask, struct tmpmasks *tmp)
+{
+	struct cpuset *child, *next;
+	int disable_cnt = 0;
+
+	/*
+	 * Compute the effective exclusive CPUs that will be deleted.
+	 */
+	if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
+	    !cpumask_intersects(delmask, subpartitions_cpus))
+		return;	/* No deletion of exclusive CPUs in partitions */
+
+	/*
+	 * Searching the remote children list to look for those that will
+	 * be impacted by the deletion of exclusive CPUs.
+	 *
+	 * Since a cpuset must be removed from the remote children list
+	 * before it can go offline and holding cpuset_mutex will prevent
+	 * any change in cpuset status. RCU read lock isn't needed.
+	 */
+	lockdep_assert_held(&cpuset_mutex);
+	list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
+		if (cpumask_intersects(child->effective_cpus, delmask)) {
+			remote_partition_disable(child, tmp);
+			disable_cnt++;
+		}
+	if (disable_cnt)
+		rebuild_sched_domains_locked();
+}
+
+/*
+ * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
+ * @prstate: partition root state to be checked
+ * @new_cpus: cpu mask
+ * Return: true if there is conflict, false otherwise
+ *
+ * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
+ * an isolated partition.
+ */
+static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
+{
+	const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
+	bool all_in_hk = cpumask_subset(new_cpus, hk_domain);
+
+	if (!all_in_hk && (prstate != PRS_ISOLATED))
+		return true;
+
+	return false;
+}
+
 /**
- * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+ * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
  * @cs:      The cpuset that requests change in partition root state
  * @cmd:     Partition root state change command
  * @newmask: Optional new cpumask for partcmd_update
@@ -1332,21 +1677,20 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
  * Return:   0 or a partition root state error code
  *
  * For partcmd_enable, the cpuset is being transformed from a non-partition
- * root to a partition root. The cpus_allowed mask of the given cpuset will
- * be put into parent's subparts_cpus and taken away from parent's
+ * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus
+ * not set) mask of the given cpuset will be taken away from parent's
  * effective_cpus. The function will return 0 if all the CPUs listed in
- * cpus_allowed can be granted or an error code will be returned.
+ * effective_xcpus can be granted or an error code will be returned.
  *
  * For partcmd_disable, the cpuset is being transformed from a partition
- * root back to a non-partition root. Any CPUs in cpus_allowed that are in
- * parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 will always be returned.
+ * root back to a non-partition root. Any CPUs in effective_xcpus will be
+ * given back to parent's effective_cpus. 0 will always be returned.
  *
  * For partcmd_update, if the optional newmask is specified, the cpu list is
- * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
  * assumed to remain the same. The cpuset should either be a valid or invalid
  * partition root. The partition root state may change from valid to invalid
- * or vice versa. An error code will only be returned if transitioning from
+ * or vice versa. An error code will be returned if transitioning from
  * invalid to valid violates the exclusivity rule.
  *
  * For partcmd_invalidate, the current partition will be made invalid.
@@ -1361,19 +1705,48 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
  * check for error and so partition_root_state and prs_error will be updated
  * directly.
  */
-static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
-					  struct cpumask *newmask,
-					  struct tmpmasks *tmp)
+static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+					   struct cpumask *newmask,
+					   struct tmpmasks *tmp)
 {
 	struct cpuset *parent = parent_cs(cs);
-	int adding;	/* Moving cpus from effective_cpus to subparts_cpus */
-	int deleting;	/* Moving cpus from subparts_cpus to effective_cpus */
+	int adding;	/* Adding cpus to parent's effective_cpus	*/
+	int deleting;	/* Deleting cpus from parent's effective_cpus	*/
 	int old_prs, new_prs;
 	int part_error = PERR_NONE;	/* Partition error? */
+	int subparts_delta = 0;
+	struct cpumask *xcpus;		/* cs effective_xcpus */
+	bool nocpu;
 
 	lockdep_assert_held(&cpuset_mutex);
 
 	/*
+	 * new_prs will only be changed for the partcmd_update and
+	 * partcmd_invalidate commands.
+	 */
+	adding = deleting = false;
+	old_prs = new_prs = cs->partition_root_state;
+	xcpus = !cpumask_empty(cs->exclusive_cpus)
+		? cs->effective_xcpus : cs->cpus_allowed;
+
+	if (cmd == partcmd_invalidate) {
+		if (is_prs_invalid(old_prs))
+			return 0;
+
+		/*
+		 * Make the current partition invalid.
+		 */
+		if (is_partition_valid(parent))
+			adding = cpumask_and(tmp->addmask,
+					     xcpus, parent->effective_xcpus);
+		if (old_prs > 0) {
+			new_prs = -old_prs;
+			subparts_delta--;
+		}
+		goto write_error;
+	}
+
+	/*
 	 * The parent must be a partition root.
 	 * The new cpumask, if present, or the current cpus_allowed must
 	 * not be empty.
@@ -1385,124 +1758,138 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 	if (!newmask && cpumask_empty(cs->cpus_allowed))
 		return PERR_CPUSEMPTY;
 
-	/*
-	 * new_prs will only be changed for the partcmd_update and
-	 * partcmd_invalidate commands.
-	 */
-	adding = deleting = false;
-	old_prs = new_prs = cs->partition_root_state;
+	nocpu = tasks_nocpu_error(parent, cs, xcpus);
+
 	if (cmd == partcmd_enable) {
 		/*
-		 * Enabling partition root is not allowed if cpus_allowed
-		 * doesn't overlap parent's cpus_allowed.
+		 * Enabling partition root is not allowed if its
+		 * effective_xcpus is empty or doesn't overlap with
+		 * parent's effective_xcpus.
 		 */
-		if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+		if (cpumask_empty(xcpus) ||
+		    !cpumask_intersects(xcpus, parent->effective_xcpus))
 			return PERR_INVCPUS;
 
+		if (prstate_housekeeping_conflict(new_prs, xcpus))
+			return PERR_HKEEPING;
+
 		/*
 		 * A parent can be left with no CPU as long as there is no
 		 * task directly associated with the parent partition.
 		 */
-		if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
-		    partition_is_populated(parent, cs))
+		if (nocpu)
 			return PERR_NOCPUS;
 
-		cpumask_copy(tmp->addmask, cs->cpus_allowed);
-		adding = true;
+		cpumask_copy(tmp->delmask, xcpus);
+		deleting = true;
+		subparts_delta++;
 	} else if (cmd == partcmd_disable) {
 		/*
-		 * Need to remove cpus from parent's subparts_cpus for valid
-		 * partition root.
+		 * May need to add cpus to parent's effective_cpus for
+		 * valid partition root.
 		 */
-		deleting = !is_prs_invalid(old_prs) &&
-			   cpumask_and(tmp->delmask, cs->cpus_allowed,
-				       parent->subparts_cpus);
-	} else if (cmd == partcmd_invalidate) {
-		if (is_prs_invalid(old_prs))
-			return 0;
-
+		adding = !is_prs_invalid(old_prs) &&
+			  cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
+		if (adding)
+			subparts_delta--;
+	} else if (newmask) {
 		/*
-		 * Make the current partition invalid. It is assumed that
-		 * invalidation is caused by violating cpu exclusivity rule.
+		 * Empty cpumask is not allowed
 		 */
-		deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
-				       parent->subparts_cpus);
-		if (old_prs > 0) {
-			new_prs = -old_prs;
-			part_error = PERR_NOTEXCL;
+		if (cpumask_empty(newmask)) {
+			part_error = PERR_CPUSEMPTY;
+			goto write_error;
 		}
-	} else if (newmask) {
+
 		/*
 		 * partcmd_update with newmask:
 		 *
-		 * Compute add/delete mask to/from subparts_cpus
+		 * Compute add/delete mask to/from effective_cpus
+		 *
+		 * For valid partition:
+		 *   addmask = exclusive_cpus & ~newmask
+		 *			      & parent->effective_xcpus
+		 *   delmask = newmask & ~exclusive_cpus
+		 *		       & parent->effective_xcpus
 		 *
-		 * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
-		 * addmask = newmask & parent->cpus_allowed
-		 *		     & ~parent->subparts_cpus
+		 * For invalid partition:
+		 *   delmask = newmask & parent->effective_xcpus
 		 */
-		cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
-		deleting = cpumask_and(tmp->delmask, tmp->delmask,
-				       parent->subparts_cpus);
+		if (is_prs_invalid(old_prs)) {
+			adding = false;
+			deleting = cpumask_and(tmp->delmask,
+					newmask, parent->effective_xcpus);
+		} else {
+			cpumask_andnot(tmp->addmask, xcpus, newmask);
+			adding = cpumask_and(tmp->addmask, tmp->addmask,
+					     parent->effective_xcpus);
 
-		cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
-		adding = cpumask_andnot(tmp->addmask, tmp->addmask,
-					parent->subparts_cpus);
-		/*
-		 * Empty cpumask is not allowed
-		 */
-		if (cpumask_empty(newmask)) {
-			part_error = PERR_CPUSEMPTY;
+			cpumask_andnot(tmp->delmask, newmask, xcpus);
+			deleting = cpumask_and(tmp->delmask, tmp->delmask,
+					       parent->effective_xcpus);
+		}
 		/*
 		 * Make partition invalid if parent's effective_cpus could
 		 * become empty and there are tasks in the parent.
 		 */
-		} else if (adding &&
-		    cpumask_subset(parent->effective_cpus, tmp->addmask) &&
-		    !cpumask_intersects(tmp->delmask, cpu_active_mask) &&
-		    partition_is_populated(parent, cs)) {
+		if (nocpu && (!adding ||
+		    !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
 			part_error = PERR_NOCPUS;
-			adding = false;
-			deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
-					       parent->subparts_cpus);
+			deleting = false;
+			adding = cpumask_and(tmp->addmask,
+					     xcpus, parent->effective_xcpus);
 		}
 	} else {
 		/*
-		 * partcmd_update w/o newmask:
+		 * partcmd_update w/o newmask
+		 *
+		 * delmask = effective_xcpus & parent->effective_cpus
+		 *
+		 * This can be called from:
+		 * 1) update_cpumasks_hier()
+		 * 2) cpuset_hotplug_update_tasks()
 		 *
-		 * delmask = cpus_allowed & parent->subparts_cpus
-		 * addmask = cpus_allowed & parent->cpus_allowed
-		 *			  & ~parent->subparts_cpus
+		 * Check to see if it can be transitioned from valid to
+		 * invalid partition or vice versa.
 		 *
-		 * This gets invoked either due to a hotplug event or from
-		 * update_cpumasks_hier(). This can cause the state of a
-		 * partition root to transition from valid to invalid or vice
-		 * versa. So we still need to compute the addmask and delmask.
-
-		 * A partition error happens when:
-		 * 1) Cpuset is valid partition, but parent does not distribute
-		 *    out any CPUs.
-		 * 2) Parent has tasks and all its effective CPUs will have
-		 *    to be distributed out.
+		 * A partition error happens when parent has tasks and all
+		 * its effective CPUs will have to be distributed out.
 		 */
-		cpumask_and(tmp->addmask, cs->cpus_allowed,
-					  parent->cpus_allowed);
-		adding = cpumask_andnot(tmp->addmask, tmp->addmask,
-					parent->subparts_cpus);
-
-		if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
-		    (adding &&
-		     cpumask_subset(parent->effective_cpus, tmp->addmask) &&
-		     partition_is_populated(parent, cs))) {
+		WARN_ON_ONCE(!is_partition_valid(parent));
+		if (nocpu) {
 			part_error = PERR_NOCPUS;
-			adding = false;
-		}
+			if (is_partition_valid(cs))
+				adding = cpumask_and(tmp->addmask,
+						xcpus, parent->effective_xcpus);
+		} else if (is_partition_invalid(cs) &&
+			   cpumask_subset(xcpus, parent->effective_xcpus)) {
+			struct cgroup_subsys_state *css;
+			struct cpuset *child;
+			bool exclusive = true;
 
-		if (part_error && is_partition_valid(cs) &&
-		    parent->nr_subparts_cpus)
-			deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
-					       parent->subparts_cpus);
+			/*
+			 * Convert invalid partition to valid has to
+			 * pass the cpu exclusivity test.
+			 */
+			rcu_read_lock();
+			cpuset_for_each_child(child, css, parent) {
+				if (child == cs)
+					continue;
+				if (!cpusets_are_exclusive(cs, child)) {
+					exclusive = false;
+					break;
+				}
+			}
+			rcu_read_unlock();
+			if (exclusive)
+				deleting = cpumask_and(tmp->delmask,
+						xcpus, parent->effective_cpus);
+			else
+				part_error = PERR_NOTEXCL;
+		}
 	}
+
+write_error:
 	if (part_error)
 		WRITE_ONCE(cs->prs_err, part_error);
 
@@ -1514,13 +1901,17 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 		switch (cs->partition_root_state) {
 		case PRS_ROOT:
 		case PRS_ISOLATED:
-			if (part_error)
+			if (part_error) {
 				new_prs = -old_prs;
+				subparts_delta--;
+			}
 			break;
 		case PRS_INVALID_ROOT:
 		case PRS_INVALID_ISOLATED:
-			if (!part_error)
+			if (!part_error) {
 				new_prs = -old_prs;
+				subparts_delta++;
+			}
 			break;
 		}
 	}
@@ -1530,9 +1921,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 
 	/*
 	 * Transitioning between invalid to valid or vice versa may require
-	 * changing CS_CPU_EXCLUSIVE.
+	 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
+	 * validate_change() has already been successfully called and
+	 * CPU lists in cs haven't been updated yet. So defer it to later.
 	 */
-	if (old_prs != new_prs) {
+	if ((old_prs != new_prs) && (cmd != partcmd_update))  {
 		int err = update_partition_exclusive(cs, new_prs);
 
 		if (err)
@@ -1540,39 +1933,52 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 	}
 
 	/*
-	 * Change the parent's subparts_cpus.
+	 * Change the parent's effective_cpus & effective_xcpus (top cpuset
+	 * only).
+	 *
 	 * Newly added CPUs will be removed from effective_cpus and
 	 * newly deleted ones will be added back to effective_cpus.
 	 */
 	spin_lock_irq(&callback_lock);
 	if (adding) {
-		cpumask_or(parent->subparts_cpus,
-			   parent->subparts_cpus, tmp->addmask);
-		cpumask_andnot(parent->effective_cpus,
-			       parent->effective_cpus, tmp->addmask);
-	}
-	if (deleting) {
-		cpumask_andnot(parent->subparts_cpus,
-			       parent->subparts_cpus, tmp->delmask);
+		if (parent == &top_cpuset)
+			cpumask_andnot(subpartitions_cpus,
+				       subpartitions_cpus, tmp->addmask);
 		/*
-		 * Some of the CPUs in subparts_cpus might have been offlined.
+		 * Some of the CPUs in effective_xcpus might have been offlined.
 		 */
-		cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
 		cpumask_or(parent->effective_cpus,
-			   parent->effective_cpus, tmp->delmask);
+			   parent->effective_cpus, tmp->addmask);
+		cpumask_and(parent->effective_cpus,
+			    parent->effective_cpus, cpu_active_mask);
+	}
+	if (deleting) {
+		if (parent == &top_cpuset)
+			cpumask_or(subpartitions_cpus,
+				   subpartitions_cpus, tmp->delmask);
+		cpumask_andnot(parent->effective_cpus,
+			       parent->effective_cpus, tmp->delmask);
 	}
 
-	parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+	if (is_partition_valid(parent)) {
+		parent->nr_subparts += subparts_delta;
+		WARN_ON_ONCE(parent->nr_subparts < 0);
+	}
 
-	if (old_prs != new_prs)
+	if (old_prs != new_prs) {
 		cs->partition_root_state = new_prs;
+		if (new_prs <= 0)
+			cs->nr_subparts = 0;
+	}
 
 	spin_unlock_irq(&callback_lock);
 
+	if ((old_prs != new_prs) && (cmd == partcmd_update))
+		update_partition_exclusive(cs, new_prs);
+
 	if (adding || deleting) {
 		update_tasks_cpumask(parent, tmp->addmask);
-		if (parent->child_ecpus_count)
-			update_sibling_cpumasks(parent, cs, tmp);
+		update_sibling_cpumasks(parent, cs, tmp);
 	}
 
 	/*
@@ -1590,6 +1996,73 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 	return 0;
 }
 
+/**
+ * compute_partition_effective_cpumask - compute effective_cpus for partition
+ * @cs: partition root cpuset
+ * @new_ecpus: previously computed effective_cpus to be updated
+ *
+ * Compute the effective_cpus of a partition root by scanning effective_xcpus
+ * of child partition roots and excluding their effective_xcpus.
+ *
+ * This has the side effect of invalidating valid child partition roots,
+ * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
+ * or update_cpumasks_hier() where parent and children are modified
+ * successively, we don't need to call update_parent_effective_cpumask()
+ * and the child's effective_cpus will be updated in later iterations.
+ *
+ * Note that rcu_read_lock() is assumed to be held.
+ */
+static void compute_partition_effective_cpumask(struct cpuset *cs,
+						struct cpumask *new_ecpus)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *child;
+	bool populated = partition_is_populated(cs, NULL);
+
+	/*
+	 * Check child partition roots to see if they should be
+	 * invalidated when
+	 *  1) child effective_xcpus not a subset of new
+	 *     excluisve_cpus
+	 *  2) All the effective_cpus will be used up and cp
+	 *     has tasks
+	 */
+	compute_effective_exclusive_cpumask(cs, new_ecpus);
+	cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
+
+	rcu_read_lock();
+	cpuset_for_each_child(child, css, cs) {
+		if (!is_partition_valid(child))
+			continue;
+
+		child->prs_err = 0;
+		if (!cpumask_subset(child->effective_xcpus,
+				    cs->effective_xcpus))
+			child->prs_err = PERR_INVCPUS;
+		else if (populated &&
+			 cpumask_subset(new_ecpus, child->effective_xcpus))
+			child->prs_err = PERR_NOCPUS;
+
+		if (child->prs_err) {
+			int old_prs = child->partition_root_state;
+
+			/*
+			 * Invalidate child partition
+			 */
+			spin_lock_irq(&callback_lock);
+			make_partition_invalid(child);
+			cs->nr_subparts--;
+			child->nr_subparts = 0;
+			spin_unlock_irq(&callback_lock);
+			notify_partition_change(child, old_prs);
+			continue;
+		}
+		cpumask_andnot(new_ecpus, new_ecpus,
+			       child->effective_xcpus);
+	}
+	rcu_read_unlock();
+}
+
 /*
  * update_cpumasks_hier() flags
  */
@@ -1620,9 +2093,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 	rcu_read_lock();
 	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
 		struct cpuset *parent = parent_cs(cp);
+		bool remote = is_remote_partition(cp);
 		bool update_parent = false;
 
-		compute_effective_cpumask(tmp->new_cpus, cp, parent);
+		/*
+		 * Skip descendent remote partition that acquires CPUs
+		 * directly from top cpuset unless it is cs.
+		 */
+		if (remote && (cp != cs)) {
+			pos_css = css_rightmost_descendant(pos_css);
+			continue;
+		}
+
+		/*
+		 * Update effective_xcpus if exclusive_cpus set.
+		 * The case when exclusive_cpus isn't set is handled later.
+		 */
+		if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
+			spin_lock_irq(&callback_lock);
+			compute_effective_exclusive_cpumask(cp, NULL);
+			spin_unlock_irq(&callback_lock);
+		}
+
+		old_prs = new_prs = cp->partition_root_state;
+		if (remote || (is_partition_valid(parent) &&
+			       is_partition_valid(cp)))
+			compute_partition_effective_cpumask(cp, tmp->new_cpus);
+		else
+			compute_effective_cpumask(tmp->new_cpus, cp, parent);
+
+		/*
+		 * A partition with no effective_cpus is allowed as long as
+		 * there is no task associated with it. Call
+		 * update_parent_effective_cpumask() to check it.
+		 */
+		if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
+			update_parent = true;
+			goto update_parent_effective;
+		}
 
 		/*
 		 * If it becomes empty, inherit the effective mask of the
@@ -1630,11 +2138,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 		 * it is a partition root that has explicitly distributed
 		 * out all its CPUs.
 		 */
-		if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
-			if (is_partition_valid(cp) &&
-			    cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
-				goto update_parent_subparts;
-
+		if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
 			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
 			if (!cp->use_parent_ecpus) {
 				cp->use_parent_ecpus = true;
@@ -1646,6 +2150,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 			parent->child_ecpus_count--;
 		}
 
+		if (remote)
+			goto get_css;
+
 		/*
 		 * Skip the whole subtree if
 		 * 1) the cpumask remains the same,
@@ -1661,14 +2168,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 			continue;
 		}
 
-update_parent_subparts:
+update_parent_effective:
 		/*
-		 * update_parent_subparts_cpumask() should have been called
+		 * update_parent_effective_cpumask() should have been called
 		 * for cs already in update_cpumask(). We should also call
 		 * update_tasks_cpumask() again for tasks in the parent
-		 * cpuset if the parent's subparts_cpus changes.
+		 * cpuset if the parent's effective_cpus changes.
 		 */
-		old_prs = new_prs = cp->partition_root_state;
 		if ((cp != cs) && old_prs) {
 			switch (parent->partition_root_state) {
 			case PRS_ROOT:
@@ -1690,14 +2196,13 @@ update_parent_subparts:
 				break;
 			}
 		}
-
+get_css:
 		if (!css_tryget_online(&cp->css))
 			continue;
 		rcu_read_unlock();
 
 		if (update_parent) {
-			update_parent_subparts_cpumask(cp, partcmd_update, NULL,
-						       tmp);
+			update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
 			/*
 			 * The cpuset partition_root_state may become
 			 * invalid. Capture it.
@@ -1706,30 +2211,17 @@ update_parent_subparts:
 		}
 
 		spin_lock_irq(&callback_lock);
-
-		if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
-			/*
-			 * Put all active subparts_cpus back to effective_cpus.
-			 */
-			cpumask_or(tmp->new_cpus, tmp->new_cpus,
-				   cp->subparts_cpus);
-			cpumask_and(tmp->new_cpus, tmp->new_cpus,
-				   cpu_active_mask);
-			cp->nr_subparts_cpus = 0;
-			cpumask_clear(cp->subparts_cpus);
-		}
-
 		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
-		if (cp->nr_subparts_cpus) {
-			/*
-			 * Make sure that effective_cpus & subparts_cpus
-			 * are mutually exclusive.
-			 */
-			cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
-				       cp->subparts_cpus);
-		}
-
 		cp->partition_root_state = new_prs;
+		/*
+		 * Make sure effective_xcpus is properly set for a valid
+		 * partition root.
+		 */
+		if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
+			cpumask_and(cp->effective_xcpus,
+				    cp->cpus_allowed, parent->effective_xcpus);
+		else if (new_prs < 0)
+			reset_partition_data(cp);
 		spin_unlock_irq(&callback_lock);
 
 		notify_partition_change(cp, old_prs);
@@ -1737,7 +2229,7 @@ update_parent_subparts:
 		WARN_ON(!is_in_v2_mode() &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
 
-		update_tasks_cpumask(cp, tmp->new_cpus);
+		update_tasks_cpumask(cp, cp->effective_cpus);
 
 		/*
 		 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
@@ -1790,8 +2282,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
 
 	/*
 	 * Check all its siblings and call update_cpumasks_hier()
-	 * if their use_parent_ecpus flag is set in order for them
-	 * to use the right effective_cpus value.
+	 * if their effective_cpus will need to be changed.
+	 *
+	 * With the addition of effective_xcpus which is a subset of
+	 * cpus_allowed. It is possible a change in parent's effective_cpus
+	 * due to a change in a child partition's effective_xcpus will impact
+	 * its siblings even if they do not inherit parent's effective_cpus
+	 * directly.
 	 *
 	 * The update_cpumasks_hier() function may sleep. So we have to
 	 * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
@@ -1802,8 +2299,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
 	cpuset_for_each_child(sibling, pos_css, parent) {
 		if (sibling == cs)
 			continue;
-		if (!sibling->use_parent_ecpus)
-			continue;
+		if (!sibling->use_parent_ecpus &&
+		    !is_partition_valid(sibling)) {
+			compute_effective_cpumask(tmp->new_cpus, sibling,
+						  parent);
+			if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
+				continue;
+		}
 		if (!css_tryget_online(&sibling->css))
 			continue;
 
@@ -1826,7 +2328,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 {
 	int retval;
 	struct tmpmasks tmp;
+	struct cpuset *parent = parent_cs(cs);
 	bool invalidate = false;
+	int hier_flags = 0;
 	int old_prs = cs->partition_root_state;
 
 	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -1841,6 +2345,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	 */
 	if (!*buf) {
 		cpumask_clear(trialcs->cpus_allowed);
+		cpumask_clear(trialcs->effective_xcpus);
 	} else {
 		retval = cpulist_parse(buf, trialcs->cpus_allowed);
 		if (retval < 0)
@@ -1849,6 +2354,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		if (!cpumask_subset(trialcs->cpus_allowed,
 				    top_cpuset.cpus_allowed))
 			return -EINVAL;
+
+		/*
+		 * When exclusive_cpus isn't explicitly set, it is constrainted
+		 * by cpus_allowed and parent's effective_xcpus. Otherwise,
+		 * trialcs->effective_xcpus is used as a temporary cpumask
+		 * for checking validity of the partition root.
+		 */
+		if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
+			compute_effective_exclusive_cpumask(trialcs, NULL);
 	}
 
 	/* Nothing to do if the cpus didn't change */
@@ -1858,11 +2372,32 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (alloc_cpumasks(NULL, &tmp))
 		return -ENOMEM;
 
+	if (old_prs) {
+		if (is_partition_valid(cs) &&
+		    cpumask_empty(trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_INVCPUS;
+		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_HKEEPING;
+		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_NOCPUS;
+		}
+	}
+
+	/*
+	 * Check all the descendants in update_cpumasks_hier() if
+	 * effective_xcpus is to be changed.
+	 */
+	if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+		hier_flags = HIER_CHECKALL;
+
 	retval = validate_change(cs, trialcs);
 
 	if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
-		struct cpuset *cp, *parent;
 		struct cgroup_subsys_state *css;
+		struct cpuset *cp;
 
 		/*
 		 * The -EINVAL error code indicates that partition sibling
@@ -1873,70 +2408,168 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		 */
 		invalidate = true;
 		rcu_read_lock();
-		parent = parent_cs(cs);
-		cpuset_for_each_child(cp, css, parent)
+		cpuset_for_each_child(cp, css, parent) {
+			struct cpumask *xcpus = fetch_xcpus(trialcs);
+
 			if (is_partition_valid(cp) &&
-			    cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) {
+			    cpumask_intersects(xcpus, cp->effective_xcpus)) {
 				rcu_read_unlock();
-				update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+				update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
 				rcu_read_lock();
 			}
+		}
 		rcu_read_unlock();
 		retval = 0;
 	}
+
 	if (retval < 0)
 		goto out_free;
 
-	if (cs->partition_root_state) {
-		if (invalidate)
-			update_parent_subparts_cpumask(cs, partcmd_invalidate,
-						       NULL, &tmp);
+	if (is_partition_valid(cs) ||
+	   (is_partition_invalid(cs) && !invalidate)) {
+		struct cpumask *xcpus = trialcs->effective_xcpus;
+
+		if (cpumask_empty(xcpus) && is_partition_invalid(cs))
+			xcpus = trialcs->cpus_allowed;
+
+		/*
+		 * Call remote_cpus_update() to handle valid remote partition
+		 */
+		if (is_remote_partition(cs))
+			remote_cpus_update(cs, xcpus, &tmp);
+		else if (invalidate)
+			update_parent_effective_cpumask(cs, partcmd_invalidate,
+							NULL, &tmp);
 		else
-			update_parent_subparts_cpumask(cs, partcmd_update,
-						trialcs->cpus_allowed, &tmp);
+			update_parent_effective_cpumask(cs, partcmd_update,
+							xcpus, &tmp);
+	} else if (!cpumask_empty(cs->exclusive_cpus)) {
+		/*
+		 * Use trialcs->effective_cpus as a temp cpumask
+		 */
+		remote_partition_check(cs, trialcs->effective_xcpus,
+				       trialcs->effective_cpus, &tmp);
 	}
 
-	compute_effective_cpumask(trialcs->effective_cpus, trialcs,
-				  parent_cs(cs));
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+	if ((old_prs > 0) && !is_partition_valid(cs))
+		reset_partition_data(cs);
+	spin_unlock_irq(&callback_lock);
+
+	/* effective_cpus/effective_xcpus will be updated here */
+	update_cpumasks_hier(cs, &tmp, hier_flags);
+
+	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+	if (cs->partition_root_state)
+		update_partition_sd_lb(cs, old_prs);
+out_free:
+	free_cpumasks(NULL, &tmp);
+	return 0;
+}
+
+/**
+ * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ *
+ * The tasks' cpumask will be updated if cs is a valid partition root.
+ */
+static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+				    const char *buf)
+{
+	int retval;
+	struct tmpmasks tmp;
+	struct cpuset *parent = parent_cs(cs);
+	bool invalidate = false;
+	int hier_flags = 0;
+	int old_prs = cs->partition_root_state;
+
+	if (!*buf) {
+		cpumask_clear(trialcs->exclusive_cpus);
+		cpumask_clear(trialcs->effective_xcpus);
+	} else {
+		retval = cpulist_parse(buf, trialcs->exclusive_cpus);
+		if (retval < 0)
+			return retval;
+		if (!is_cpu_exclusive(cs))
+			set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
+	}
+
+	/* Nothing to do if the CPUs didn't change */
+	if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
+		return 0;
+
+	if (alloc_cpumasks(NULL, &tmp))
+		return -ENOMEM;
+
+	if (*buf)
+		compute_effective_exclusive_cpumask(trialcs, NULL);
 
 	/*
-	 * Make sure that subparts_cpus, if not empty, is a subset of
-	 * cpus_allowed. Clear subparts_cpus if partition not valid or
-	 * empty effective cpus with tasks.
+	 * Check all the descendants in update_cpumasks_hier() if
+	 * effective_xcpus is to be changed.
 	 */
-	if (cs->nr_subparts_cpus) {
-		if (!is_partition_valid(cs) ||
-		   (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) &&
-		    partition_is_populated(cs, NULL))) {
-			cs->nr_subparts_cpus = 0;
-			cpumask_clear(cs->subparts_cpus);
+	if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+		hier_flags = HIER_CHECKALL;
+
+	retval = validate_change(cs, trialcs);
+	if (retval)
+		return retval;
+
+	if (old_prs) {
+		if (cpumask_empty(trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_INVCPUS;
+		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_HKEEPING;
+		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_NOCPUS;
+		}
+
+		if (is_remote_partition(cs)) {
+			if (invalidate)
+				remote_partition_disable(cs, &tmp);
+			else
+				remote_cpus_update(cs, trialcs->effective_xcpus,
+						   &tmp);
+		} else if (invalidate) {
+			update_parent_effective_cpumask(cs, partcmd_invalidate,
+							NULL, &tmp);
 		} else {
-			cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
-				    cs->cpus_allowed);
-			cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+			update_parent_effective_cpumask(cs, partcmd_update,
+						trialcs->effective_xcpus, &tmp);
 		}
+	} else if (!cpumask_empty(trialcs->exclusive_cpus)) {
+		/*
+		 * Use trialcs->effective_cpus as a temp cpumask
+		 */
+		remote_partition_check(cs, trialcs->effective_xcpus,
+				       trialcs->effective_cpus, &tmp);
 	}
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
+	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+	if ((old_prs > 0) && !is_partition_valid(cs))
+		reset_partition_data(cs);
 	spin_unlock_irq(&callback_lock);
 
-	/* effective_cpus will be updated here */
-	update_cpumasks_hier(cs, &tmp, 0);
-
-	if (cs->partition_root_state) {
-		struct cpuset *parent = parent_cs(cs);
-
-		/*
-		 * For partition root, update the cpumasks of sibling
-		 * cpusets if they use parent's effective_cpus.
-		 */
-		if (parent->child_ecpus_count)
-			update_sibling_cpumasks(parent, cs, &tmp);
+	/*
+	 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
+	 * of the subtree when it is a valid partition root or effective_xcpus
+	 * is updated.
+	 */
+	if (is_partition_valid(cs) || hier_flags)
+		update_cpumasks_hier(cs, &tmp, hier_flags);
 
-		/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */
+	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+	if (cs->partition_root_state)
 		update_partition_sd_lb(cs, old_prs);
-	}
-out_free:
+
 	free_cpumasks(NULL, &tmp);
 	return 0;
 }
@@ -2320,17 +2953,25 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		return 0;
 
 	/*
-	 * For a previously invalid partition root, leave it at being
-	 * invalid if new_prs is not "member".
+	 * Treat a previously invalid partition root as if it is a "member".
 	 */
-	if (new_prs && is_prs_invalid(old_prs)) {
-		cs->partition_root_state = -new_prs;
-		return 0;
-	}
+	if (new_prs && is_prs_invalid(old_prs))
+		old_prs = PRS_MEMBER;
 
 	if (alloc_cpumasks(NULL, &tmpmask))
 		return -ENOMEM;
 
+	/*
+	 * Setup effective_xcpus if not properly set yet, it will be cleared
+	 * later if partition becomes invalid.
+	 */
+	if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
+		spin_lock_irq(&callback_lock);
+		cpumask_and(cs->effective_xcpus,
+			    cs->cpus_allowed, parent->effective_xcpus);
+		spin_unlock_irq(&callback_lock);
+	}
+
 	err = update_partition_exclusive(cs, new_prs);
 	if (err)
 		goto out;
@@ -2344,8 +2985,14 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 			goto out;
 		}
 
-		err = update_parent_subparts_cpumask(cs, partcmd_enable,
-						     NULL, &tmpmask);
+		err = update_parent_effective_cpumask(cs, partcmd_enable,
+						      NULL, &tmpmask);
+		/*
+		 * If an attempt to become local partition root fails,
+		 * try to become a remote partition root instead.
+		 */
+		if (err && remote_partition_enable(cs, &tmpmask))
+			err = 0;
 	} else if (old_prs && new_prs) {
 		/*
 		 * A change in load balance state only, no change in cpumasks.
@@ -2356,19 +3003,16 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		 * Switching back to member is always allowed even if it
 		 * disables child partitions.
 		 */
-		update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
-					       &tmpmask);
+		if (is_remote_partition(cs))
+			remote_partition_disable(cs, &tmpmask);
+		else
+			update_parent_effective_cpumask(cs, partcmd_disable,
+							NULL, &tmpmask);
 
 		/*
-		 * If there are child partitions, they will all become invalid.
+		 * Invalidation of child partitions will be done in
+		 * update_cpumasks_hier().
 		 */
-		if (unlikely(cs->nr_subparts_cpus)) {
-			spin_lock_irq(&callback_lock);
-			cs->nr_subparts_cpus = 0;
-			cpumask_clear(cs->subparts_cpus);
-			compute_effective_cpumask(cs->effective_cpus, cs, parent);
-			spin_unlock_irq(&callback_lock);
-		}
 	}
 out:
 	/*
@@ -2383,14 +3027,12 @@ out:
 	spin_lock_irq(&callback_lock);
 	cs->partition_root_state = new_prs;
 	WRITE_ONCE(cs->prs_err, err);
+	if (!is_partition_valid(cs))
+		reset_partition_data(cs);
 	spin_unlock_irq(&callback_lock);
 
-	/*
-	 * Update child cpusets, if present.
-	 * Force update if switching back to member.
-	 */
-	if (!list_empty(&cs->css.children))
-		update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+	/* Force update if switching back to member */
+	update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
 
 	/* Update sched domains and load balance flag */
 	update_partition_sd_lb(cs, old_prs);
@@ -2639,7 +3281,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
 		guarantee_online_cpus(task, cpus_attach);
 	else
 		cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
-			       cs->subparts_cpus);
+			       subpartitions_cpus);
 	/*
 	 * can_attach beforehand should guarantee that this doesn't
 	 * fail.  TODO: have a better way to handle failure here
@@ -2742,6 +3384,8 @@ typedef enum {
 	FILE_EFFECTIVE_CPULIST,
 	FILE_EFFECTIVE_MEMLIST,
 	FILE_SUBPARTS_CPULIST,
+	FILE_EXCLUSIVE_CPULIST,
+	FILE_EFFECTIVE_XCPULIST,
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_MEM_HARDWALL,
@@ -2879,6 +3523,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	case FILE_CPULIST:
 		retval = update_cpumask(cs, trialcs, buf);
 		break;
+	case FILE_EXCLUSIVE_CPULIST:
+		retval = update_exclusive_cpumask(cs, trialcs, buf);
+		break;
 	case FILE_MEMLIST:
 		retval = update_nodemask(cs, trialcs, buf);
 		break;
@@ -2926,8 +3573,14 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_EFFECTIVE_MEMLIST:
 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
 		break;
+	case FILE_EXCLUSIVE_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
+		break;
+	case FILE_EFFECTIVE_XCPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
+		break;
 	case FILE_SUBPARTS_CPULIST:
-		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
 		break;
 	default:
 		ret = -EINVAL;
@@ -3200,10 +3853,26 @@ static struct cftype dfl_files[] = {
 	},
 
 	{
+		.name = "cpus.exclusive",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_EXCLUSIVE_CPULIST,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
+	{
+		.name = "cpus.exclusive.effective",
+		.seq_show = cpuset_common_seq_show,
+		.private = FILE_EFFECTIVE_XCPULIST,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
+	{
 		.name = "cpus.subpartitions",
 		.seq_show = cpuset_common_seq_show,
 		.private = FILE_SUBPARTS_CPULIST,
-		.flags = CFTYPE_DEBUG,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
 	},
 
 	{ }	/* terminate */
@@ -3241,6 +3910,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 	nodes_clear(cs->effective_mems);
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
+	INIT_LIST_HEAD(&cs->remote_sibling);
 
 	/* Set CS_MEMORY_MIGRATE for default hierarchy */
 	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
@@ -3276,6 +3946,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 		cs->effective_mems = parent->effective_mems;
 		cs->use_parent_ecpus = true;
 		parent->child_ecpus_count++;
+		/*
+		 * Clear CS_SCHED_LOAD_BALANCE if parent is isolated
+		 */
+		if (!is_sched_load_balance(parent))
+			clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	}
 
 	/*
@@ -3377,6 +4052,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 
 	if (is_in_v2_mode()) {
 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+		cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
 		top_cpuset.mems_allowed = node_possible_map;
 	} else {
 		cpumask_copy(top_cpuset.cpus_allowed,
@@ -3515,16 +4191,21 @@ int __init cpuset_init(void)
 {
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
-	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
+	BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
+	cpumask_setall(top_cpuset.effective_xcpus);
+	cpumask_setall(top_cpuset.exclusive_cpus);
 	nodes_setall(top_cpuset.effective_mems);
 
 	fmeter_init(&top_cpuset.fmeter);
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;
+	INIT_LIST_HEAD(&remote_children);
 
 	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
 
@@ -3640,6 +4321,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 	static nodemask_t new_mems;
 	bool cpus_updated;
 	bool mems_updated;
+	bool remote;
 	struct cpuset *parent;
 retry:
 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -3659,29 +4341,23 @@ retry:
 	compute_effective_cpumask(&new_cpus, cs, parent);
 	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
 
-	if (cs->nr_subparts_cpus)
-		/*
-		 * Make sure that CPUs allocated to child partitions
-		 * do not show up in effective_cpus.
-		 */
-		cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
-
 	if (!tmp || !cs->partition_root_state)
 		goto update_tasks;
 
 	/*
-	 * In the unlikely event that a partition root has empty
-	 * effective_cpus with tasks, we will have to invalidate child
-	 * partitions, if present, by setting nr_subparts_cpus to 0 to
-	 * reclaim their cpus.
+	 * Compute effective_cpus for valid partition root, may invalidate
+	 * child partition roots if necessary.
 	 */
-	if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
-	    cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
-		spin_lock_irq(&callback_lock);
-		cs->nr_subparts_cpus = 0;
-		cpumask_clear(cs->subparts_cpus);
-		spin_unlock_irq(&callback_lock);
+	remote = is_remote_partition(cs);
+	if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
+		compute_partition_effective_cpumask(cs, &new_cpus);
+
+	if (remote && cpumask_empty(&new_cpus) &&
+	    partition_is_populated(cs, NULL)) {
+		remote_partition_disable(cs, tmp);
 		compute_effective_cpumask(&new_cpus, cs, parent);
+		remote = false;
+		cpuset_force_rebuild();
 	}
 
 	/*
@@ -3691,44 +4367,22 @@ retry:
 	 * 2) parent is invalid or doesn't grant any cpus to child
 	 *    partitions.
 	 */
-	if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
-	   (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
-		int old_prs, parent_prs;
-
-		update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
-		if (cs->nr_subparts_cpus) {
-			spin_lock_irq(&callback_lock);
-			cs->nr_subparts_cpus = 0;
-			cpumask_clear(cs->subparts_cpus);
-			spin_unlock_irq(&callback_lock);
-			compute_effective_cpumask(&new_cpus, cs, parent);
-		}
-
-		old_prs = cs->partition_root_state;
-		parent_prs = parent->partition_root_state;
-		if (is_partition_valid(cs)) {
-			spin_lock_irq(&callback_lock);
-			make_partition_invalid(cs);
-			spin_unlock_irq(&callback_lock);
-			if (is_prs_invalid(parent_prs))
-				WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
-			else if (!parent_prs)
-				WRITE_ONCE(cs->prs_err, PERR_NOTPART);
-			else
-				WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
-			notify_partition_change(cs, old_prs);
-		}
+	if (is_local_partition(cs) && (!is_partition_valid(parent) ||
+				tasks_nocpu_error(parent, cs, &new_cpus))) {
+		update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp);
+		compute_effective_cpumask(&new_cpus, cs, parent);
 		cpuset_force_rebuild();
 	}
-
 	/*
 	 * On the other hand, an invalid partition root may be transitioned
 	 * back to a regular one.
 	 */
 	else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
-		update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
-		if (is_partition_valid(cs))
+		update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp);
+		if (is_partition_valid(cs)) {
+			compute_partition_effective_cpumask(cs, &new_cpus);
 			cpuset_force_rebuild();
+		}
 	}
 
 update_tasks:
@@ -3786,21 +4440,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	new_mems = node_states[N_MEMORY];
 
 	/*
-	 * If subparts_cpus is populated, it is likely that the check below
-	 * will produce a false positive on cpus_updated when the cpu list
-	 * isn't changed. It is extra work, but it is better to be safe.
+	 * If subpartitions_cpus is populated, it is likely that the check
+	 * below will produce a false positive on cpus_updated when the cpu
+	 * list isn't changed. It is extra work, but it is better to be safe.
 	 */
-	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
+		       !cpumask_empty(subpartitions_cpus);
 	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
 	/*
-	 * In the rare case that hotplug removes all the cpus in subparts_cpus,
-	 * we assumed that cpus are updated.
+	 * In the rare case that hotplug removes all the cpus in
+	 * subpartitions_cpus, we assumed that cpus are updated.
 	 */
-	if (!cpus_updated && top_cpuset.nr_subparts_cpus)
+	if (!cpus_updated && top_cpuset.nr_subparts)
 		cpus_updated = true;
 
-	/* synchronize cpus_allowed to cpu_active_mask */
+	/* For v1, synchronize cpus_allowed to cpu_active_mask */
 	if (cpus_updated) {
 		spin_lock_irq(&callback_lock);
 		if (!on_dfl)
@@ -3808,17 +4463,16 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		/*
 		 * Make sure that CPUs allocated to child partitions
 		 * do not show up in effective_cpus. If no CPU is left,
-		 * we clear the subparts_cpus & let the child partitions
+		 * we clear the subpartitions_cpus & let the child partitions
 		 * fight for the CPUs again.
 		 */
-		if (top_cpuset.nr_subparts_cpus) {
-			if (cpumask_subset(&new_cpus,
-					   top_cpuset.subparts_cpus)) {
-				top_cpuset.nr_subparts_cpus = 0;
-				cpumask_clear(top_cpuset.subparts_cpus);
+		if (!cpumask_empty(subpartitions_cpus)) {
+			if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
+				top_cpuset.nr_subparts = 0;
+				cpumask_clear(subpartitions_cpus);
 			} else {
 				cpumask_andnot(&new_cpus, &new_cpus,
-					       top_cpuset.subparts_cpus);
+					       subpartitions_cpus);
 			}
 		}
 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
@@ -3950,7 +4604,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 		 * We first exclude cpus allocated to partitions. If there is no
 		 * allowable online cpu left, we fall back to all possible cpus.
 		 */
-		cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
+		cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
 		if (!cpumask_intersects(pmask, cpu_online_mask))
 			cpumask_copy(pmask, possible_mask);
 	}
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
new file mode 100644
index 000000000000..95a400f042b1
--- /dev/null
+++ b/kernel/configs/hardening.config
@@ -0,0 +1,98 @@
+# Help: Basic kernel hardening options
+#
+# These are considered the basic kernel hardening, self-protection, and
+# attack surface reduction options. They are expected to have low (or
+# no) performance impact on most workloads, and have a reasonable level
+# of legacy API removals.
+
+# Make sure reporting of various hardening actions is possible.
+CONFIG_BUG=y
+
+# Basic kernel memory permission enforcement.
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_VMAP_STACK=y
+
+# Kernel image and memory ASLR.
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
+
+# Randomize allocator freelists, harden metadata.
+CONFIG_SLAB_FREELIST_RANDOM=y
+CONFIG_SLAB_FREELIST_HARDENED=y
+CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
+CONFIG_RANDOM_KMALLOC_CACHES=y
+
+# Randomize kernel stack offset on syscall entry.
+CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y
+
+# Basic stack frame overflow protection.
+CONFIG_STACKPROTECTOR=y
+CONFIG_STACKPROTECTOR_STRONG=y
+
+# Basic buffer length bounds checking.
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_FORTIFY_SOURCE=y
+
+# Basic array index bounds checking.
+CONFIG_UBSAN=y
+CONFIG_UBSAN_TRAP=y
+CONFIG_UBSAN_BOUNDS=y
+# CONFIG_UBSAN_SHIFT is not set
+# CONFIG_UBSAN_DIV_ZERO
+# CONFIG_UBSAN_UNREACHABLE
+# CONFIG_UBSAN_BOOL
+# CONFIG_UBSAN_ENUM
+# CONFIG_UBSAN_ALIGNMENT
+CONFIG_UBSAN_SANITIZE_ALL=y
+
+# Linked list integrity checking.
+CONFIG_LIST_HARDENED=y
+
+# Initialize all heap variables to zero on allocation.
+CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
+
+# Initialize all stack variables to zero on function entry.
+CONFIG_INIT_STACK_ALL_ZERO=y
+
+# Wipe RAM at reboot via EFI. For more details, see:
+# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/
+# https://bugzilla.redhat.com/show_bug.cgi?id=1532058
+CONFIG_RESET_ATTACK_MITIGATION=y
+
+# Disable DMA between EFI hand-off and the kernel's IOMMU setup.
+CONFIG_EFI_DISABLE_PCI_DMA=y
+
+# Force IOMMU TLB invalidation so devices will never be able to access stale
+# data content.
+CONFIG_IOMMU_SUPPORT=y
+CONFIG_IOMMU_DEFAULT_DMA_STRICT=y
+
+# Do not allow direct physical memory access to non-device memory.
+CONFIG_STRICT_DEVMEM=y
+CONFIG_IO_STRICT_DEVMEM=y
+
+# Provide userspace with seccomp BPF API for syscall attack surface reduction.
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+
+# Provides some protections against SYN flooding.
+CONFIG_SYN_COOKIES=y
+
+# Attack surface reduction: do not autoload TTY line disciplines.
+# CONFIG_LDISC_AUTOLOAD is not set
+
+# Dangerous; enabling this disables userspace brk ASLR.
+# CONFIG_COMPAT_BRK is not set
+
+# Dangerous; exposes kernel text image layout.
+# CONFIG_PROC_KCORE is not set
+
+# Dangerous; enabling this disables userspace VDSO ASLR.
+# CONFIG_COMPAT_VDSO is not set
+
+# Attack surface reduction: Use the modern PTY interface (devpts) only.
+# CONFIG_LEGACY_PTYS is not set
+
+# Attack surface reduction: Use only modesetting video drivers.
+# CONFIG_DRM_LEGACY is not set
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..69e92ddef5dd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -659,11 +659,19 @@ static inline bool cpu_smt_thread_allowed(unsigned int cpu)
 #endif
 }
 
-static inline bool cpu_smt_allowed(unsigned int cpu)
+static inline bool cpu_bootable(unsigned int cpu)
 {
 	if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
 		return true;
 
+	/* All CPUs are bootable if controls are not configured */
+	if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
+		return true;
+
+	/* All CPUs are bootable if CPU is not SMT capable */
+	if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+		return true;
+
 	if (topology_is_primary_thread(cpu))
 		return true;
 
@@ -685,7 +693,7 @@ bool cpu_smt_possible(void)
 EXPORT_SYMBOL_GPL(cpu_smt_possible);
 
 #else
-static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+static inline bool cpu_bootable(unsigned int cpu) { return true; }
 #endif
 
 static inline enum cpuhp_state
@@ -788,10 +796,10 @@ static int bringup_wait_for_ap_online(unsigned int cpu)
 	 * SMT soft disabling on X86 requires to bring the CPU out of the
 	 * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
 	 * CPU marked itself as booted_once in notify_cpu_starting() so the
-	 * cpu_smt_allowed() check will now return false if this is not the
+	 * cpu_bootable() check will now return false if this is not the
 	 * primary sibling.
 	 */
-	if (!cpu_smt_allowed(cpu))
+	if (!cpu_bootable(cpu))
 		return -ECANCELED;
 	return 0;
 }
@@ -1372,7 +1380,14 @@ static int takedown_cpu(unsigned int cpu)
 	cpuhp_bp_sync_dead(cpu);
 
 	tick_cleanup_dead_cpu(cpu);
+
+	/*
+	 * Callbacks must be re-integrated right away to the RCU state machine.
+	 * Otherwise an RCU callback could block a further teardown function
+	 * waiting for its completion.
+	 */
 	rcutree_migrate_callbacks(cpu);
+
 	return 0;
 }
 
@@ -1388,10 +1403,10 @@ void cpuhp_report_idle_dead(void)
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
-	rcu_report_dead(smp_processor_id());
+	rcutree_report_cpu_dead();
 	st->state = CPUHP_AP_IDLE_DEAD;
 	/*
-	 * We cannot call complete after rcu_report_dead() so we delegate it
+	 * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
 	 * to an online cpu.
 	 */
 	smp_call_function_single(cpumask_first(cpu_online_mask),
@@ -1515,11 +1530,14 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
 	/*
 	 * Ensure that the control task does not run on the to be offlined
 	 * CPU to prevent a deadlock against cfs_b->period_timer.
+	 * Also keep at least one housekeeping cpu onlined to avoid generating
+	 * an empty sched_domain span.
 	 */
-	cpu = cpumask_any_but(cpu_online_mask, cpu);
-	if (cpu >= nr_cpu_ids)
-		return -EBUSY;
-	return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+	for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
+		if (cpu != work.cpu)
+			return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+	}
+	return -EBUSY;
 }
 
 static int cpu_down(unsigned int cpu, enum cpuhp_state target)
@@ -1617,7 +1635,7 @@ void notify_cpu_starting(unsigned int cpu)
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
 
-	rcu_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
+	rcutree_report_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
 	cpumask_set_cpu(cpu, &cpus_booted_once_mask);
 
 	/*
@@ -1741,7 +1759,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
 		err = -EBUSY;
 		goto out;
 	}
-	if (!cpu_smt_allowed(cpu)) {
+	if (!cpu_bootable(cpu)) {
 		err = -EPERM;
 		goto out;
 	}
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..3c714cb31660 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -36,7 +36,7 @@ do {									\
 static struct kmem_cache *cred_jar;
 
 /* init to 2 - one for init_task, one to ensure it is never freed */
-static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
 
 /*
  * The initial credentials for the initial task
@@ -162,23 +162,29 @@ EXPORT_SYMBOL(__put_cred);
  */
 void exit_creds(struct task_struct *tsk)
 {
-	struct cred *cred;
+	struct cred *real_cred, *cred;
 
 	kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
 	       atomic_read(&tsk->cred->usage),
 	       read_cred_subscribers(tsk->cred));
 
-	cred = (struct cred *) tsk->real_cred;
+	real_cred = (struct cred *) tsk->real_cred;
 	tsk->real_cred = NULL;
-	validate_creds(cred);
-	alter_cred_subscribers(cred, -1);
-	put_cred(cred);
 
 	cred = (struct cred *) tsk->cred;
 	tsk->cred = NULL;
+
 	validate_creds(cred);
-	alter_cred_subscribers(cred, -1);
-	put_cred(cred);
+	if (real_cred == cred) {
+		alter_cred_subscribers(cred, -2);
+		put_cred_many(cred, 2);
+	} else {
+		validate_creds(real_cred);
+		alter_cred_subscribers(real_cred, -1);
+		put_cred(real_cred);
+		alter_cred_subscribers(cred, -1);
+		put_cred(cred);
+	}
 
 #ifdef CONFIG_KEYS_REQUEST_CACHE
 	key_put(tsk->cached_requested_key);
@@ -355,8 +361,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 		clone_flags & CLONE_THREAD
 	    ) {
-		p->real_cred = get_cred(p->cred);
-		get_cred(p->cred);
+		p->real_cred = get_cred_many(p->cred, 2);
 		alter_cred_subscribers(p->cred, 2);
 		kdebug("share_creds(%p{%d,%d})",
 		       p->cred, atomic_read(&p->cred->usage),
@@ -520,8 +525,7 @@ int commit_creds(struct cred *new)
 		proc_id_connector(task, PROC_EVENT_GID);
 
 	/* release the old obj and subj refs both */
-	put_cred(old);
-	put_cred(old);
+	put_cred_many(old, 2);
 	return 0;
 }
 EXPORT_SYMBOL(commit_creds);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 01637677736f..dff067bd56b1 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -678,6 +678,11 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
 	size_t pool_size;
 	size_t tlb_size;
 
+	if (nslabs > SLABS_PER_PAGE << MAX_ORDER) {
+		nslabs = SLABS_PER_PAGE << MAX_ORDER;
+		nareas = limit_nareas(nareas, nslabs);
+	}
+
 	pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
 	pool = kzalloc(pool_size, gfp);
 	if (!pool)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d0663b9324e7..683dc086ef10 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -375,6 +375,7 @@ enum event_type_t {
 	EVENT_TIME = 0x4,
 	/* see ctx_resched() for details */
 	EVENT_CPU = 0x8,
+	EVENT_CGROUP = 0x10,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -449,8 +450,8 @@ static void update_perf_cpu_limits(void)
 
 static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
 
-int perf_proc_update_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
+				       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -684,20 +685,26 @@ do {									\
 	___p;								\
 })
 
-static void perf_ctx_disable(struct perf_event_context *ctx)
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		perf_pmu_disable(pmu_ctx->pmu);
+	}
 }
 
-static void perf_ctx_enable(struct perf_event_context *ctx)
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		perf_pmu_enable(pmu_ctx->pmu);
+	}
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
 		return;
 
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-	perf_ctx_disable(&cpuctx->ctx);
+	perf_ctx_disable(&cpuctx->ctx, true);
 
-	ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+	ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
 	/*
 	 * must not be done before ctxswout due
 	 * to update_cgrp_time_from_cpuctx() in
@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
 	 * perf_cgroup_set_timestamp() in ctx_sched_in()
 	 * to not have to pass task around
 	 */
-	ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+	ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
 
-	perf_ctx_enable(&cpuctx->ctx);
+	perf_ctx_enable(&cpuctx->ctx, true);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
 	if (!is_cgroup_event(event))
 		return;
 
+	event->pmu_ctx->nr_cgroups++;
+
 	/*
 	 * Because cgroup events are always per-cpu events,
 	 * @ctx == &cpuctx->ctx.
@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
 	if (!is_cgroup_event(event))
 		return;
 
+	event->pmu_ctx->nr_cgroups--;
+
 	/*
 	 * Because cgroup events are always per-cpu events,
 	 * @ctx == &cpuctx->ctx.
@@ -2679,9 +2690,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 
 	event_type &= EVENT_ALL;
 
-	perf_ctx_disable(&cpuctx->ctx);
+	perf_ctx_disable(&cpuctx->ctx, false);
 	if (task_ctx) {
-		perf_ctx_disable(task_ctx);
+		perf_ctx_disable(task_ctx, false);
 		task_ctx_sched_out(task_ctx, event_type);
 	}
 
@@ -2699,9 +2710,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 
 	perf_event_sched_in(cpuctx, task_ctx);
 
-	perf_ctx_enable(&cpuctx->ctx);
+	perf_ctx_enable(&cpuctx->ctx, false);
 	if (task_ctx)
-		perf_ctx_enable(task_ctx);
+		perf_ctx_enable(task_ctx, false);
 }
 
 void perf_pmu_resched(struct pmu *pmu)
@@ -3246,6 +3257,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct perf_event_pmu_context *pmu_ctx;
 	int is_active = ctx->is_active;
+	bool cgroup = event_type & EVENT_CGROUP;
+
+	event_type &= ~EVENT_CGROUP;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -3292,8 +3306,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
 
 	is_active ^= ctx->is_active; /* changed bits */
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		__pmu_ctx_sched_out(pmu_ctx, is_active);
+	}
 }
 
 /*
@@ -3484,7 +3501,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
 
-			perf_ctx_disable(ctx);
+			perf_ctx_disable(ctx, false);
 
 			/* PMIs are disabled; ctx->nr_pending is stable. */
 			if (local_read(&ctx->nr_pending) ||
@@ -3504,7 +3521,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
 			perf_ctx_sched_task_cb(ctx, false);
 			perf_event_swap_task_ctx_data(ctx, next_ctx);
 
-			perf_ctx_enable(ctx);
+			perf_ctx_enable(ctx, false);
 
 			/*
 			 * RCU_INIT_POINTER here is safe because we've not
@@ -3528,13 +3545,13 @@ unlock:
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		perf_ctx_disable(ctx);
+		perf_ctx_disable(ctx, false);
 
 inside_switch:
 		perf_ctx_sched_task_cb(ctx, false);
 		task_ctx_sched_out(ctx, EVENT_ALL);
 
-		perf_ctx_enable(ctx);
+		perf_ctx_enable(ctx, false);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -3820,47 +3837,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
 	return 0;
 }
 
-static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+				struct perf_event_groups *groups,
+				struct pmu *pmu)
 {
-	struct perf_event_pmu_context *pmu_ctx;
 	int can_add_hw = 1;
-
-	if (pmu) {
-		visit_groups_merge(ctx, &ctx->pinned_groups,
-				   smp_processor_id(), pmu,
-				   merge_sched_in, &can_add_hw);
-	} else {
-		list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-			can_add_hw = 1;
-			visit_groups_merge(ctx, &ctx->pinned_groups,
-					   smp_processor_id(), pmu_ctx->pmu,
-					   merge_sched_in, &can_add_hw);
-		}
-	}
+	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+			   merge_sched_in, &can_add_hw);
 }
 
-static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+				struct perf_event_groups *groups,
+				bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
-	int can_add_hw = 1;
 
-	if (pmu) {
-		visit_groups_merge(ctx, &ctx->flexible_groups,
-				   smp_processor_id(), pmu,
-				   merge_sched_in, &can_add_hw);
-	} else {
-		list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-			can_add_hw = 1;
-			visit_groups_merge(ctx, &ctx->flexible_groups,
-					   smp_processor_id(), pmu_ctx->pmu,
-					   merge_sched_in, &can_add_hw);
-		}
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
+		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
 	}
 }
 
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+			       struct pmu *pmu)
 {
-	ctx_flexible_sched_in(ctx, pmu);
+	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
 }
 
 static void
@@ -3868,6 +3870,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	int is_active = ctx->is_active;
+	bool cgroup = event_type & EVENT_CGROUP;
+
+	event_type &= ~EVENT_CGROUP;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -3900,11 +3905,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
 	 * in order to give them the best chance of going on.
 	 */
 	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, NULL);
+		ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
 
 	/* Then walk through the lower prio flexible groups */
 	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, NULL);
+		ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
 }
 
 static void perf_event_context_sched_in(struct task_struct *task)
@@ -3919,11 +3924,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
 
 	if (cpuctx->task_ctx == ctx) {
 		perf_ctx_lock(cpuctx, ctx);
-		perf_ctx_disable(ctx);
+		perf_ctx_disable(ctx, false);
 
 		perf_ctx_sched_task_cb(ctx, true);
 
-		perf_ctx_enable(ctx);
+		perf_ctx_enable(ctx, false);
 		perf_ctx_unlock(cpuctx, ctx);
 		goto rcu_unlock;
 	}
@@ -3936,7 +3941,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	if (!ctx->nr_events)
 		goto unlock;
 
-	perf_ctx_disable(ctx);
+	perf_ctx_disable(ctx, false);
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -3946,7 +3951,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	 * events, no need to flip the cpuctx's events around.
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
-		perf_ctx_disable(&cpuctx->ctx);
+		perf_ctx_disable(&cpuctx->ctx, false);
 		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
 	}
 
@@ -3955,9 +3960,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
 
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
-		perf_ctx_enable(&cpuctx->ctx);
+		perf_ctx_enable(&cpuctx->ctx, false);
 
-	perf_ctx_enable(ctx);
+	perf_ctx_enable(ctx, false);
 
 unlock:
 	perf_ctx_unlock(cpuctx, ctx);
@@ -4427,6 +4432,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
 {
 	u16 local_pkg, event_pkg;
 
+	if ((unsigned)event_cpu >= nr_cpu_ids)
+		return event_cpu;
+
 	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
 		int local_cpu = smp_processor_id();
 
@@ -4529,6 +4537,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 			  u64 *enabled, u64 *running)
 {
 	unsigned long flags;
+	int event_oncpu;
+	int event_cpu;
 	int ret = 0;
 
 	/*
@@ -4553,15 +4563,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 		goto out;
 	}
 
+	/*
+	 * Get the event CPU numbers, and adjust them to local if the event is
+	 * a per-package event that can be read locally
+	 */
+	event_oncpu = __perf_event_read_cpu(event, event->oncpu);
+	event_cpu = __perf_event_read_cpu(event, event->cpu);
+
 	/* If this is a per-CPU event, it must be for this CPU */
 	if (!(event->attach_state & PERF_ATTACH_TASK) &&
-	    event->cpu != smp_processor_id()) {
+	    event_cpu != smp_processor_id()) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	/* If this is a pinned event it must be running on this CPU */
-	if (event->attr.pinned && event->oncpu != smp_processor_id()) {
+	if (event->attr.pinned && event_oncpu != smp_processor_id()) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -4571,7 +4588,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id())
+	if (event_oncpu == smp_processor_id())
 		event->pmu->read(event);
 
 	*value = local64_read(&event->count);
@@ -13372,7 +13389,8 @@ static int inherit_group(struct perf_event *parent_event,
 		    !perf_get_aux_event(child_ctr, leader))
 			return -EINVAL;
 	}
-	leader->group_generation = parent_event->group_generation;
+	if (leader)
+		leader->group_generation = parent_event->group_generation;
 	return 0;
 }
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb1e180b5f0a..e8d82c2f07d0 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -700,6 +700,12 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
 		watermark = 0;
 	}
 
+	/*
+	 * kcalloc_node() is unable to allocate buffer if the size is larger
+	 * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+	 */
+	if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+		return -ENOMEM;
 	rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
 				     node);
 	if (!rb->aux_pages)
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..640123767726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1492,9 +1492,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 	struct file *exe_file;
 
 	rcu_read_lock();
-	exe_file = rcu_dereference(mm->exe_file);
-	if (exe_file && !get_file_rcu(exe_file))
-		exe_file = NULL;
+	exe_file = get_file_rcu(&mm->exe_file);
 	rcu_read_unlock();
 	return exe_file;
 }
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 4fad0e6fca64..c450fa8b8b5e 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -71,7 +71,11 @@ bool __refrigerator(bool check_kthr_stop)
 	for (;;) {
 		bool freeze;
 
+		raw_spin_lock_irq(&current->pi_lock);
 		set_current_state(TASK_FROZEN);
+		/* unstale saved_state so that __thaw_task() will wake us up */
+		current->saved_state = TASK_RUNNING;
+		raw_spin_unlock_irq(&current->pi_lock);
 
 		spin_lock_irq(&freezer_lock);
 		freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
@@ -129,6 +133,7 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
 		WARN_ON_ONCE(debug_locks && p->lockdep_depth);
 #endif
 
+	p->saved_state = p->__state;
 	WRITE_ONCE(p->__state, TASK_FROZEN);
 	return TASK_FROZEN;
 }
@@ -170,42 +175,34 @@ bool freeze_task(struct task_struct *p)
 }
 
 /*
- * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
- * state in p->jobctl. If either of them got a wakeup that was missed because
- * TASK_FROZEN, then their canonical state reflects that and the below will
- * refuse to restore the special state and instead issue the wakeup.
+ * Restore the saved_state before the task entered freezer. For typical task
+ * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens
+ * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state
+ * is restored unless they got an expected wakeup (see ttwu_state_match()).
+ * Returns 1 if the task state was restored.
  */
-static int __set_task_special(struct task_struct *p, void *arg)
+static int __restore_freezer_state(struct task_struct *p, void *arg)
 {
-	unsigned int state = 0;
+	unsigned int state = p->saved_state;
 
-	if (p->jobctl & JOBCTL_TRACED)
-		state = TASK_TRACED;
-
-	else if (p->jobctl & JOBCTL_STOPPED)
-		state = TASK_STOPPED;
-
-	if (state)
+	if (state != TASK_RUNNING) {
 		WRITE_ONCE(p->__state, state);
+		return 1;
+	}
 
-	return state;
+	return 0;
 }
 
 void __thaw_task(struct task_struct *p)
 {
-	unsigned long flags, flags2;
+	unsigned long flags;
 
 	spin_lock_irqsave(&freezer_lock, flags);
 	if (WARN_ON_ONCE(freezing(p)))
 		goto unlock;
 
-	if (lock_task_sighand(p, &flags2)) {
-		/* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
-		bool ret = task_call_func(p, __set_task_special, NULL);
-		unlock_task_sighand(p, &flags2);
-		if (ret)
-			goto unlock;
-	}
+	if (task_call_func(p, __restore_freezer_state, NULL))
+		goto unlock;
 
 	wake_up_state(p, TASK_FROZEN);
 unlock:
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index f10587d1d481..52695c59d041 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -193,7 +193,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
- * @fshared:	false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
+ * @flags:	FLAGS_*
  * @key:	address where result is stored.
  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
  *              FUTEX_WRITE)
@@ -217,14 +217,18 @@ static u64 get_inode_sequence_number(struct inode *inode)
  *
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
-int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
 		  enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct page *page, *tail;
+	struct page *page;
+	struct folio *folio;
 	struct address_space *mapping;
 	int err, ro = 0;
+	bool fshared;
+
+	fshared = flags & FLAGS_SHARED;
 
 	/*
 	 * The futex address must be "naturally" aligned.
@@ -248,7 +252,17 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
 	 *        but access_ok() should be faster than find_vma()
 	 */
 	if (!fshared) {
-		key->private.mm = mm;
+		/*
+		 * On no-MMU, shared futexes are treated as private, therefore
+		 * we must not include the current process in the key. Since
+		 * there is only one address space, the address is a unique key
+		 * on its own.
+		 */
+		if (IS_ENABLED(CONFIG_MMU))
+			key->private.mm = mm;
+		else
+			key->private.mm = NULL;
+
 		key->private.address = address;
 		return 0;
 	}
@@ -273,54 +287,52 @@ again:
 		err = 0;
 
 	/*
-	 * The treatment of mapping from this point on is critical. The page
-	 * lock protects many things but in this context the page lock
+	 * The treatment of mapping from this point on is critical. The folio
+	 * lock protects many things but in this context the folio lock
 	 * stabilizes mapping, prevents inode freeing in the shared
 	 * file-backed region case and guards against movement to swap cache.
 	 *
-	 * Strictly speaking the page lock is not needed in all cases being
-	 * considered here and page lock forces unnecessarily serialization
+	 * Strictly speaking the folio lock is not needed in all cases being
+	 * considered here and folio lock forces unnecessarily serialization.
 	 * From this point on, mapping will be re-verified if necessary and
-	 * page lock will be acquired only if it is unavoidable
+	 * folio lock will be acquired only if it is unavoidable
 	 *
-	 * Mapping checks require the head page for any compound page so the
-	 * head page and mapping is looked up now. For anonymous pages, it
-	 * does not matter if the page splits in the future as the key is
-	 * based on the address. For filesystem-backed pages, the tail is
-	 * required as the index of the page determines the key. For
-	 * base pages, there is no tail page and tail == page.
+	 * Mapping checks require the folio so it is looked up now. For
+	 * anonymous pages, it does not matter if the folio is split
+	 * in the future as the key is based on the address. For
+	 * filesystem-backed pages, the precise page is required as the
+	 * index of the page determines the key.
 	 */
-	tail = page;
-	page = compound_head(page);
-	mapping = READ_ONCE(page->mapping);
+	folio = page_folio(page);
+	mapping = READ_ONCE(folio->mapping);
 
 	/*
-	 * If page->mapping is NULL, then it cannot be a PageAnon
+	 * If folio->mapping is NULL, then it cannot be an anonymous
 	 * page; but it might be the ZERO_PAGE or in the gate area or
 	 * in a special mapping (all cases which we are happy to fail);
 	 * or it may have been a good file page when get_user_pages_fast
 	 * found it, but truncated or holepunched or subjected to
-	 * invalidate_complete_page2 before we got the page lock (also
+	 * invalidate_complete_page2 before we got the folio lock (also
 	 * cases which we are happy to fail).  And we hold a reference,
 	 * so refcount care in invalidate_inode_page's remove_mapping
 	 * prevents drop_caches from setting mapping to NULL beneath us.
 	 *
 	 * The case we do have to guard against is when memory pressure made
 	 * shmem_writepage move it from filecache to swapcache beneath us:
-	 * an unlikely race, but we do need to retry for page->mapping.
+	 * an unlikely race, but we do need to retry for folio->mapping.
 	 */
 	if (unlikely(!mapping)) {
 		int shmem_swizzled;
 
 		/*
-		 * Page lock is required to identify which special case above
-		 * applies. If this is really a shmem page then the page lock
+		 * Folio lock is required to identify which special case above
+		 * applies. If this is really a shmem page then the folio lock
 		 * will prevent unexpected transitions.
 		 */
-		lock_page(page);
-		shmem_swizzled = PageSwapCache(page) || page->mapping;
-		unlock_page(page);
-		put_page(page);
+		folio_lock(folio);
+		shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
+		folio_unlock(folio);
+		folio_put(folio);
 
 		if (shmem_swizzled)
 			goto again;
@@ -331,14 +343,14 @@ again:
 	/*
 	 * Private mappings are handled in a simple way.
 	 *
-	 * If the futex key is stored on an anonymous page, then the associated
+	 * If the futex key is stored in anonymous memory, then the associated
 	 * object is the mm which is implicitly pinned by the calling process.
 	 *
 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 	 * it's a read-only handle, it's expected that futexes attach to
 	 * the object not the particular process.
 	 */
-	if (PageAnon(page)) {
+	if (folio_test_anon(folio)) {
 		/*
 		 * A RO anonymous page will never change and thus doesn't make
 		 * sense for futex operations.
@@ -357,10 +369,10 @@ again:
 
 		/*
 		 * The associated futex object in this case is the inode and
-		 * the page->mapping must be traversed. Ordinarily this should
-		 * be stabilised under page lock but it's not strictly
+		 * the folio->mapping must be traversed. Ordinarily this should
+		 * be stabilised under folio lock but it's not strictly
 		 * necessary in this case as we just want to pin the inode, not
-		 * update the radix tree or anything like that.
+		 * update i_pages or anything like that.
 		 *
 		 * The RCU read lock is taken as the inode is finally freed
 		 * under RCU. If the mapping still matches expectations then the
@@ -368,9 +380,9 @@ again:
 		 */
 		rcu_read_lock();
 
-		if (READ_ONCE(page->mapping) != mapping) {
+		if (READ_ONCE(folio->mapping) != mapping) {
 			rcu_read_unlock();
-			put_page(page);
+			folio_put(folio);
 
 			goto again;
 		}
@@ -378,19 +390,19 @@ again:
 		inode = READ_ONCE(mapping->host);
 		if (!inode) {
 			rcu_read_unlock();
-			put_page(page);
+			folio_put(folio);
 
 			goto again;
 		}
 
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 		key->shared.i_seq = get_inode_sequence_number(inode);
-		key->shared.pgoff = page_to_pgoff(tail);
+		key->shared.pgoff = folio->index + folio_page_idx(folio, page);
 		rcu_read_unlock();
 	}
 
 out:
-	put_page(page);
+	folio_put(folio);
 	return err;
 }
 
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index b5379c0e6d6d..a06030a1a27b 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -5,6 +5,7 @@
 #include <linux/futex.h>
 #include <linux/rtmutex.h>
 #include <linux/sched/wake_q.h>
+#include <linux/compat.h>
 
 #ifdef CONFIG_PREEMPT_RT
 #include <linux/rcuwait.h>
@@ -16,17 +17,84 @@
  * Futex flags used to encode options to functions and preserve them across
  * restarts.
  */
+#define FLAGS_SIZE_8		0x0000
+#define FLAGS_SIZE_16		0x0001
+#define FLAGS_SIZE_32		0x0002
+#define FLAGS_SIZE_64		0x0003
+
+#define FLAGS_SIZE_MASK		0x0003
+
 #ifdef CONFIG_MMU
-# define FLAGS_SHARED		0x01
+# define FLAGS_SHARED		0x0010
 #else
 /*
  * NOMMU does not have per process address space. Let the compiler optimize
  * code away.
  */
-# define FLAGS_SHARED		0x00
+# define FLAGS_SHARED		0x0000
 #endif
-#define FLAGS_CLOCKRT		0x02
-#define FLAGS_HAS_TIMEOUT	0x04
+#define FLAGS_CLOCKRT		0x0020
+#define FLAGS_HAS_TIMEOUT	0x0040
+#define FLAGS_NUMA		0x0080
+#define FLAGS_STRICT		0x0100
+
+/* FUTEX_ to FLAGS_ */
+static inline unsigned int futex_to_flags(unsigned int op)
+{
+	unsigned int flags = FLAGS_SIZE_32;
+
+	if (!(op & FUTEX_PRIVATE_FLAG))
+		flags |= FLAGS_SHARED;
+
+	if (op & FUTEX_CLOCK_REALTIME)
+		flags |= FLAGS_CLOCKRT;
+
+	return flags;
+}
+
+/* FUTEX2_ to FLAGS_ */
+static inline unsigned int futex2_to_flags(unsigned int flags2)
+{
+	unsigned int flags = flags2 & FUTEX2_SIZE_MASK;
+
+	if (!(flags2 & FUTEX2_PRIVATE))
+		flags |= FLAGS_SHARED;
+
+	if (flags2 & FUTEX2_NUMA)
+		flags |= FLAGS_NUMA;
+
+	return flags;
+}
+
+static inline unsigned int futex_size(unsigned int flags)
+{
+	return 1 << (flags & FLAGS_SIZE_MASK);
+}
+
+static inline bool futex_flags_valid(unsigned int flags)
+{
+	/* Only 64bit futexes for 64bit code */
+	if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
+		if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
+			return false;
+	}
+
+	/* Only 32bit futexes are implemented -- for now */
+	if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
+		return false;
+
+	return true;
+}
+
+static inline bool futex_validate_input(unsigned int flags, u64 val)
+{
+	int bits = 8 * futex_size(flags);
+
+	if (bits < 64 && (val >> bits))
+		return false;
+
+	return true;
+}
 
 #ifdef CONFIG_FAIL_FUTEX
 extern bool should_fail_futex(bool fshared);
@@ -116,7 +184,7 @@ enum futex_access {
 	FUTEX_WRITE
 };
 
-extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
 			 enum futex_access rw);
 
 extern struct hrtimer_sleeper *
@@ -260,10 +328,14 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
 				 val, ktime_t *abs_time, u32 bitset, u32 __user
 				 *uaddr2);
 
-extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
-			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+			 u32 __user *uaddr2, unsigned int flags2,
+			 int nr_wake, int nr_requeue,
 			 u32 *cmpval, int requeue_pi);
 
+extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+			struct hrtimer_sleeper *to, u32 bitset);
+
 extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		      ktime_t *abs_time, u32 bitset);
 
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index ce2889f12375..90e5197f4e56 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <linux/slab.h>
+#include <linux/sched/rt.h>
 #include <linux/sched/task.h>
 
 #include "futex.h"
@@ -610,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 /*
  * Caller must hold a reference on @pi_state.
  */
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval,
+			 struct futex_pi_state *pi_state,
+			 struct rt_mutex_waiter *top_waiter)
 {
-	struct rt_mutex_waiter *top_waiter;
 	struct task_struct *new_owner;
 	bool postunlock = false;
 	DEFINE_RT_WAKE_Q(wqh);
 	u32 curval, newval;
 	int ret = 0;
 
-	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
-	if (WARN_ON_ONCE(!top_waiter)) {
-		/*
-		 * As per the comment in futex_unlock_pi() this should not happen.
-		 *
-		 * When this happens, give up our locks and try again, giving
-		 * the futex_lock_pi() instance time to complete, either by
-		 * waiting on the rtmutex or removing itself from the futex
-		 * queue.
-		 */
-		ret = -EAGAIN;
-		goto out_unlock;
-	}
-
 	new_owner = top_waiter->task;
 
 	/*
@@ -945,7 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 	to = futex_setup_timer(time, &timeout, flags, 0);
 
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1002,6 +990,12 @@ retry_private:
 		goto no_block;
 	}
 
+	/*
+	 * Must be done before we enqueue the waiter, here is unfortunately
+	 * under the hb lock, but that *should* work because it does nothing.
+	 */
+	rt_mutex_pre_schedule();
+
 	rt_mutex_init_waiter(&rt_waiter);
 
 	/*
@@ -1039,19 +1033,37 @@ retry_private:
 	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
 
 cleanup:
-	spin_lock(q.lock_ptr);
 	/*
 	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
-	 * first acquire the hb->lock before removing the lock from the
-	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
-	 * lists consistent.
+	 * must unwind the above, however we canont lock hb->lock because
+	 * rt_mutex already has a waiter enqueued and hb->lock can itself try
+	 * and enqueue an rt_waiter through rtlock.
+	 *
+	 * Doing the cleanup without holding hb->lock can cause inconsistent
+	 * state between hb and pi_state, but only in the direction of not
+	 * seeing a waiter that is leaving.
+	 *
+	 * See futex_unlock_pi(), it deals with this inconsistency.
+	 *
+	 * There be dragons here, since we must deal with the inconsistency on
+	 * the way out (here), it is impossible to detect/warn about the race
+	 * the other way around (missing an incoming waiter).
 	 *
-	 * In particular; it is important that futex_unlock_pi() can not
-	 * observe this inconsistency.
+	 * What could possibly go wrong...
 	 */
 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
 		ret = 0;
 
+	/*
+	 * Now that the rt_waiter has been dequeued, it is safe to use
+	 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+	 * the
+	 */
+	spin_lock(q.lock_ptr);
+	/*
+	 * Waiter is unqueued.
+	 */
+	rt_mutex_post_schedule();
 no_block:
 	/*
 	 * Fixup the pi_state owner and possibly acquire the lock if we
@@ -1117,7 +1129,7 @@ retry:
 	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;
 
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
 	if (ret)
 		return ret;
 
@@ -1132,6 +1144,7 @@ retry:
 	top_waiter = futex_top_waiter(hb, &key);
 	if (top_waiter) {
 		struct futex_pi_state *pi_state = top_waiter->pi_state;
+		struct rt_mutex_waiter *rt_waiter;
 
 		ret = -EINVAL;
 		if (!pi_state)
@@ -1144,22 +1157,39 @@ retry:
 		if (pi_state->owner != current)
 			goto out_unlock;
 
-		get_pi_state(pi_state);
 		/*
 		 * By taking wait_lock while still holding hb->lock, we ensure
-		 * there is no point where we hold neither; and therefore
-		 * wake_futex_p() must observe a state consistent with what we
-		 * observed.
+		 * there is no point where we hold neither; and thereby
+		 * wake_futex_pi() must observe any new waiters.
+		 *
+		 * Since the cleanup: case in futex_lock_pi() removes the
+		 * rt_waiter without holding hb->lock, it is possible for
+		 * wake_futex_pi() to not find a waiter while the above does,
+		 * in this case the waiter is on the way out and it can be
+		 * ignored.
 		 *
 		 * In particular; this forces __rt_mutex_start_proxy() to
 		 * complete such that we're guaranteed to observe the
-		 * rt_waiter. Also see the WARN in wake_futex_pi().
+		 * rt_waiter.
 		 */
 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+		/*
+		 * Futex vs rt_mutex waiter state -- if there are no rt_mutex
+		 * waiters even though futex thinks there are, then the waiter
+		 * is leaving and the uncontended path is safe to take.
+		 */
+		rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+		if (!rt_waiter) {
+			raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+			goto do_uncontended;
+		}
+
+		get_pi_state(pi_state);
 		spin_unlock(&hb->lock);
 
 		/* drops pi_state->pi_mutex.wait_lock */
-		ret = wake_futex_pi(uaddr, uval, pi_state);
+		ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
 
 		put_pi_state(pi_state);
 
@@ -1187,6 +1217,7 @@ retry:
 		return ret;
 	}
 
+do_uncontended:
 	/*
 	 * We have no kernel internal state, i.e. no waiters in the
 	 * kernel. Waiters which are about to queue themselves are stuck
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index cba8b1a6a4cc..16a3645bd786 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -269,7 +269,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 			   union futex_key *key2, struct futex_pi_state **ps,
 			   struct task_struct **exiting, int set_waiters)
 {
-	struct futex_q *top_waiter = NULL;
+	struct futex_q *top_waiter;
 	u32 curval;
 	int ret;
 
@@ -346,8 +346,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 /**
  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
  * @uaddr1:	source futex user address
- * @flags:	futex flags (FLAGS_SHARED, etc.)
+ * @flags1:	futex flags (FLAGS_SHARED, etc.)
  * @uaddr2:	target futex user address
+ * @flags2:	futex flags (FLAGS_SHARED, etc.)
  * @nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
  * @nr_requeue:	number of waiters to requeue (0-INT_MAX)
  * @cmpval:	@uaddr1 expected value (or %NULL)
@@ -361,7 +362,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
  *  - >=0 - on success, the number of tasks requeued or woken;
  *  -  <0 - on error
  */
-int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+		  u32 __user *uaddr2, unsigned int flags2,
 		  int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -424,10 +426,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	}
 
 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+	ret = get_futex_key(uaddr2, flags2, &key2,
 			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
@@ -459,7 +461,7 @@ retry_private:
 			if (ret)
 				return ret;
 
-			if (!(flags & FLAGS_SHARED))
+			if (!(flags1 & FLAGS_SHARED))
 				goto retry_private;
 
 			goto retry;
@@ -789,7 +791,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	rt_mutex_init_waiter(&rt_waiter);
 
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -850,11 +852,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		pi_mutex = &q.pi_state->pi_mutex;
 		ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
 
-		/* Current is not longer pi_blocked_on */
-		spin_lock(q.lock_ptr);
+		/*
+		 * See futex_unlock_pi()'s cleanup: comment.
+		 */
 		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
 			ret = 0;
 
+		spin_lock(q.lock_ptr);
 		debug_rt_mutex_free_waiter(&rt_waiter);
 		/*
 		 * Fixup the pi_state owner and possibly acquire the lock if we
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index a8074079b09e..8200d86d30e1 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
-#include <linux/compat.h>
 #include <linux/syscalls.h>
 #include <linux/time_namespace.h>
 
@@ -85,15 +84,12 @@ err_unlock:
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
+	unsigned int flags = futex_to_flags(op);
 	int cmd = op & FUTEX_CMD_MASK;
-	unsigned int flags = 0;
 
-	if (!(op & FUTEX_PRIVATE_FLAG))
-		flags |= FLAGS_SHARED;
-
-	if (op & FUTEX_CLOCK_REALTIME) {
-		flags |= FLAGS_CLOCKRT;
-		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+	if (flags & FLAGS_CLOCKRT) {
+		if (cmd != FUTEX_WAIT_BITSET &&
+		    cmd != FUTEX_WAIT_REQUEUE_PI &&
 		    cmd != FUTEX_LOCK_PI2)
 			return -ENOSYS;
 	}
@@ -110,9 +106,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	case FUTEX_WAKE_BITSET:
 		return futex_wake(uaddr, flags, val, val3);
 	case FUTEX_REQUEUE:
-		return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
 	case FUTEX_CMP_REQUEUE:
-		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
 	case FUTEX_WAKE_OP:
 		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
 	case FUTEX_LOCK_PI:
@@ -129,7 +125,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
 					     uaddr2);
 	case FUTEX_CMP_REQUEUE_PI:
-		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
 	}
 	return -ENOSYS;
 }
@@ -183,8 +179,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
-/* Mask of available flags for each futex in futex_waitv list */
-#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
 
 /**
  * futex_parse_waitv - Parse a waitv array from userspace
@@ -202,16 +197,22 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 	unsigned int i;
 
 	for (i = 0; i < nr_futexes; i++) {
+		unsigned int flags;
+
 		if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
 			return -EFAULT;
 
-		if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
+		if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
 			return -EINVAL;
 
-		if (!(aux.flags & FUTEX_32))
+		flags = futex2_to_flags(aux.flags);
+		if (!futex_flags_valid(flags))
 			return -EINVAL;
 
-		futexv[i].w.flags = aux.flags;
+		if (!futex_validate_input(flags, aux.val))
+			return -EINVAL;
+
+		futexv[i].w.flags = flags;
 		futexv[i].w.val = aux.val;
 		futexv[i].w.uaddr = aux.uaddr;
 		futexv[i].q = futex_q_init;
@@ -220,6 +221,46 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 	return 0;
 }
 
+static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
+				clockid_t clockid, struct hrtimer_sleeper *to)
+{
+	int flag_clkid = 0, flag_init = 0;
+	struct timespec64 ts;
+	ktime_t time;
+	int ret;
+
+	if (!timeout)
+		return 0;
+
+	if (clockid == CLOCK_REALTIME) {
+		flag_clkid = FLAGS_CLOCKRT;
+		flag_init = FUTEX_CLOCK_REALTIME;
+	}
+
+	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+		return -EINVAL;
+
+	if (get_timespec64(&ts, timeout))
+		return -EFAULT;
+
+	/*
+	 * Since there's no opcode for futex_waitv, use
+	 * FUTEX_WAIT_BITSET that uses absolute timeout as well
+	 */
+	ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+	if (ret)
+		return ret;
+
+	futex_setup_timer(&time, to, flag_clkid, 0);
+	return 0;
+}
+
+static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
+{
+	hrtimer_cancel(&to->timer);
+	destroy_hrtimer_on_stack(&to->timer);
+}
+
 /**
  * sys_futex_waitv - Wait on a list of futexes
  * @waiters:    List of futexes to wait on
@@ -249,8 +290,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 {
 	struct hrtimer_sleeper to;
 	struct futex_vector *futexv;
-	struct timespec64 ts;
-	ktime_t time;
 	int ret;
 
 	/* This syscall supports no flags for now */
@@ -260,30 +299,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 	if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
 		return -EINVAL;
 
-	if (timeout) {
-		int flag_clkid = 0, flag_init = 0;
-
-		if (clockid == CLOCK_REALTIME) {
-			flag_clkid = FLAGS_CLOCKRT;
-			flag_init = FUTEX_CLOCK_REALTIME;
-		}
-
-		if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
-			return -EINVAL;
-
-		if (get_timespec64(&ts, timeout))
-			return -EFAULT;
-
-		/*
-		 * Since there's no opcode for futex_waitv, use
-		 * FUTEX_WAIT_BITSET that uses absolute timeout as well
-		 */
-		ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
-		if (ret)
-			return ret;
-
-		futex_setup_timer(&time, &to, flag_clkid, 0);
-	}
+	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+		return ret;
 
 	futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
 	if (!futexv) {
@@ -298,13 +315,125 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 	kfree(futexv);
 
 destroy_timer:
-	if (timeout) {
-		hrtimer_cancel(&to.timer);
-		destroy_hrtimer_on_stack(&to.timer);
-	}
+	if (timeout)
+		futex2_destroy_timeout(&to);
+	return ret;
+}
+
+/*
+ * sys_futex_wake - Wake a number of futexes
+ * @uaddr:	Address of the futex(es) to wake
+ * @mask:	bitmask
+ * @nr:		Number of the futexes to wake
+ * @flags:	FUTEX2 flags
+ *
+ * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_wake,
+		void __user *, uaddr,
+		unsigned long, mask,
+		int, nr,
+		unsigned int, flags)
+{
+	if (flags & ~FUTEX2_VALID_MASK)
+		return -EINVAL;
+
+	flags = futex2_to_flags(flags);
+	if (!futex_flags_valid(flags))
+		return -EINVAL;
+
+	if (!futex_validate_input(flags, mask))
+		return -EINVAL;
+
+	return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
+}
+
+/*
+ * sys_futex_wait - Wait on a futex
+ * @uaddr:	Address of the futex to wait on
+ * @val:	Value of @uaddr
+ * @mask:	bitmask
+ * @flags:	FUTEX2 flags
+ * @timeout:	Optional absolute timeout
+ * @clockid:	Clock to be used for the timeout, realtime or monotonic
+ *
+ * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
+ * futex2 familiy of calls.
+ */
+
+SYSCALL_DEFINE6(futex_wait,
+		void __user *, uaddr,
+		unsigned long, val,
+		unsigned long, mask,
+		unsigned int, flags,
+		struct __kernel_timespec __user *, timeout,
+		clockid_t, clockid)
+{
+	struct hrtimer_sleeper to;
+	int ret;
+
+	if (flags & ~FUTEX2_VALID_MASK)
+		return -EINVAL;
+
+	flags = futex2_to_flags(flags);
+	if (!futex_flags_valid(flags))
+		return -EINVAL;
+
+	if (!futex_validate_input(flags, val) ||
+	    !futex_validate_input(flags, mask))
+		return -EINVAL;
+
+	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+		return ret;
+
+	ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
+
+	if (timeout)
+		futex2_destroy_timeout(&to);
+
 	return ret;
 }
 
+/*
+ * sys_futex_requeue - Requeue a waiter from one futex to another
+ * @waiters:	array describing the source and destination futex
+ * @flags:	unused
+ * @nr_wake:	number of futexes to wake
+ * @nr_requeue:	number of futexes to requeue
+ *
+ * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_requeue,
+		struct futex_waitv __user *, waiters,
+		unsigned int, flags,
+		int, nr_wake,
+		int, nr_requeue)
+{
+	struct futex_vector futexes[2];
+	u32 cmpval;
+	int ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (!waiters)
+		return -EINVAL;
+
+	ret = futex_parse_waitv(futexes, waiters, 2);
+	if (ret)
+		return ret;
+
+	cmpval = futexes[0].w.val;
+
+	return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
+			     u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
+			     nr_wake, nr_requeue, &cmpval, 0);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		struct compat_robust_list_head __user *, head,
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ba01b9408203..37860f794bf7 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -145,16 +145,19 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
-	int ret;
 	DEFINE_WAKE_Q(wake_q);
+	int ret;
 
 	if (!bitset)
 		return -EINVAL;
 
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
 
+	if ((flags & FLAGS_STRICT) && !nr_wake)
+		return 0;
+
 	hb = futex_hash(&key);
 
 	/* Make sure we really have tasks to wakeup */
@@ -245,10 +248,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	DEFINE_WAKE_Q(wake_q);
 
 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		return ret;
 
@@ -419,11 +422,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo
 	 */
 retry:
 	for (i = 0; i < count; i++) {
-		if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
+		if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
 			continue;
 
 		ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
-				    !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
+				    vs[i].w.flags,
 				    &vs[i].q.key, FUTEX_READ);
 
 		if (unlikely(ret))
@@ -435,7 +438,7 @@ retry:
 	for (i = 0; i < count; i++) {
 		u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
 		struct futex_q *q = &vs[i].q;
-		u32 val = (u32)vs[i].w.val;
+		u32 val = vs[i].w.val;
 
 		hb = futex_q_lock(q);
 		ret = futex_get_value_locked(&uval, uaddr);
@@ -599,7 +602,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 * while the syscall executes.
 	 */
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
 
@@ -629,20 +632,18 @@ retry_private:
 	return ret;
 }
 
-int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+		 struct hrtimer_sleeper *to, u32 bitset)
 {
-	struct hrtimer_sleeper timeout, *to;
-	struct restart_block *restart;
-	struct futex_hash_bucket *hb;
 	struct futex_q q = futex_q_init;
+	struct futex_hash_bucket *hb;
 	int ret;
 
 	if (!bitset)
 		return -EINVAL;
+
 	q.bitset = bitset;
 
-	to = futex_setup_timer(abs_time, &timeout, flags,
-			       current->timer_slack_ns);
 retry:
 	/*
 	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
@@ -650,18 +651,17 @@ retry:
 	 */
 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
-		goto out;
+		return ret;
 
 	/* futex_queue and wait for wakeup, timeout, or a signal. */
 	futex_wait_queue(hb, &q, to);
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
-	ret = 0;
 	if (!futex_unqueue(&q))
-		goto out;
-	ret = -ETIMEDOUT;
+		return 0;
+
 	if (to && !to->task)
-		goto out;
+		return -ETIMEDOUT;
 
 	/*
 	 * We expect signal_pending(current), but we might be the
@@ -670,24 +670,38 @@ retry:
 	if (!signal_pending(current))
 		goto retry;
 
-	ret = -ERESTARTSYS;
-	if (!abs_time)
-		goto out;
+	return -ERESTARTSYS;
+}
 
-	restart = &current->restart_block;
-	restart->futex.uaddr = uaddr;
-	restart->futex.val = val;
-	restart->futex.time = *abs_time;
-	restart->futex.bitset = bitset;
-	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+	struct hrtimer_sleeper timeout, *to;
+	struct restart_block *restart;
+	int ret;
 
-	ret = set_restart_fn(restart, futex_wait_restart);
+	to = futex_setup_timer(abs_time, &timeout, flags,
+			       current->timer_slack_ns);
 
-out:
-	if (to) {
-		hrtimer_cancel(&to->timer);
-		destroy_hrtimer_on_stack(&to->timer);
+	ret = __futex_wait(uaddr, flags, val, to, bitset);
+
+	/* No timeout, nothing to clean up. */
+	if (!to)
+		return ret;
+
+	hrtimer_cancel(&to->timer);
+	destroy_hrtimer_on_stack(&to->timer);
+
+	if (ret == -ERESTARTSYS) {
+		restart = &current->restart_block;
+		restart->futex.uaddr = uaddr;
+		restart->futex.val = val;
+		restart->futex.time = *abs_time;
+		restart->futex.bitset = bitset;
+		restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+		return set_restart_fn(restart, futex_wait_restart);
 	}
+
 	return ret;
 }
 
diff --git a/kernel/groups.c b/kernel/groups.c
index 9aaed2a31073..9b43da22647d 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -19,7 +19,7 @@ struct group_info *groups_alloc(int gidsetsize)
 	if (!gi)
 		return NULL;
 
-	atomic_set(&gi->usage, 1);
+	refcount_set(&gi->usage, 1);
 	gi->ngroups = gidsetsize;
 	return gi;
 }
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 5971a66be034..aae0402507ed 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -121,7 +121,6 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
 	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
-	BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
 
 	BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
 
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c653cd31548d..d39a40bc542b 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -219,11 +219,15 @@ void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
 			   int num_ct, unsigned int irq_base,
 			   void __iomem *reg_base, irq_flow_handler_t handler)
 {
+	struct irq_chip_type *ct = gc->chip_types;
+	int i;
+
 	raw_spin_lock_init(&gc->lock);
 	gc->num_ct = num_ct;
 	gc->irq_base = irq_base;
 	gc->reg_base = reg_base;
-	gc->chip_types->chip.name = name;
+	for (i = 0; i < num_ct; i++)
+		ct[i].chip.name = name;
 	gc->chip_types->handler = handler;
 }
 
@@ -544,21 +548,34 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
 void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
 			     unsigned int clr, unsigned int set)
 {
-	unsigned int i = gc->irq_base;
+	unsigned int i, virq;
 
 	raw_spin_lock(&gc_lock);
 	list_del(&gc->list);
 	raw_spin_unlock(&gc_lock);
 
-	for (; msk; msk >>= 1, i++) {
+	for (i = 0; msk; msk >>= 1, i++) {
 		if (!(msk & 0x01))
 			continue;
 
+		/*
+		 * Interrupt domain based chips store the base hardware
+		 * interrupt number in gc::irq_base. Otherwise gc::irq_base
+		 * contains the base Linux interrupt number.
+		 */
+		if (gc->domain) {
+			virq = irq_find_mapping(gc->domain, gc->irq_base + i);
+			if (!virq)
+				continue;
+		} else {
+			virq = gc->irq_base + i;
+		}
+
 		/* Remove handler first. That will mask the irq line */
-		irq_set_handler(i, NULL);
-		irq_set_chip(i, &no_irq_chip);
-		irq_set_chip_data(i, NULL);
-		irq_modify_status(i, clr, set);
+		irq_set_handler(virq, NULL);
+		irq_set_chip(virq, &no_irq_chip);
+		irq_set_chip_data(virq, NULL);
+		irq_modify_status(virq, clr, set);
 	}
 }
 EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 1698e77645ac..75d0ae490e29 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -466,16 +466,16 @@ unsigned int irq_matrix_reserved(struct irq_matrix *m)
 }
 
 /**
- * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU
  * @m:		Pointer to the matrix to search
  *
- * This returns number of allocated irqs
+ * This returns number of allocated non-managed interrupts.
  */
 unsigned int irq_matrix_allocated(struct irq_matrix *m)
 {
 	struct cpumap *cm = this_cpu_ptr(m->maps);
 
-	return cm->allocated;
+	return cm->allocated - cm->managed_allocated;
 }
 
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index b4c31a5c1147..79b4a58ba9c3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1204,7 +1204,6 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
 
 #define VIRQ_CAN_RESERVE	0x01
 #define VIRQ_ACTIVATE		0x02
-#define VIRQ_NOMASK_QUIRK	0x04
 
 static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
 {
@@ -1213,8 +1212,6 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
 
 	if (!(vflags & VIRQ_CAN_RESERVE)) {
 		irqd_clr_can_reserve(irqd);
-		if (vflags & VIRQ_NOMASK_QUIRK)
-			irqd_set_msi_nomask_quirk(irqd);
 
 		/*
 		 * If the interrupt is managed but no CPU is available to
@@ -1275,15 +1272,8 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
 	 * Interrupt can use a reserved vector and will not occupy
 	 * a real device vector until the interrupt is requested.
 	 */
-	if (msi_check_reservation_mode(domain, info, dev)) {
+	if (msi_check_reservation_mode(domain, info, dev))
 		vflags |= VIRQ_CAN_RESERVE;
-		/*
-		 * MSI affinity setting requires a special quirk (X86) when
-		 * reservation mode is active.
-		 */
-		if (info->flags & MSI_FLAG_NOMASK_QUIRK)
-			vflags |= VIRQ_NOMASK_QUIRK;
-	}
 
 	xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
 		if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 5353edfad8e1..b0639f21041f 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, idx);
+	file = task_lookup_fdget_rcu(task, idx);
 	rcu_read_unlock();
+	if (file)
+		fput(file);
 
 	return file;
 }
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
index fa2c2f951c6b..e68d82099558 100644
--- a/kernel/locking/lock_events.c
+++ b/kernel/locking/lock_events.c
@@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void)
 	struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
 	int i;
 
-	if (!d_counts)
+	if (IS_ERR(d_counts))
 		goto out;
 
 	/*
@@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void)
 	for (i = 0; i < lockevent_num; i++) {
 		if (skip_lockevent(lockevent_names[i]))
 			continue;
-		if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
-					 (void *)(long)i, &fops_lockevent))
+		if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
+					 (void *)(long)i, &fops_lockevent)))
 			goto fail_undo;
 	}
 
-	if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+	if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
 				 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
-				 &fops_lockevent))
+				 &fops_lockevent)))
 		goto fail_undo;
 
 	return 0;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 15fdc7fa5c68..e2bfb1db589d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -440,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 
 static void seq_time(struct seq_file *m, s64 time)
 {
-	char num[15];
+	char num[22];
 
 	snprint_time(num, sizeof(num), time);
 	seq_printf(m, " %14s", num);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 270c7f80ce84..69d3cd2cfc3b 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -33,21 +33,23 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
 
-torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
+torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable).");
 torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
 torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
 torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, rt_boost, 2,
+		   "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
 torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
 torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
 torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
 torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(int, rt_boost, 2,
-		   "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
-torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
-torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
 torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
-torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
 /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
 #define MAX_NESTED_LOCKS 8
 
@@ -56,6 +58,55 @@ module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type,
 		 "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
 
+static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs.
+
+// Parse a cpumask kernel parameter.  If there are more users later on,
+// this might need to got to a more central location.
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+	cpumask_var_t *cm_bind = kp->arg;
+	int ret;
+	char *s;
+
+	if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) {
+		s = "Out of memory";
+		ret = -ENOMEM;
+		goto out_err;
+	}
+	ret = cpulist_parse(val, *cm_bind);
+	if (!ret)
+		return ret;
+	s = "Bad CPU range";
+out_err:
+	pr_warn("%s: %s, all CPUs set\n", kp->name, s);
+	cpumask_setall(*cm_bind);
+	return ret;
+}
+
+// Output a cpumask kernel parameter.
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+	cpumask_var_t *cm_bind = kp->arg;
+
+	return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind));
+}
+
+static bool cpumask_nonempty(cpumask_var_t mask)
+{
+	return cpumask_available(mask) && !cpumask_empty(mask);
+}
+
+static const struct kernel_param_ops lt_bind_ops = {
+	.set = param_set_cpumask,
+	.get = param_get_cpumask,
+};
+
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0644);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0644);
+
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+
 static struct task_struct *stats_task;
 static struct task_struct **writer_tasks;
 static struct task_struct **reader_tasks;
@@ -69,6 +120,12 @@ struct lock_stress_stats {
 	long n_lock_acquired;
 };
 
+struct call_rcu_chain {
+	struct rcu_head crc_rh;
+	bool crc_stop;
+};
+struct call_rcu_chain *call_rcu_chain;
+
 /* Forward reference. */
 static void lock_torture_cleanup(void);
 
@@ -116,12 +173,9 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused)
 
 static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
 	/* We want a long delay occasionally to force massive contention.  */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold);
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
 		torture_preempt_schedule();  /* Allow test to be preempted. */
 }
@@ -194,15 +248,14 @@ __acquires(torture_spinlock)
 static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 2;
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
 	unsigned long j;
 
 	/* We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
-	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) {
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) {
 		j = jiffies;
-		mdelay(longdelay_ms);
+		mdelay(long_hold);
 		pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
 	}
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
@@ -320,14 +373,12 @@ __acquires(torture_rwlock)
 static void torture_rwlock_write_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 2;
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
 
 	/* We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold);
 	else
 		udelay(shortdelay_us);
 }
@@ -348,14 +399,12 @@ __acquires(torture_rwlock)
 static void torture_rwlock_read_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 10;
-	const unsigned long longdelay_ms = 100;
 
 	/* We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+		mdelay(long_hold);
 	else
 		udelay(shortdelay_us);
 }
@@ -453,12 +502,9 @@ __acquires(torture_mutex)
 
 static void torture_mutex_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
 	/* We want a long delay occasionally to force massive contention.  */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms * 5);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold * 5);
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
 		torture_preempt_schedule();  /* Allow test to be preempted. */
 }
@@ -626,15 +672,13 @@ __acquires(torture_rtmutex)
 static void torture_rtmutex_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 2;
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
 
 	/*
 	 * We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold);
 	if (!(torture_random(trsp) %
 	      (cxt.nrealwriters_stress * 200 * shortdelay_us)))
 		udelay(shortdelay_us);
@@ -691,12 +735,9 @@ __acquires(torture_rwsem)
 
 static void torture_rwsem_write_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
 	/* We want a long delay occasionally to force massive contention.  */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms * 10);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold * 10);
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
 		torture_preempt_schedule();  /* Allow test to be preempted. */
 }
@@ -716,14 +757,11 @@ __acquires(torture_rwsem)
 
 static void torture_rwsem_read_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_ms = 100;
-
 	/* We want a long delay occasionally to force massive contention.  */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms * 2);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+		mdelay(long_hold * 2);
 	else
-		mdelay(longdelay_ms / 2);
+		mdelay(long_hold / 2);
 	if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
 		torture_preempt_schedule();  /* Allow test to be preempted. */
 }
@@ -803,11 +841,13 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
  */
 static int lock_torture_writer(void *arg)
 {
+	unsigned long j;
+	unsigned long j1;
+	u32 lockset_mask;
 	struct lock_stress_stats *lwsp = arg;
-	int tid = lwsp - cxt.lwsa;
 	DEFINE_TORTURE_RANDOM(rand);
-	u32 lockset_mask;
 	bool skip_main_lock;
+	int tid = lwsp - cxt.lwsa;
 
 	VERBOSE_TOROUT_STRING("lock_torture_writer task started");
 	if (!rt_task(current))
@@ -834,17 +874,24 @@ static int lock_torture_writer(void *arg)
 			cxt.cur_ops->nested_lock(tid, lockset_mask);
 
 		if (!skip_main_lock) {
+			if (acq_writer_lim > 0)
+				j = jiffies;
 			cxt.cur_ops->writelock(tid);
 			if (WARN_ON_ONCE(lock_is_write_held))
 				lwsp->n_lock_fail++;
 			lock_is_write_held = true;
 			if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
 				lwsp->n_lock_fail++; /* rare, but... */
-
+			if (acq_writer_lim > 0) {
+				j1 = jiffies;
+				WARN_ONCE(time_after(j1, j + acq_writer_lim),
+					  "%s: Lock acquisition took %lu jiffies.\n",
+					  __func__, j1 - j);
+			}
 			lwsp->n_lock_acquired++;
-		}
-		if (!skip_main_lock) {
+
 			cxt.cur_ops->write_delay(&rand);
+
 			lock_is_write_held = false;
 			WRITE_ONCE(last_lock_release, jiffies);
 			cxt.cur_ops->writeunlock(tid);
@@ -986,16 +1033,69 @@ static int lock_torture_stats(void *arg)
 	return 0;
 }
 
+
 static inline void
 lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
 				const char *tag)
 {
+	static cpumask_t cpumask_all;
+	cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all;
+	cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all;
+
+	cpumask_setall(&cpumask_all);
 	pr_alert("%s" TORTURE_FLAG
-		 "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+		 "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n",
 		 torture_type, tag, cxt.debug_lock ? " [debug]": "",
-		 cxt.nrealwriters_stress, cxt.nrealreaders_stress,
-		 nested_locks, stat_interval, verbose, shuffle_interval,
-		 stutter, shutdown_secs, onoff_interval, onoff_holdoff);
+		 acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp),
+		 call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
+		 cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
+		 rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
+		 verbose, writer_fifo);
+}
+
+// If requested, maintain call_rcu() chains to keep a grace period always
+// in flight.  These increase the probability of getting an RCU CPU stall
+// warning and associated diagnostics when a locking primitive stalls.
+
+static void call_rcu_chain_cb(struct rcu_head *rhp)
+{
+	struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh);
+
+	if (!smp_load_acquire(&crcp->crc_stop)) {
+		(void)start_poll_synchronize_rcu(); // Start one grace period...
+		call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another.
+	}
+}
+
+// Start the requested number of call_rcu() chains.
+static int call_rcu_chain_init(void)
+{
+	int i;
+
+	if (call_rcu_chains <= 0)
+		return 0;
+	call_rcu_chain = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain), GFP_KERNEL);
+	if (!call_rcu_chain)
+		return -ENOMEM;
+	for (i = 0; i < call_rcu_chains; i++) {
+		call_rcu_chain[i].crc_stop = false;
+		call_rcu(&call_rcu_chain[i].crc_rh, call_rcu_chain_cb);
+	}
+	return 0;
+}
+
+// Stop all of the call_rcu() chains.
+static void call_rcu_chain_cleanup(void)
+{
+	int i;
+
+	if (!call_rcu_chain)
+		return;
+	for (i = 0; i < call_rcu_chains; i++)
+		smp_store_release(&call_rcu_chain[i].crc_stop, true);
+	rcu_barrier();
+	kfree(call_rcu_chain);
+	call_rcu_chain = NULL;
 }
 
 static void lock_torture_cleanup(void)
@@ -1048,6 +1148,8 @@ static void lock_torture_cleanup(void)
 	kfree(cxt.lrsa);
 	cxt.lrsa = NULL;
 
+	call_rcu_chain_cleanup();
+
 end:
 	if (cxt.init_called) {
 		if (cxt.cur_ops->exit)
@@ -1177,6 +1279,10 @@ static int __init lock_torture_init(void)
 		}
 	}
 
+	firsterr = call_rcu_chain_init();
+	if (torture_init_error(firsterr))
+		goto unwind;
+
 	lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
 
 	/* Prepare torture context. */
@@ -1250,6 +1356,8 @@ static int __init lock_torture_init(void)
 						     writer_fifo ? sched_set_fifo : NULL);
 		if (torture_init_error(firsterr))
 			goto unwind;
+		if (cpumask_nonempty(bind_writers))
+			torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers);
 
 	create_reader:
 		if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1259,6 +1367,8 @@ static int __init lock_torture_init(void)
 						  reader_tasks[j]);
 		if (torture_init_error(firsterr))
 			goto unwind;
+		if (cpumask_nonempty(bind_readers))
+			torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers);
 	}
 	if (stat_interval > 0) {
 		firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d973fe6041bf..2deeeca3e71b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1126,6 +1126,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 #endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 #endif /* !CONFIG_PREEMPT_RT */
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
+
 /**
  * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
  * @cnt: the atomic which we are to dec
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 21db0df0eb00..4a10e8c16fd2 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
 	return try_cmpxchg_acquire(&lock->owner, &old, new);
 }
 
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+	return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
 						     struct task_struct *old,
 						     struct task_struct *new)
@@ -297,6 +302,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
 
 }
 
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+	/*
+	 * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
+	 *
+	 * Avoid unconditionally taking the slow path by using
+	 * rt_mutex_slow_trylock() which is covered by the debug code and can
+	 * acquire a non-contended rtmutex.
+	 */
+	return rt_mutex_slowtrylock(lock);
+}
+
 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
 						     struct task_struct *old,
 						     struct task_struct *new)
@@ -1613,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 		raw_spin_unlock_irq(&lock->wait_lock);
 
 		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
-			schedule();
+			rt_mutex_schedule();
 
 		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
@@ -1642,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	WARN(1, "rtmutex deadlock detected\n");
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule();
+		rt_mutex_schedule();
 	}
 }
 
@@ -1738,6 +1757,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	int ret;
 
 	/*
+	 * Do all pre-schedule work here, before we queue a waiter and invoke
+	 * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+	 * otherwise recurse back into task_blocks_on_rt_mutex() through
+	 * rtlock_slowlock() and will then enqueue a second waiter for this
+	 * same task and things get really confusing real fast.
+	 */
+	rt_mutex_pre_schedule();
+
+	/*
 	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
 	 * be called in early boot if the cmpxchg() fast path is disabled
 	 * (debug, no architecture support). In this case we will acquire the
@@ -1748,6 +1776,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+	rt_mutex_post_schedule();
 
 	return ret;
 }
@@ -1755,7 +1784,9 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
 					   unsigned int state)
 {
-	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+	lockdep_assert(!current->pi_blocked_on);
+
+	if (likely(rt_mutex_try_acquire(lock)))
 		return 0;
 
 	return rt_mutex_slowlock(lock, NULL, state);
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 25ec0239477c..34a59569db6b 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 	int ret;
 
+	rwbase_pre_schedule();
 	raw_spin_lock_irq(&rtm->wait_lock);
 
 	/*
@@ -125,12 +126,15 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 		rwbase_rtmutex_unlock(rtm);
 
 	trace_contention_end(rwb, ret);
+	rwbase_post_schedule();
 	return ret;
 }
 
 static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
 					    unsigned int state)
 {
+	lockdep_assert(!current->pi_blocked_on);
+
 	if (rwbase_read_trylock(rwb))
 		return 0;
 
@@ -237,6 +241,8 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 	/* Force readers into slow path */
 	atomic_sub(READER_BIAS, &rwb->readers);
 
+	rwbase_pre_schedule();
+
 	raw_spin_lock_irqsave(&rtm->wait_lock, flags);
 	if (__rwbase_write_trylock(rwb))
 		goto out_unlock;
@@ -248,6 +254,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 		if (rwbase_signal_pending_state(state, current)) {
 			rwbase_restore_current_state();
 			__rwbase_write_unlock(rwb, 0, flags);
+			rwbase_post_schedule();
 			trace_contention_end(rwb, -EINTR);
 			return -EINTR;
 		}
@@ -266,6 +273,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+	rwbase_post_schedule();
 	return 0;
 }
 
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 9eabd585ce7a..2340b6d90ec6 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1427,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 #define rwbase_signal_pending_state(state, current)	\
 	signal_pending_state(state, current)
 
+#define rwbase_pre_schedule()				\
+	rt_mutex_pre_schedule()
+
 #define rwbase_schedule()				\
-	schedule()
+	rt_mutex_schedule()
+
+#define rwbase_post_schedule()				\
+	rt_mutex_post_schedule()
 
 #include "rwbase_rt.c"
 
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 48a19ed8486d..38e292454fcc 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -37,6 +37,8 @@
 
 static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
 {
+	lockdep_assert(!current->pi_blocked_on);
+
 	if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
 		rtlock_slowlock(rtm);
 }
@@ -184,9 +186,13 @@ static __always_inline int  rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
 
 #define rwbase_signal_pending_state(state, current)	(0)
 
+#define rwbase_pre_schedule()
+
 #define rwbase_schedule()				\
 	schedule_rtlock()
 
+#define rwbase_post_schedule()
+
 #include "rwbase_rt.c"
 /*
  * The common functions which get wrapped into the rwlock API.
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 93cca6e69860..78719e1ef1b1 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -9,7 +9,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
 #include <linux/slab.h>
 #include <linux/ww_mutex.h>
 
@@ -386,6 +386,19 @@ struct stress {
 	int nlocks;
 };
 
+struct rnd_state rng;
+DEFINE_SPINLOCK(rng_lock);
+
+static inline u32 prandom_u32_below(u32 ceil)
+{
+	u32 ret;
+
+	spin_lock(&rng_lock);
+	ret = prandom_u32_state(&rng) % ceil;
+	spin_unlock(&rng_lock);
+	return ret;
+}
+
 static int *get_random_order(int count)
 {
 	int *order;
@@ -399,7 +412,7 @@ static int *get_random_order(int count)
 		order[n] = n;
 
 	for (n = count - 1; n > 1; n--) {
-		r = get_random_u32_below(n + 1);
+		r = prandom_u32_below(n + 1);
 		if (r != n) {
 			tmp = order[n];
 			order[n] = order[r];
@@ -452,21 +465,21 @@ retry:
 			ww_mutex_unlock(&locks[order[n]]);
 
 		if (err == -EDEADLK) {
-			ww_mutex_lock_slow(&locks[order[contended]], &ctx);
-			goto retry;
+			if (!time_after(jiffies, stress->timeout)) {
+				ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+				goto retry;
+			}
 		}
 
+		ww_acquire_fini(&ctx);
 		if (err) {
 			pr_err_once("stress (%s) failed with %d\n",
 				    __func__, err);
 			break;
 		}
-
-		ww_acquire_fini(&ctx);
 	} while (!time_after(jiffies, stress->timeout));
 
 	kfree(order);
-	kfree(stress);
 }
 
 struct reorder_lock {
@@ -531,7 +544,6 @@ out:
 	list_for_each_entry_safe(ll, ln, &locks, link)
 		kfree(ll);
 	kfree(order);
-	kfree(stress);
 }
 
 static void stress_one_work(struct work_struct *work)
@@ -552,8 +564,6 @@ static void stress_one_work(struct work_struct *work)
 			break;
 		}
 	} while (!time_after(jiffies, stress->timeout));
-
-	kfree(stress);
 }
 
 #define STRESS_INORDER BIT(0)
@@ -564,15 +574,24 @@ static void stress_one_work(struct work_struct *work)
 static int stress(int nlocks, int nthreads, unsigned int flags)
 {
 	struct ww_mutex *locks;
-	int n;
+	struct stress *stress_array;
+	int n, count;
 
 	locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
 	if (!locks)
 		return -ENOMEM;
 
+	stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+				     GFP_KERNEL);
+	if (!stress_array) {
+		kfree(locks);
+		return -ENOMEM;
+	}
+
 	for (n = 0; n < nlocks; n++)
 		ww_mutex_init(&locks[n], &ww_class);
 
+	count = 0;
 	for (n = 0; nthreads; n++) {
 		struct stress *stress;
 		void (*fn)(struct work_struct *work);
@@ -596,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
 		if (!fn)
 			continue;
 
-		stress = kmalloc(sizeof(*stress), GFP_KERNEL);
-		if (!stress)
-			break;
+		stress = &stress_array[count++];
 
 		INIT_WORK(&stress->work, fn);
 		stress->locks = locks;
@@ -613,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
 
 	for (n = 0; n < nlocks; n++)
 		ww_mutex_destroy(&locks[n]);
+	kfree(stress_array);
 	kfree(locks);
 
 	return 0;
@@ -625,6 +643,8 @@ static int __init test_ww_mutex_init(void)
 
 	printk(KERN_INFO "Beginning ww mutex selftests\n");
 
+	prandom_seed_state(&rng, get_random_u64());
+
 	wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
 	if (!wq)
 		return -ENOMEM;
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index d1473c624105..c7196de838ed 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
 	}
 	mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
 
-	if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) {
+	if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
 		if (ww_ctx)
 			ww_mutex_set_context_fastpath(lock, ww_ctx);
 		return 0;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8d35b9f9aaa3..dee341ae4ace 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -684,7 +684,7 @@ static void power_down(void)
 		cpu_relax();
 }
 
-static int load_image_and_restore(bool snapshot_test)
+static int load_image_and_restore(void)
 {
 	int error;
 	unsigned int flags;
@@ -694,12 +694,12 @@ static int load_image_and_restore(bool snapshot_test)
 	lock_device_hotplug();
 	error = create_basic_memory_bitmaps();
 	if (error) {
-		swsusp_close(snapshot_test);
+		swsusp_close();
 		goto Unlock;
 	}
 
 	error = swsusp_read(&flags);
-	swsusp_close(snapshot_test);
+	swsusp_close();
 	if (!error)
 		error = hibernation_restore(flags & SF_PLATFORM_MODE);
 
@@ -788,7 +788,7 @@ int hibernate(void)
 		pm_pr_dbg("Checking hibernation image\n");
 		error = swsusp_check(false);
 		if (!error)
-			error = load_image_and_restore(false);
+			error = load_image_and_restore();
 	}
 	thaw_processes();
 
@@ -952,7 +952,7 @@ static int software_resume(void)
 	/* The snapshot device should not be opened while we're running */
 	if (!hibernate_acquire()) {
 		error = -EBUSY;
-		swsusp_close(true);
+		swsusp_close();
 		goto Unlock;
 	}
 
@@ -973,7 +973,7 @@ static int software_resume(void)
 		goto Close_Finish;
 	}
 
-	error = load_image_and_restore(true);
+	error = load_image_and_restore();
 	thaw_processes();
  Finish:
 	pm_notifier_call_chain(PM_POST_RESTORE);
@@ -987,7 +987,7 @@ static int software_resume(void)
 	pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
 	return error;
  Close_Finish:
-	swsusp_close(true);
+	swsusp_close();
 	goto Finish;
 }
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index a98f95e309a3..17fd9aaaf084 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -172,7 +172,7 @@ int swsusp_check(bool exclusive);
 extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
-void swsusp_close(bool exclusive);
+void swsusp_close(void);
 #ifdef CONFIG_SUSPEND
 extern int swsusp_unmark(void);
 #endif
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 74edbce2320b..68a5c2f06957 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -222,7 +222,7 @@ int swsusp_swap_in_use(void)
  */
 
 static unsigned short root_swap = 0xffff;
-static struct block_device *hib_resume_bdev;
+static struct bdev_handle *hib_resume_bdev_handle;
 
 struct hib_bio_batch {
 	atomic_t		count;
@@ -276,7 +276,8 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
 	struct bio *bio;
 	int error = 0;
 
-	bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH);
+	bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf,
+			GFP_NOIO | __GFP_HIGH);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
@@ -356,14 +357,14 @@ static int swsusp_swap_check(void)
 		return res;
 	root_swap = res;
 
-	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+	hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
 			BLK_OPEN_WRITE, NULL, NULL);
-	if (IS_ERR(hib_resume_bdev))
-		return PTR_ERR(hib_resume_bdev);
+	if (IS_ERR(hib_resume_bdev_handle))
+		return PTR_ERR(hib_resume_bdev_handle);
 
-	res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
+	res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
 	if (res < 0)
-		blkdev_put(hib_resume_bdev, NULL);
+		bdev_release(hib_resume_bdev_handle);
 
 	return res;
 }
@@ -443,7 +444,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 err_rel:
 	release_swap_writer(handle);
 err_close:
-	swsusp_close(false);
+	swsusp_close();
 	return ret;
 }
 
@@ -508,7 +509,7 @@ static int swap_writer_finish(struct swap_map_handle *handle,
 	if (error)
 		free_all_swap_pages(root_swap);
 	release_swap_writer(handle);
-	swsusp_close(false);
+	swsusp_close();
 
 	return error;
 }
@@ -1522,10 +1523,10 @@ int swsusp_check(bool exclusive)
 	void *holder = exclusive ? &swsusp_holder : NULL;
 	int error;
 
-	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ,
-					    holder, NULL);
-	if (!IS_ERR(hib_resume_bdev)) {
-		set_blocksize(hib_resume_bdev, PAGE_SIZE);
+	hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+				BLK_OPEN_READ, holder, NULL);
+	if (!IS_ERR(hib_resume_bdev_handle)) {
+		set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
 		error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
 					swsusp_header, NULL);
@@ -1550,11 +1551,11 @@ int swsusp_check(bool exclusive)
 
 put:
 		if (error)
-			blkdev_put(hib_resume_bdev, holder);
+			bdev_release(hib_resume_bdev_handle);
 		else
 			pr_debug("Image signature found, resuming\n");
 	} else {
-		error = PTR_ERR(hib_resume_bdev);
+		error = PTR_ERR(hib_resume_bdev_handle);
 	}
 
 	if (error)
@@ -1568,14 +1569,14 @@ put:
  * @exclusive: Close the resume device which is exclusively opened.
  */
 
-void swsusp_close(bool exclusive)
+void swsusp_close(void)
 {
-	if (IS_ERR(hib_resume_bdev)) {
+	if (IS_ERR(hib_resume_bdev_handle)) {
 		pr_debug("Image device not initialised\n");
 		return;
 	}
 
-	blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL);
+	bdev_release(hib_resume_bdev_handle);
 }
 
 /**
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 98e13be411af..0d866eaa4cc8 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -10,6 +10,7 @@
 #ifndef __LINUX_RCU_H
 #define __LINUX_RCU_H
 
+#include <linux/slab.h>
 #include <trace/events/rcu.h>
 
 /*
@@ -248,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 }
 #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
+static inline void debug_rcu_head_callback(struct rcu_head *rhp)
+{
+	if (unlikely(!rhp->func))
+		kmem_dump_obj(rhp);
+}
+
 extern int rcu_cpu_stall_suppress_at_boot;
 
 static inline bool rcu_stall_is_suppressed_at_boot(void)
@@ -568,10 +575,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
 static inline void rcu_gp_set_torture_wait(int duration) { }
 #endif
 
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
-#endif
-
 #ifdef CONFIG_TINY_SRCU
 
 static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -654,4 +657,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
 bool rcu_cpu_beenfullyonline(int cpu);
 #endif
 
+#ifdef CONFIG_RCU_STALL_COMMON
+int rcu_stall_notifier_call_chain(unsigned long val, void *v);
+#else // #ifdef CONFIG_RCU_STALL_COMMON
+static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
+#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index f71fac422c8f..1693ea22ef1b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -368,7 +368,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
 	smp_mb(); /* Ensure counts are updated before callback is entrained. */
 	rhp->next = NULL;
 	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1])
+		if (!rcu_segcblist_segempty(rsclp, i))
 			break;
 	rcu_segcblist_inc_seglen(rsclp, i);
 	WRITE_ONCE(*rsclp->tails[i], rhp);
@@ -551,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 	 * as their ->gp_seq[] grace-period completion sequence number.
 	 */
 	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+		if (!rcu_segcblist_segempty(rsclp, i) &&
 		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
 			break;
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ade42d6a9d9b..30fc9d34e329 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -21,6 +21,7 @@
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/rcu_notifier.h>
 #include <linux/interrupt.h>
 #include <linux/sched/signal.h>
 #include <uapi/linux/sched/types.h>
@@ -810,7 +811,7 @@ static void synchronize_rcu_trivial(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+		torture_sched_setaffinity(current->pid, cpumask_of(cpu));
 		WARN_ON_ONCE(raw_smp_processor_id() != cpu);
 	}
 }
@@ -1149,7 +1150,7 @@ static int rcu_torture_boost(void *arg)
 				mutex_unlock(&boost_mutex);
 				break;
 			}
-			schedule_timeout_uninterruptible(1);
+			schedule_timeout_uninterruptible(HZ / 20);
 		}
 
 		/* Go do the stutter. */
@@ -1160,7 +1161,7 @@ checkwait:	if (stutter_wait("rcu_torture_boost"))
 	/* Clean up and exit. */
 	while (!kthread_should_stop()) {
 		torture_shutdown_absorb("rcu_torture_boost");
-		schedule_timeout_uninterruptible(1);
+		schedule_timeout_uninterruptible(HZ / 20);
 	}
 	torture_kthread_stopping("rcu_torture_boost");
 	return 0;
@@ -1183,7 +1184,7 @@ rcu_torture_fqs(void *arg)
 		fqs_resume_time = jiffies + fqs_stutter * HZ;
 		while (time_before(jiffies, fqs_resume_time) &&
 		       !kthread_should_stop()) {
-			schedule_timeout_interruptible(1);
+			schedule_timeout_interruptible(HZ / 20);
 		}
 		fqs_burst_remaining = fqs_duration;
 		while (fqs_burst_remaining > 0 &&
@@ -2126,7 +2127,7 @@ static int rcu_nocb_toggle(void *arg)
 	VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started");
 	while (!rcu_inkernel_boot_has_ended())
 		schedule_timeout_interruptible(HZ / 10);
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		maxcpu = cpu;
 	WARN_ON(maxcpu < 0);
 	if (toggle_interval > ULONG_MAX)
@@ -2428,6 +2429,16 @@ static int rcutorture_booster_init(unsigned int cpu)
 	return 0;
 }
 
+static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr)
+{
+	pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_torture_stall_block = {
+	.notifier_call = rcu_torture_stall_nf,
+};
+
 /*
  * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
  * induces a CPU stall for the time specified by stall_cpu.
@@ -2435,9 +2446,14 @@ static int rcutorture_booster_init(unsigned int cpu)
 static int rcu_torture_stall(void *args)
 {
 	int idx;
+	int ret;
 	unsigned long stop_at;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+	ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+	if (ret)
+		pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+			__func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
 	if (stall_cpu_holdoff > 0) {
 		VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
 		schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -2481,6 +2497,11 @@ static int rcu_torture_stall(void *args)
 		cur_ops->readunlock(idx);
 	}
 	pr_alert("%s end.\n", __func__);
+	if (!ret) {
+		ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
+		if (ret)
+			pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret);
+	}
 	torture_shutdown_absorb("rcu_torture_stall");
 	while (!kthread_should_stop())
 		schedule_timeout_interruptible(10 * HZ);
@@ -2899,7 +2920,7 @@ static int rcu_torture_fwd_prog(void *args)
 			WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
 		} else {
 			while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
-				schedule_timeout_interruptible(1);
+				schedule_timeout_interruptible(HZ / 20);
 			oldseq = READ_ONCE(rcu_fwd_seq);
 		}
 		pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
@@ -3200,7 +3221,7 @@ static int rcu_torture_read_exit_child(void *trsp_in)
 	set_user_nice(current, MAX_NICE);
 	// Minimize time between reading and exiting.
 	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
+		schedule_timeout_uninterruptible(HZ / 20);
 	(void)rcu_torture_one_read(trsp, -1);
 	return 0;
 }
@@ -3248,7 +3269,7 @@ static int rcu_torture_read_exit(void *unused)
 	smp_mb(); // Store before wakeup.
 	wake_up(&read_exit_wq);
 	while (!torture_must_stop())
-		schedule_timeout_uninterruptible(1);
+		schedule_timeout_uninterruptible(HZ / 20);
 	torture_kthread_stopping("rcu_torture_read_exit");
 	return 0;
 }
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 91a0fd0d4d9a..2c2648a3ad30 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -655,12 +655,12 @@ retry:
 			goto retry;
 		}
 		un_delay(udl, ndl);
+		b = READ_ONCE(rtsp->a);
 		// Remember, seqlock read-side release can fail.
 		if (!rts_release(rtsp, start)) {
 			rcu_read_unlock();
 			goto retry;
 		}
-		b = READ_ONCE(rtsp->a);
 		WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
 		b = rtsp->b;
 		rcu_read_unlock();
@@ -1025,8 +1025,8 @@ static void
 ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" SCALE_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
-		 verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
+		 "--- %s:  verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+		 verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
 }
 
 static void
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 336af24e0fe3..c38e5933a5d6 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
 	while (lh) {
 		rhp = lh;
 		lh = lh->next;
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 20d7a238d675..560e99ec5333 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -223,7 +223,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 				snp->grplo = cpu;
 			snp->grphi = cpu;
 		}
-		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
+		sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo);
 	}
 	smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
 	return true;
@@ -255,29 +255,31 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	ssp->srcu_sup->sda_is_static = is_static;
 	if (!is_static)
 		ssp->sda = alloc_percpu(struct srcu_data);
-	if (!ssp->sda) {
-		if (!is_static)
-			kfree(ssp->srcu_sup);
-		return -ENOMEM;
-	}
+	if (!ssp->sda)
+		goto err_free_sup;
 	init_srcu_struct_data(ssp);
 	ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
-		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
-			if (!ssp->srcu_sup->sda_is_static) {
-				free_percpu(ssp->sda);
-				ssp->sda = NULL;
-				kfree(ssp->srcu_sup);
-				return -ENOMEM;
-			}
-		} else {
-			WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
-		}
+		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+			goto err_free_sda;
+		WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
 	}
 	ssp->srcu_sup->srcu_ssp = ssp;
 	smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
 	return 0;
+
+err_free_sda:
+	if (!is_static) {
+		free_percpu(ssp->sda);
+		ssp->sda = NULL;
+	}
+err_free_sup:
+	if (!is_static) {
+		kfree(ssp->srcu_sup);
+		ssp->srcu_sup = NULL;
+	}
+	return -ENOMEM;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -782,8 +784,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
 	spin_lock_rcu_node(sdp);  /* Interrupts already disabled. */
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
-	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
-				       rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
+	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
 	spin_unlock_rcu_node(sdp);  /* Interrupts remain disabled. */
 	WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
 	WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
@@ -833,7 +834,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
 	int cpu;
 
 	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
-		if (!(mask & (1 << (cpu - snp->grplo))))
+		if (!(mask & (1UL << (cpu - snp->grplo))))
 			continue;
 		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
 	}
@@ -1242,10 +1243,37 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	spin_lock_irqsave_sdp_contention(sdp, &flags);
 	if (rhp)
 		rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
+	/*
+	 * The snapshot for acceleration must be taken _before_ the read of the
+	 * current gp sequence used for advancing, otherwise advancing may fail
+	 * and acceleration may then fail too.
+	 *
+	 * This could happen if:
+	 *
+	 *  1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
+	 *     RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
+	 *
+	 *  2) The grace period for RCU_WAIT_TAIL is seen as started but not
+	 *     completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
+	 *
+	 *  3) This value is passed to rcu_segcblist_advance() which can't move
+	 *     any segment forward and fails.
+	 *
+	 *  4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
+	 *     But then the call to rcu_seq_snap() observes the grace period for the
+	 *     RCU_WAIT_TAIL segment as completed and the subsequent one for the
+	 *     RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
+	 *     so it returns a snapshot of the next grace period, which is X + 12.
+	 *
+	 *  5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
+	 *     freshly enqueued callback in RCU_NEXT_TAIL can't move to
+	 *     RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
+	 *     period (gp_num = X + 8). So acceleration fails.
+	 */
+	s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
-	s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
-	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+	WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp);
 	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
 		sdp->srcu_gp_seq_needed = s;
 		needgp = true;
@@ -1692,6 +1720,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	ssp = sdp->ssp;
 	rcu_cblist_init(&ready_cbs);
 	spin_lock_irq_rcu_node(sdp);
+	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
 	if (sdp->srcu_cblist_invoking ||
@@ -1708,6 +1737,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	rhp = rcu_cblist_dequeue(&ready_cbs);
 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
 		debug_rcu_head_unqueue(rhp);
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
@@ -1720,8 +1750,6 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	 */
 	spin_lock_irq_rcu_node(sdp);
 	rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
-	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
-				       rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
 	sdp->srcu_cblist_invoking = false;
 	more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
 	spin_unlock_irq_rcu_node(sdp);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 8d65f7d576a3..1fa631168594 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -432,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
 static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 {
 	int cpu;
+	int dequeue_limit;
 	unsigned long flags;
 	bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
 	long n;
@@ -439,7 +440,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 	long ncbsnz = 0;
 	int needgpcb = 0;
 
-	for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
+	dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
+	for (cpu = 0; cpu < dequeue_limit; cpu++) {
 		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
 
 		/* Advance and accelerate any new callbacks. */
@@ -538,6 +540,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 	len = rcl.len;
 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
@@ -1084,7 +1087,7 @@ void rcu_barrier_tasks(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
 
-int rcu_tasks_lazy_ms = -1;
+static int rcu_tasks_lazy_ms = -1;
 module_param(rcu_tasks_lazy_ms, int, 0444);
 
 static int __init rcu_spawn_tasks_kthread(void)
@@ -1979,20 +1982,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
 
 static void rcu_tasks_initiate_self_tests(void)
 {
-	pr_info("Running RCU-tasks wait API self tests\n");
 #ifdef CONFIG_TASKS_RCU
+	pr_info("Running RCU Tasks wait API self tests\n");
 	tests[0].runstart = jiffies;
 	synchronize_rcu_tasks();
 	call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
 #endif
 
 #ifdef CONFIG_TASKS_RUDE_RCU
+	pr_info("Running RCU Tasks Rude wait API self tests\n");
 	tests[1].runstart = jiffies;
 	synchronize_rcu_tasks_rude();
 	call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
 #endif
 
 #ifdef CONFIG_TASKS_TRACE_RCU
+	pr_info("Running RCU Tasks Trace wait API self tests\n");
 	tests[2].runstart = jiffies;
 	synchronize_rcu_tasks_trace();
 	call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 42f7589e51e0..fec804b79080 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
 
 	trace_rcu_invoke_callback("", head);
 	f = head->func;
+	debug_rcu_head_callback(head);
 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
 	f(head);
 	rcu_lock_release(&rcu_callback_map);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb1caefa8bd0..700524726079 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/export.h>
 #include <linux/completion.h>
+#include <linux/kmemleak.h>
 #include <linux/moduleparam.h>
 #include <linux/panic.h>
 #include <linux/panic_notifier.h>
@@ -1260,7 +1261,7 @@ EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
 /* Unregister a counter, with NULL for not caring which. */
 void rcu_gp_slow_unregister(atomic_t *rgssp)
 {
-	WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress);
+	WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL);
 
 	WRITE_ONCE(rcu_gp_slow_suppress, NULL);
 }
@@ -1556,10 +1557,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
  */
 static void rcu_gp_fqs(bool first_time)
 {
+	int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
 	struct rcu_node *rnp = rcu_get_root();
 
 	WRITE_ONCE(rcu_state.gp_activity, jiffies);
 	WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
+
+	WARN_ON_ONCE(nr_fqs > 3);
+	/* Only countdown nr_fqs for stall purposes if jiffies moves. */
+	if (nr_fqs) {
+		if (nr_fqs == 1) {
+			WRITE_ONCE(rcu_state.jiffies_stall,
+				   jiffies + rcu_jiffies_till_stall_check());
+		}
+		WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
+	}
+
 	if (first_time) {
 		/* Collect dyntick-idle snapshots. */
 		force_qs_rnp(dyntick_save_progress_counter);
@@ -2135,6 +2148,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 		trace_rcu_invoke_callback(rcu_state.name, rhp);
 
 		f = rhp->func;
+		debug_rcu_head_callback(rhp);
 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
 		f(rhp);
 
@@ -2713,7 +2727,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
  */
 void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 {
-	return __call_rcu_common(head, func, false);
+	__call_rcu_common(head, func, false);
 }
 EXPORT_SYMBOL_GPL(call_rcu_hurry);
 #endif
@@ -2764,7 +2778,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
  */
 void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
-	return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+	__call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
@@ -3388,6 +3402,14 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 		success = true;
 	}
 
+	/*
+	 * The kvfree_rcu() caller considers the pointer freed at this point
+	 * and likely removes any references to it. Since the actual slab
+	 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+	 * this object (no scanning or false positives reporting).
+	 */
+	kmemleak_ignore(ptr);
+
 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
 		schedule_delayed_monitor_work(krcp);
@@ -4083,6 +4105,82 @@ retry:
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
+static unsigned long rcu_barrier_last_throttle;
+
+/**
+ * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
+ *
+ * This can be thought of as guard rails around rcu_barrier() that
+ * permits unrestricted userspace use, at least assuming the hardware's
+ * try_cmpxchg() is robust.  There will be at most one call per second to
+ * rcu_barrier() system-wide from use of this function, which means that
+ * callers might needlessly wait a second or three.
+ *
+ * This is intended for use by test suites to avoid OOM by flushing RCU
+ * callbacks from the previous test before starting the next.  See the
+ * rcutree.do_rcu_barrier module parameter for more information.
+ *
+ * Why not simply make rcu_barrier() more scalable?  That might be
+ * the eventual endpoint, but let's keep it simple for the time being.
+ * Note that the module parameter infrastructure serializes calls to a
+ * given .set() function, but should concurrent .set() invocation ever be
+ * possible, we are ready!
+ */
+static void rcu_barrier_throttled(void)
+{
+	unsigned long j = jiffies;
+	unsigned long old = READ_ONCE(rcu_barrier_last_throttle);
+	unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
+
+	while (time_in_range(j, old, old + HZ / 16) ||
+	       !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) {
+		schedule_timeout_idle(HZ / 16);
+		if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
+			smp_mb(); /* caller's subsequent code after above check. */
+			return;
+		}
+		j = jiffies;
+		old = READ_ONCE(rcu_barrier_last_throttle);
+	}
+	rcu_barrier();
+}
+
+/*
+ * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
+ * request arrives.  We insist on a true value to allow for possible
+ * future expansion.
+ */
+static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp)
+{
+	bool b;
+	int ret;
+
+	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
+		return -EAGAIN;
+	ret = kstrtobool(val, &b);
+	if (!ret && b) {
+		atomic_inc((atomic_t *)kp->arg);
+		rcu_barrier_throttled();
+		atomic_dec((atomic_t *)kp->arg);
+	}
+	return ret;
+}
+
+/*
+ * Output the number of outstanding rcutree.do_rcu_barrier requests.
+ */
+static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp)
+{
+	return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg));
+}
+
+static const struct kernel_param_ops do_rcu_barrier_ops = {
+	.set = param_set_do_rcu_barrier,
+	.get = param_get_do_rcu_barrier,
+};
+static atomic_t do_rcu_barrier;
+module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644);
+
 /*
  * Compute the mask of online CPUs for the specified rcu_node structure.
  * This will not be stable unless the rcu_node structure's ->lock is
@@ -4130,7 +4228,7 @@ bool rcu_lockdep_current_cpu_online(void)
 	rdp = this_cpu_ptr(&rcu_data);
 	/*
 	 * Strictly, we care here about the case where the current CPU is
-	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+	 * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
 	 * not being up to date. So arch_spin_is_locked() might have a
 	 * false positive if it's held by some *other* CPU, but that's
 	 * OK because that just means a false *negative* on the warning.
@@ -4152,25 +4250,6 @@ static bool rcu_init_invoked(void)
 }
 
 /*
- * Near the end of the offline process.  Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
-{
-	bool blkd;
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-	struct rcu_node *rnp = rdp->mynode;
-
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return 0;
-
-	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
-	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
-			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
-	return 0;
-}
-
-/*
  * All CPUs for the specified rcu_node structure have gone offline,
  * and all tasks that were preempted within an RCU read-side critical
  * section while running on one of those CPUs have since exited their RCU
@@ -4216,23 +4295,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 }
 
 /*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context.  Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
-{
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return 0;
-
-	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
-	// Stop-machine done, so allow nohz_full to disable tick.
-	tick_dep_clear(TICK_DEP_BIT_RCU);
-	return 0;
-}
-
-/*
  * Propagate ->qsinitmask bits up the rcu_node tree to account for the
  * first CPU in a given leaf rcu_node structure coming online.  The caller
  * must hold the corresponding leaf rcu_node ->lock with interrupts
@@ -4385,29 +4447,6 @@ int rcutree_online_cpu(unsigned int cpu)
 }
 
 /*
- * Near the beginning of the process.  The CPU is still very much alive
- * with pretty much all services enabled.
- */
-int rcutree_offline_cpu(unsigned int cpu)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
-
-	rdp = per_cpu_ptr(&rcu_data, cpu);
-	rnp = rdp->mynode;
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	rnp->ffmask &= ~rdp->grpmask;
-	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
-	rcutree_affinity_setting(cpu, cpu);
-
-	// nohz_full CPUs need the tick for stop-machine to work quickly
-	tick_dep_set(TICK_DEP_BIT_RCU);
-	return 0;
-}
-
-/*
  * Mark the specified CPU as being online so that subsequent grace periods
  * (both expedited and normal) will wait on it.  Note that this means that
  * incoming CPUs are not allowed to use RCU read-side critical sections
@@ -4418,8 +4457,10 @@ int rcutree_offline_cpu(unsigned int cpu)
  * from the incoming CPU rather than from the cpuhp_step mechanism.
  * This is because this function must be invoked at a precise location.
  * This incoming CPU must not have enabled interrupts yet.
+ *
+ * This mirrors the effects of rcutree_report_cpu_dead().
  */
-void rcu_cpu_starting(unsigned int cpu)
+void rcutree_report_cpu_starting(unsigned int cpu)
 {
 	unsigned long mask;
 	struct rcu_data *rdp;
@@ -4473,14 +4514,21 @@ void rcu_cpu_starting(unsigned int cpu)
  * Note that this function is special in that it is invoked directly
  * from the outgoing CPU rather than from the cpuhp_step mechanism.
  * This is because this function must be invoked at a precise location.
+ *
+ * This mirrors the effect of rcutree_report_cpu_starting().
  */
-void rcu_report_dead(unsigned int cpu)
+void rcutree_report_cpu_dead(void)
 {
-	unsigned long flags, seq_flags;
+	unsigned long flags;
 	unsigned long mask;
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
+	/*
+	 * IRQS must be disabled from now on and until the CPU dies, or an interrupt
+	 * may introduce a new READ-side while it is actually off the QS masks.
+	 */
+	lockdep_assert_irqs_disabled();
 	// Do any dangling deferred wakeups.
 	do_nocb_deferred_wakeup(rdp);
 
@@ -4488,7 +4536,6 @@ void rcu_report_dead(unsigned int cpu)
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
-	local_irq_save(seq_flags);
 	arch_spin_lock(&rcu_state.ofl_lock);
 	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4502,8 +4549,6 @@ void rcu_report_dead(unsigned int cpu)
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	arch_spin_unlock(&rcu_state.ofl_lock);
-	local_irq_restore(seq_flags);
-
 	rdp->cpu_started = false;
 }
 
@@ -4558,7 +4603,60 @@ void rcutree_migrate_callbacks(int cpu)
 		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
 		  rcu_segcblist_first_cb(&rdp->cblist));
 }
-#endif
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context.  Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+	// Stop-machine done, so allow nohz_full to disable tick.
+	tick_dep_clear(TICK_DEP_BIT_RCU);
+	return 0;
+}
+
+/*
+ * Near the end of the offline process.  Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+	bool blkd;
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	struct rcu_node *rnp = rdp->mynode;
+
+	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+	return 0;
+}
+
+/*
+ * Near the beginning of the process.  The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
+int rcutree_offline_cpu(unsigned int cpu)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	rdp = per_cpu_ptr(&rcu_data, cpu);
+	rnp = rdp->mynode;
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	rnp->ffmask &= ~rdp->grpmask;
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+	rcutree_affinity_setting(cpu, cpu);
+
+	// nohz_full CPUs need the tick for stop-machine to work quickly
+	tick_dep_set(TICK_DEP_BIT_RCU);
+	return 0;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
  * On non-huge systems, use expedited RCU grace periods to make suspend
@@ -4990,7 +5088,7 @@ void __init rcu_init(void)
 	pm_notifier(rcu_pm_notify, 0);
 	WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
 	rcutree_prepare_cpu(cpu);
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	rcutree_online_cpu(cpu);
 
 	/* Create workqueue for Tree SRCU and for expedited GPs. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 192536916f9a..e9821a8422db 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -386,6 +386,10 @@ struct rcu_state {
 						/*  in jiffies. */
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
+	int nr_fqs_jiffies_stall;		/* Number of fqs loops after
+						 * which read jiffies and set
+						 * jiffies_stall. Stall
+						 * warnings disabled if !0. */
 	unsigned long jiffies_resched;		/* Time at which to resched */
 						/*  a reluctant CPU. */
 	unsigned long n_force_qs_gpstart;	/* Snapshot of n_force_qs at */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8239b39d945b..6d7cea5d591f 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -621,10 +621,14 @@ static void synchronize_rcu_expedited_wait(void)
 	}
 
 	for (;;) {
+		unsigned long j;
+
 		if (synchronize_rcu_expedited_wait_once(jiffies_stall))
 			return;
 		if (rcu_stall_is_suppressed())
 			continue;
+		j = jiffies;
+		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
 		trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
 		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
 		       rcu_state.name);
@@ -647,7 +651,7 @@ static void synchronize_rcu_expedited_wait(void)
 			}
 		}
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
-			jiffies - jiffies_start, rcu_state.expedited_sequence,
+			j - jiffies_start, rcu_state.expedited_sequence,
 			data_race(rnp_root->expmask),
 			".T"[!!data_race(rnp_root->exp_tasks)]);
 		if (ndetected) {
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 6f06dc12904a..ac8e86babe44 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kvm_para.h>
+#include <linux/rcu_notifier.h>
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -149,12 +150,17 @@ static void panic_on_rcu_stall(void)
 /**
  * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
  *
+ * To perform the reset request from the caller, disable stall detection until
+ * 3 fqs loops have passed. This is required to ensure a fresh jiffies is
+ * loaded.  It should be safe to do from the fqs loop as enough timer
+ * interrupts and context switches should have passed.
+ *
  * The caller must disable hard irqs.
  */
 void rcu_cpu_stall_reset(void)
 {
-	WRITE_ONCE(rcu_state.jiffies_stall,
-		   jiffies + rcu_jiffies_till_stall_check());
+	WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3);
+	WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -170,6 +176,7 @@ static void record_gp_stall_check_time(void)
 	WRITE_ONCE(rcu_state.gp_start, j);
 	j1 = rcu_jiffies_till_stall_check();
 	smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq.
+	WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0);
 	WRITE_ONCE(rcu_state.jiffies_stall, j + j1);
 	rcu_state.jiffies_resched = j + j1 / 2;
 	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
@@ -534,16 +541,16 @@ static void rcu_check_gp_kthread_starvation(void)
 		       data_race(READ_ONCE(rcu_state.gp_state)),
 		       gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
 		if (gpk) {
+			struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
 			pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
 			pr_err("RCU grace-period kthread stack dump:\n");
 			sched_show_task(gpk);
-			if (cpu >= 0) {
-				if (cpu_is_offline(cpu)) {
-					pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
-				} else  {
-					pr_err("Stack dump where RCU GP kthread last ran:\n");
-					dump_cpu_task(cpu);
-				}
+			if (cpu_is_offline(cpu)) {
+				pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
+			} else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) {
+				pr_err("Stack dump where RCU GP kthread last ran:\n");
+				dump_cpu_task(cpu);
 			}
 			wake_up_process(gpk);
 		}
@@ -711,7 +718,7 @@ static void print_cpu_stall(unsigned long gps)
 
 static void check_cpu_stall(struct rcu_data *rdp)
 {
-	bool didstall = false;
+	bool self_detected;
 	unsigned long gs1;
 	unsigned long gs2;
 	unsigned long gps;
@@ -725,6 +732,16 @@ static void check_cpu_stall(struct rcu_data *rdp)
 	    !rcu_gp_in_progress())
 		return;
 	rcu_stall_kick_kthreads();
+
+	/*
+	 * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS
+	 * loop has to set jiffies to ensure a non-stale jiffies value. This
+	 * is required to have good jiffies value after coming out of long
+	 * breaks of jiffies updates. Not doing so can cause false positives.
+	 */
+	if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0)
+		return;
+
 	j = jiffies;
 
 	/*
@@ -758,10 +775,10 @@ static void check_cpu_stall(struct rcu_data *rdp)
 		return; /* No stall or GP completed since entering function. */
 	rnp = rdp->mynode;
 	jn = jiffies + ULONG_MAX / 2;
+	self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask;
 	if (rcu_gp_in_progress() &&
-	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+	    (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) &&
 	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
 		/*
 		 * If a virtual machine is stopped by the host it can look to
 		 * the watchdog like an RCU stall. Check to see if the host
@@ -770,39 +787,28 @@ static void check_cpu_stall(struct rcu_data *rdp)
 		if (kvm_check_and_clear_guest_paused())
 			return;
 
-		/* We haven't checked in, so go dump stack. */
-		print_cpu_stall(gps);
-		if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
-			rcu_ftrace_dump(DUMP_ALL);
-		didstall = true;
-
-	} else if (rcu_gp_in_progress() &&
-		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
-		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
-		/*
-		 * If a virtual machine is stopped by the host it can look to
-		 * the watchdog like an RCU stall. Check to see if the host
-		 * stopped the vm.
-		 */
-		if (kvm_check_and_clear_guest_paused())
-			return;
+		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
+		if (self_detected) {
+			/* We haven't checked in, so go dump stack. */
+			print_cpu_stall(gps);
+		} else {
+			/* They had a few time units to dump stack, so complain. */
+			print_other_cpu_stall(gs2, gps);
+		}
 
-		/* They had a few time units to dump stack, so complain. */
-		print_other_cpu_stall(gs2, gps);
 		if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
 			rcu_ftrace_dump(DUMP_ALL);
-		didstall = true;
-	}
-	if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) {
-		jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
-		WRITE_ONCE(rcu_state.jiffies_stall, jn);
+
+		if (READ_ONCE(rcu_state.jiffies_stall) == jn) {
+			jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+			WRITE_ONCE(rcu_state.jiffies_stall, jn);
+		}
 	}
 }
 
 //////////////////////////////////////////////////////////////////////////////
 //
-// RCU forward-progress mechanisms, including of callback invocation.
+// RCU forward-progress mechanisms, including for callback invocation.
 
 
 /*
@@ -1054,3 +1060,58 @@ static int __init rcu_sysrq_init(void)
 	return 0;
 }
 early_initcall(rcu_sysrq_init);
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU CPU stall-warning notifiers
+
+static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
+
+/**
+ * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Adds an RCU CPU stall notifier to an atomic notifier chain.
+ * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or
+ * friends.  The @data will be the duration of the stalled grace period,
+ * in jiffies, coerced to a void* pointer.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int rcu_stall_chain_notifier_register(struct notifier_block *n)
+{
+	return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
+
+/**
+ * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Removes an RCU CPU stall notifier from an atomic notifier chain.
+ *
+ * Returns zero on success, %-ENOENT on failure.
+ */
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n)
+{
+	return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister);
+
+/*
+ * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in the RCU CPU stall notifier chain in turn, which
+ * is an atomic call chain.  See atomic_notifier_call_chain() for more
+ * information.
+ *
+ * This is for use within RCU, hence the omission of the extra asterisk
+ * to indicate a non-kerneldoc format header comment.
+ */
+int rcu_stall_notifier_call_chain(unsigned long val, void *v)
+{
+	return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
+}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 19bf6fa3ee6a..c534d6806d3d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -25,6 +25,7 @@
 #include <linux/interrupt.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/torture.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/percpu.h>
@@ -524,17 +525,17 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 	do { } while (0)
 #endif
 
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
 /* Get rcutorture access to sched_setaffinity(). */
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	int ret;
 
 	ret = sched_setaffinity(pid, in_mask);
-	WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+	WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
+EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
 #endif
 
 #ifdef CONFIG_RCU_STALL_COMMON
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 99bdd96f454f..80a3df49ab47 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -34,7 +34,6 @@
 #include <linux/nospec.h>
 #include <linux/proc_fs.h>
 #include <linux/psi.h>
-#include <linux/psi.h>
 #include <linux/ptrace_api.h>
 #include <linux/sched_clock.h>
 #include <linux/security.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 802551e0009b..81885748871d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -85,7 +85,6 @@
 
 #include "sched.h"
 #include "stats.h"
-#include "autogroup.h"
 
 #include "autogroup.h"
 #include "pelt.h"
@@ -114,6 +113,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -919,14 +919,13 @@ static bool set_nr_if_polling(struct task_struct *p)
 	struct thread_info *ti = task_thread_info(p);
 	typeof(ti->flags) val = READ_ONCE(ti->flags);
 
-	for (;;) {
+	do {
 		if (!(val & _TIF_POLLING_NRFLAG))
 			return false;
 		if (val & _TIF_NEED_RESCHED)
 			return true;
-		if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
-			break;
-	}
+	} while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
 	return true;
 }
 
@@ -1480,16 +1479,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p)
 
 static void uclamp_update_util_min_rt_default(struct task_struct *p)
 {
-	struct rq_flags rf;
-	struct rq *rq;
-
 	if (!rt_task(p))
 		return;
 
 	/* Protect updates to p->uclamp_* */
-	rq = task_rq_lock(p, &rf);
+	guard(task_rq_lock)(p);
 	__uclamp_update_util_min_rt_default(p);
-	task_rq_unlock(rq, p, &rf);
 }
 
 static inline struct uclamp_se
@@ -1785,9 +1780,8 @@ static void uclamp_update_root_tg(void)
 	uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
 		      sysctl_sched_uclamp_util_max, false);
 
-	rcu_read_lock();
+	guard(rcu)();
 	cpu_util_update_eff(&root_task_group.css);
-	rcu_read_unlock();
 }
 #else
 static void uclamp_update_root_tg(void) { }
@@ -1814,10 +1808,9 @@ static void uclamp_sync_util_min_rt_default(void)
 	smp_mb__after_spinlock();
 	read_unlock(&tasklist_lock);
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_process_thread(g, p)
 		uclamp_update_util_min_rt_default(p);
-	rcu_read_unlock();
 }
 
 static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
@@ -2218,10 +2211,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
 
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (p->sched_class == rq->curr->sched_class)
-		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+		rq->curr->sched_class->wakeup_preempt(rq, p, flags);
 	else if (sched_class_above(p->sched_class, rq->curr->sched_class))
 		resched_curr(rq);
 
@@ -2239,31 +2232,21 @@ int __task_state_match(struct task_struct *p, unsigned int state)
 	if (READ_ONCE(p->__state) & state)
 		return 1;
 
-#ifdef CONFIG_PREEMPT_RT
 	if (READ_ONCE(p->saved_state) & state)
 		return -1;
-#endif
+
 	return 0;
 }
 
 static __always_inline
 int task_state_match(struct task_struct *p, unsigned int state)
 {
-#ifdef CONFIG_PREEMPT_RT
-	int match;
-
 	/*
-	 * Serialize against current_save_and_set_rtlock_wait_state() and
-	 * current_restore_rtlock_saved_state().
+	 * Serialize against current_save_and_set_rtlock_wait_state(),
+	 * current_restore_rtlock_saved_state(), and __refrigerator().
 	 */
-	raw_spin_lock_irq(&p->pi_lock);
-	match = __task_state_match(p, state);
-	raw_spin_unlock_irq(&p->pi_lock);
-
-	return match;
-#else
+	guard(raw_spinlock_irq)(&p->pi_lock);
 	return __task_state_match(p, state);
-#endif
 }
 
 /*
@@ -2417,10 +2400,9 @@ void migrate_disable(void)
 		return;
 	}
 
-	preempt_disable();
+	guard(preempt)();
 	this_rq()->nr_pinned++;
 	p->migration_disabled = 1;
-	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
 
@@ -2444,7 +2426,7 @@ void migrate_enable(void)
 	 * Ensure stop_task runs either before or after this, and that
 	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
 	 */
-	preempt_disable();
+	guard(preempt)();
 	if (p->cpus_ptr != &p->cpus_mask)
 		__set_cpus_allowed_ptr(p, &ac);
 	/*
@@ -2455,7 +2437,6 @@ void migrate_enable(void)
 	barrier();
 	p->migration_disabled = 0;
 	this_rq()->nr_pinned--;
-	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -2527,7 +2508,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 	rq_lock(rq, rf);
 	WARN_ON_ONCE(task_cpu(p) != new_cpu);
 	activate_task(rq, p, 0);
-	check_preempt_curr(rq, p, 0);
+	wakeup_preempt(rq, p, 0);
 
 	return rq;
 }
@@ -2664,9 +2645,11 @@ static int migration_cpu_stop(void *data)
 		 * it.
 		 */
 		WARN_ON_ONCE(!pending->stop_pending);
+		preempt_disable();
 		task_rq_unlock(rq, p, &rf);
 		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
 				    &pending->arg, &pending->stop_work);
+		preempt_enable();
 		return 0;
 	}
 out:
@@ -2986,12 +2969,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 			complete = true;
 		}
 
+		preempt_disable();
 		task_rq_unlock(rq, p, rf);
-
 		if (push_task) {
 			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
 					    p, &rq->push_work);
 		}
+		preempt_enable();
 
 		if (complete)
 			complete_all(&pending->done);
@@ -3057,12 +3041,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 		if (flags & SCA_MIGRATE_ENABLE)
 			p->migration_flags &= ~MDF_PUSH;
 
+		preempt_disable();
 		task_rq_unlock(rq, p, rf);
-
 		if (!stop_pending) {
 			stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
 					    &pending->arg, &pending->stop_work);
 		}
+		preempt_enable();
 
 		if (flags & SCA_MIGRATE_ENABLE)
 			return 0;
@@ -3409,7 +3394,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
-		check_preempt_curr(dst_rq, p, 0);
+		wakeup_preempt(dst_rq, p, 0);
 
 		rq_unpin_lock(dst_rq, &drf);
 		rq_unpin_lock(src_rq, &srf);
@@ -3516,13 +3501,11 @@ out:
  */
 void kick_process(struct task_struct *p)
 {
-	int cpu;
+	guard(preempt)();
+	int cpu = task_cpu(p);
 
-	preempt_disable();
-	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
-	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
 
@@ -3785,7 +3768,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 	}
 
 	activate_task(rq, p, en_flags);
-	check_preempt_curr(rq, p, wake_flags);
+	wakeup_preempt(rq, p, wake_flags);
 
 	ttwu_do_wakeup(p);
 
@@ -3809,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		if (rq->avg_idle > max)
 			rq->avg_idle = max;
 
-		rq->wake_stamp = jiffies;
-		rq->wake_avg_idle = rq->avg_idle / 2;
-
 		rq->idle_stamp = 0;
 	}
 #endif
@@ -3856,7 +3836,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
 			 * it should preempt the task that is current now.
 			 */
 			update_rq_clock(rq);
-			check_preempt_curr(rq, p, wake_flags);
+			wakeup_preempt(rq, p, wake_flags);
 		}
 		ttwu_do_wakeup(p);
 		ret = 1;
@@ -3956,6 +3936,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
 
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+	if (this_cpu == that_cpu)
+		return true;
+
+	return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 {
 	/*
@@ -4036,13 +4028,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  * The caller holds p::pi_lock if p != current or has preemption
  * disabled when p == current.
  *
- * The rules of PREEMPT_RT saved_state:
+ * The rules of saved_state:
  *
  *   The related locking code always holds p::pi_lock when updating
  *   p::saved_state, which means the code is fully serialized in both cases.
  *
- *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
- *   bits set. This allows to distinguish all wakeup scenarios.
+ *   For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ *   No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ *   For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ *   allows us to prevent early wakeup of tasks before they can be run on
+ *   asymmetric ISA architectures (eg ARMv9).
  */
 static __always_inline
 bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
@@ -4056,13 +4052,13 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
 
 	*success = !!(match = __task_state_match(p, state));
 
-#ifdef CONFIG_PREEMPT_RT
 	/*
 	 * Saved state preserves the task state across blocking on
-	 * an RT lock.  If the state matches, set p::saved_state to
-	 * TASK_RUNNING, but do not wake the task because it waits
-	 * for a lock wakeup. Also indicate success because from
-	 * the regular waker's point of view this has succeeded.
+	 * an RT lock or TASK_FREEZABLE tasks.  If the state matches,
+	 * set p::saved_state to TASK_RUNNING, but do not wake the task
+	 * because it waits for a lock wakeup or __thaw_task(). Also
+	 * indicate success because from the regular waker's point of
+	 * view this has succeeded.
 	 *
 	 * After acquiring the lock the task will restore p::__state
 	 * from p::saved_state which ensures that the regular
@@ -4072,7 +4068,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
 	 */
 	if (match < 0)
 		p->saved_state = TASK_RUNNING;
-#endif
+
 	return match > 0;
 }
 
@@ -4254,7 +4250,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
 		 * __schedule().  See the comment for smp_mb__after_spinlock().
 		 *
-		 * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
+		 * A similar smp_rmb() lives in __task_needs_rq_lock().
 		 */
 		smp_rmb();
 		if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
@@ -4871,7 +4867,7 @@ void wake_up_new_task(struct task_struct *p)
 
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	trace_sched_wakeup_new(p);
-	check_preempt_curr(rq, p, WF_FORK);
+	wakeup_preempt(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*
@@ -5374,8 +5370,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	/* switch_mm_cid() requires the memory barriers above. */
 	switch_mm_cid(rq, prev, next);
 
-	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
 	prepare_lock_switch(rq, next, rf);
 
 	/* Here we just switch the register state and the stack. */
@@ -5916,8 +5910,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
-	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
-	    && in_atomic_preempt_off()) {
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
 		pr_err("Preemption disabled at:");
 		print_ip_sym(KERN_ERR, preempt_disable_ip);
 	}
@@ -6368,8 +6361,9 @@ static void sched_core_balance(struct rq *rq)
 	struct sched_domain *sd;
 	int cpu = cpu_of(rq);
 
-	preempt_disable();
-	rcu_read_lock();
+	guard(preempt)();
+	guard(rcu)();
+
 	raw_spin_rq_unlock_irq(rq);
 	for_each_domain(cpu, sd) {
 		if (need_resched())
@@ -6379,8 +6373,6 @@ static void sched_core_balance(struct rq *rq)
 			break;
 	}
 	raw_spin_rq_lock_irq(rq);
-	rcu_read_unlock();
-	preempt_enable();
 }
 
 static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
@@ -6615,6 +6607,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 	/* Promote REQ to ACT */
 	rq->clock_update_flags <<= 1;
 	update_rq_clock(rq);
+	rq->clock_update_flags = RQCF_UPDATED;
 
 	switch_count = &prev->nivcsw;
 
@@ -6694,8 +6687,6 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
-		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
 		rq_unpin_lock(rq, &rf);
 		__balance_callbacks(rq);
 		raw_spin_rq_unlock_irq(rq);
@@ -6720,22 +6711,24 @@ void __noreturn do_task_dead(void)
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
+	static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
 	unsigned int task_flags;
 
-	if (task_is_running(tsk))
-		return;
+	/*
+	 * Establish LD_WAIT_CONFIG context to ensure none of the code called
+	 * will use a blocking primitive -- which would lead to recursion.
+	 */
+	lock_map_acquire_try(&sched_map);
 
 	task_flags = tsk->flags;
 	/*
 	 * If a worker goes to sleep, notify and ask workqueue whether it
 	 * wants to wake up a task to maintain concurrency.
 	 */
-	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
-		if (task_flags & PF_WQ_WORKER)
-			wq_worker_sleeping(tsk);
-		else
-			io_wq_worker_sleeping(tsk);
-	}
+	if (task_flags & PF_WQ_WORKER)
+		wq_worker_sleeping(tsk);
+	else if (task_flags & PF_IO_WORKER)
+		io_wq_worker_sleeping(tsk);
 
 	/*
 	 * spinlock and rwlock must not flush block requests.  This will
@@ -6749,6 +6742,8 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * make sure to submit it to avoid deadlocks.
 	 */
 	blk_flush_plug(tsk->plug, true);
+
+	lock_map_release(&sched_map);
 }
 
 static void sched_update_worker(struct task_struct *tsk)
@@ -6761,16 +6756,26 @@ static void sched_update_worker(struct task_struct *tsk)
 	}
 }
 
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void __schedule_loop(unsigned int sched_mode)
 {
-	struct task_struct *tsk = current;
-
-	sched_submit_work(tsk);
 	do {
 		preempt_disable();
-		__schedule(SM_NONE);
+		__schedule(sched_mode);
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+	struct task_struct *tsk = current;
+
+#ifdef CONFIG_RT_MUTEXES
+	lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+	if (!task_is_running(tsk))
+		sched_submit_work(tsk);
+	__schedule_loop(SM_NONE);
 	sched_update_worker(tsk);
 }
 EXPORT_SYMBOL(schedule);
@@ -6834,11 +6839,7 @@ void __sched schedule_preempt_disabled(void)
 #ifdef CONFIG_PREEMPT_RT
 void __sched notrace schedule_rtlock(void)
 {
-	do {
-		preempt_disable();
-		__schedule(SM_RTLOCK_WAIT);
-		sched_preempt_enable_no_resched();
-	} while (need_resched());
+	__schedule_loop(SM_RTLOCK_WAIT);
 }
 NOKPROBE_SYMBOL(schedule_rtlock);
 #endif
@@ -7034,6 +7035,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio)
 
 #ifdef CONFIG_RT_MUTEXES
 
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+	lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+	sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+	lockdep_assert(current->sched_rt_mutex);
+	__schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+	sched_update_worker(current);
+	lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
 {
 	if (pi_task)
@@ -7187,9 +7214,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 void set_user_nice(struct task_struct *p, long nice)
 {
 	bool queued, running;
-	int old_prio;
-	struct rq_flags rf;
 	struct rq *rq;
+	int old_prio;
 
 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
 		return;
@@ -7197,7 +7223,9 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rq = task_rq_lock(p, &rf);
+	CLASS(task_rq_lock, rq_guard)(p);
+	rq = rq_guard.rq;
+
 	update_rq_clock(rq);
 
 	/*
@@ -7208,8 +7236,9 @@ void set_user_nice(struct task_struct *p, long nice)
 	 */
 	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
-		goto out_unlock;
+		return;
 	}
+
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
@@ -7232,9 +7261,6 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * lowered its priority, then reschedule its CPU:
 	 */
 	p->sched_class->prio_changed(rq, p, old_prio);
-
-out_unlock:
-	task_rq_unlock(rq, p, &rf);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -7507,6 +7533,21 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 	return pid ? find_task_by_vpid(pid) : current;
 }
 
+static struct task_struct *find_get_task(pid_t pid)
+{
+	struct task_struct *p;
+	guard(rcu)();
+
+	p = find_process_by_pid(pid);
+	if (likely(p))
+		get_task_struct(p);
+
+	return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+	     find_get_task(pid), pid_t pid)
+
 /*
  * sched_setparam() passes in -1 for its policy, to let the functions
  * it calls know not to change it.
@@ -7544,14 +7585,11 @@ static void __setscheduler_params(struct task_struct *p,
 static bool check_same_owner(struct task_struct *p)
 {
 	const struct cred *cred = current_cred(), *pcred;
-	bool match;
+	guard(rcu)();
 
-	rcu_read_lock();
 	pcred = __task_cred(p);
-	match = (uid_eq(cred->euid, pcred->euid) ||
-		 uid_eq(cred->euid, pcred->uid));
-	rcu_read_unlock();
-	return match;
+	return (uid_eq(cred->euid, pcred->euid) ||
+		uid_eq(cred->euid, pcred->uid));
 }
 
 /*
@@ -7963,27 +8001,17 @@ static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
-	struct task_struct *p;
-	int retval;
 
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 
-	rcu_read_lock();
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (likely(p))
-		get_task_struct(p);
-	rcu_read_unlock();
-
-	if (likely(p)) {
-		retval = sched_setscheduler(p, policy, &lparam);
-		put_task_struct(p);
-	}
+	CLASS(find_get_task, p)(pid);
+	if (!p)
+		return -ESRCH;
 
-	return retval;
+	return sched_setscheduler(p, policy, &lparam);
 }
 
 /*
@@ -8079,7 +8107,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
 			       unsigned int, flags)
 {
 	struct sched_attr attr;
-	struct task_struct *p;
 	int retval;
 
 	if (!uattr || pid < 0 || flags)
@@ -8094,21 +8121,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
 	if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
 		attr.sched_policy = SETPARAM_POLICY;
 
-	rcu_read_lock();
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (likely(p))
-		get_task_struct(p);
-	rcu_read_unlock();
+	CLASS(find_get_task, p)(pid);
+	if (!p)
+		return -ESRCH;
 
-	if (likely(p)) {
-		if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
-			get_params(p, &attr);
-		retval = sched_setattr(p, &attr);
-		put_task_struct(p);
-	}
+	if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+		get_params(p, &attr);
 
-	return retval;
+	return sched_setattr(p, &attr);
 }
 
 /**
@@ -8126,16 +8146,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 	if (pid < 0)
 		return -EINVAL;
 
-	retval = -ESRCH;
-	rcu_read_lock();
+	guard(rcu)();
 	p = find_process_by_pid(pid);
-	if (p) {
-		retval = security_task_getscheduler(p);
-		if (!retval)
-			retval = p->policy
-				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+	if (!p)
+		return -ESRCH;
+
+	retval = security_task_getscheduler(p);
+	if (!retval) {
+		retval = p->policy;
+		if (p->sched_reset_on_fork)
+			retval |= SCHED_RESET_ON_FORK;
 	}
-	rcu_read_unlock();
 	return retval;
 }
 
@@ -8156,30 +8177,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 	if (!param || pid < 0)
 		return -EINVAL;
 
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
+	scoped_guard (rcu) {
+		p = find_process_by_pid(pid);
+		if (!p)
+			return -ESRCH;
 
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
+		retval = security_task_getscheduler(p);
+		if (retval)
+			return retval;
 
-	if (task_has_rt_policy(p))
-		lp.sched_priority = p->rt_priority;
-	rcu_read_unlock();
+		if (task_has_rt_policy(p))
+			lp.sched_priority = p->rt_priority;
+	}
 
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
-	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
-	return retval;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
+	return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 }
 
 /*
@@ -8239,46 +8253,38 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	    usize < SCHED_ATTR_SIZE_VER0 || flags)
 		return -EINVAL;
 
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
+	scoped_guard (rcu) {
+		p = find_process_by_pid(pid);
+		if (!p)
+			return -ESRCH;
 
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
+		retval = security_task_getscheduler(p);
+		if (retval)
+			return retval;
 
-	kattr.sched_policy = p->policy;
-	if (p->sched_reset_on_fork)
-		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
-	get_params(p, &kattr);
-	kattr.sched_flags &= SCHED_FLAG_ALL;
+		kattr.sched_policy = p->policy;
+		if (p->sched_reset_on_fork)
+			kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+		get_params(p, &kattr);
+		kattr.sched_flags &= SCHED_FLAG_ALL;
 
 #ifdef CONFIG_UCLAMP_TASK
-	/*
-	 * This could race with another potential updater, but this is fine
-	 * because it'll correctly read the old or the new value. We don't need
-	 * to guarantee who wins the race as long as it doesn't return garbage.
-	 */
-	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
-	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+		/*
+		 * This could race with another potential updater, but this is fine
+		 * because it'll correctly read the old or the new value. We don't need
+		 * to guarantee who wins the race as long as it doesn't return garbage.
+		 */
+		kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+		kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
 #endif
-
-	rcu_read_unlock();
+	}
 
 	return sched_attr_copy_to_user(uattr, &kattr, usize);
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
 }
 
 #ifdef CONFIG_SMP
 int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 {
-	int ret = 0;
-
 	/*
 	 * If the task isn't a deadline task or admission control is
 	 * disabled then we don't care about affinity changes.
@@ -8292,11 +8298,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 	 * tasks allowed to run on all the CPUs in the task's
 	 * root_domain.
 	 */
-	rcu_read_lock();
+	guard(rcu)();
 	if (!cpumask_subset(task_rq(p)->rd->span, mask))
-		ret = -EBUSY;
-	rcu_read_unlock();
-	return ret;
+		return -EBUSY;
+
+	return 0;
 }
 #endif
 
@@ -8366,39 +8372,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	struct affinity_context ac;
 	struct cpumask *user_mask;
-	struct task_struct *p;
 	int retval;
 
-	rcu_read_lock();
-
-	p = find_process_by_pid(pid);
-	if (!p) {
-		rcu_read_unlock();
+	CLASS(find_get_task, p)(pid);
+	if (!p)
 		return -ESRCH;
-	}
-
-	/* Prevent p going away */
-	get_task_struct(p);
-	rcu_read_unlock();
 
-	if (p->flags & PF_NO_SETAFFINITY) {
-		retval = -EINVAL;
-		goto out_put_task;
-	}
+	if (p->flags & PF_NO_SETAFFINITY)
+		return -EINVAL;
 
 	if (!check_same_owner(p)) {
-		rcu_read_lock();
-		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
-			rcu_read_unlock();
-			retval = -EPERM;
-			goto out_put_task;
-		}
-		rcu_read_unlock();
+		guard(rcu)();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+			return -EPERM;
 	}
 
 	retval = security_task_setscheduler(p);
 	if (retval)
-		goto out_put_task;
+		return retval;
 
 	/*
 	 * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
@@ -8408,8 +8399,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	if (user_mask) {
 		cpumask_copy(user_mask, in_mask);
 	} else if (IS_ENABLED(CONFIG_SMP)) {
-		retval = -ENOMEM;
-		goto out_put_task;
+		return -ENOMEM;
 	}
 
 	ac = (struct affinity_context){
@@ -8421,8 +8411,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	retval = __sched_setaffinity(p, &ac);
 	kfree(ac.user_mask);
 
-out_put_task:
-	put_task_struct(p);
 	return retval;
 }
 
@@ -8464,28 +8452,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
-	unsigned long flags;
 	int retval;
 
-	rcu_read_lock();
-
-	retval = -ESRCH;
+	guard(rcu)();
 	p = find_process_by_pid(pid);
 	if (!p)
-		goto out_unlock;
+		return -ESRCH;
 
 	retval = security_task_getscheduler(p);
 	if (retval)
-		goto out_unlock;
+		return retval;
 
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	guard(raw_spinlock_irqsave)(&p->pi_lock);
 	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-out_unlock:
-	rcu_read_unlock();
 
-	return retval;
+	return 0;
 }
 
 /**
@@ -8932,55 +8913,46 @@ int __sched yield_to(struct task_struct *p, bool preempt)
 {
 	struct task_struct *curr = current;
 	struct rq *rq, *p_rq;
-	unsigned long flags;
 	int yielded = 0;
 
-	local_irq_save(flags);
-	rq = this_rq();
+	scoped_guard (irqsave) {
+		rq = this_rq();
 
 again:
-	p_rq = task_rq(p);
-	/*
-	 * If we're the only runnable task on the rq and target rq also
-	 * has only one task, there's absolutely no point in yielding.
-	 */
-	if (rq->nr_running == 1 && p_rq->nr_running == 1) {
-		yielded = -ESRCH;
-		goto out_irq;
-	}
+		p_rq = task_rq(p);
+		/*
+		 * If we're the only runnable task on the rq and target rq also
+		 * has only one task, there's absolutely no point in yielding.
+		 */
+		if (rq->nr_running == 1 && p_rq->nr_running == 1)
+			return -ESRCH;
 
-	double_rq_lock(rq, p_rq);
-	if (task_rq(p) != p_rq) {
-		double_rq_unlock(rq, p_rq);
-		goto again;
-	}
+		guard(double_rq_lock)(rq, p_rq);
+		if (task_rq(p) != p_rq)
+			goto again;
 
-	if (!curr->sched_class->yield_to_task)
-		goto out_unlock;
+		if (!curr->sched_class->yield_to_task)
+			return 0;
 
-	if (curr->sched_class != p->sched_class)
-		goto out_unlock;
+		if (curr->sched_class != p->sched_class)
+			return 0;
 
-	if (task_on_cpu(p_rq, p) || !task_is_running(p))
-		goto out_unlock;
+		if (task_on_cpu(p_rq, p) || !task_is_running(p))
+			return 0;
 
-	yielded = curr->sched_class->yield_to_task(rq, p);
-	if (yielded) {
-		schedstat_inc(rq->yld_count);
-		/*
-		 * Make p's CPU reschedule; pick_next_entity takes care of
-		 * fairness.
-		 */
-		if (preempt && rq != p_rq)
-			resched_curr(p_rq);
+		yielded = curr->sched_class->yield_to_task(rq, p);
+		if (yielded) {
+			schedstat_inc(rq->yld_count);
+			/*
+			 * Make p's CPU reschedule; pick_next_entity
+			 * takes care of fairness.
+			 */
+			if (preempt && rq != p_rq)
+				resched_curr(p_rq);
+		}
 	}
 
-out_unlock:
-	double_rq_unlock(rq, p_rq);
-out_irq:
-	local_irq_restore(flags);
-
-	if (yielded > 0)
+	if (yielded)
 		schedule();
 
 	return yielded;
@@ -9083,38 +9055,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 
 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
 {
-	struct task_struct *p;
-	unsigned int time_slice;
-	struct rq_flags rf;
-	struct rq *rq;
+	unsigned int time_slice = 0;
 	int retval;
 
 	if (pid < 0)
 		return -EINVAL;
 
-	retval = -ESRCH;
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	if (!p)
-		goto out_unlock;
+	scoped_guard (rcu) {
+		struct task_struct *p = find_process_by_pid(pid);
+		if (!p)
+			return -ESRCH;
 
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
+		retval = security_task_getscheduler(p);
+		if (retval)
+			return retval;
 
-	rq = task_rq_lock(p, &rf);
-	time_slice = 0;
-	if (p->sched_class->get_rr_interval)
-		time_slice = p->sched_class->get_rr_interval(rq, p);
-	task_rq_unlock(rq, p, &rf);
+		scoped_guard (task_rq_lock, p) {
+			struct rq *rq = scope.rq;
+			if (p->sched_class->get_rr_interval)
+				time_slice = p->sched_class->get_rr_interval(rq, p);
+		}
+	}
 
-	rcu_read_unlock();
 	jiffies_to_timespec64(time_slice, t);
 	return 0;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
 }
 
 /**
@@ -9173,9 +9137,9 @@ void sched_show_task(struct task_struct *p)
 	if (pid_alive(p))
 		ppid = task_pid_nr(rcu_dereference(p->real_parent));
 	rcu_read_unlock();
-	pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
-		free, task_pid_nr(p), ppid,
-		read_task_thread_flags(p));
+	pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+		free, task_pid_nr(p), task_tgid_nr(p),
+		ppid, read_task_thread_flags(p));
 
 	print_worker_info(KERN_INFO, p);
 	print_stop_info(KERN_INFO, p);
@@ -9505,9 +9469,11 @@ static void balance_push(struct rq *rq)
 	 * Temporarily drop rq->lock such that we can wake-up the stop task.
 	 * Both preemption and IRQs are still disabled.
 	 */
+	preempt_disable();
 	raw_spin_rq_unlock(rq);
 	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
 			    this_cpu_ptr(&push_work));
+	preempt_enable();
 	/*
 	 * At this point need_resched() is true and we'll take the loop in
 	 * schedule(). The next pick is obviously going to be the stop task
@@ -10013,7 +9979,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
-		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
 		rq->balance_callback = &balance_push_callback;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
@@ -10022,8 +9988,6 @@ void __init sched_init(void)
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-		rq->wake_stamp = jiffies;
-		rq->wake_avg_idle = rq->avg_idle;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -10498,17 +10462,18 @@ void sched_move_task(struct task_struct *tsk)
 	int queued, running, queue_flags =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	struct task_group *group;
-	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(tsk, &rf);
+	CLASS(task_rq_lock, rq_guard)(tsk);
+	rq = rq_guard.rq;
+
 	/*
 	 * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
 	 * group changes.
 	 */
 	group = sched_get_task_group(tsk);
 	if (group == tsk->sched_task_group)
-		goto unlock;
+		return;
 
 	update_rq_clock(rq);
 
@@ -10533,9 +10498,6 @@ void sched_move_task(struct task_struct *tsk)
 		 */
 		resched_curr(rq);
 	}
-
-unlock:
-	task_rq_unlock(rq, tsk, &rf);
 }
 
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -10572,11 +10534,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 	/* Propagate the effective uclamp value for the new group */
-	mutex_lock(&uclamp_mutex);
-	rcu_read_lock();
+	guard(mutex)(&uclamp_mutex);
+	guard(rcu)();
 	cpu_util_update_eff(css);
-	rcu_read_unlock();
-	mutex_unlock(&uclamp_mutex);
 #endif
 
 	return 0;
@@ -10727,8 +10687,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
 
 	static_branch_enable(&sched_uclamp_used);
 
-	mutex_lock(&uclamp_mutex);
-	rcu_read_lock();
+	guard(mutex)(&uclamp_mutex);
+	guard(rcu)();
 
 	tg = css_tg(of_css(of));
 	if (tg->uclamp_req[clamp_id].value != req.util)
@@ -10743,9 +10703,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
 	/* Update effective clamps to track the most restrictive value */
 	cpu_util_update_eff(of_css(of));
 
-	rcu_read_unlock();
-	mutex_unlock(&uclamp_mutex);
-
 	return nbytes;
 }
 
@@ -10771,10 +10728,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf,
 	u64 percent;
 	u32 rem;
 
-	rcu_read_lock();
-	tg = css_tg(seq_css(sf));
-	util_clamp = tg->uclamp_req[clamp_id].value;
-	rcu_read_unlock();
+	scoped_guard (rcu) {
+		tg = css_tg(seq_css(sf));
+		util_clamp = tg->uclamp_req[clamp_id].value;
+	}
 
 	if (util_clamp == SCHED_CAPACITY_SCALE) {
 		seq_puts(sf, "max\n");
@@ -10865,11 +10822,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 	 * Prevent race between setting of cfs_rq->runtime_enabled and
 	 * unthrottle_offline_cfs_rqs().
 	 */
-	cpus_read_lock();
-	mutex_lock(&cfs_constraints_mutex);
+	guard(cpus_read_lock)();
+	guard(mutex)(&cfs_constraints_mutex);
+
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	runtime_enabled = quota != RUNTIME_INF;
 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -10879,39 +10837,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 	 */
 	if (runtime_enabled && !runtime_was_enabled)
 		cfs_bandwidth_usage_inc();
-	raw_spin_lock_irq(&cfs_b->lock);
-	cfs_b->period = ns_to_ktime(period);
-	cfs_b->quota = quota;
-	cfs_b->burst = burst;
 
-	__refill_cfs_bandwidth_runtime(cfs_b);
+	scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+		cfs_b->period = ns_to_ktime(period);
+		cfs_b->quota = quota;
+		cfs_b->burst = burst;
 
-	/* Restart the period timer (if active) to handle new period expiry: */
-	if (runtime_enabled)
-		start_cfs_bandwidth(cfs_b);
+		__refill_cfs_bandwidth_runtime(cfs_b);
 
-	raw_spin_unlock_irq(&cfs_b->lock);
+		/*
+		 * Restart the period timer (if active) to handle new
+		 * period expiry:
+		 */
+		if (runtime_enabled)
+			start_cfs_bandwidth(cfs_b);
+	}
 
 	for_each_online_cpu(i) {
 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
 		struct rq *rq = cfs_rq->rq;
-		struct rq_flags rf;
 
-		rq_lock_irq(rq, &rf);
+		guard(rq_lock_irq)(rq);
 		cfs_rq->runtime_enabled = runtime_enabled;
 		cfs_rq->runtime_remaining = 0;
 
 		if (cfs_rq->throttled)
 			unthrottle_cfs_rq(cfs_rq);
-		rq_unlock_irq(rq, &rf);
 	}
+
 	if (runtime_was_enabled && !runtime_enabled)
 		cfs_bandwidth_usage_dec();
-out_unlock:
-	mutex_unlock(&cfs_constraints_mutex);
-	cpus_read_unlock();
 
-	return ret;
+	return 0;
 }
 
 static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@ -11096,7 +11053,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 {
-	int ret;
 	struct cfs_schedulable_data data = {
 		.tg = tg,
 		.period = period,
@@ -11108,11 +11064,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 		do_div(data.quota, NSEC_PER_USEC);
 	}
 
-	rcu_read_lock();
-	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
-	rcu_read_unlock();
-
-	return ret;
+	guard(rcu)();
+	return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
 }
 
 static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@ -11717,14 +11670,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
 	 * are not the last task to be migrated from this cpu for this mm, so
 	 * there is no need to move src_cid to the destination cpu.
 	 */
-	rcu_read_lock();
+	guard(rcu)();
 	src_task = rcu_dereference(src_rq->curr);
 	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
-		rcu_read_unlock();
 		t->last_mm_cid = -1;
 		return -1;
 	}
-	rcu_read_unlock();
 
 	return src_cid;
 }
@@ -11768,18 +11719,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
 	 * the lazy-put flag, this task will be responsible for transitioning
 	 * from lazy-put flag set to MM_CID_UNSET.
 	 */
-	rcu_read_lock();
-	src_task = rcu_dereference(src_rq->curr);
-	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
-		rcu_read_unlock();
-		/*
-		 * We observed an active task for this mm, there is therefore
-		 * no point in moving this cid to the destination cpu.
-		 */
-		t->last_mm_cid = -1;
-		return -1;
+	scoped_guard (rcu) {
+		src_task = rcu_dereference(src_rq->curr);
+		if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+			/*
+			 * We observed an active task for this mm, there is therefore
+			 * no point in moving this cid to the destination cpu.
+			 */
+			t->last_mm_cid = -1;
+			return -1;
+		}
 	}
-	rcu_read_unlock();
 
 	/*
 	 * The src_cid is unused, so it can be unset.
@@ -11852,7 +11802,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *t;
-	unsigned long flags;
 	int cid, lazy_cid;
 
 	cid = READ_ONCE(pcpu_cid->cid);
@@ -11887,23 +11836,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
 	 * the lazy-put flag, that task will be responsible for transitioning
 	 * from lazy-put flag set to MM_CID_UNSET.
 	 */
-	rcu_read_lock();
-	t = rcu_dereference(rq->curr);
-	if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
-		rcu_read_unlock();
-		return;
+	scoped_guard (rcu) {
+		t = rcu_dereference(rq->curr);
+		if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+			return;
 	}
-	rcu_read_unlock();
 
 	/*
 	 * The cid is unused, so it can be unset.
 	 * Disable interrupts to keep the window of cid ownership without rq
 	 * lock small.
 	 */
-	local_irq_save(flags);
-	if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
-		__mm_cid_put(mm, cid);
-	local_irq_restore(flags);
+	scoped_guard (irqsave) {
+		if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+			__mm_cid_put(mm, cid);
+	}
 }
 
 static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
@@ -11925,14 +11872,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
 	 * snapshot associated with this cid if an active task using the mm is
 	 * observed on this rq.
 	 */
-	rcu_read_lock();
-	curr = rcu_dereference(rq->curr);
-	if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
-		WRITE_ONCE(pcpu_cid->time, rq_clock);
-		rcu_read_unlock();
-		return;
+	scoped_guard (rcu) {
+		curr = rcu_dereference(rq->curr);
+		if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+			WRITE_ONCE(pcpu_cid->time, rq_clock);
+			return;
+		}
 	}
-	rcu_read_unlock();
 
 	if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
 		return;
@@ -12026,7 +11972,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
 void sched_mm_cid_exit_signals(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct rq_flags rf;
 	struct rq *rq;
 
 	if (!mm)
@@ -12034,7 +11979,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
 
 	preempt_disable();
 	rq = this_rq();
-	rq_lock_irqsave(rq, &rf);
+	guard(rq_lock_irqsave)(rq);
 	preempt_enable_no_resched();	/* holding spinlock */
 	WRITE_ONCE(t->mm_cid_active, 0);
 	/*
@@ -12044,13 +11989,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
 	smp_mb();
 	mm_cid_put(mm);
 	t->last_mm_cid = t->mm_cid = -1;
-	rq_unlock_irqrestore(rq, &rf);
 }
 
 void sched_mm_cid_before_execve(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct rq_flags rf;
 	struct rq *rq;
 
 	if (!mm)
@@ -12058,7 +12001,7 @@ void sched_mm_cid_before_execve(struct task_struct *t)
 
 	preempt_disable();
 	rq = this_rq();
-	rq_lock_irqsave(rq, &rf);
+	guard(rq_lock_irqsave)(rq);
 	preempt_enable_no_resched();	/* holding spinlock */
 	WRITE_ONCE(t->mm_cid_active, 0);
 	/*
@@ -12068,13 +12011,11 @@ void sched_mm_cid_before_execve(struct task_struct *t)
 	smp_mb();
 	mm_cid_put(mm);
 	t->last_mm_cid = t->mm_cid = -1;
-	rq_unlock_irqrestore(rq, &rf);
 }
 
 void sched_mm_cid_after_execve(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct rq_flags rf;
 	struct rq *rq;
 
 	if (!mm)
@@ -12082,16 +12023,16 @@ void sched_mm_cid_after_execve(struct task_struct *t)
 
 	preempt_disable();
 	rq = this_rq();
-	rq_lock_irqsave(rq, &rf);
-	preempt_enable_no_resched();	/* holding spinlock */
-	WRITE_ONCE(t->mm_cid_active, 1);
-	/*
-	 * Store t->mm_cid_active before loading per-mm/cpu cid.
-	 * Matches barrier in sched_mm_cid_remote_clear_old().
-	 */
-	smp_mb();
-	t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
-	rq_unlock_irqrestore(rq, &rf);
+	scoped_guard (rq_lock_irqsave, rq) {
+		preempt_enable_no_resched();	/* holding spinlock */
+		WRITE_ONCE(t->mm_cid_active, 1);
+		/*
+		 * Store t->mm_cid_active before loading per-mm/cpu cid.
+		 * Matches barrier in sched_mm_cid_remote_clear_old().
+		 */
+		smp_mb();
+		t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+	}
 	rseq_set_notify_resume(t);
 }
 
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 57c92d751bcd..95baa12a1029 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 			if (!dl_task_fits_capacity(p, cpu)) {
 				cpumask_clear_cpu(cpu, later_mask);
 
-				cap = capacity_orig_of(cpu);
+				cap = arch_scale_cpu_capacity(cpu);
 
 				if (cap > max_cap ||
 				    (cpu == task_cpu(p) && cap == max_cap)) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 58b542bf2893..b28114478b82 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -132,7 +132,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
 	int i;
 
 	for_each_cpu_and(i, mask, cpu_active_mask)
-		cap += capacity_orig_of(i);
+		cap += arch_scale_cpu_capacity(i);
 
 	return cap;
 }
@@ -144,7 +144,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
 static inline unsigned long dl_bw_capacity(int i)
 {
 	if (!sched_asym_cpucap_active() &&
-	    capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+	    arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) {
 		return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
 	} else {
 		RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
@@ -509,7 +509,6 @@ void init_dl_rq(struct dl_rq *dl_rq)
 	/* zero means no -deadline tasks */
 	dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
 
-	dl_rq->dl_nr_migratory = 0;
 	dl_rq->overloaded = 0;
 	dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
 #else
@@ -553,39 +552,6 @@ static inline void dl_clear_overload(struct rq *rq)
 	cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
 }
 
-static void update_dl_migration(struct dl_rq *dl_rq)
-{
-	if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
-		if (!dl_rq->overloaded) {
-			dl_set_overload(rq_of_dl_rq(dl_rq));
-			dl_rq->overloaded = 1;
-		}
-	} else if (dl_rq->overloaded) {
-		dl_clear_overload(rq_of_dl_rq(dl_rq));
-		dl_rq->overloaded = 0;
-	}
-}
-
-static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-	struct task_struct *p = dl_task_of(dl_se);
-
-	if (p->nr_cpus_allowed > 1)
-		dl_rq->dl_nr_migratory++;
-
-	update_dl_migration(dl_rq);
-}
-
-static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-	struct task_struct *p = dl_task_of(dl_se);
-
-	if (p->nr_cpus_allowed > 1)
-		dl_rq->dl_nr_migratory--;
-
-	update_dl_migration(dl_rq);
-}
-
 #define __node_2_pdl(node) \
 	rb_entry((node), struct task_struct, pushable_dl_tasks)
 
@@ -594,6 +560,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
 	return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
 }
 
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+	return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+}
+
 /*
  * The list of pushable -deadline task is not a plist, like in
  * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
@@ -609,6 +580,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 				 __pushable_less);
 	if (leftmost)
 		rq->dl.earliest_dl.next = p->dl.deadline;
+
+	if (!rq->dl.overloaded) {
+		dl_set_overload(rq);
+		rq->dl.overloaded = 1;
+	}
 }
 
 static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
@@ -625,11 +601,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 		dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
 
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
-}
 
-static inline int has_pushable_dl_tasks(struct rq *rq)
-{
-	return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+	if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) {
+		dl_clear_overload(rq);
+		rq->dl.overloaded = 0;
+	}
 }
 
 static int push_dl_task(struct rq *rq);
@@ -763,7 +739,7 @@ static inline void deadline_queue_pull_task(struct rq *rq)
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
 
 static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
 					    struct rq *rq)
@@ -1175,7 +1151,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 
 	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
 	if (dl_task(rq->curr))
-		check_preempt_curr_dl(rq, p, 0);
+		wakeup_preempt_dl(rq, p, 0);
 	else
 		resched_curr(rq);
 
@@ -1504,7 +1480,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	add_nr_running(rq_of_dl_rq(dl_rq), 1);
 
 	inc_dl_deadline(dl_rq, deadline);
-	inc_dl_migration(dl_se, dl_rq);
 }
 
 static inline
@@ -1518,7 +1493,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
-	dec_dl_migration(dl_se, dl_rq);
 }
 
 static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
@@ -1939,7 +1913,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  * Only called when both the current and waking task are -deadline
  * tasks.
  */
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
 				  int flags)
 {
 	if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
@@ -2291,9 +2265,6 @@ static int push_dl_task(struct rq *rq)
 	struct rq *later_rq;
 	int ret = 0;
 
-	if (!rq->dl.overloaded)
-		return 0;
-
 	next_task = pick_next_pushable_dl_task(rq);
 	if (!next_task)
 		return 0;
@@ -2449,9 +2420,11 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 
 		if (push_task) {
+			preempt_disable();
 			raw_spin_rq_unlock(this_rq);
 			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
 					    push_task, &src_rq->push_work);
+			preempt_enable();
 			raw_spin_rq_lock(this_rq);
 		}
 	}
@@ -2652,7 +2625,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 			deadline_queue_push_tasks(rq);
 #endif
 		if (dl_task(rq->curr))
-			check_preempt_curr_dl(rq, p, 0);
+			wakeup_preempt_dl(rq, p, 0);
 		else
 			resched_curr(rq);
 	} else {
@@ -2721,7 +2694,7 @@ DEFINE_SCHED_CLASS(dl) = {
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
 
-	.check_preempt_curr	= check_preempt_curr_dl,
+	.wakeup_preempt		= wakeup_preempt_dl,
 
 	.pick_next_task		= pick_next_task_dl,
 	.put_prev_task		= put_prev_task_dl,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db6..4580a450700e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,7 +8,7 @@
  */
 
 /*
- * This allows printing both to /proc/sched_debug and
+ * This allows printing both to /sys/kernel/debug/sched/debug and
  * to the console
  */
 #define SEQ_printf(m, x...)			\
@@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
 
 	PU(rt_nr_running);
-#ifdef CONFIG_SMP
-	PU(rt_nr_migratory);
-#endif
 	P(rt_throttled);
 	PN(rt_time);
 	PN(rt_runtime);
@@ -748,7 +745,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 
 	PU(dl_nr_running);
 #ifdef CONFIG_SMP
-	PU(dl_nr_migratory);
 	dl_bw = &cpu_rq(cpu)->rd->dl_bw;
 #else
 	dl_bw = &dl_rq->dl_bw;
@@ -864,7 +860,6 @@ static void sched_debug_header(struct seq_file *m)
 #define PN(x) \
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	PN(sysctl_sched_base_slice);
-	P(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
 #undef PN
 #undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df348aa55d3c..8767988242ee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,8 +51,6 @@
 
 #include <asm/switch_to.h>
 
-#include <linux/sched/cond_resched.h>
-
 #include "sched.h"
 #include "stats.h"
 #include "autogroup.h"
@@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 unsigned int sysctl_sched_base_slice			= 750000ULL;
 static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
 
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
 int sched_thermal_decay_shift;
@@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table sched_fair_sysctls[] = {
-	{
-		.procname       = "sched_child_runs_first",
-		.data           = &sysctl_sched_child_runs_first,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec,
-	},
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.procname       = "sched_cfs_bandwidth_slice_us",
@@ -2899,19 +2884,7 @@ static void task_numa_placement(struct task_struct *p)
 	}
 
 	/* Cannot migrate task to CPU-less node */
-	if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
-		int near_nid = max_nid;
-		int distance, near_distance = INT_MAX;
-
-		for_each_node_state(nid, N_CPU) {
-			distance = node_distance(max_nid, nid);
-			if (distance < near_distance) {
-				near_nid = nid;
-				near_distance = distance;
-			}
-		}
-		max_nid = near_nid;
-	}
+	max_nid = numa_nearest_node(max_nid, N_CPU);
 
 	if (ng) {
 		numa_group_count_active_nodes(ng);
@@ -3182,7 +3155,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
 	p->mm->numa_scan_offset = 0;
 }
 
-static bool vma_is_accessed(struct vm_area_struct *vma)
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	unsigned long pids;
 	/*
@@ -3194,8 +3167,20 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
 	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
 		return true;
 
-	pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
-	return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
+	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+		return true;
+
+	/*
+	 * Complete a scan that has already started regardless of PID access, or
+	 * some VMAs may never be scanned in multi-threaded applications:
+	 */
+	if (mm->numa_scan_offset > vma->vm_start) {
+		trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+		return true;
+	}
+
+	return false;
 }
 
 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3215,6 +3200,8 @@ static void task_numa_work(struct callback_head *work)
 	unsigned long nr_pte_updates = 0;
 	long pages, virtpages;
 	struct vma_iterator vmi;
+	bool vma_pids_skipped;
+	bool vma_pids_forced = false;
 
 	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 
@@ -3257,7 +3244,6 @@ static void task_numa_work(struct callback_head *work)
 	 */
 	p->node_stamp += 2 * TICK_NSEC;
 
-	start = mm->numa_scan_offset;
 	pages = sysctl_numa_balancing_scan_size;
 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 	virtpages = pages * 8;	   /* Scan up to this much virtual space */
@@ -3267,6 +3253,16 @@ static void task_numa_work(struct callback_head *work)
 
 	if (!mmap_read_trylock(mm))
 		return;
+
+	/*
+	 * VMAs are skipped if the current PID has not trapped a fault within
+	 * the VMA recently. Allow scanning to be forced if there is no
+	 * suitable VMA remaining.
+	 */
+	vma_pids_skipped = false;
+
+retry_pids:
+	start = mm->numa_scan_offset;
 	vma_iter_init(&vmi, mm, start);
 	vma = vma_next(&vmi);
 	if (!vma) {
@@ -3279,6 +3275,7 @@ static void task_numa_work(struct callback_head *work)
 	do {
 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
 			continue;
 		}
 
@@ -3289,15 +3286,19 @@ static void task_numa_work(struct callback_head *work)
 		 * as migrating the pages will be of marginal benefit.
 		 */
 		if (!vma->vm_mm ||
-		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
 			continue;
+		}
 
 		/*
 		 * Skip inaccessible VMAs to avoid any confusion between
 		 * PROT_NONE and NUMA hinting ptes
 		 */
-		if (!vma_is_accessible(vma))
+		if (!vma_is_accessible(vma)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
 			continue;
+		}
 
 		/* Initialise new per-VMA NUMAB state. */
 		if (!vma->numab_state) {
@@ -3310,8 +3311,15 @@ static void task_numa_work(struct callback_head *work)
 				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 
 			/* Reset happens after 4 times scan delay of scan start */
-			vma->numab_state->next_pid_reset =  vma->numab_state->next_scan +
+			vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+			/*
+			 * Ensure prev_scan_seq does not match numa_scan_seq,
+			 * to prevent VMAs being skipped prematurely on the
+			 * first scan:
+			 */
+			 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
 		}
 
 		/*
@@ -3319,23 +3327,35 @@ static void task_numa_work(struct callback_head *work)
 		 * delay the scan for new VMAs.
 		 */
 		if (mm->numa_scan_seq && time_before(jiffies,
-						vma->numab_state->next_scan))
+						vma->numab_state->next_scan)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
 			continue;
+		}
 
-		/* Do not scan the VMA if task has not accessed */
-		if (!vma_is_accessed(vma))
+		/* RESET access PIDs regularly for old VMAs. */
+		if (mm->numa_scan_seq &&
+				time_after(jiffies, vma->numab_state->pids_active_reset)) {
+			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
+				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+			vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+			vma->numab_state->pids_active[1] = 0;
+		}
+
+		/* Do not rescan VMAs twice within the same sequence. */
+		if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+			mm->numa_scan_offset = vma->vm_end;
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
 			continue;
+		}
 
 		/*
-		 * RESET access PIDs regularly for old VMAs. Resetting after checking
-		 * vma for recent access to avoid clearing PID info before access..
+		 * Do not scan the VMA if task has not accessed it, unless no other
+		 * VMA candidate exists.
 		 */
-		if (mm->numa_scan_seq &&
-				time_after(jiffies, vma->numab_state->next_pid_reset)) {
-			vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
-				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
-			vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
-			vma->numab_state->access_pids[1] = 0;
+		if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+			vma_pids_skipped = true;
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+			continue;
 		}
 
 		do {
@@ -3362,8 +3382,28 @@ static void task_numa_work(struct callback_head *work)
 
 			cond_resched();
 		} while (end != vma->vm_end);
+
+		/* VMA scan is complete, do not scan until next sequence. */
+		vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+		/*
+		 * Only force scan within one VMA at a time, to limit the
+		 * cost of scanning a potentially uninteresting VMA.
+		 */
+		if (vma_pids_forced)
+			break;
 	} for_each_vma(vmi, vma);
 
+	/*
+	 * If no VMAs are remaining and VMAs were skipped due to the PID
+	 * not accessing the VMA previously, then force a scan to ensure
+	 * forward progress:
+	 */
+	if (!vma && !vma_pids_forced && vma_pids_skipped) {
+		vma_pids_forced = true;
+		goto retry_pids;
+	}
+
 out:
 	/*
 	 * It is possible to reach the end of the VMA list but the last few
@@ -3942,7 +3982,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
-	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+	long delta;
+	u64 now;
 
 	/*
 	 * No need to update load_avg for root_task_group as it is not used.
@@ -3950,9 +3991,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	if (cfs_rq->tg == &root_task_group)
 		return;
 
+	/*
+	 * For migration heavy workloads, access to tg->load_avg can be
+	 * unbound. Limit the update rate to at most once per ms.
+	 */
+	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+	if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+		return;
+
+	delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 	if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+		cfs_rq->last_update_tg_load_avg = now;
 	}
 }
 
@@ -4626,22 +4677,6 @@ static inline unsigned long task_util_est(struct task_struct *p)
 	return max(task_util(p), _task_util_est(p));
 }
 
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-					     unsigned long uclamp_min,
-					     unsigned long uclamp_max)
-{
-	return clamp(task_util_est(p), uclamp_min, uclamp_max);
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-					     unsigned long uclamp_min,
-					     unsigned long uclamp_max)
-{
-	return task_util_est(p);
-}
-#endif
-
 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 				    struct task_struct *p)
 {
@@ -4745,7 +4780,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * To avoid overestimation of actual task utilization, skip updates if
 	 * we cannot grant there is idle time in this CPU.
 	 */
-	if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
+	if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
 		return;
 
 	/*
@@ -4793,14 +4828,14 @@ static inline int util_fits_cpu(unsigned long util,
 		return fits;
 
 	/*
-	 * We must use capacity_orig_of() for comparing against uclamp_min and
+	 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
 	 * uclamp_max. We only care about capacity pressure (by using
 	 * capacity_of()) for comparing against the real util.
 	 *
 	 * If a task is boosted to 1024 for example, we don't want a tiny
 	 * pressure to skew the check whether it fits a CPU or not.
 	 *
-	 * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
 	 * should fit a little cpu even if there's some pressure.
 	 *
 	 * Only exception is for thermal pressure since it has a direct impact
@@ -4812,7 +4847,7 @@ static inline int util_fits_cpu(unsigned long util,
 	 * For uclamp_max, we can tolerate a drop in performance level as the
 	 * goal is to cap the task. So it's okay if it's getting less.
 	 */
-	capacity_orig = capacity_orig_of(cpu);
+	capacity_orig = arch_scale_cpu_capacity(cpu);
 	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
 
 	/*
@@ -4932,7 +4967,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
-	return true;
+	return !cfs_rq->nr_running;
 }
 
 #define UPDATE_TG	0x0
@@ -5267,7 +5302,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * 4) do not run the "skip" process, if something else is available
  */
 static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	/*
 	 * Enabling NEXT_BUDDY will affect latency but not fairness.
@@ -5811,13 +5846,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
 
 static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 {
-	struct cfs_rq *local_unthrottle = NULL;
 	int this_cpu = smp_processor_id();
 	u64 runtime, remaining = 1;
 	bool throttled = false;
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *tmp;
 	struct rq_flags rf;
 	struct rq *rq;
+	LIST_HEAD(local_unthrottle);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5833,11 +5868,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 
-#ifdef CONFIG_SMP
 		/* Already queued for async unthrottle */
 		if (!list_empty(&cfs_rq->throttled_csd_list))
 			goto next;
-#endif
 
 		/* By the above checks, this should never be true */
 		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
@@ -5854,11 +5887,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 
 		/* we check whether we're throttled above */
 		if (cfs_rq->runtime_remaining > 0) {
-			if (cpu_of(rq) != this_cpu ||
-			    SCHED_WARN_ON(local_unthrottle))
+			if (cpu_of(rq) != this_cpu) {
 				unthrottle_cfs_rq_async(cfs_rq);
-			else
-				local_unthrottle = cfs_rq;
+			} else {
+				/*
+				 * We currently only expect to be unthrottling
+				 * a single cfs_rq locally.
+				 */
+				SCHED_WARN_ON(!list_empty(&local_unthrottle));
+				list_add_tail(&cfs_rq->throttled_csd_list,
+					      &local_unthrottle);
+			}
 		} else {
 			throttled = true;
 		}
@@ -5866,15 +5905,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 next:
 		rq_unlock_irqrestore(rq, &rf);
 	}
-	rcu_read_unlock();
 
-	if (local_unthrottle) {
-		rq = cpu_rq(this_cpu);
+	list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+				 throttled_csd_list) {
+		struct rq *rq = rq_of(cfs_rq);
+
 		rq_lock_irqsave(rq, &rf);
-		if (cfs_rq_throttled(local_unthrottle))
-			unthrottle_cfs_rq(local_unthrottle);
+
+		list_del_init(&cfs_rq->throttled_csd_list);
+
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
+
 		rq_unlock_irqrestore(rq, &rf);
 	}
+	SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
+	rcu_read_unlock();
 
 	return throttled;
 }
@@ -6204,9 +6251,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
-#ifdef CONFIG_SMP
 	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7164,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
 	struct sched_domain_shared *sd_share;
-	struct rq *this_rq = this_rq();
-	int this = smp_processor_id();
-	struct sched_domain *this_sd = NULL;
-	u64 time = 0;
 
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
-	if (sched_feat(SIS_PROP) && !has_idle_core) {
-		u64 avg_cost, avg_idle, span_avg;
-		unsigned long now = jiffies;
-
-		this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-		if (!this_sd)
-			return -1;
-
-		/*
-		 * If we're busy, the assumption that the last idle period
-		 * predicts the future is flawed; age away the remaining
-		 * predicted idle time.
-		 */
-		if (unlikely(this_rq->wake_stamp < now)) {
-			while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
-				this_rq->wake_stamp++;
-				this_rq->wake_avg_idle >>= 1;
-			}
-		}
-
-		avg_idle = this_rq->wake_avg_idle;
-		avg_cost = this_sd->avg_scan_cost + 1;
-
-		span_avg = sd->span_weight * avg_idle;
-		if (span_avg > 4*avg_cost)
-			nr = div_u64(span_avg, avg_cost);
-		else
-			nr = 4;
-
-		time = cpu_clock(this);
-	}
-
 	if (sched_feat(SIS_UTIL)) {
 		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
 		if (sd_share) {
@@ -7214,6 +7223,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 		}
 	}
 
+	if (static_branch_unlikely(&sched_cluster_active)) {
+		struct sched_group *sg = sd->groups;
+
+		if (sg->flags & SD_CLUSTER) {
+			for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+				if (!cpumask_test_cpu(cpu, cpus))
+					continue;
+
+				if (has_idle_core) {
+					i = select_idle_core(p, cpu, cpus, &idle_cpu);
+					if ((unsigned int)i < nr_cpumask_bits)
+						return i;
+				} else {
+					if (--nr <= 0)
+						return -1;
+					idle_cpu = __select_idle_cpu(cpu, p);
+					if ((unsigned int)idle_cpu < nr_cpumask_bits)
+						return idle_cpu;
+				}
+			}
+			cpumask_andnot(cpus, cpus, sched_group_span(sg));
+		}
+	}
+
 	for_each_cpu_wrap(cpu, cpus, target + 1) {
 		if (has_idle_core) {
 			i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7221,7 +7254,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 				return i;
 
 		} else {
-			if (!--nr)
+			if (--nr <= 0)
 				return -1;
 			idle_cpu = __select_idle_cpu(cpu, p);
 			if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -7232,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	if (has_idle_core)
 		set_idle_cores(target, false);
 
-	if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
-		time = cpu_clock(this) - time;
-
-		/*
-		 * Account for the scan cost of wakeups against the average
-		 * idle time.
-		 */
-		this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
-
-		update_avg(&this_sd->avg_scan_cost, time);
-	}
-
 	return idle_cpu;
 }
 
@@ -7283,7 +7304,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		 * Look for the CPU with best capacity.
 		 */
 		else if (fits < 0)
-			cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+			cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
 
 		/*
 		 * First, select CPU which fits better (-1 being better than 0).
@@ -7323,7 +7344,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	bool has_idle_core = false;
 	struct sched_domain *sd;
 	unsigned long task_util, util_min, util_max;
-	int i, recent_used_cpu;
+	int i, recent_used_cpu, prev_aff = -1;
 
 	/*
 	 * On asymmetric system, update task utilization because we will check
@@ -7350,8 +7371,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev))
-		return prev;
+	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+		if (!static_branch_unlikely(&sched_cluster_active) ||
+		    cpus_share_resources(prev, target))
+			return prev;
+
+		prev_aff = prev;
+	}
 
 	/*
 	 * Allow a per-cpu kthread to stack with the wakee if the
@@ -7378,7 +7405,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
-		return recent_used_cpu;
+
+		if (!static_branch_unlikely(&sched_cluster_active) ||
+		    cpus_share_resources(recent_used_cpu, target))
+			return recent_used_cpu;
+
+	} else {
+		recent_used_cpu = -1;
 	}
 
 	/*
@@ -7419,6 +7452,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
+	/*
+	 * For cluster machines which have lower sharing cache like L2 or
+	 * LLC Tag, we tend to find an idle CPU in the target's cluster
+	 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+	 * use them if possible when no idle CPU found in select_idle_cpu().
+	 */
+	if ((unsigned int)prev_aff < nr_cpumask_bits)
+		return prev_aff;
+	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+		return recent_used_cpu;
+
 	return target;
 }
 
@@ -7525,7 +7569,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 		util = max(util, util_est);
 	}
 
-	return min(util, capacity_orig_of(cpu));
+	return min(util, arch_scale_cpu_capacity(cpu));
 }
 
 unsigned long cpu_util_cfs(int cpu)
@@ -7677,11 +7721,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
 {
 	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
 	unsigned long busy_time = eenv->pd_busy_time;
+	unsigned long energy;
 
 	if (dst_cpu >= 0)
 		busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
 
-	return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+	energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+	trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+	return energy;
 }
 
 /*
@@ -7756,7 +7805,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	target = prev_cpu;
 
 	sync_entity_load_avg(&p->se);
-	if (!uclamp_task_util(p, p_util_min, p_util_max))
+	if (!task_util_est(p) && p_util_min == 0)
 		goto unlock;
 
 	eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -7764,11 +7813,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	for (; pd; pd = pd->next) {
 		unsigned long util_min = p_util_min, util_max = p_util_max;
 		unsigned long cpu_cap, cpu_thermal_cap, util;
-		unsigned long cur_delta, max_spare_cap = 0;
+		long prev_spare_cap = -1, max_spare_cap = -1;
 		unsigned long rq_util_min, rq_util_max;
-		unsigned long prev_spare_cap = 0;
+		unsigned long cur_delta, base_energy;
 		int max_spare_cap_cpu = -1;
-		unsigned long base_energy;
 		int fits, max_fits = -1;
 
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7831,7 +7879,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 				prev_spare_cap = cpu_cap;
 				prev_fits = fits;
 			} else if ((fits > max_fits) ||
-				   ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
+				   ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * among the remaining CPUs in the performance
@@ -7843,7 +7891,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			}
 		}
 
-		if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
+		if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
 			continue;
 
 		eenv_pd_busy_time(&eenv, cpus, p);
@@ -7851,7 +7899,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 		base_energy = compute_energy(&eenv, pd, cpus, p, -1);
 
 		/* Evaluate the energy impact of using prev_cpu. */
-		if (prev_spare_cap > 0) {
+		if (prev_spare_cap > -1) {
 			prev_delta = compute_energy(&eenv, pd, cpus, p,
 						    prev_cpu);
 			/* CPU utilization has changed */
@@ -8052,7 +8100,7 @@ static void set_next_buddy(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
@@ -8065,7 +8113,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 
 	/*
 	 * This is possible from callers such as attach_tasks(), in which we
-	 * unconditionally check_preempt_curr() after an enqueue (which may have
+	 * unconditionally wakeup_preempt() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
 	 */
@@ -8157,7 +8205,7 @@ again:
 				goto again;
 		}
 
-		se = pick_next_entity(cfs_rq, curr);
+		se = pick_next_entity(cfs_rq);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
@@ -8220,7 +8268,7 @@ again:
 			}
 		}
 
-		se = pick_next_entity(cfs_rq, curr);
+		se = pick_next_entity(cfs_rq);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
@@ -8259,7 +8307,7 @@ simple:
 		put_prev_task(rq, prev);
 
 	do {
-		se = pick_next_entity(cfs_rq, NULL);
+		se = pick_next_entity(cfs_rq);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
@@ -8972,7 +9020,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 
 	WARN_ON_ONCE(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
-	check_preempt_curr(rq, p, 0);
+	wakeup_preempt(rq, p, 0);
 }
 
 /*
@@ -9312,8 +9360,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 	unsigned long capacity = scale_rt_capacity(cpu);
 	struct sched_group *sdg = sd->groups;
 
-	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
 	if (!capacity)
 		capacity = 1;
 
@@ -9389,7 +9435,7 @@ static inline int
 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 {
 	return ((rq->cpu_capacity * sd->imbalance_pct) <
-				(rq->cpu_capacity_orig * 100));
+				(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
 }
 
 /*
@@ -9400,7 +9446,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
 {
 	return rq->misfit_task_load &&
-		(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+		(arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
 		 check_cpu_capacity(rq, sd));
 }
 
@@ -9552,7 +9598,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
  * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
  * imbalances in the number of CPUS are dealt with in find_busiest_group().
  *
- * If we are balancing load within an SMT core, or at DIE domain level, always
+ * If we are balancing load within an SMT core, or at PKG domain level, always
  * proceed.
  *
  * Return: true if @env::dst_cpu can do with asym_packing load balance. False
@@ -11251,13 +11297,15 @@ more_balance:
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
-			raw_spin_rq_unlock_irqrestore(busiest, flags);
 
+			preempt_disable();
+			raw_spin_rq_unlock_irqrestore(busiest, flags);
 			if (active_balance) {
 				stop_one_cpu_nowait(cpu_of(busiest),
 					active_load_balance_cpu_stop, busiest,
 					&busiest->active_balance_work);
 			}
+			preempt_enable();
 		}
 	} else {
 		sd->nr_balance_failed = 0;
@@ -11565,36 +11613,39 @@ static inline int on_null_domain(struct rq *rq)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
+ *
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
  *   anywhere yet.
  */
-
 static inline int find_new_ilb(void)
 {
-	int ilb;
 	const struct cpumask *hk_mask;
+	int ilb_cpu;
 
 	hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
 
-	for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
+	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
 
-		if (ilb == smp_processor_id())
+		if (ilb_cpu == smp_processor_id())
 			continue;
 
-		if (idle_cpu(ilb))
-			return ilb;
+		if (idle_cpu(ilb_cpu))
+			return ilb_cpu;
 	}
 
-	return nr_cpu_ids;
+	return -1;
 }
 
 /*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
  */
 static void kick_ilb(unsigned int flags)
 {
@@ -11608,8 +11659,7 @@ static void kick_ilb(unsigned int flags)
 		nohz.next_balance = jiffies+1;
 
 	ilb_cpu = find_new_ilb();
-
-	if (ilb_cpu >= nr_cpu_ids)
+	if (ilb_cpu < 0)
 		return;
 
 	/*
@@ -11622,7 +11672,7 @@ static void kick_ilb(unsigned int flags)
 
 	/*
 	 * This way we generate an IPI on the target CPU which
-	 * is idle. And the softirq performing nohz idle load balance
+	 * is idle, and the softirq performing NOHZ idle load balancing
 	 * will be run before returning from the IPI.
 	 */
 	smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
@@ -11651,7 +11701,7 @@ static void nohz_balancer_kick(struct rq *rq)
 
 	/*
 	 * None are in tickless mode and hence no need for NOHZ idle load
-	 * balancing.
+	 * balancing:
 	 */
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
@@ -11673,9 +11723,8 @@ static void nohz_balancer_kick(struct rq *rq)
 	sd = rcu_dereference(rq->sd);
 	if (sd) {
 		/*
-		 * If there's a CFS task and the current CPU has reduced
-		 * capacity; kick the ILB to see if there's a better CPU to run
-		 * on.
+		 * If there's a runnable CFS task and the current CPU has reduced
+		 * capacity, kick the ILB to see if there's a better CPU to run on:
 		 */
 		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
 			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
@@ -11727,11 +11776,11 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (sds) {
 		/*
 		 * If there is an imbalance between LLC domains (IOW we could
-		 * increase the overall cache use), we need some less-loaded LLC
-		 * domain to pull some load. Likewise, we may need to spread
+		 * increase the overall cache utilization), we need a less-loaded LLC
+		 * domain to pull some load from. Likewise, we may need to spread
 		 * load within the current LLC domain (e.g. packed SMT cores but
 		 * other CPUs are idle). We can't really know from here how busy
-		 * the others are - so just get a nohz balance going if it looks
+		 * the others are - so just get a NOHZ balance going if it looks
 		 * like this LLC domain has tasks we could move.
 		 */
 		nr_busy = atomic_read(&sds->nr_busy_cpus);
@@ -12001,8 +12050,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 }
 
 /*
- * Check if we need to run the ILB for updating blocked load before entering
- * idle state.
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * cpu about to enter idle, because it can take a long time.
  */
 void nohz_run_idle_balance(int cpu)
 {
@@ -12447,7 +12507,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 		if (p->prio > oldprio)
 			resched_curr(rq);
 	} else
-		check_preempt_curr(rq, p, 0);
+		wakeup_preempt(rq, p, 0);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12549,7 +12609,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 		if (task_current(rq, p))
 			resched_curr(rq);
 		else
-			check_preempt_curr(rq, p, 0);
+			wakeup_preempt(rq, p, 0);
 	}
 }
 
@@ -12908,7 +12968,7 @@ DEFINE_SCHED_CLASS(fair) = {
 	.yield_task		= yield_task_fair,
 	.yield_to_task		= yield_to_task_fair,
 
-	.check_preempt_curr	= check_preempt_wakeup,
+	.wakeup_preempt		= check_preempt_wakeup_fair,
 
 	.pick_next_task		= __pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index f770168230ae..a3ddf84de430 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
 /*
  * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
  */
-SCHED_FEAT(SIS_PROP, false)
 SCHED_FEAT(SIS_UTIL, true)
 
 /*
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 5007b25c5bc6..565f8374ddbb 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -401,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 /*
  * Idle tasks are unconditionally rescheduled:
  */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	resched_curr(rq);
 }
@@ -482,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = {
 	/* dequeue is not valid, we print a debug message there: */
 	.dequeue_task		= dequeue_task_idle,
 
-	.check_preempt_curr	= check_preempt_curr_idle,
+	.wakeup_preempt		= wakeup_preempt_idle,
 
 	.pick_next_task		= pick_next_task_idle,
 	.put_prev_task		= put_prev_task_idle,
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 0f310768260c..63b6cf898220 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Per Entity Load Tracking
+ * Per Entity Load Tracking (PELT)
  *
  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1d0f634725a6..7b4aa5809c0f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -434,14 +434,13 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
 	return growth;
 }
 
-static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
+static void update_triggers(struct psi_group *group, u64 now,
 						   enum psi_aggregators aggregator)
 {
 	struct psi_trigger *t;
 	u64 *total = group->total[aggregator];
 	struct list_head *triggers;
 	u64 *aggregator_total;
-	*update_total = false;
 
 	if (aggregator == PSI_AVGS) {
 		triggers = &group->avg_triggers;
@@ -471,14 +470,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
 		 * events without dropping any).
 		 */
 		if (new_stall) {
-			/*
-			 * Multiple triggers might be looking at the same state,
-			 * remember to update group->polling_total[] once we've
-			 * been through all of them. Also remember to extend the
-			 * polling time if we see new stall activity.
-			 */
-			*update_total = true;
-
 			/* Calculate growth since last update */
 			growth = window_update(&t->win, now, total[t->state]);
 			if (!t->pending_event) {
@@ -503,8 +494,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
 		/* Reset threshold breach flag once event got generated */
 		t->pending_event = false;
 	}
-
-	return now + group->rtpoll_min_period;
 }
 
 static u64 update_averages(struct psi_group *group, u64 now)
@@ -565,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	u32 changed_states;
-	bool update_total;
 	u64 now;
 
 	dwork = to_delayed_work(work);
@@ -584,7 +572,7 @@ static void psi_avgs_work(struct work_struct *work)
 	 * go - see calc_avgs() and missed_periods.
 	 */
 	if (now >= group->avg_next_update) {
-		update_triggers(group, now, &update_total, PSI_AVGS);
+		update_triggers(group, now, PSI_AVGS);
 		group->avg_next_update = update_averages(group, now);
 	}
 
@@ -608,7 +596,7 @@ static void init_rtpoll_triggers(struct psi_group *group, u64 now)
 	group->rtpoll_next_update = now + group->rtpoll_min_period;
 }
 
-/* Schedule polling if it's not already scheduled or forced. */
+/* Schedule rtpolling if it's not already scheduled or forced. */
 static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
 				   bool force)
 {
@@ -640,7 +628,6 @@ static void psi_rtpoll_work(struct psi_group *group)
 {
 	bool force_reschedule = false;
 	u32 changed_states;
-	bool update_total;
 	u64 now;
 
 	mutex_lock(&group->rtpoll_trigger_lock);
@@ -649,37 +636,37 @@ static void psi_rtpoll_work(struct psi_group *group)
 
 	if (now > group->rtpoll_until) {
 		/*
-		 * We are either about to start or might stop polling if no
-		 * state change was recorded. Resetting poll_scheduled leaves
+		 * We are either about to start or might stop rtpolling if no
+		 * state change was recorded. Resetting rtpoll_scheduled leaves
 		 * a small window for psi_group_change to sneak in and schedule
-		 * an immediate poll_work before we get to rescheduling. One
-		 * potential extra wakeup at the end of the polling window
-		 * should be negligible and polling_next_update still keeps
+		 * an immediate rtpoll_work before we get to rescheduling. One
+		 * potential extra wakeup at the end of the rtpolling window
+		 * should be negligible and rtpoll_next_update still keeps
 		 * updates correctly on schedule.
 		 */
 		atomic_set(&group->rtpoll_scheduled, 0);
 		/*
-		 * A task change can race with the poll worker that is supposed to
+		 * A task change can race with the rtpoll worker that is supposed to
 		 * report on it. To avoid missing events, ensure ordering between
-		 * poll_scheduled and the task state accesses, such that if the poll
-		 * worker misses the state update, the task change is guaranteed to
-		 * reschedule the poll worker:
+		 * rtpoll_scheduled and the task state accesses, such that if the
+		 * rtpoll worker misses the state update, the task change is
+		 * guaranteed to reschedule the rtpoll worker:
 		 *
-		 * poll worker:
-		 *   atomic_set(poll_scheduled, 0)
+		 * rtpoll worker:
+		 *   atomic_set(rtpoll_scheduled, 0)
 		 *   smp_mb()
 		 *   LOAD states
 		 *
 		 * task change:
 		 *   STORE states
-		 *   if atomic_xchg(poll_scheduled, 1) == 0:
-		 *     schedule poll worker
+		 *   if atomic_xchg(rtpoll_scheduled, 1) == 0:
+		 *     schedule rtpoll worker
 		 *
 		 * The atomic_xchg() implies a full barrier.
 		 */
 		smp_mb();
 	} else {
-		/* Polling window is not over, keep rescheduling */
+		/* The rtpolling window is not over, keep rescheduling */
 		force_reschedule = true;
 	}
 
@@ -687,7 +674,7 @@ static void psi_rtpoll_work(struct psi_group *group)
 	collect_percpu_times(group, PSI_POLL, &changed_states);
 
 	if (changed_states & group->rtpoll_states) {
-		/* Initialize trigger windows when entering polling mode */
+		/* Initialize trigger windows when entering rtpolling mode */
 		if (now > group->rtpoll_until)
 			init_rtpoll_triggers(group, now);
 
@@ -706,10 +693,12 @@ static void psi_rtpoll_work(struct psi_group *group)
 	}
 
 	if (now >= group->rtpoll_next_update) {
-		group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
-		if (update_total)
+		if (changed_states & group->rtpoll_states) {
+			update_triggers(group, now, PSI_POLL);
 			memcpy(group->rtpoll_total, group->total[PSI_POLL],
 				   sizeof(group->rtpoll_total));
+		}
+		group->rtpoll_next_update = now + group->rtpoll_min_period;
 	}
 
 	psi_schedule_rtpoll_work(group,
@@ -1009,6 +998,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
 	struct psi_group_cpu *groupc;
 	u64 now;
 
+	if (static_branch_likely(&psi_disabled))
+		return;
+
 	if (!task->pid)
 		return;
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0597ba0f85ff..6aaf0a3d6081 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth;
  * period over which we measure -rt task CPU usage in us.
  * default: 1s
  */
-unsigned int sysctl_sched_rt_period = 1000000;
+int sysctl_sched_rt_period = 1000000;
 
 /*
  * part of the period that we allow rt tasks to run in us.
@@ -34,9 +34,11 @@ static struct ctl_table sched_rt_sysctls[] = {
 	{
 		.procname       = "sched_rt_period_us",
 		.data           = &sysctl_sched_rt_period,
-		.maxlen         = sizeof(unsigned int),
+		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = sched_rt_handler,
+		.extra1         = SYSCTL_ONE,
+		.extra2         = SYSCTL_INT_MAX,
 	},
 	{
 		.procname       = "sched_rt_runtime_us",
@@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = sched_rt_handler,
+		.extra1         = SYSCTL_NEG_ONE,
+		.extra2         = (void *)&sysctl_sched_rt_period,
 	},
 	{
 		.procname       = "sched_rr_timeslice_ms",
@@ -143,7 +147,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
 #if defined CONFIG_SMP
 	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
 	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
-	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 	plist_head_init(&rt_rq->pushable_tasks);
 #endif /* CONFIG_SMP */
@@ -358,53 +361,6 @@ static inline void rt_clear_overload(struct rq *rq)
 	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 }
 
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
-	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
-		if (!rt_rq->overloaded) {
-			rt_set_overload(rq_of_rt_rq(rt_rq));
-			rt_rq->overloaded = 1;
-		}
-	} else if (rt_rq->overloaded) {
-		rt_clear_overload(rq_of_rt_rq(rt_rq));
-		rt_rq->overloaded = 0;
-	}
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-	struct task_struct *p;
-
-	if (!rt_entity_is_task(rt_se))
-		return;
-
-	p = rt_task_of(rt_se);
-	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
-	rt_rq->rt_nr_total++;
-	if (p->nr_cpus_allowed > 1)
-		rt_rq->rt_nr_migratory++;
-
-	update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-	struct task_struct *p;
-
-	if (!rt_entity_is_task(rt_se))
-		return;
-
-	p = rt_task_of(rt_se);
-	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
-	rt_rq->rt_nr_total--;
-	if (p->nr_cpus_allowed > 1)
-		rt_rq->rt_nr_migratory--;
-
-	update_rt_migration(rt_rq);
-}
-
 static inline int has_pushable_tasks(struct rq *rq)
 {
 	return !plist_head_empty(&rq->rt.pushable_tasks);
@@ -438,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 	/* Update the highest prio pushable task */
 	if (p->prio < rq->rt.highest_prio.next)
 		rq->rt.highest_prio.next = p->prio;
+
+	if (!rq->rt.overloaded) {
+		rt_set_overload(rq);
+		rq->rt.overloaded = 1;
+	}
 }
 
 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -451,6 +412,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 		rq->rt.highest_prio.next = p->prio;
 	} else {
 		rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+		if (rq->rt.overloaded) {
+			rt_clear_overload(rq);
+			rq->rt.overloaded = 0;
+		}
 	}
 }
 
@@ -464,16 +430,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 }
 
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
 static inline void rt_queue_push_tasks(struct rq *rq)
 {
 }
@@ -515,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
 	min_cap = uclamp_eff_value(p, UCLAMP_MIN);
 	max_cap = uclamp_eff_value(p, UCLAMP_MAX);
 
-	cpu_cap = capacity_orig_of(cpu);
+	cpu_cap = arch_scale_cpu_capacity(cpu);
 
 	return cpu_cap >= min(min_cap, max_cap);
 }
@@ -953,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 
 				/*
 				 * When we're idle and a woken (rt) task is
-				 * throttled check_preempt_curr() will set
+				 * throttled wakeup_preempt() will set
 				 * skip_update and the time between the wakeup
 				 * and this unthrottle will get accounted as
 				 * 'runtime'.
@@ -1281,7 +1237,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
 	inc_rt_prio(rt_rq, prio);
-	inc_rt_migration(rt_se, rt_rq);
 	inc_rt_group(rt_se, rt_rq);
 }
 
@@ -1294,7 +1249,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
 	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
-	dec_rt_migration(rt_se, rt_rq);
 	dec_rt_group(rt_se, rt_rq);
 }
 
@@ -1715,7 +1669,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (p->prio < rq->curr->prio) {
 		resched_curr(rq);
@@ -2109,9 +2063,11 @@ retry:
 		 */
 		push_task = get_push_task(rq);
 		if (push_task) {
+			preempt_disable();
 			raw_spin_rq_unlock(rq);
 			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
 					    push_task, &rq->push_work);
+			preempt_enable();
 			raw_spin_rq_lock(rq);
 		}
 
@@ -2448,9 +2404,11 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 
 		if (push_task) {
+			preempt_disable();
 			raw_spin_rq_unlock(this_rq);
 			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
 					    push_task, &src_rq->push_work);
+			preempt_enable();
 			raw_spin_rq_lock(this_rq);
 		}
 	}
@@ -2702,7 +2660,7 @@ DEFINE_SCHED_CLASS(rt) = {
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
 
-	.check_preempt_curr	= check_preempt_curr_rt,
+	.wakeup_preempt		= wakeup_preempt_rt,
 
 	.pick_next_task		= pick_next_task_rt,
 	.put_prev_task		= put_prev_task_rt,
@@ -2985,9 +2943,6 @@ static int sched_rt_global_constraints(void)
 #ifdef CONFIG_SYSCTL
 static int sched_rt_global_validate(void)
 {
-	if (sysctl_sched_rt_period <= 0)
-		return -EINVAL;
-
 	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
 		((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
 		 ((u64)sysctl_sched_rt_runtime *
@@ -3018,7 +2973,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (!ret && write) {
 		ret = sched_rt_global_validate();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04846272409c..2e5a95486a42 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -74,15 +74,6 @@
 
 #include "../workqueue_internal.h"
 
-#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-#include <linux/psi.h>
-#endif
-
-#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
-#endif
-
 #ifdef CONFIG_PARAVIRT
 # include <asm/paravirt.h>
 # include <asm/paravirt_api_clock.h>
@@ -109,14 +100,12 @@ extern __read_mostly int scheduler_running;
 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
-extern unsigned int sysctl_sched_child_runs_first;
-
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
 extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
 
-extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 extern int sched_rr_timeslice;
 
@@ -594,6 +583,7 @@ struct cfs_rq {
 	} removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	u64			last_update_tg_load_avg;
 	unsigned long		tg_load_avg_contrib;
 	long			propagate;
 	long			prop_runnable_sum;
@@ -644,9 +634,7 @@ struct cfs_rq {
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;
-#ifdef CONFIG_SMP
 	struct list_head	throttled_csd_list;
-#endif
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
@@ -675,8 +663,6 @@ struct rt_rq {
 	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
-	unsigned int		rt_nr_migratory;
-	unsigned int		rt_nr_total;
 	int			overloaded;
 	struct plist_head	pushable_tasks;
 
@@ -721,7 +707,6 @@ struct dl_rq {
 		u64		next;
 	} earliest_dl;
 
-	unsigned int		dl_nr_migratory;
 	int			overloaded;
 
 	/*
@@ -963,10 +948,6 @@ struct rq {
 	/* runqueue lock: */
 	raw_spinlock_t		__lock;
 
-	/*
-	 * nr_running and cpu_load should be in the same cacheline because
-	 * remote CPUs use both these fields when doing load calculation.
-	 */
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
@@ -1048,7 +1029,6 @@ struct rq {
 	struct sched_domain __rcu	*sd;
 
 	unsigned long		cpu_capacity;
-	unsigned long		cpu_capacity_orig;
 
 	struct balance_callback *balance_callback;
 
@@ -1079,9 +1059,6 @@ struct rq {
 	u64			idle_stamp;
 	u64			avg_idle;
 
-	unsigned long		wake_stamp;
-	u64			wake_avg_idle;
-
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
 
@@ -1658,6 +1635,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
 
+DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
+		    _T->rq = task_rq_lock(_T->lock, &_T->rf),
+		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+		    struct rq *rq; struct rq_flags rf)
+
 static inline void
 rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
 	__acquires(rq->lock)
@@ -1868,11 +1850,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_share_id);
 DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 extern struct static_key_false sched_asym_cpucapacity;
+extern struct static_key_false sched_cluster_active;
 
 static __always_inline bool sched_asym_cpucap_active(void)
 {
@@ -2239,7 +2223,7 @@ struct sched_class {
 	void (*yield_task)   (struct rq *rq);
 	bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
 
-	void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
 
 	struct task_struct *(*pick_next_task)(struct rq *rq);
 
@@ -2513,7 +2497,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
 #ifdef CONFIG_PREEMPT_RT
 #define SCHED_NR_MIGRATE_BREAK 8
@@ -2977,11 +2961,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif
 
 #ifdef CONFIG_SMP
-static inline unsigned long capacity_orig_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
 /**
  * enum cpu_util_type - CPU utilization type
  * @FREQUENCY_UTIL:	Utilization used to select frequency
@@ -3219,6 +3198,8 @@ static inline bool sched_energy_enabled(void)
 	return static_branch_unlikely(&sched_energy_present);
 }
 
+extern struct cpufreq_governor schedutil_gov;
+
 #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
 
 #define perf_domain_span(pd) NULL
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 85590599b4d6..6cf7304e6449 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 #endif /* CONFIG_SMP */
 
 static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	/* we're never preempted */
 }
@@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = {
 	.dequeue_task		= dequeue_task_stop,
 	.yield_task		= yield_task_stop,
 
-	.check_preempt_curr	= check_preempt_curr_stop,
+	.wakeup_preempt		= wakeup_preempt_stop,
 
 	.pick_next_task		= pick_next_task_stop,
 	.put_prev_task		= put_prev_task_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05a5bc678c08..10d1391e7416 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -212,6 +212,69 @@ static unsigned int sysctl_sched_energy_aware = 1;
 static DEFINE_MUTEX(sched_energy_mutex);
 static bool sched_energy_update;
 
+static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
+{
+	bool any_asym_capacity = false;
+	struct cpufreq_policy *policy;
+	struct cpufreq_governor *gov;
+	int i;
+
+	/* EAS is enabled for asymmetric CPU capacity topologies. */
+	for_each_cpu(i, cpu_mask) {
+		if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
+			any_asym_capacity = true;
+			break;
+		}
+	}
+	if (!any_asym_capacity) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
+				cpumask_pr_args(cpu_mask));
+		}
+		return false;
+	}
+
+	/* EAS definitely does *not* handle SMT */
+	if (sched_smt_active()) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
+				cpumask_pr_args(cpu_mask));
+		}
+		return false;
+	}
+
+	if (!arch_scale_freq_invariant()) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
+				cpumask_pr_args(cpu_mask));
+		}
+		return false;
+	}
+
+	/* Do not attempt EAS if schedutil is not being used. */
+	for_each_cpu(i, cpu_mask) {
+		policy = cpufreq_cpu_get(i);
+		if (!policy) {
+			if (sched_debug()) {
+				pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
+					cpumask_pr_args(cpu_mask), i);
+			}
+			return false;
+		}
+		gov = policy->governor;
+		cpufreq_cpu_put(policy);
+		if (gov != &schedutil_gov) {
+			if (sched_debug()) {
+				pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
+					cpumask_pr_args(cpu_mask));
+			}
+			return false;
+		}
+	}
+
+	return true;
+}
+
 void rebuild_sched_domains_energy(void)
 {
 	mutex_lock(&sched_energy_mutex);
@@ -230,6 +293,15 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (!sched_is_eas_possible(cpu_active_mask)) {
+		if (write) {
+			return -EOPNOTSUPP;
+		} else {
+			*lenp = 0;
+			return 0;
+		}
+	}
+
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		state = static_branch_unlikely(&sched_energy_present);
@@ -348,103 +420,33 @@ static void sched_energy_set(bool has_eas)
  *    1. an Energy Model (EM) is available;
  *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
  *    3. no SMT is detected.
- *    4. the EM complexity is low enough to keep scheduling overheads low;
- *    5. schedutil is driving the frequency of all CPUs of the rd;
- *    6. frequency invariance support is present;
- *
- * The complexity of the Energy Model is defined as:
- *
- *              C = nr_pd * (nr_cpus + nr_ps)
- *
- * with parameters defined as:
- *  - nr_pd:    the number of performance domains
- *  - nr_cpus:  the number of CPUs
- *  - nr_ps:    the sum of the number of performance states of all performance
- *              domains (for example, on a system with 2 performance domains,
- *              with 10 performance states each, nr_ps = 2 * 10 = 20).
- *
- * It is generally not a good idea to use such a model in the wake-up path on
- * very complex platforms because of the associated scheduling overheads. The
- * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 performance states each, for example.
+ *    4. schedutil is driving the frequency of all CPUs of the rd;
+ *    5. frequency invariance support is present;
  */
-#define EM_MAX_COMPLEXITY 2048
-
-extern struct cpufreq_governor schedutil_gov;
 static bool build_perf_domains(const struct cpumask *cpu_map)
 {
-	int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
+	int i;
 	struct perf_domain *pd = NULL, *tmp;
 	int cpu = cpumask_first(cpu_map);
 	struct root_domain *rd = cpu_rq(cpu)->rd;
-	struct cpufreq_policy *policy;
-	struct cpufreq_governor *gov;
 
 	if (!sysctl_sched_energy_aware)
 		goto free;
 
-	/* EAS is enabled for asymmetric CPU capacity topologies. */
-	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
-		if (sched_debug()) {
-			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
-					cpumask_pr_args(cpu_map));
-		}
+	if (!sched_is_eas_possible(cpu_map))
 		goto free;
-	}
-
-	/* EAS definitely does *not* handle SMT */
-	if (sched_smt_active()) {
-		pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
-			cpumask_pr_args(cpu_map));
-		goto free;
-	}
-
-	if (!arch_scale_freq_invariant()) {
-		if (sched_debug()) {
-			pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
-				cpumask_pr_args(cpu_map));
-		}
-		goto free;
-	}
 
 	for_each_cpu(i, cpu_map) {
 		/* Skip already covered CPUs. */
 		if (find_pd(pd, i))
 			continue;
 
-		/* Do not attempt EAS if schedutil is not being used. */
-		policy = cpufreq_cpu_get(i);
-		if (!policy)
-			goto free;
-		gov = policy->governor;
-		cpufreq_cpu_put(policy);
-		if (gov != &schedutil_gov) {
-			if (rd->pd)
-				pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
-						cpumask_pr_args(cpu_map));
-			goto free;
-		}
-
 		/* Create the new pd and add it to the local list. */
 		tmp = pd_init(i);
 		if (!tmp)
 			goto free;
 		tmp->next = pd;
 		pd = tmp;
-
-		/*
-		 * Count performance domains and performance states for the
-		 * complexity check.
-		 */
-		nr_pd++;
-		nr_ps += em_pd_nr_perf_states(pd->em_pd);
-	}
-
-	/* Bail out if the Energy Model complexity is too high. */
-	if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
-		WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
-						cpumask_pr_args(cpu_map));
-		goto free;
 	}
 
 	perf_domain_debug(cpu_map, pd);
@@ -666,11 +668,14 @@ static void destroy_sched_domains(struct sched_domain *sd)
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_share_id);
 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -691,6 +696,17 @@ static void update_top_cache_domain(int cpu)
 	per_cpu(sd_llc_id, cpu) = id;
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
+	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+	if (sd)
+		id = cpumask_first(sched_domain_span(sd));
+
+	/*
+	 * This assignment should be placed after the sd_llc_id as
+	 * we want this id equals to cluster id on cluster machines
+	 * but equals to LLC id on non-Cluster machines.
+	 */
+	per_cpu(sd_share_id, cpu) = id;
+
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 
@@ -1117,7 +1133,7 @@ fail:
  *
  *  - Simultaneous multithreading (SMT)
  *  - Multi-Core Cache (MC)
- *  - Package (DIE)
+ *  - Package (PKG)
  *
  * Where the last one more or less denotes everything up to a NUMA node.
  *
@@ -1139,13 +1155,13 @@ fail:
  *
  * CPU   0   1   2   3   4   5   6   7
  *
- * DIE  [                             ]
+ * PKG  [                             ]
  * MC   [             ] [             ]
  * SMT  [     ] [     ] [     ] [     ]
  *
  *  - or -
  *
- * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * PKG  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
  * MC	0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
  * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
  *
@@ -1548,6 +1564,7 @@ static struct cpumask		***sched_domains_numa_masks;
  */
 #define TOPOLOGY_SD_FLAGS		\
 	(SD_SHARE_CPUCAPACITY	|	\
+	 SD_CLUSTER		|	\
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA		|	\
 	 SD_ASYM_PACKING)
@@ -1679,7 +1696,7 @@ static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_MC
 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
 	{ NULL, },
 };
 
@@ -2112,22 +2129,31 @@ static int hop_cmp(const void *a, const void *b)
 	return -1;
 }
 
-/*
- * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
- *                             closest to @cpu from @cpumask.
- * cpumask: cpumask to find a cpu from
- * cpu: Nth cpu to find
- *
- * returns: cpu, or nr_cpu_ids when nothing found.
+/**
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
+ *                             from @cpus to @cpu, taking into account distance
+ *                             from a given @node.
+ * @cpus: cpumask to find a cpu from
+ * @cpu: CPU to start searching
+ * @node: NUMA node to order CPUs by distance
+ *
+ * Return: cpu, or nr_cpu_ids when nothing found.
  */
 int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
+	struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
 	struct cpumask ***hop_masks;
 	int hop, ret = nr_cpu_ids;
 
+	if (node == NUMA_NO_NODE)
+		return cpumask_nth_and(cpu, cpus, cpu_online_mask);
+
 	rcu_read_lock();
 
+	/* CPU-less node entries are uninitialized in sched_domains_numa_masks */
+	node = numa_nearest_node(node, N_CPU);
+	k.node = node;
+
 	k.masks = rcu_dereference(sched_domains_numa_masks);
 	if (!k.masks)
 		goto unlock;
@@ -2362,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
 	bool has_asym = false;
+	bool has_cluster = false;
 
 	if (WARN_ON(cpumask_empty(cpu_map)))
 		goto error;
@@ -2479,20 +2506,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
+		unsigned long capacity;
+
 		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
 
+		capacity = arch_scale_cpu_capacity(i);
 		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
-		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
-			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+		if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
+			WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
 
 		cpu_attach_domain(sd, d.rd, i);
+
+		if (lowest_flag_domain(i, SD_CLUSTER))
+			has_cluster = true;
 	}
 	rcu_read_unlock();
 
 	if (has_asym)
 		static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
 
+	if (has_cluster)
+		static_branch_inc_cpuslocked(&sched_cluster_active);
+
 	if (rq && sched_debug_verbose) {
 		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
@@ -2592,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
 	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
 		static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
 
+	if (static_branch_unlikely(&sched_cluster_active))
+		static_branch_dec_cpuslocked(&sched_cluster_active);
+
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
diff --git a/kernel/signal.c b/kernel/signal.c
index 09019017d669..f2a5578326ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2329,15 +2329,38 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
 		do_notify_parent_cldstop(current, false, why);
 
 	/*
-	 * Don't want to allow preemption here, because
-	 * sys_ptrace() needs this task to be inactive.
+	 * The previous do_notify_parent_cldstop() invocation woke ptracer.
+	 * One a PREEMPTION kernel this can result in preemption requirement
+	 * which will be fulfilled after read_unlock() and the ptracer will be
+	 * put on the CPU.
+	 * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
+	 * this task wait in schedule(). If this task gets preempted then it
+	 * remains enqueued on the runqueue. The ptracer will observe this and
+	 * then sleep for a delay of one HZ tick. In the meantime this task
+	 * gets scheduled, enters schedule() and will wait for the ptracer.
 	 *
-	 * XXX: implement read_unlock_no_resched().
+	 * This preemption point is not bad from a correctness point of
+	 * view but extends the runtime by one HZ tick time due to the
+	 * ptracer's sleep.  The preempt-disable section ensures that there
+	 * will be no preemption between unlock and schedule() and so
+	 * improving the performance since the ptracer will observe that
+	 * the tracee is scheduled out once it gets on the CPU.
+	 *
+	 * On PREEMPT_RT locking tasklist_lock does not disable preemption.
+	 * Therefore the task can be preempted after do_notify_parent_cldstop()
+	 * before unlocking tasklist_lock so there is no benefit in doing this.
+	 *
+	 * In fact disabling preemption is harmful on PREEMPT_RT because
+	 * the spinlock_t in cgroup_enter_frozen() must not be acquired
+	 * with preemption disabled due to the 'sleeping' spinlock
+	 * substitution of RT.
 	 */
-	preempt_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
 	read_unlock(&tasklist_lock);
 	cgroup_enter_frozen();
-	preempt_enable_no_resched();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable_no_resched();
 	schedule();
 	cgroup_leave_frozen(true);
 
diff --git a/kernel/smp.c b/kernel/smp.c
index 8455a53465af..f085ebcdf9e7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -127,7 +127,7 @@ send_call_function_ipi_mask(struct cpumask *mask)
 }
 
 static __always_inline void
-csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd)
+csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
 {
 	trace_csd_function_entry(func, csd);
 	func(info);
@@ -170,11 +170,13 @@ static DEFINE_PER_CPU(void *, cur_csd_info);
 
 static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
 module_param(csd_lock_timeout, ulong, 0444);
+static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
+module_param(panic_on_ipistall, int, 0444);
 
 static atomic_t csd_bug_count = ATOMIC_INIT(0);
 
 /* Record current CSD work for current CPU, NULL to erase. */
-static void __csd_lock_record(struct __call_single_data *csd)
+static void __csd_lock_record(call_single_data_t *csd)
 {
 	if (!csd) {
 		smp_mb(); /* NULL cur_csd after unlock. */
@@ -189,13 +191,13 @@ static void __csd_lock_record(struct __call_single_data *csd)
 		  /* Or before unlock, as the case may be. */
 }
 
-static __always_inline void csd_lock_record(struct __call_single_data *csd)
+static __always_inline void csd_lock_record(call_single_data_t *csd)
 {
 	if (static_branch_unlikely(&csdlock_debug_enabled))
 		__csd_lock_record(csd);
 }
 
-static int csd_lock_wait_getcpu(struct __call_single_data *csd)
+static int csd_lock_wait_getcpu(call_single_data_t *csd)
 {
 	unsigned int csd_type;
 
@@ -210,7 +212,7 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd)
  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
  * so waiting on other types gets much less information.
  */
-static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
 {
 	int cpu = -1;
 	int cpux;
@@ -230,6 +232,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
 	}
 
 	ts2 = sched_clock();
+	/* How long since we last checked for a stuck CSD lock.*/
 	ts_delta = ts2 - *ts1;
 	if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
 		return false;
@@ -243,9 +246,17 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
 	else
 		cpux = cpu;
 	cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
+	/* How long since this CSD lock was stuck. */
+	ts_delta = ts2 - ts0;
 	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
-		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
+		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
 		 cpu, csd->func, csd->info);
+	/*
+	 * If the CSD lock is still stuck after 5 minutes, it is unlikely
+	 * to become unstuck. Use a signed comparison to avoid triggering
+	 * on underflows when the TSC is out of sync between sockets.
+	 */
+	BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
 	if (cpu_cur_csd && csd != cpu_cur_csd) {
 		pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
 			 *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
@@ -276,7 +287,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static void __csd_lock_wait(struct __call_single_data *csd)
+static void __csd_lock_wait(call_single_data_t *csd)
 {
 	int bug_id = 0;
 	u64 ts0, ts1;
@@ -290,7 +301,7 @@ static void __csd_lock_wait(struct __call_single_data *csd)
 	smp_acquire__after_ctrl_dep();
 }
 
-static __always_inline void csd_lock_wait(struct __call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	if (static_branch_unlikely(&csdlock_debug_enabled)) {
 		__csd_lock_wait(csd);
@@ -300,17 +311,17 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #else
-static void csd_lock_record(struct __call_single_data *csd)
+static void csd_lock_record(call_single_data_t *csd)
 {
 }
 
-static __always_inline void csd_lock_wait(struct __call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #endif
 
-static __always_inline void csd_lock(struct __call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
 {
 	csd_lock_wait(csd);
 	csd->node.u_flags |= CSD_FLAG_LOCK;
@@ -323,7 +334,7 @@ static __always_inline void csd_lock(struct __call_single_data *csd)
 	smp_wmb();
 }
 
-static __always_inline void csd_unlock(struct __call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
 {
 	WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 
@@ -376,7 +387,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static int generic_exec_single(int cpu, struct __call_single_data *csd)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
 {
 	if (cpu == smp_processor_id()) {
 		smp_call_func_t func = csd->func;
@@ -667,7 +678,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  *
  * Return: %0 on success or negative errno value on error
  */
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	int err = 0;
 
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9ed5ce989415..4f65824879ab 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
 	put_task_stack(tsk);
 	return c.len;
 }
+EXPORT_SYMBOL_GPL(stack_trace_save_tsk);
 
 /**
  * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
@@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
 	save_stack_trace_tsk(task, &trace);
 	return trace.nr_entries;
 }
+EXPORT_SYMBOL_GPL(stack_trace_save_tsk);
 
 /**
  * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e137c1385c56..9db51ea373b0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,9 @@ COND_SYSCALL_COMPAT(set_robust_list);
 COND_SYSCALL(get_robust_list);
 COND_SYSCALL_COMPAT(get_robust_list);
 COND_SYSCALL(futex_waitv);
+COND_SYSCALL(futex_wake);
+COND_SYSCALL(futex_wait);
+COND_SYSCALL(futex_requeue);
 COND_SYSCALL(kexec_load);
 COND_SYSCALL_COMPAT(kexec_load);
 COND_SYSCALL(init_module);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..2b6585751891 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1983,7 +1983,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_perf_event_sample_rate,
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
-		.proc_handler	= perf_proc_update_handler,
+		.proc_handler	= perf_event_max_sample_rate_handler,
 		.extra1		= SYSCTL_ONE,
 	},
 	{
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8d9f13d847f0..4657cb8e8b1f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -290,6 +290,17 @@ static int alarmtimer_suspend(struct device *dev)
 	rtc_timer_cancel(rtc, &rtctimer);
 	rtc_read_time(rtc, &tm);
 	now = rtc_tm_to_ktime(tm);
+
+	/*
+	 * If the RTC alarm timer only supports a limited time offset, set the
+	 * alarm time to the maximum supported value.
+	 * The system may wake up earlier (possibly much earlier) than expected
+	 * when the alarmtimer runs. This is the best the kernel can do if
+	 * the alarmtimer exceeds the time that the rtc device can be programmed
+	 * for.
+	 */
+	min = rtc_bound_alarmtime(rtc, min);
+
 	now = ktime_add(now, min);
 
 	/* Set alarm, if in the past reject suspend briefly to handle */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 87015e9deacc..be77b021e5d6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -4,7 +4,7 @@
  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
  *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
  *
- *  No idle tick implementation for low and high resolution timers
+ *  NOHZ implementation for low and high resolution timers
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  */
@@ -45,7 +45,7 @@ struct tick_sched *tick_get_tick_sched(int cpu)
 
 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
 /*
- * The time, when the last jiffy update happened. Write access must hold
+ * The time when the last jiffy update happened. Write access must hold
  * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
  * consistent view of jiffies and last_jiffies_update.
  */
@@ -60,13 +60,13 @@ static void tick_do_update_jiffies64(ktime_t now)
 	ktime_t delta, nextp;
 
 	/*
-	 * 64bit can do a quick check without holding jiffies lock and
+	 * 64-bit can do a quick check without holding the jiffies lock and
 	 * without looking at the sequence count. The smp_load_acquire()
 	 * pairs with the update done later in this function.
 	 *
-	 * 32bit cannot do that because the store of tick_next_period
-	 * consists of two 32bit stores and the first store could move it
-	 * to a random point in the future.
+	 * 32-bit cannot do that because the store of 'tick_next_period'
+	 * consists of two 32-bit stores, and the first store could be
+	 * moved by the CPU to a random point in the future.
 	 */
 	if (IS_ENABLED(CONFIG_64BIT)) {
 		if (ktime_before(now, smp_load_acquire(&tick_next_period)))
@@ -75,7 +75,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 		unsigned int seq;
 
 		/*
-		 * Avoid contention on jiffies_lock and protect the quick
+		 * Avoid contention on 'jiffies_lock' and protect the quick
 		 * check with the sequence count.
 		 */
 		do {
@@ -90,7 +90,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 	/* Quick check failed, i.e. update is required. */
 	raw_spin_lock(&jiffies_lock);
 	/*
-	 * Reevaluate with the lock held. Another CPU might have done the
+	 * Re-evaluate with the lock held. Another CPU might have done the
 	 * update already.
 	 */
 	if (ktime_before(now, tick_next_period)) {
@@ -114,25 +114,23 @@ static void tick_do_update_jiffies64(ktime_t now)
 						   TICK_NSEC);
 	}
 
-	/* Advance jiffies to complete the jiffies_seq protected job */
+	/* Advance jiffies to complete the 'jiffies_seq' protected job */
 	jiffies_64 += ticks;
 
-	/*
-	 * Keep the tick_next_period variable up to date.
-	 */
+	/* Keep the tick_next_period variable up to date */
 	nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
 
 	if (IS_ENABLED(CONFIG_64BIT)) {
 		/*
 		 * Pairs with smp_load_acquire() in the lockless quick
-		 * check above and ensures that the update to jiffies_64 is
-		 * not reordered vs. the store to tick_next_period, neither
+		 * check above, and ensures that the update to 'jiffies_64' is
+		 * not reordered vs. the store to 'tick_next_period', neither
 		 * by the compiler nor by the CPU.
 		 */
 		smp_store_release(&tick_next_period, nextp);
 	} else {
 		/*
-		 * A plain store is good enough on 32bit as the quick check
+		 * A plain store is good enough on 32-bit, as the quick check
 		 * above is protected by the sequence count.
 		 */
 		tick_next_period = nextp;
@@ -140,7 +138,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 
 	/*
 	 * Release the sequence count. calc_global_load() below is not
-	 * protected by it, but jiffies_lock needs to be held to prevent
+	 * protected by it, but 'jiffies_lock' needs to be held to prevent
 	 * concurrent invocations.
 	 */
 	write_seqcount_end(&jiffies_seq);
@@ -160,7 +158,8 @@ static ktime_t tick_init_jiffy_update(void)
 
 	raw_spin_lock(&jiffies_lock);
 	write_seqcount_begin(&jiffies_seq);
-	/* Did we start the jiffies update yet ? */
+
+	/* Have we started the jiffies update yet ? */
 	if (last_jiffies_update == 0) {
 		u32 rem;
 
@@ -175,8 +174,10 @@ static ktime_t tick_init_jiffy_update(void)
 		last_jiffies_update = tick_next_period;
 	}
 	period = last_jiffies_update;
+
 	write_seqcount_end(&jiffies_seq);
 	raw_spin_unlock(&jiffies_lock);
+
 	return period;
 }
 
@@ -192,10 +193,10 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 	 * concurrency: This happens only when the CPU in charge went
 	 * into a long sleep. If two CPUs happen to assign themselves to
 	 * this duty, then the jiffies update is still serialized by
-	 * jiffies_lock.
+	 * 'jiffies_lock'.
 	 *
 	 * If nohz_full is enabled, this should not happen because the
-	 * tick_do_timer_cpu never relinquishes.
+	 * 'tick_do_timer_cpu' CPU never relinquishes.
 	 */
 	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
 #ifdef CONFIG_NO_HZ_FULL
@@ -205,12 +206,12 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 	}
 #endif
 
-	/* Check, if the jiffies need an update */
+	/* Check if jiffies need an update */
 	if (tick_do_timer_cpu == cpu)
 		tick_do_update_jiffies64(now);
 
 	/*
-	 * If jiffies update stalled for too long (timekeeper in stop_machine()
+	 * If the jiffies update stalled for too long (timekeeper in stop_machine()
 	 * or VMEXIT'ed for several msecs), force an update.
 	 */
 	if (ts->last_tick_jiffies != jiffies) {
@@ -234,10 +235,10 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 	/*
 	 * When we are idle and the tick is stopped, we have to touch
 	 * the watchdog as we might not schedule for a really long
-	 * time. This happens on complete idle SMP systems while
+	 * time. This happens on completely idle SMP systems while
 	 * waiting on the login prompt. We also increment the "start of
 	 * idle" jiffy stamp so the idle accounting adjustment we do
-	 * when we go busy again does not account too much ticks.
+	 * when we go busy again does not account too many ticks.
 	 */
 	if (ts->tick_stopped) {
 		touch_softlockup_watchdog_sched();
@@ -362,7 +363,7 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
 
 	/*
 	 * If the task is not running, run_posix_cpu_timers()
-	 * has nothing to elapse, IPI can then be spared.
+	 * has nothing to elapse, and an IPI can then be optimized out.
 	 *
 	 * activate_task()                      STORE p->tick_dep_mask
 	 *   STORE p->on_rq
@@ -425,7 +426,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep,
 
 /*
  * Set a global tick dependency. Used by perf events that rely on freq and
- * by unstable clock.
+ * unstable clocks.
  */
 void tick_nohz_dep_set(enum tick_dep_bits bit)
 {
@@ -439,7 +440,7 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit)
 
 /*
  * Set per-CPU tick dependency. Used by scheduler and perf events in order to
- * manage events throttling.
+ * manage event-throttling.
  */
 void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
 {
@@ -455,7 +456,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
 		if (cpu == smp_processor_id()) {
 			tick_nohz_full_kick();
 		} else {
-			/* Remote irq work not NMI-safe */
+			/* Remote IRQ work not NMI-safe */
 			if (!WARN_ON_ONCE(in_nmi()))
 				tick_nohz_full_kick_cpu(cpu);
 		}
@@ -473,7 +474,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
 
 /*
- * Set a per-task tick dependency. RCU need this. Also posix CPU timers
+ * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
  * in order to elapse per task timers.
  */
 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
@@ -546,7 +547,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
 {
 	/*
-	 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+	 * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
 	 * timers, workqueues, timekeeping, ...) on behalf of full dynticks
 	 * CPUs. It must remain online when nohz full is enabled.
 	 */
@@ -568,12 +569,12 @@ void __init tick_nohz_init(void)
 		return;
 
 	/*
-	 * Full dynticks uses irq work to drive the tick rescheduling on safe
-	 * locking contexts. But then we need irq work to raise its own
-	 * interrupts to avoid circular dependency on the tick
+	 * Full dynticks uses IRQ work to drive the tick rescheduling on safe
+	 * locking contexts. But then we need IRQ work to raise its own
+	 * interrupts to avoid circular dependency on the tick.
 	 */
 	if (!arch_irq_work_has_interrupt()) {
-		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
+		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
 		cpumask_clear(tick_nohz_full_mask);
 		tick_nohz_full_running = false;
 		return;
@@ -643,7 +644,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu)
  * In case the sched_tick was stopped on this CPU, we have to check if jiffies
  * must be updated. Otherwise an interrupt handler could use a stale jiffy
  * value. We do this unconditionally on any CPU, as we don't know whether the
- * CPU, which has the update task assigned is in a long sleep.
+ * CPU, which has the update task assigned, is in a long sleep.
  */
 static void tick_nohz_update_jiffies(ktime_t now)
 {
@@ -726,7 +727,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
  * counters if NULL.
  *
  * Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds. Note this is partially broken due to
+ * CPU, in microseconds. Note that this is partially broken due to
  * the counter of iowait tasks that can be remotely updated without
  * any synchronization. Therefore it is possible to observe backward
  * values within two consecutive reads.
@@ -787,7 +788,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 	}
 
 	/*
-	 * Reset to make sure next tick stop doesn't get fooled by past
+	 * Reset to make sure the next tick stop doesn't get fooled by past
 	 * cached clock deadline.
 	 */
 	ts->next_tick = 0;
@@ -816,11 +817,11 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 	/*
 	 * Keep the periodic tick, when RCU, architecture or irq_work
 	 * requests it.
-	 * Aside of that check whether the local timer softirq is
-	 * pending. If so its a bad idea to call get_next_timer_interrupt()
+	 * Aside of that, check whether the local timer softirq is
+	 * pending. If so, its a bad idea to call get_next_timer_interrupt(),
 	 * because there is an already expired timer, so it will request
 	 * immediate expiry, which rearms the hardware timer with a
-	 * minimal delta which brings us back to this place
+	 * minimal delta, which brings us back to this place
 	 * immediately. Lather, rinse and repeat...
 	 */
 	if (rcu_needs_cpu() || arch_needs_cpu() ||
@@ -861,7 +862,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 
 	/*
 	 * If this CPU is the one which had the do_timer() duty last, we limit
-	 * the sleep time to the timekeeping max_deferment value.
+	 * the sleep time to the timekeeping 'max_deferment' value.
 	 * Otherwise we can sleep as long as we want.
 	 */
 	delta = timekeeping_max_deferment();
@@ -895,8 +896,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	 * If this CPU is the one which updates jiffies, then give up
 	 * the assignment and let it be taken by the CPU which runs
 	 * the tick timer next, which might be this CPU as well. If we
-	 * don't drop this here the jiffies might be stale and
-	 * do_timer() never invoked. Keep track of the fact that it
+	 * don't drop this here, the jiffies might be stale and
+	 * do_timer() never gets invoked. Keep track of the fact that it
 	 * was the one which had the do_timer() duty last.
 	 */
 	if (cpu == tick_do_timer_cpu) {
@@ -906,7 +907,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 		ts->do_timer_last = 0;
 	}
 
-	/* Skip reprogram of event if its not changed */
+	/* Skip reprogram of event if it's not changed */
 	if (ts->tick_stopped && (expires == ts->next_tick)) {
 		/* Sanity check: make sure clockevent is actually programmed */
 		if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
@@ -919,11 +920,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	}
 
 	/*
-	 * nohz_stop_sched_tick can be called several times before
-	 * the nohz_restart_sched_tick is called. This happens when
+	 * nohz_stop_sched_tick() can be called several times before
+	 * nohz_restart_sched_tick() is called. This happens when
 	 * interrupts arrive which do not cause a reschedule. In the
 	 * first call we save the current tick time, so we can restart
-	 * the scheduler tick in nohz_restart_sched_tick.
+	 * the scheduler tick in nohz_restart_sched_tick().
 	 */
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
@@ -985,9 +986,8 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 
 	calc_load_nohz_stop();
 	touch_softlockup_watchdog_sched();
-	/*
-	 * Cancel the scheduled timer and restore the tick
-	 */
+
+	/* Cancel the scheduled timer and restore the tick: */
 	ts->tick_stopped  = 0;
 	tick_nohz_restart(ts, now);
 }
@@ -1019,11 +1019,11 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 /*
  * A pending softirq outside an IRQ (or softirq disabled section) context
  * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
- * reach here due to the need_resched() early check in can_stop_idle_tick().
+ * reach this code due to the need_resched() early check in can_stop_idle_tick().
  *
  * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
  * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
- * triggering the below since wakep_softirqd() is ignored.
+ * triggering the code below, since wakep_softirqd() is ignored.
  *
  */
 static bool report_idle_softirq(void)
@@ -1044,7 +1044,7 @@ static bool report_idle_softirq(void)
 	if (ratelimit >= 10)
 		return false;
 
-	/* On RT, softirqs handling may be waiting on some lock */
+	/* On RT, softirq handling may be waiting on some lock */
 	if (local_bh_blocked())
 		return false;
 
@@ -1061,8 +1061,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	 * If this CPU is offline and it is the one which updates
 	 * jiffies, then give up the assignment and let it be taken by
 	 * the CPU which runs the tick timer next. If we don't drop
-	 * this here the jiffies might be stale and do_timer() never
-	 * invoked.
+	 * this here, the jiffies might be stale and do_timer() never
+	 * gets invoked.
 	 */
 	if (unlikely(!cpu_online(cpu))) {
 		if (cpu == tick_do_timer_cpu)
@@ -1175,12 +1175,23 @@ void tick_nohz_idle_enter(void)
 }
 
 /**
- * tick_nohz_irq_exit - update next tick event from interrupt exit
+ * tick_nohz_irq_exit - Notify the tick about IRQ exit
+ *
+ * A timer may have been added/modified/deleted either by the current IRQ,
+ * or by another place using this IRQ as a notification. This IRQ may have
+ * also updated the RCU callback list. These events may require a
+ * re-evaluation of the next tick. Depending on the context:
+ *
+ * 1) If the CPU is idle and no resched is pending, just proceed with idle
+ *    time accounting. The next tick will be re-evaluated on the next idle
+ *    loop iteration.
+ *
+ * 2) If the CPU is nohz_full:
  *
- * When an interrupt fires while we are idle and it doesn't cause
- * a reschedule, it may still add, modify or delete a timer, enqueue
- * an RCU callback, etc...
- * So we need to re-calculate and reprogram the next tick event.
+ *    2.1) If there is any tick dependency, restart the tick if stopped.
+ *
+ *    2.2) If there is no tick dependency, (re-)evaluate the next tick and
+ *         stop/update it accordingly.
  */
 void tick_nohz_irq_exit(void)
 {
@@ -1208,7 +1219,7 @@ bool tick_nohz_idle_got_tick(void)
 
 /**
  * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
- * or the tick, whatever that expires first. Note that, if the tick has been
+ * or the tick, whichever expires first. Note that, if the tick has been
  * stopped, it returns the next hrtimer.
  *
  * Called from power state control code with interrupts disabled
@@ -1252,7 +1263,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 		return *delta_next;
 
 	/*
-	 * If the next highres timer to expire is earlier than next_event, the
+	 * If the next highres timer to expire is earlier than 'next_event', the
 	 * idle governor needs to know that.
 	 */
 	next_event = min_t(u64, next_event,
@@ -1296,9 +1307,9 @@ static void tick_nohz_account_idle_time(struct tick_sched *ts,
 	if (vtime_accounting_enabled_this_cpu())
 		return;
 	/*
-	 * We stopped the tick in idle. Update process times would miss the
-	 * time we slept as update_process_times does only a 1 tick
-	 * accounting. Enforce that this is accounted to idle !
+	 * We stopped the tick in idle. update_process_times() would miss the
+	 * time we slept, as it does only a 1 tick accounting.
+	 * Enforce that this is accounted to idle !
 	 */
 	ticks = jiffies - ts->idle_jiffies;
 	/*
@@ -1330,11 +1341,20 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
 }
 
 /**
- * tick_nohz_idle_exit - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - Update the tick upon idle task exit
+ *
+ * When the idle task exits, update the tick depending on the
+ * following situations:
+ *
+ * 1) If the CPU is not in nohz_full mode (most cases), then
+ *    restart the tick.
+ *
+ * 2) If the CPU is in nohz_full mode (corner case):
+ *   2.1) If the tick can be kept stopped (no tick dependencies)
+ *        then re-evaluate the next tick and try to keep it stopped
+ *        as long as possible.
+ *   2.2) If the tick has dependencies, restart the tick.
  *
- * Restart the idle tick when the CPU is woken up from idle
- * This also exit the RCU extended quiescent state. The CPU
- * can use RCU again after this function is called.
  */
 void tick_nohz_idle_exit(void)
 {
@@ -1364,9 +1384,15 @@ void tick_nohz_idle_exit(void)
 }
 
 /*
- * The nohz low res interrupt handler
+ * In low-resolution mode, the tick handler must be implemented directly
+ * at the clockevent level. hrtimer can't be used instead, because its
+ * infrastructure actually relies on the tick itself as a backend in
+ * low-resolution mode (see hrtimer_run_queues()).
+ *
+ * This low-resolution handler still makes use of some hrtimer APIs meanwhile
+ * for convenience with expiration calculation and forwarding.
  */
-static void tick_nohz_handler(struct clock_event_device *dev)
+static void tick_nohz_lowres_handler(struct clock_event_device *dev)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 	struct pt_regs *regs = get_irq_regs();
@@ -1377,18 +1403,16 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	tick_sched_do_timer(ts, now);
 	tick_sched_handle(ts, regs);
 
-	if (unlikely(ts->tick_stopped)) {
-		/*
-		 * The clockevent device is not reprogrammed, so change the
-		 * clock event device to ONESHOT_STOPPED to avoid spurious
-		 * interrupts on devices which might not be truly one shot.
-		 */
-		tick_program_event(KTIME_MAX, 1);
-		return;
+	/*
+	 * In dynticks mode, tick reprogram is deferred:
+	 * - to the idle task if in dynticks-idle
+	 * - to IRQ exit if in full-dynticks.
+	 */
+	if (likely(!ts->tick_stopped)) {
+		hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
+		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 	}
 
-	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
-	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 }
 
 static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
@@ -1402,7 +1426,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
 }
 
 /**
- * tick_nohz_switch_to_nohz - switch to nohz mode
+ * tick_nohz_switch_to_nohz - switch to NOHZ mode
  */
 static void tick_nohz_switch_to_nohz(void)
 {
@@ -1412,12 +1436,12 @@ static void tick_nohz_switch_to_nohz(void)
 	if (!tick_nohz_enabled)
 		return;
 
-	if (tick_switch_to_oneshot(tick_nohz_handler))
+	if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
 		return;
 
 	/*
-	 * Recycle the hrtimer in ts, so we can share the
-	 * hrtimer_forward with the highres code.
+	 * Recycle the hrtimer in 'ts', so we can share the
+	 * hrtimer_forward_now() function with the highres code.
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	/* Get the next period */
@@ -1440,7 +1464,7 @@ static inline void tick_nohz_irq_enter(void)
 	if (ts->idle_active)
 		tick_nohz_stop_idle(ts, now);
 	/*
-	 * If all CPUs are idle. We may need to update a stale jiffies value.
+	 * If all CPUs are idle we may need to update a stale jiffies value.
 	 * Note nohz_full is a special case: a timekeeper is guaranteed to stay
 	 * alive but it might be busy looping with interrupts disabled in some
 	 * rare case (typically stop machine). So we must make sure we have a
@@ -1459,7 +1483,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
 /*
- * Called from irq_enter to notify about the possible interruption of idle()
+ * Called from irq_enter() to notify about the possible interruption of idle()
  */
 void tick_irq_enter(void)
 {
@@ -1475,7 +1499,7 @@ void tick_irq_enter(void)
  * We rearm the timer until we get disabled by the idle code.
  * Called with interrupts disabled.
  */
-static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer)
 {
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
@@ -1485,15 +1509,19 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	tick_sched_do_timer(ts, now);
 
 	/*
-	 * Do not call, when we are not in irq context and have
-	 * no valid regs pointer
+	 * Do not call when we are not in IRQ context and have
+	 * no valid 'regs' pointer
 	 */
 	if (regs)
 		tick_sched_handle(ts, regs);
 	else
 		ts->next_tick = 0;
 
-	/* No need to reprogram if we are in idle or full dynticks mode */
+	/*
+	 * In dynticks mode, tick reprogram is deferred:
+	 * - to the idle task if in dynticks-idle
+	 * - to IRQ exit if in full-dynticks.
+	 */
 	if (unlikely(ts->tick_stopped))
 		return HRTIMER_NORESTART;
 
@@ -1520,16 +1548,14 @@ void tick_setup_sched_timer(void)
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 	ktime_t now = ktime_get();
 
-	/*
-	 * Emulate tick processing via per-CPU hrtimers:
-	 */
+	/* Emulate tick processing via per-CPU hrtimers: */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-	ts->sched_timer.function = tick_sched_timer;
+	ts->sched_timer.function = tick_nohz_highres_handler;
 
 	/* Get the next period (per-CPU) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
 
-	/* Offset the tick to avert jiffies_lock contention. */
+	/* Offset the tick to avert 'jiffies_lock' contention. */
 	if (sched_skew_tick) {
 		u64 offset = TICK_NSEC >> 1;
 		do_div(offset, num_possible_cpus());
@@ -1579,10 +1605,10 @@ void tick_oneshot_notify(void)
 }
 
 /*
- * Check, if a change happened, which makes oneshot possible.
+ * Check if a change happened, which makes oneshot possible.
  *
- * Called cyclic from the hrtimer softirq (driven by the timer
- * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * Called cyclically from the hrtimer softirq (driven by the timer
+ * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
  * mode, because high resolution timers are disabled (either compile
  * or runtime). Called with interrupts disabled.
  */
diff --git a/kernel/torture.c b/kernel/torture.c
index b28b05bbef02..c72ab2d251f4 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -87,14 +87,15 @@ EXPORT_SYMBOL_GPL(verbose_torout_sleep);
  * nanosecond random fuzz.  This function and its friends desynchronize
  * testing from the timer wheel.
  */
-int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp)
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+			 struct torture_random_state *trsp)
 {
 	ktime_t hto = baset_ns;
 
 	if (trsp)
 		hto += torture_random(trsp) % fuzzt_ns;
 	set_current_state(TASK_IDLE);
-	return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
+	return schedule_hrtimeout(&hto, mode);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
 
@@ -106,7 +107,7 @@ int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state
 {
 	ktime_t baset_ns = baset_us * NSEC_PER_USEC;
 
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_us);
 
@@ -123,7 +124,7 @@ int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state
 		fuzzt_ns = (u32)~0U;
 	else
 		fuzzt_ns = fuzzt_us * NSEC_PER_USEC;
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_ms);
 
@@ -136,7 +137,7 @@ int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp)
 {
 	ktime_t baset_ns = jiffies_to_nsecs(baset_j);
 
-	return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp);
+	return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies);
 
@@ -153,7 +154,7 @@ int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *
 		fuzzt_ns = (u32)~0U;
 	else
 		fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC;
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_s);
 
@@ -520,9 +521,8 @@ static void torture_shuffle_task_unregister_all(void)
  * A special case is when shuffle_idle_cpu = -1, in which case we allow
  * the tasks to run on all CPUs.
  */
-static void torture_shuffle_tasks(void)
+static void torture_shuffle_tasks(struct torture_random_state *trp)
 {
-	DEFINE_TORTURE_RANDOM(rand);
 	struct shuffle_task *stp;
 
 	cpumask_setall(shuffle_tmp_mask);
@@ -543,7 +543,7 @@ static void torture_shuffle_tasks(void)
 
 	mutex_lock(&shuffle_task_mutex);
 	list_for_each_entry(stp, &shuffle_task_list, st_l) {
-		if (!random_shuffle || torture_random(&rand) & 0x1)
+		if (!random_shuffle || torture_random(trp) & 0x1)
 			set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
 	}
 	mutex_unlock(&shuffle_task_mutex);
@@ -562,7 +562,7 @@ static int torture_shuffle(void *arg)
 	VERBOSE_TOROUT_STRING("torture_shuffle task started");
 	do {
 		torture_hrtimeout_jiffies(shuffle_interval, &rand);
-		torture_shuffle_tasks();
+		torture_shuffle_tasks(&rand);
 		torture_shutdown_absorb("torture_shuffle");
 	} while (!torture_must_stop());
 	torture_kthread_stopping("torture_shuffle");
@@ -673,7 +673,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
 	if (ssecs > 0) {
 		shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
 		return torture_create_kthread(torture_shutdown, NULL,
-					     shutdown_task);
+					      shutdown_task);
 	}
 	return 0;
 }
@@ -720,7 +720,7 @@ static void torture_shutdown_cleanup(void)
  * suddenly applied to or removed from the system.
  */
 static struct task_struct *stutter_task;
-static int stutter_pause_test;
+static ktime_t stutter_till_abs_time;
 static int stutter;
 static int stutter_gap;
 
@@ -730,30 +730,16 @@ static int stutter_gap;
  */
 bool stutter_wait(const char *title)
 {
-	unsigned int i = 0;
 	bool ret = false;
-	int spt;
+	ktime_t till_ns;
 
 	cond_resched_tasks_rcu_qs();
-	spt = READ_ONCE(stutter_pause_test);
-	for (; spt; spt = READ_ONCE(stutter_pause_test)) {
-		if (!ret && !rt_task(current)) {
-			sched_set_normal(current, MAX_NICE);
-			ret = true;
-		}
-		if (spt == 1) {
-			torture_hrtimeout_jiffies(1, NULL);
-		} else if (spt == 2) {
-			while (READ_ONCE(stutter_pause_test)) {
-				if (!(i++ & 0xffff))
-					torture_hrtimeout_us(10, 0, NULL);
-				cond_resched();
-			}
-		} else {
-			torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL);
-		}
-		torture_shutdown_absorb(title);
+	till_ns = READ_ONCE(stutter_till_abs_time);
+	if (till_ns && ktime_before(ktime_get(), till_ns)) {
+		torture_hrtimeout_ns(till_ns, 0, HRTIMER_MODE_ABS, NULL);
+		ret = true;
 	}
+	torture_shutdown_absorb(title);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stutter_wait);
@@ -764,23 +750,16 @@ EXPORT_SYMBOL_GPL(stutter_wait);
  */
 static int torture_stutter(void *arg)
 {
-	DEFINE_TORTURE_RANDOM(rand);
-	int wtime;
+	ktime_t till_ns;
 
 	VERBOSE_TOROUT_STRING("torture_stutter task started");
 	do {
 		if (!torture_must_stop() && stutter > 1) {
-			wtime = stutter;
-			if (stutter > 2) {
-				WRITE_ONCE(stutter_pause_test, 1);
-				wtime = stutter - 3;
-				torture_hrtimeout_jiffies(wtime, &rand);
-				wtime = 2;
-			}
-			WRITE_ONCE(stutter_pause_test, 2);
-			torture_hrtimeout_jiffies(wtime, NULL);
+			till_ns = ktime_add_ns(ktime_get(),
+					       jiffies_to_nsecs(stutter));
+			WRITE_ONCE(stutter_till_abs_time, till_ns);
+			torture_hrtimeout_jiffies(stutter - 1, NULL);
 		}
-		WRITE_ONCE(stutter_pause_test, 0);
 		if (!torture_must_stop())
 			torture_hrtimeout_jiffies(stutter_gap, NULL);
 		torture_shutdown_absorb("torture_stutter");
@@ -812,6 +791,13 @@ static void torture_stutter_cleanup(void)
 	stutter_task = NULL;
 }
 
+static void
+torture_print_module_parms(void)
+{
+	pr_alert("torture module --- %s:  disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n",
+		 torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle);
+}
+
 /*
  * Initialize torture module.  Please note that this is -not- invoked via
  * the usual module_init() mechanism, but rather by an explicit call from
@@ -834,6 +820,7 @@ bool torture_init_begin(char *ttype, int v)
 	torture_type = ttype;
 	verbose = v;
 	fullstop = FULLSTOP_DONTSTOP;
+	torture_print_module_parms();
 	return true;
 }
 EXPORT_SYMBOL_GPL(torture_init_begin);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index effcaede4759..a3442db35670 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -714,14 +714,30 @@ static int count_symbols(void *data, unsigned long unused)
 	return 0;
 }
 
+struct sym_count_ctx {
+	unsigned int count;
+	const char *name;
+};
+
+static int count_mod_symbols(void *data, const char *name, unsigned long unused)
+{
+	struct sym_count_ctx *ctx = data;
+
+	if (strcmp(name, ctx->name) == 0)
+		ctx->count++;
+
+	return 0;
+}
+
 static unsigned int number_of_same_symbols(char *func_name)
 {
-	unsigned int count;
+	struct sym_count_ctx ctx = { .count = 0, .name = func_name };
+
+	kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
 
-	count = 0;
-	kallsyms_on_each_match_symbol(count_symbols, func_name, &count);
+	module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
 
-	return count;
+	return ctx.count;
 }
 
 static int __trace_kprobe_create(int argc, const char *argv[])
@@ -1007,7 +1023,7 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init);
  * @name: The name of the kprobe event
  * @loc: The location of the kprobe event
  * @kretprobe: Is this a return probe?
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
  *
  * NOTE: Users normally won't want to call this function directly, but
  * rather use the kprobe_event_gen_cmd_start() wrapper, which automatically
@@ -1080,7 +1096,7 @@ EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start);
 /**
  * __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list
  * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
  *
  * NOTE: Users normally won't want to call this function directly, but
  * rather use the kprobe_event_add_fields() wrapper, which
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index db575094c498..d8b302d01083 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 			vmstart = vma->vm_start;
 		}
 		if (file) {
-			ret = trace_seq_path(s, &file->f_path);
+			ret = trace_seq_path(s, file_user_path(file));
 			if (ret)
 				trace_seq_printf(s, "[+0x%lx]",
 						 ip - vmstart);
diff --git a/kernel/up.c b/kernel/up.c
index a38b8b095251..df50828cc2f0 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	unsigned long flags;
 
diff --git a/kernel/user.c b/kernel/user.c
index d667debeafd6..03cedc366dc9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -18,8 +18,18 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/binfmts.h>
 #include <linux/proc_ns.h>
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc init_binfmt_misc = {
+	.entries = LIST_HEAD_INIT(init_binfmt_misc.entries),
+	.enabled = true,
+	.entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock),
+};
+EXPORT_SYMBOL_GPL(init_binfmt_misc);
+#endif
+
 /*
  * userns count is 1 for root user, 1 for init_uts_ns,
  * and 1 for... ?
@@ -67,6 +77,9 @@ struct user_namespace init_user_ns = {
 	.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
 	.keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
 #endif
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	.binfmt_misc = &init_binfmt_misc,
+#endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d8e47bed3f1..d52a894ecf57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -213,6 +213,9 @@ static void free_user_ns(struct work_struct *work)
 			kfree(ns->projid_map.forward);
 			kfree(ns->projid_map.reverse);
 		}
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+		kfree(ns->binfmt_misc);
+#endif
 		retire_userns_sysctls(ns);
 		key_free_user_ns(ns);
 		ns_free_inum(&ns->ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a3522b70218d..0f682da96e1c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5622,50 +5622,54 @@ static void work_for_cpu_fn(struct work_struct *work)
 }
 
 /**
- * work_on_cpu - run a function in thread context on a particular cpu
+ * work_on_cpu_key - run a function in thread context on a particular cpu
  * @cpu: the cpu to run on
  * @fn: the function to run
  * @arg: the function arg
+ * @key: The lock class key for lock debugging purposes
  *
  * It is up to the caller to ensure that the cpu doesn't go offline.
  * The caller must not hold any locks which would prevent @fn from completing.
  *
  * Return: The value @fn returns.
  */
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+		     void *arg, struct lock_class_key *key)
 {
 	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
 
-	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+	INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
 	schedule_work_on(cpu, &wfc.work);
 	flush_work(&wfc.work);
 	destroy_work_on_stack(&wfc.work);
 	return wfc.ret;
 }
-EXPORT_SYMBOL_GPL(work_on_cpu);
+EXPORT_SYMBOL_GPL(work_on_cpu_key);
 
 /**
- * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * work_on_cpu_safe_key - run a function in thread context on a particular cpu
  * @cpu: the cpu to run on
  * @fn:  the function to run
  * @arg: the function argument
+ * @key: The lock class key for lock debugging purposes
  *
  * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
  * any locks which would prevent @fn from completing.
  *
  * Return: The value @fn returns.
  */
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+			  void *arg, struct lock_class_key *key)
 {
 	long ret = -ENODEV;
 
 	cpus_read_lock();
 	if (cpu_online(cpu))
-		ret = work_on_cpu(cpu, fn, arg);
+		ret = work_on_cpu_key(cpu, fn, arg, key);
 	cpus_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(work_on_cpu_safe);
+EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/lib/Kconfig b/lib/Kconfig
index c686f4adc124..2d90935d5a21 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -506,6 +506,9 @@ config ASSOCIATIVE_ARRAY
 
 	  for more information.
 
+config CLOSURES
+	bool
+
 config HAS_IOMEM
 	bool
 	depends on !NO_IOMEM
@@ -729,6 +732,11 @@ config PARMAN
 config OBJAGG
 	tristate "objagg" if COMPILE_TEST
 
+config LWQ_TEST
+	bool "Boot-time test for lwq queuing"
+	help
+          Run boot-time test of light-weight queuing.
+
 endmenu
 
 config GENERIC_IOREMAP
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index fa307f93fa2e..ce3a4abf40f8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1720,6 +1720,15 @@ config DEBUG_NOTIFIERS
 	  This is a relatively cheap check but if you care about maximum
 	  performance, say N.
 
+config DEBUG_CLOSURES
+	bool "Debug closures (bcache async widgits)"
+	depends on CLOSURES
+	select DEBUG_FS
+	help
+	  Keeps all active closures in a linked list and provides a debugfs
+	  interface to list them, which makes it possible to see asynchronous
+	  operations that get stuck.
+
 config DEBUG_MAPLE_TREE
 	bool "Debug maple trees"
 	depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 740109b6e2c8..1b311c7fd32b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -45,7 +45,7 @@ obj-y	+= lockref.o
 obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
-	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
+	 bsearch.o find_bit.o llist.o lwq.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o base64.o \
 	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
@@ -255,6 +255,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
 
 obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
 
+obj-$(CONFIG_CLOSURES) += closure.o
+
 obj-$(CONFIG_DQL) += dynamic_queue_limits.o
 
 obj-$(CONFIG_GLOB) += glob.o
diff --git a/drivers/md/bcache/closure.c b/lib/closure.c
index d8d9394a6beb..0855e698ced1 100644
--- a/drivers/md/bcache/closure.c
+++ b/lib/closure.c
@@ -6,13 +6,13 @@
  * Copyright 2012 Google, Inc.
  */
 
+#include <linux/closure.h>
 #include <linux/debugfs.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/rcupdate.h>
 #include <linux/seq_file.h>
 #include <linux/sched/debug.h>
 
-#include "closure.h"
-
 static inline void closure_put_after_sub(struct closure *cl, int flags)
 {
 	int r = flags & CLOSURE_REMAINING_MASK;
@@ -45,6 +45,7 @@ void closure_sub(struct closure *cl, int v)
 {
 	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
 }
+EXPORT_SYMBOL(closure_sub);
 
 /*
  * closure_put - decrement a closure's refcount
@@ -53,6 +54,7 @@ void closure_put(struct closure *cl)
 {
 	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
 }
+EXPORT_SYMBOL(closure_put);
 
 /*
  * closure_wake_up - wake up all closures on a wait list, without memory barrier
@@ -74,6 +76,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
 		closure_sub(cl, CLOSURE_WAITING + 1);
 	}
 }
+EXPORT_SYMBOL(__closure_wake_up);
 
 /**
  * closure_wait - add a closure to a waitlist
@@ -93,6 +96,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 
 	return true;
 }
+EXPORT_SYMBOL(closure_wait);
 
 struct closure_syncer {
 	struct task_struct	*task;
@@ -127,8 +131,9 @@ void __sched __closure_sync(struct closure *cl)
 
 	__set_current_state(TASK_RUNNING);
 }
+EXPORT_SYMBOL(__closure_sync);
 
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 
 static LIST_HEAD(closure_list);
 static DEFINE_SPINLOCK(closure_list_lock);
@@ -144,6 +149,7 @@ void closure_debug_create(struct closure *cl)
 	list_add(&cl->all, &closure_list);
 	spin_unlock_irqrestore(&closure_list_lock, flags);
 }
+EXPORT_SYMBOL(closure_debug_create);
 
 void closure_debug_destroy(struct closure *cl)
 {
@@ -156,8 +162,7 @@ void closure_debug_destroy(struct closure *cl)
 	list_del(&cl->all);
 	spin_unlock_irqrestore(&closure_list_lock, flags);
 }
-
-static struct dentry *closure_debug;
+EXPORT_SYMBOL(closure_debug_destroy);
 
 static int debug_show(struct seq_file *f, void *data)
 {
@@ -181,7 +186,7 @@ static int debug_show(struct seq_file *f, void *data)
 			seq_printf(f, " W %pS\n",
 				   (void *) cl->waiting_on);
 
-		seq_printf(f, "\n");
+		seq_puts(f, "\n");
 	}
 
 	spin_unlock_irq(&closure_list_lock);
@@ -190,18 +195,11 @@ static int debug_show(struct seq_file *f, void *data)
 
 DEFINE_SHOW_ATTRIBUTE(debug);
 
-void  __init closure_debug_init(void)
+static int __init closure_debug_init(void)
 {
-	if (!IS_ERR_OR_NULL(bcache_debug))
-		/*
-		 * it is unnecessary to check return value of
-		 * debugfs_create_file(), we should not care
-		 * about this.
-		 */
-		closure_debug = debugfs_create_file(
-			"closures", 0400, bcache_debug, NULL, &debug_fops);
+	debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops);
+	return 0;
 }
-#endif
+late_initcall(closure_debug_init)
 
-MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
-MODULE_LICENSE("GPL");
+#endif
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a7fd02b5ae26..34335c1e7265 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -146,9 +146,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
 	/* Wrap: we always want a cpu. */
 	i %= num_online_cpus();
 
-	cpu = (node == NUMA_NO_NODE) ?
-		cpumask_nth(i, cpu_online_mask) :
-		sched_numa_find_nth_cpu(cpu_online_mask, i, node);
+	cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node);
 
 	WARN_ON(cpu >= nr_cpu_ids);
 	return cpu;
diff --git a/lib/errname.c b/lib/errname.c
index 67739b174a8c..dd1b998552cd 100644
--- a/lib/errname.c
+++ b/lib/errname.c
@@ -228,3 +228,4 @@ const char *errname(int err)
 
 	return err > 0 ? name + 1 : name;
 }
+EXPORT_SYMBOL(errname);
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index f25eb111c051..41f1bcdc4488 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -1,4 +1,5 @@
 
+#include <linux/atomic.h>
 #include <linux/export.h>
 #include <linux/generic-radix-tree.h>
 #include <linux/gfp.h>
@@ -166,6 +167,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
 	struct genradix_root *r;
 	struct genradix_node *n;
 	unsigned level, i;
+
+	if (iter->offset == SIZE_MAX)
+		return NULL;
+
 restart:
 	r = READ_ONCE(radix->root);
 	if (!r)
@@ -184,10 +189,17 @@ restart:
 			(GENRADIX_ARY - 1);
 
 		while (!n->children[i]) {
+			size_t objs_per_ptr = genradix_depth_size(level);
+
+			if (iter->offset + objs_per_ptr < iter->offset) {
+				iter->offset	= SIZE_MAX;
+				iter->pos	= SIZE_MAX;
+				return NULL;
+			}
+
 			i++;
-			iter->offset = round_down(iter->offset +
-					   genradix_depth_size(level),
-					   genradix_depth_size(level));
+			iter->offset = round_down(iter->offset + objs_per_ptr,
+						  objs_per_ptr);
 			iter->pos = (iter->offset >> PAGE_SHIFT) *
 				objs_per_page;
 			if (i == GENRADIX_ARY)
@@ -201,6 +213,64 @@ restart:
 }
 EXPORT_SYMBOL(__genradix_iter_peek);
 
+void *__genradix_iter_peek_prev(struct genradix_iter *iter,
+				struct __genradix *radix,
+				size_t objs_per_page,
+				size_t obj_size_plus_page_remainder)
+{
+	struct genradix_root *r;
+	struct genradix_node *n;
+	unsigned level, i;
+
+	if (iter->offset == SIZE_MAX)
+		return NULL;
+
+restart:
+	r = READ_ONCE(radix->root);
+	if (!r)
+		return NULL;
+
+	n	= genradix_root_to_node(r);
+	level	= genradix_root_to_depth(r);
+
+	if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
+		iter->offset = genradix_depth_size(level);
+		iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+		iter->offset -= obj_size_plus_page_remainder;
+		iter->pos--;
+	}
+
+	while (level) {
+		level--;
+
+		i = (iter->offset >> genradix_depth_shift(level)) &
+			(GENRADIX_ARY - 1);
+
+		while (!n->children[i]) {
+			size_t objs_per_ptr = genradix_depth_size(level);
+
+			iter->offset = round_down(iter->offset, objs_per_ptr);
+			iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+			if (!iter->offset)
+				return NULL;
+
+			iter->offset -= obj_size_plus_page_remainder;
+			iter->pos--;
+
+			if (!i)
+				goto restart;
+			--i;
+		}
+
+		n = n->children[i];
+	}
+
+	return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek_prev);
+
 static void genradix_free_recurse(struct genradix_node *n, unsigned level)
 {
 	if (level) {
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 27234a820eeb..de7d11cf4c63 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-only
-#include <crypto/hash.h>
 #include <linux/export.h>
 #include <linux/bvec.h>
 #include <linux/fault-inject-usercopy.h>
@@ -10,192 +9,71 @@
 #include <linux/vmalloc.h>
 #include <linux/splice.h>
 #include <linux/compat.h>
-#include <net/checksum.h>
 #include <linux/scatterlist.h>
 #include <linux/instrumented.h>
+#include <linux/iov_iter.h>
 
-/* covers ubuf and kbuf alike */
-#define iterate_buf(i, n, base, len, off, __p, STEP) {		\
-	size_t __maybe_unused off = 0;				\
-	len = n;						\
-	base = __p + i->iov_offset;				\
-	len -= (STEP);						\
-	i->iov_offset += len;					\
-	n = len;						\
-}
-
-/* covers iovec and kvec alike */
-#define iterate_iovec(i, n, base, len, off, __p, STEP) {	\
-	size_t off = 0;						\
-	size_t skip = i->iov_offset;				\
-	do {							\
-		len = min(n, __p->iov_len - skip);		\
-		if (likely(len)) {				\
-			base = __p->iov_base + skip;		\
-			len -= (STEP);				\
-			off += len;				\
-			skip += len;				\
-			n -= len;				\
-			if (skip < __p->iov_len)		\
-				break;				\
-		}						\
-		__p++;						\
-		skip = 0;					\
-	} while (n);						\
-	i->iov_offset = skip;					\
-	n = off;						\
-}
-
-#define iterate_bvec(i, n, base, len, off, p, STEP) {		\
-	size_t off = 0;						\
-	unsigned skip = i->iov_offset;				\
-	while (n) {						\
-		unsigned offset = p->bv_offset + skip;		\
-		unsigned left;					\
-		void *kaddr = kmap_local_page(p->bv_page +	\
-					offset / PAGE_SIZE);	\
-		base = kaddr + offset % PAGE_SIZE;		\
-		len = min(min(n, (size_t)(p->bv_len - skip)),	\
-		     (size_t)(PAGE_SIZE - offset % PAGE_SIZE));	\
-		left = (STEP);					\
-		kunmap_local(kaddr);				\
-		len -= left;					\
-		off += len;					\
-		skip += len;					\
-		if (skip == p->bv_len) {			\
-			skip = 0;				\
-			p++;					\
-		}						\
-		n -= len;					\
-		if (left)					\
-			break;					\
-	}							\
-	i->iov_offset = skip;					\
-	n = off;						\
-}
-
-#define iterate_xarray(i, n, base, len, __off, STEP) {		\
-	__label__ __out;					\
-	size_t __off = 0;					\
-	struct folio *folio;					\
-	loff_t start = i->xarray_start + i->iov_offset;		\
-	pgoff_t index = start / PAGE_SIZE;			\
-	XA_STATE(xas, i->xarray, index);			\
-								\
-	len = PAGE_SIZE - offset_in_page(start);		\
-	rcu_read_lock();					\
-	xas_for_each(&xas, folio, ULONG_MAX) {			\
-		unsigned left;					\
-		size_t offset;					\
-		if (xas_retry(&xas, folio))			\
-			continue;				\
-		if (WARN_ON(xa_is_value(folio)))		\
-			break;					\
-		if (WARN_ON(folio_test_hugetlb(folio)))		\
-			break;					\
-		offset = offset_in_folio(folio, start + __off);	\
-		while (offset < folio_size(folio)) {		\
-			base = kmap_local_folio(folio, offset);	\
-			len = min(n, len);			\
-			left = (STEP);				\
-			kunmap_local(base);			\
-			len -= left;				\
-			__off += len;				\
-			n -= len;				\
-			if (left || n == 0)			\
-				goto __out;			\
-			offset += len;				\
-			len = PAGE_SIZE;			\
-		}						\
-	}							\
-__out:								\
-	rcu_read_unlock();					\
-	i->iov_offset += __off;					\
-	n = __off;						\
-}
-
-#define __iterate_and_advance(i, n, base, len, off, I, K) {	\
-	if (unlikely(i->count < n))				\
-		n = i->count;					\
-	if (likely(n)) {					\
-		if (likely(iter_is_ubuf(i))) {			\
-			void __user *base;			\
-			size_t len;				\
-			iterate_buf(i, n, base, len, off,	\
-						i->ubuf, (I)) 	\
-		} else if (likely(iter_is_iovec(i))) {		\
-			const struct iovec *iov = iter_iov(i);	\
-			void __user *base;			\
-			size_t len;				\
-			iterate_iovec(i, n, base, len, off,	\
-						iov, (I))	\
-			i->nr_segs -= iov - iter_iov(i);	\
-			i->__iov = iov;				\
-		} else if (iov_iter_is_bvec(i)) {		\
-			const struct bio_vec *bvec = i->bvec;	\
-			void *base;				\
-			size_t len;				\
-			iterate_bvec(i, n, base, len, off,	\
-						bvec, (K))	\
-			i->nr_segs -= bvec - i->bvec;		\
-			i->bvec = bvec;				\
-		} else if (iov_iter_is_kvec(i)) {		\
-			const struct kvec *kvec = i->kvec;	\
-			void *base;				\
-			size_t len;				\
-			iterate_iovec(i, n, base, len, off,	\
-						kvec, (K))	\
-			i->nr_segs -= kvec - i->kvec;		\
-			i->kvec = kvec;				\
-		} else if (iov_iter_is_xarray(i)) {		\
-			void *base;				\
-			size_t len;				\
-			iterate_xarray(i, n, base, len, off,	\
-							(K))	\
-		}						\
-		i->count -= n;					\
-	}							\
-}
-#define iterate_and_advance(i, n, base, len, off, I, K) \
-	__iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
-
-static int copyout(void __user *to, const void *from, size_t n)
+static __always_inline
+size_t copy_to_user_iter(void __user *iter_to, size_t progress,
+			 size_t len, void *from, void *priv2)
 {
 	if (should_fail_usercopy())
-		return n;
-	if (access_ok(to, n)) {
-		instrument_copy_to_user(to, from, n);
-		n = raw_copy_to_user(to, from, n);
+		return len;
+	if (access_ok(iter_to, len)) {
+		from += progress;
+		instrument_copy_to_user(iter_to, from, len);
+		len = raw_copy_to_user(iter_to, from, len);
 	}
-	return n;
+	return len;
 }
 
-static int copyout_nofault(void __user *to, const void *from, size_t n)
+static __always_inline
+size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
+				 size_t len, void *from, void *priv2)
 {
-	long res;
+	ssize_t res;
 
 	if (should_fail_usercopy())
-		return n;
-
-	res = copy_to_user_nofault(to, from, n);
+		return len;
 
-	return res < 0 ? n : res;
+	from += progress;
+	res = copy_to_user_nofault(iter_to, from, len);
+	return res < 0 ? len : res;
 }
 
-static int copyin(void *to, const void __user *from, size_t n)
+static __always_inline
+size_t copy_from_user_iter(void __user *iter_from, size_t progress,
+			   size_t len, void *to, void *priv2)
 {
-	size_t res = n;
+	size_t res = len;
 
 	if (should_fail_usercopy())
-		return n;
-	if (access_ok(from, n)) {
-		instrument_copy_from_user_before(to, from, n);
-		res = raw_copy_from_user(to, from, n);
-		instrument_copy_from_user_after(to, from, n, res);
+		return len;
+	if (access_ok(iter_from, len)) {
+		to += progress;
+		instrument_copy_from_user_before(to, iter_from, len);
+		res = raw_copy_from_user(to, iter_from, len);
+		instrument_copy_from_user_after(to, iter_from, len, res);
 	}
 	return res;
 }
 
+static __always_inline
+size_t memcpy_to_iter(void *iter_to, size_t progress,
+		      size_t len, void *from, void *priv2)
+{
+	memcpy(iter_to, from + progress, len);
+	return 0;
+}
+
+static __always_inline
+size_t memcpy_from_iter(void *iter_from, size_t progress,
+			size_t len, void *to, void *priv2)
+{
+	memcpy(to + progress, iter_from, len);
+	return 0;
+}
+
 /*
  * fault_in_iov_iter_readable - fault in iov iterator for reading
  * @i: iterator
@@ -290,7 +168,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 		.iter_type = ITER_IOVEC,
 		.copy_mc = false,
 		.nofault = false,
-		.user_backed = true,
 		.data_source = direction,
 		.__iov = iov,
 		.nr_segs = nr_segs,
@@ -300,36 +177,35 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
-			      __wsum sum, size_t off)
-{
-	__wsum next = csum_partial_copy_nocheck(from, to, len);
-	return csum_block_add(sum, next, off);
-}
-
 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (WARN_ON_ONCE(i->data_source))
 		return 0;
 	if (user_backed_iter(i))
 		might_fault();
-	iterate_and_advance(i, bytes, base, len, off,
-		copyout(base, addr + off, len),
-		memcpy(base, addr + off, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, (void *)addr,
+				   copy_to_user_iter, memcpy_to_iter);
 }
 EXPORT_SYMBOL(_copy_to_iter);
 
 #ifdef CONFIG_ARCH_HAS_COPY_MC
-static int copyout_mc(void __user *to, const void *from, size_t n)
-{
-	if (access_ok(to, n)) {
-		instrument_copy_to_user(to, from, n);
-		n = copy_mc_to_user((__force void *) to, from, n);
+static __always_inline
+size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
+			    size_t len, void *from, void *priv2)
+{
+	if (access_ok(iter_to, len)) {
+		from += progress;
+		instrument_copy_to_user(iter_to, from, len);
+		len = copy_mc_to_user(iter_to, from, len);
 	}
-	return n;
+	return len;
+}
+
+static __always_inline
+size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
+			 size_t len, void *from, void *priv2)
+{
+	return copy_mc_to_kernel(iter_to, from + progress, len);
 }
 
 /**
@@ -362,22 +238,35 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 		return 0;
 	if (user_backed_iter(i))
 		might_fault();
-	__iterate_and_advance(i, bytes, base, len, off,
-		copyout_mc(base, addr + off, len),
-		copy_mc_to_kernel(base, addr + off, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, (void *)addr,
+				   copy_to_user_iter_mc, memcpy_to_iter_mc);
 }
 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
 #endif /* CONFIG_ARCH_HAS_COPY_MC */
 
-static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
-				 size_t size)
+static __always_inline
+size_t memcpy_from_iter_mc(void *iter_from, size_t progress,
+			   size_t len, void *to, void *priv2)
+{
+	return copy_mc_to_kernel(to + progress, iter_from, len);
+}
+
+static size_t __copy_from_iter_mc(void *addr, size_t bytes, struct iov_iter *i)
+{
+	if (unlikely(i->count < bytes))
+		bytes = i->count;
+	if (unlikely(!bytes))
+		return 0;
+	return iterate_bvec(i, bytes, addr, NULL, memcpy_from_iter_mc);
+}
+
+static __always_inline
+size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
-	if (iov_iter_is_copy_mc(i))
-		return (void *)copy_mc_to_kernel(to, from, size);
-	return memcpy(to, from, size);
+	if (unlikely(iov_iter_is_copy_mc(i)))
+		return __copy_from_iter_mc(addr, bytes, i);
+	return iterate_and_advance(i, bytes, addr,
+				   copy_from_user_iter, memcpy_from_iter);
 }
 
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
@@ -387,30 +276,44 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 
 	if (user_backed_iter(i))
 		might_fault();
-	iterate_and_advance(i, bytes, base, len, off,
-		copyin(addr + off, base, len),
-		memcpy_from_iter(i, addr + off, base, len)
-	)
-
-	return bytes;
+	return __copy_from_iter(addr, bytes, i);
 }
 EXPORT_SYMBOL(_copy_from_iter);
 
+static __always_inline
+size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
+				   size_t len, void *to, void *priv2)
+{
+	return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
+}
+
 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
-	iterate_and_advance(i, bytes, base, len, off,
-		__copy_from_user_inatomic_nocache(addr + off, base, len),
-		memcpy(addr + off, base, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, addr,
+				   copy_from_user_iter_nocache,
+				   memcpy_from_iter);
 }
 EXPORT_SYMBOL(_copy_from_iter_nocache);
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+static __always_inline
+size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
+				      size_t len, void *to, void *priv2)
+{
+	return __copy_from_user_flushcache(to + progress, iter_from, len);
+}
+
+static __always_inline
+size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
+				   size_t len, void *to, void *priv2)
+{
+	memcpy_flushcache(to + progress, iter_from, len);
+	return 0;
+}
+
 /**
  * _copy_from_iter_flushcache - write destination through cpu cache
  * @addr: destination kernel address
@@ -432,12 +335,9 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
-	iterate_and_advance(i, bytes, base, len, off,
-		__copy_from_user_flushcache(addr + off, base, len),
-		memcpy_flushcache(addr + off, base, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, addr,
+				   copy_from_user_iter_flushcache,
+				   memcpy_from_iter_flushcache);
 }
 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
 #endif
@@ -509,10 +409,9 @@ size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte
 		void *kaddr = kmap_local_page(page);
 		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
 
-		iterate_and_advance(i, n, base, len, off,
-			copyout_nofault(base, kaddr + offset + off, len),
-			memcpy(base, kaddr + offset + off, len)
-		)
+		n = iterate_and_advance(i, bytes, kaddr,
+					copy_to_user_iter_nofault,
+					memcpy_to_iter);
 		kunmap_local(kaddr);
 		res += n;
 		bytes -= n;
@@ -555,14 +454,25 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 }
 EXPORT_SYMBOL(copy_page_from_iter);
 
-size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
+static __always_inline
+size_t zero_to_user_iter(void __user *iter_to, size_t progress,
+			 size_t len, void *priv, void *priv2)
 {
-	iterate_and_advance(i, bytes, base, len, count,
-		clear_user(base, len),
-		memset(base, 0, len)
-	)
+	return clear_user(iter_to, len);
+}
+
+static __always_inline
+size_t zero_to_iter(void *iter_to, size_t progress,
+		    size_t len, void *priv, void *priv2)
+{
+	memset(iter_to, 0, len);
+	return 0;
+}
 
-	return bytes;
+size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
+{
+	return iterate_and_advance(i, bytes, NULL,
+				   zero_to_user_iter, zero_to_iter);
 }
 EXPORT_SYMBOL(iov_iter_zero);
 
@@ -587,10 +497,7 @@ size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
 		}
 
 		p = kmap_atomic(page) + offset;
-		iterate_and_advance(i, n, base, len, off,
-			copyin(p + off, base, len),
-			memcpy_from_iter(i, p + off, base, len)
-		)
+		n = __copy_from_iter(p, n, i);
 		kunmap_atomic(p);
 		copied += n;
 		offset += n;
@@ -1181,78 +1088,6 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
-			       struct iov_iter *i)
-{
-	__wsum sum, next;
-	sum = *csum;
-	if (WARN_ON_ONCE(!i->data_source))
-		return 0;
-
-	iterate_and_advance(i, bytes, base, len, off, ({
-		next = csum_and_copy_from_user(base, addr + off, len);
-		sum = csum_block_add(sum, next, off);
-		next ? 0 : len;
-	}), ({
-		sum = csum_and_memcpy(addr + off, base, len, sum, off);
-	})
-	)
-	*csum = sum;
-	return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_from_iter);
-
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
-			     struct iov_iter *i)
-{
-	struct csum_state *csstate = _csstate;
-	__wsum sum, next;
-
-	if (WARN_ON_ONCE(i->data_source))
-		return 0;
-	if (unlikely(iov_iter_is_discard(i))) {
-		// can't use csum_memcpy() for that one - data is not copied
-		csstate->csum = csum_block_add(csstate->csum,
-					       csum_partial(addr, bytes, 0),
-					       csstate->off);
-		csstate->off += bytes;
-		return bytes;
-	}
-
-	sum = csum_shift(csstate->csum, csstate->off);
-	iterate_and_advance(i, bytes, base, len, off, ({
-		next = csum_and_copy_to_user(addr + off, base, len);
-		sum = csum_block_add(sum, next, off);
-		next ? 0 : len;
-	}), ({
-		sum = csum_and_memcpy(base, addr + off, len, sum, off);
-	})
-	)
-	csstate->csum = csum_shift(sum, csstate->off);
-	csstate->off += bytes;
-	return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_to_iter);
-
-size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
-		struct iov_iter *i)
-{
-#ifdef CONFIG_CRYPTO_HASH
-	struct ahash_request *hash = hashp;
-	struct scatterlist sg;
-	size_t copied;
-
-	copied = copy_to_iter(addr, bytes, i);
-	sg_init_one(&sg, addr, copied);
-	ahash_request_set_crypt(hash, &sg, NULL, copied);
-	crypto_ahash_update(hash);
-	return copied;
-#else
-	return 0;
-#endif
-}
-EXPORT_SYMBOL(hash_and_copy_to_iter);
-
 static int iov_npages(const struct iov_iter *i, int maxpages)
 {
 	size_t skip = i->iov_offset, size = i->count;
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 7c44b7ae4c5c..fb9a2f06dd1e 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -254,10 +254,10 @@ static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem)
 	int buffer_size = sizeof(env->buf) - env->buflen;
 	int len;
 
-	len = strlcpy(&env->buf[env->buflen], subsystem, buffer_size);
-	if (len >= buffer_size) {
-		pr_warn("init_uevent_argv: buffer size of %d too small, needed %d\n",
-			buffer_size, len);
+	len = strscpy(&env->buf[env->buflen], subsystem, buffer_size);
+	if (len < 0) {
+		pr_warn("%s: insufficient buffer space (%u left) for %s\n",
+			__func__, buffer_size, subsystem);
 		return -ENOMEM;
 	}
 
diff --git a/lib/llist.c b/lib/llist.c
index 6e668fa5a2c6..f21d0cfbbaaa 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -66,6 +66,34 @@ struct llist_node *llist_del_first(struct llist_head *head)
 EXPORT_SYMBOL_GPL(llist_del_first);
 
 /**
+ * llist_del_first_this - delete given entry of lock-less list if it is first
+ * @head:	the head for your lock-less list
+ * @this:	a list entry.
+ *
+ * If head of the list is given entry, delete and return %true else
+ * return %false.
+ *
+ * Multiple callers can safely call this concurrently with multiple
+ * llist_add() callers, providing all the callers offer a different @this.
+ */
+bool llist_del_first_this(struct llist_head *head,
+			  struct llist_node *this)
+{
+	struct llist_node *entry, *next;
+
+	/* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */
+	entry = smp_load_acquire(&head->first);
+	do {
+		if (entry != this)
+			return false;
+		next = READ_ONCE(entry->next);
+	} while (!try_cmpxchg(&head->first, &entry, next));
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(llist_del_first_this);
+
+/**
  * llist_reverse_order - reverse order of a llist chain
  * @head:	first item of the list to be reversed
  *
diff --git a/lib/lwq.c b/lib/lwq.c
new file mode 100644
index 000000000000..57d080a4d53d
--- /dev/null
+++ b/lib/lwq.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Light-weight single-linked queue.
+ *
+ * Entries are enqueued to the head of an llist, with no blocking.
+ * This can happen in any context.
+ *
+ * Entries are dequeued using a spinlock to protect against multiple
+ * access.  The llist is staged in reverse order, and refreshed
+ * from the llist when it exhausts.
+ *
+ * This is particularly suitable when work items are queued in BH or
+ * IRQ context, and where work items are handled one at a time by
+ * dedicated threads.
+ */
+#include <linux/rcupdate.h>
+#include <linux/lwq.h>
+
+struct llist_node *__lwq_dequeue(struct lwq *q)
+{
+	struct llist_node *this;
+
+	if (lwq_empty(q))
+		return NULL;
+	spin_lock(&q->lock);
+	this = q->ready;
+	if (!this && !llist_empty(&q->new)) {
+		/* ensure queue doesn't appear transiently lwq_empty */
+		smp_store_release(&q->ready, (void *)1);
+		this = llist_reverse_order(llist_del_all(&q->new));
+		if (!this)
+			q->ready = NULL;
+	}
+	if (this)
+		q->ready = llist_next(this);
+	spin_unlock(&q->lock);
+	return this;
+}
+EXPORT_SYMBOL_GPL(__lwq_dequeue);
+
+/**
+ * lwq_dequeue_all - dequeue all currently enqueued objects
+ * @q:	the queue to dequeue from
+ *
+ * Remove and return a linked list of llist_nodes of all the objects that were
+ * in the queue. The first on the list will be the object that was least
+ * recently enqueued.
+ */
+struct llist_node *lwq_dequeue_all(struct lwq *q)
+{
+	struct llist_node *r, *t, **ep;
+
+	if (lwq_empty(q))
+		return NULL;
+
+	spin_lock(&q->lock);
+	r = q->ready;
+	q->ready = NULL;
+	t = llist_del_all(&q->new);
+	spin_unlock(&q->lock);
+	ep = &r;
+	while (*ep)
+		ep = &(*ep)->next;
+	*ep = llist_reverse_order(t);
+	return r;
+}
+EXPORT_SYMBOL_GPL(lwq_dequeue_all);
+
+#if IS_ENABLED(CONFIG_LWQ_TEST)
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/wait_bit.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+struct tnode {
+	struct lwq_node n;
+	int i;
+	int c;
+};
+
+static int lwq_exercise(void *qv)
+{
+	struct lwq *q = qv;
+	int cnt;
+	struct tnode *t;
+
+	for (cnt = 0; cnt < 10000; cnt++) {
+		wait_var_event(q, (t = lwq_dequeue(q, struct tnode, n)) != NULL);
+		t->c++;
+		if (lwq_enqueue(&t->n, q))
+			wake_up_var(q);
+	}
+	while (!kthread_should_stop())
+		schedule_timeout_idle(1);
+	return 0;
+}
+
+static int lwq_test(void)
+{
+	int i;
+	struct lwq q;
+	struct llist_node *l, **t1, *t2;
+	struct tnode *t;
+	struct task_struct *threads[8];
+
+	printk(KERN_INFO "testing lwq....\n");
+	lwq_init(&q);
+	printk(KERN_INFO " lwq: run some threads\n");
+	for (i = 0; i < ARRAY_SIZE(threads); i++)
+		threads[i] = kthread_run(lwq_exercise, &q, "lwq-test-%d", i);
+	for (i = 0; i < 100; i++) {
+		t = kmalloc(sizeof(*t), GFP_KERNEL);
+		if (!t)
+			break;
+		t->i = i;
+		t->c = 0;
+		if (lwq_enqueue(&t->n, &q))
+			wake_up_var(&q);
+	}
+	/* wait for threads to exit */
+	for (i = 0; i < ARRAY_SIZE(threads); i++)
+		if (!IS_ERR_OR_NULL(threads[i]))
+			kthread_stop(threads[i]);
+	printk(KERN_INFO " lwq: dequeue first 50:");
+	for (i = 0; i < 50 ; i++) {
+		if (i && (i % 10) == 0) {
+			printk(KERN_CONT "\n");
+			printk(KERN_INFO " lwq: ... ");
+		}
+		t = lwq_dequeue(&q, struct tnode, n);
+		if (t)
+			printk(KERN_CONT " %d(%d)", t->i, t->c);
+		kfree(t);
+	}
+	printk(KERN_CONT "\n");
+	l = lwq_dequeue_all(&q);
+	printk(KERN_INFO " lwq: delete the multiples of 3 (test lwq_for_each_safe())\n");
+	lwq_for_each_safe(t, t1, t2, &l, n) {
+		if ((t->i % 3) == 0) {
+			t->i = -1;
+			kfree(t);
+			t = NULL;
+		}
+	}
+	if (l)
+		lwq_enqueue_batch(l, &q);
+	printk(KERN_INFO " lwq: dequeue remaining:");
+	while ((t = lwq_dequeue(&q, struct tnode, n)) != NULL) {
+		printk(KERN_CONT " %d", t->i);
+		kfree(t);
+	}
+	printk(KERN_CONT "\n");
+	return 0;
+}
+
+module_init(lwq_test);
+#endif /* CONFIG_LWQ_TEST*/
diff --git a/lib/rcuref.c b/lib/rcuref.c
index 5ec00a4a64d1..97f300eca927 100644
--- a/lib/rcuref.c
+++ b/lib/rcuref.c
@@ -248,7 +248,7 @@ bool rcuref_put_slowpath(rcuref_t *ref)
 		 * require a retry. If this fails the caller is not
 		 * allowed to deconstruct the object.
 		 */
-		if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
+		if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))
 			return false;
 
 		/*
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 9982344cca34..7713f73e66b0 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -31,9 +31,11 @@
  * giving the size in the required units.  @buf should have room for
  * at least 9 bytes and will always be zero terminated.
  *
+ * Return value: number of characters of output that would have been written
+ * (which may be greater than len, if output was truncated).
  */
-void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
-		     char *buf, int len)
+int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
+		    char *buf, int len)
 {
 	static const char *const units_10[] = {
 		"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
@@ -126,8 +128,8 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
 	else
 		unit = units_str[units][i];
 
-	snprintf(buf, len, "%u%s %s", (u32)size,
-		 tmp, unit);
+	return snprintf(buf, len, "%u%s %s", (u32)size,
+			tmp, unit);
 }
 EXPORT_SYMBOL(string_get_size);
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 29ebf1e7898c..e52e3a0b8f2e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,22 +131,26 @@ static struct mempolicy default_policy = {
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 
 /**
- * numa_map_to_online_node - Find closest online node
+ * numa_nearest_node - Find nearest node by state
  * @node: Node id to start the search
+ * @state: State to filter the search
  *
- * Lookup the next closest node by distance if @nid is not online.
+ * Lookup the closest node by distance if @nid is not in state.
  *
- * Return: this @node if it is online, otherwise the closest node by distance
+ * Return: this @node if it is in state, otherwise the closest node by distance
  */
-int numa_map_to_online_node(int node)
+int numa_nearest_node(int node, unsigned int state)
 {
 	int min_dist = INT_MAX, dist, n, min_node;
 
-	if (node == NUMA_NO_NODE || node_online(node))
+	if (state >= NR_NODE_STATES)
+		return -EINVAL;
+
+	if (node == NUMA_NO_NODE || node_state(node, state))
 		return node;
 
 	min_node = node;
-	for_each_online_node(n) {
+	for_each_node_state(n, state) {
 		dist = node_distance(node, n);
 		if (dist < min_dist) {
 			min_dist = dist;
@@ -156,7 +160,7 @@ int numa_map_to_online_node(int node)
 
 	return min_node;
 }
-EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+EXPORT_SYMBOL_GPL(numa_nearest_node);
 
 struct mempolicy *get_task_policy(struct task_struct *p)
 {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9e018d8dd7d6..853489ca05ef 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3194,12 +3194,6 @@ limits_failed:
 }
 EXPORT_SYMBOL(vm_brk_flags);
 
-int vm_brk(unsigned long addr, unsigned long len)
-{
-	return vm_brk_flags(addr, len, 0);
-}
-EXPORT_SYMBOL(vm_brk);
-
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
diff --git a/mm/nommu.c b/mm/nommu.c
index 7f9e9e5a0e12..23c43c208f2b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1531,11 +1531,6 @@ void exit_mmap(struct mm_struct *mm)
 	mmap_write_unlock(mm);
 }
 
-int vm_brk(unsigned long addr, unsigned long len)
-{
-	return -ENOMEM;
-}
-
 /*
  * expand (or shrink) an existing mapping, potentially moving it at the same
  * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
diff --git a/mm/readahead.c b/mm/readahead.c
index e815c114de21..6925e6959fd3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -735,7 +735,8 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
 	 */
 	ret = -EINVAL;
 	if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
-	    !S_ISREG(file_inode(f.file)->i_mode))
+	    (!S_ISREG(file_inode(f.file)->i_mode) &&
+	    !S_ISBLK(file_inode(f.file)->i_mode)))
 		goto out;
 
 	ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
diff --git a/mm/shmem.c b/mm/shmem.c
index 69595d341882..6b102965d355 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1112,7 +1112,7 @@ whole_folios:
 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
 	shmem_undo_range(inode, lstart, lend, false);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	inode_inc_iversion(inode);
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -1224,7 +1224,7 @@ static int shmem_setattr(struct mnt_idmap *idmap,
 	if (!error && update_ctime) {
 		inode_set_ctime_current(inode);
 		if (update_mtime)
-			inode->i_mtime = inode_get_ctime(inode);
+			inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
 		inode_inc_iversion(inode);
 	}
 	return error;
@@ -2455,7 +2455,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
 	inode->i_ino = ino;
 	inode_init_owner(idmap, inode, dir, mode);
 	inode->i_blocks = 0;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_generation = get_random_u32();
 	info = SHMEM_I(inode);
 	memset(info, 0, (char *)inode - (char *)info);
@@ -2463,7 +2463,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
 	atomic_set(&info->stop_eviction, 0);
 	info->seals = F_SEAL_SEAL;
 	info->flags = flags & VM_NORESERVE;
-	info->i_crtime = inode->i_mtime;
+	info->i_crtime = inode_get_mtime(inode);
 	info->fsflags = (dir == NULL) ? 0 :
 		SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
 	if (info->fsflags)
@@ -3229,7 +3229,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
 		goto out_iput;
 
 	dir->i_size += BOGO_DIRENT_SIZE;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	inode_inc_iversion(dir);
 	d_instantiate(dentry, inode);
 	dget(dentry); /* Extra count - pin the dentry in core */
@@ -3318,8 +3318,8 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 	}
 
 	dir->i_size += BOGO_DIRENT_SIZE;
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	inode_inc_iversion(dir);
 	inc_nlink(inode);
 	ihold(inode);	/* New dentry reference */
@@ -3339,8 +3339,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 	simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
 
 	dir->i_size -= BOGO_DIRENT_SIZE;
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	inode_inc_iversion(dir);
 	drop_nlink(inode);
 	dput(dentry);	/* Undo the count from "create" - this does all the work */
@@ -3488,7 +3488,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		folio_put(folio);
 	}
 	dir->i_size += BOGO_DIRENT_SIZE;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	inode_inc_iversion(dir);
 	d_instantiate(dentry, inode);
 	dget(dentry);
@@ -3714,7 +3714,7 @@ static const struct xattr_handler shmem_user_xattr_handler = {
 	.set = shmem_xattr_handler_set,
 };
 
-static const struct xattr_handler *shmem_xattr_handlers[] = {
+static const struct xattr_handler * const shmem_xattr_handlers[] = {
 	&shmem_security_xattr_handler,
 	&shmem_trusted_xattr_handler,
 	&shmem_user_xattr_handler,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9bbffe82d65a..8d431193c273 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -528,26 +528,6 @@ bool slab_is_available(void)
 }
 
 #ifdef CONFIG_PRINTK
-/**
- * kmem_valid_obj - does the pointer reference a valid slab object?
- * @object: pointer to query.
- *
- * Return: %true if the pointer is to a not-yet-freed object from
- * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
- * is to an already-freed object, and %false otherwise.
- */
-bool kmem_valid_obj(void *object)
-{
-	struct folio *folio;
-
-	/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
-	if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
-		return false;
-	folio = virt_to_folio(object);
-	return folio_test_slab(folio);
-}
-EXPORT_SYMBOL_GPL(kmem_valid_obj);
-
 static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
 {
 	if (__kfence_obj_info(kpp, object, slab))
@@ -566,11 +546,11 @@ static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *
  * and, if available, the slab name, return address, and stack trace from
  * the allocation and last free path of that object.
  *
- * This function will splat if passed a pointer to a non-slab object.
- * If you are not sure what type of object you have, you should instead
- * use mem_dump_obj().
+ * Return: %true if the pointer is to a not-yet-freed object from
+ * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
+ * is to an already-freed object, and %false otherwise.
  */
-void kmem_dump_obj(void *object)
+bool kmem_dump_obj(void *object)
 {
 	char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
 	int i;
@@ -578,13 +558,13 @@ void kmem_dump_obj(void *object)
 	unsigned long ptroffset;
 	struct kmem_obj_info kp = { };
 
-	if (WARN_ON_ONCE(!virt_addr_valid(object)))
-		return;
+	/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
+	if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
+		return false;
 	slab = virt_to_slab(object);
-	if (WARN_ON_ONCE(!slab)) {
-		pr_cont(" non-slab memory.\n");
-		return;
-	}
+	if (!slab)
+		return false;
+
 	kmem_obj_info(&kp, object, slab);
 	if (kp.kp_slab_cache)
 		pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
@@ -621,6 +601,7 @@ void kmem_dump_obj(void *object)
 		pr_info("    %pS\n", kp.kp_free_stack[i]);
 	}
 
+	return true;
 }
 EXPORT_SYMBOL_GPL(kmem_dump_obj);
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index f7940048138c..63d281dfacdb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4110,17 +4110,12 @@ static unsigned int slub_min_objects;
  * the smallest order which will fit the object.
  */
 static inline unsigned int calc_slab_order(unsigned int size,
-		unsigned int min_objects, unsigned int max_order,
+		unsigned int min_order, unsigned int max_order,
 		unsigned int fract_leftover)
 {
-	unsigned int min_order = slub_min_order;
 	unsigned int order;
 
-	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
-		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
-
-	for (order = max(min_order, (unsigned int)get_order(min_objects * size));
-			order <= max_order; order++) {
+	for (order = min_order; order <= max_order; order++) {
 
 		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
 		unsigned int rem;
@@ -4139,16 +4134,8 @@ static inline int calculate_order(unsigned int size)
 	unsigned int order;
 	unsigned int min_objects;
 	unsigned int max_objects;
-	unsigned int nr_cpus;
+	unsigned int min_order;
 
-	/*
-	 * Attempt to find best configuration for a slab. This
-	 * works by first attempting to generate a layout with
-	 * the best configuration and backing off gradually.
-	 *
-	 * First we increase the acceptable waste in a slab. Then
-	 * we reduce the minimum objects required in a slab.
-	 */
 	min_objects = slub_min_objects;
 	if (!min_objects) {
 		/*
@@ -4160,40 +4147,46 @@ static inline int calculate_order(unsigned int size)
 		 * order on systems that appear larger than they are, and too
 		 * low order on systems that appear smaller than they are.
 		 */
-		nr_cpus = num_present_cpus();
+		unsigned int nr_cpus = num_present_cpus();
 		if (nr_cpus <= 1)
 			nr_cpus = nr_cpu_ids;
 		min_objects = 4 * (fls(nr_cpus) + 1);
 	}
-	max_objects = order_objects(slub_max_order, size);
+	/* min_objects can't be 0 because get_order(0) is undefined */
+	max_objects = max(order_objects(slub_max_order, size), 1U);
 	min_objects = min(min_objects, max_objects);
 
-	while (min_objects > 1) {
-		unsigned int fraction;
-
-		fraction = 16;
-		while (fraction >= 4) {
-			order = calc_slab_order(size, min_objects,
-					slub_max_order, fraction);
-			if (order <= slub_max_order)
-				return order;
-			fraction /= 2;
-		}
-		min_objects--;
-	}
+	min_order = max_t(unsigned int, slub_min_order,
+			  get_order(min_objects * size));
+	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
+		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 
 	/*
-	 * We were unable to place multiple objects in a slab. Now
-	 * lets see if we can place a single object there.
+	 * Attempt to find best configuration for a slab. This works by first
+	 * attempting to generate a layout with the best possible configuration
+	 * and backing off gradually.
+	 *
+	 * We start with accepting at most 1/16 waste and try to find the
+	 * smallest order from min_objects-derived/slub_min_order up to
+	 * slub_max_order that will satisfy the constraint. Note that increasing
+	 * the order can only result in same or less fractional waste, not more.
+	 *
+	 * If that fails, we increase the acceptable fraction of waste and try
+	 * again. The last iteration with fraction of 1/2 would effectively
+	 * accept any waste and give us the order determined by min_objects, as
+	 * long as at least single object fits within slub_max_order.
 	 */
-	order = calc_slab_order(size, 1, slub_max_order, 1);
-	if (order <= slub_max_order)
-		return order;
+	for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
+		order = calc_slab_order(size, min_order, slub_max_order,
+					fraction);
+		if (order <= slub_max_order)
+			return order;
+	}
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
-	order = calc_slab_order(size, 1, MAX_ORDER, 1);
+	order = get_order(size);
 	if (order <= MAX_ORDER)
 		return order;
 	return -ENOSYS;
@@ -4711,6 +4704,9 @@ static int __init setup_slub_min_order(char *str)
 {
 	get_option(&str, (int *)&slub_min_order);
 
+	if (slub_min_order > slub_max_order)
+		slub_max_order = slub_min_order;
+
 	return 1;
 }
 
@@ -4721,6 +4717,9 @@ static int __init setup_slub_max_order(char *str)
 	get_option(&str, (int *)&slub_max_order);
 	slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER);
 
+	if (slub_min_order > slub_max_order)
+		slub_min_order = slub_max_order;
+
 	return 1;
 }
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e52f486834eb..4bc70f459164 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2530,11 +2530,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	exit_swap_address_space(p->type);
 
 	inode = mapping->host;
-	if (S_ISBLK(inode->i_mode)) {
-		struct block_device *bdev = I_BDEV(inode);
-
-		set_blocksize(bdev, old_block_size);
-		blkdev_put(bdev, p);
+	if (p->bdev_handle) {
+		set_blocksize(p->bdev, old_block_size);
+		bdev_release(p->bdev_handle);
+		p->bdev_handle = NULL;
 	}
 
 	inode_lock(inode);
@@ -2764,13 +2763,14 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
 	int error;
 
 	if (S_ISBLK(inode->i_mode)) {
-		p->bdev = blkdev_get_by_dev(inode->i_rdev,
+		p->bdev_handle = bdev_open_by_dev(inode->i_rdev,
 				BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
-		if (IS_ERR(p->bdev)) {
-			error = PTR_ERR(p->bdev);
-			p->bdev = NULL;
+		if (IS_ERR(p->bdev_handle)) {
+			error = PTR_ERR(p->bdev_handle);
+			p->bdev_handle = NULL;
 			return error;
 		}
+		p->bdev = p->bdev_handle->bdev;
 		p->old_block_size = block_size(p->bdev);
 		error = set_blocksize(p->bdev, PAGE_SIZE);
 		if (error < 0)
@@ -3206,9 +3206,10 @@ bad_swap:
 	p->percpu_cluster = NULL;
 	free_percpu(p->cluster_next_cpu);
 	p->cluster_next_cpu = NULL;
-	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
+	if (p->bdev_handle) {
 		set_blocksize(p->bdev, p->old_block_size);
-		blkdev_put(p->bdev, p);
+		bdev_release(p->bdev_handle);
+		p->bdev_handle = NULL;
 	}
 	inode = NULL;
 	destroy_swap_extents(p);
diff --git a/mm/util.c b/mm/util.c
index 8cbbfd3a3d59..6eddd891198e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1060,10 +1060,8 @@ void mem_dump_obj(void *object)
 {
 	const char *type;
 
-	if (kmem_valid_obj(object)) {
-		kmem_dump_obj(object);
+	if (kmem_dump_obj(object))
 		return;
-	}
 
 	if (vmalloc_dump_obj(object))
 		return;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 176eb5834746..103d46fa0eeb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -50,7 +50,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/uio.h>
+#include <linux/iov_iter.h>
 #include <linux/indirect_call_wrapper.h>
 
 #include <net/protocol.h>
@@ -61,6 +61,7 @@
 #include <net/tcp_states.h>
 #include <trace/events/skb.h>
 #include <net/busy_poll.h>
+#include <crypto/hash.h>
 
 /*
  *	Is a socket 'connection oriented' ?
@@ -489,6 +490,24 @@ short_copy:
 	return 0;
 }
 
+static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+				    struct iov_iter *i)
+{
+#ifdef CONFIG_CRYPTO_HASH
+	struct ahash_request *hash = hashp;
+	struct scatterlist sg;
+	size_t copied;
+
+	copied = copy_to_iter(addr, bytes, i);
+	sg_init_one(&sg, addr, copied);
+	ahash_request_set_crypt(hash, &sg, NULL, copied);
+	crypto_ahash_update(hash);
+	return copied;
+#else
+	return 0;
+#endif
+}
+
 /**
  *	skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
  *          and update a hash.
@@ -716,6 +735,60 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 }
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
+static __always_inline
+size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
+			      size_t len, void *from, void *priv2)
+{
+	__wsum next, *csum = priv2;
+
+	next = csum_and_copy_to_user(from + progress, iter_to, len);
+	*csum = csum_block_add(*csum, next, progress);
+	return next ? 0 : len;
+}
+
+static __always_inline
+size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
+			   size_t len, void *from, void *priv2)
+{
+	__wsum *csum = priv2;
+	__wsum next = csum_partial_copy_nocheck(from, iter_to, len);
+
+	*csum = csum_block_add(*csum, next, progress);
+	return 0;
+}
+
+struct csum_state {
+	__wsum csum;
+	size_t off;
+};
+
+static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
+				    struct iov_iter *i)
+{
+	struct csum_state *csstate = _csstate;
+	__wsum sum;
+
+	if (WARN_ON_ONCE(i->data_source))
+		return 0;
+	if (unlikely(iov_iter_is_discard(i))) {
+		// can't use csum_memcpy() for that one - data is not copied
+		csstate->csum = csum_block_add(csstate->csum,
+					       csum_partial(addr, bytes, 0),
+					       csstate->off);
+		csstate->off += bytes;
+		return bytes;
+	}
+
+	sum = csum_shift(csstate->csum, csstate->off);
+
+	bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
+				     copy_to_user_iter_csum,
+				     memcpy_to_iter_csum);
+	csstate->csum = csum_shift(sum, csstate->off);
+	csstate->off += bytes;
+	return bytes;
+}
+
 /**
  *	skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
  *          and update a checksum.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c52ddd6891d9..b157efea5dea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -62,6 +62,7 @@
 #include <linux/if_vlan.h>
 #include <linux/mpls.h>
 #include <linux/kcov.h>
+#include <linux/iov_iter.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
@@ -6948,3 +6949,42 @@ out:
 	return spliced ?: ret;
 }
 EXPORT_SYMBOL(skb_splice_from_iter);
+
+static __always_inline
+size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
+			     size_t len, void *to, void *priv2)
+{
+	__wsum *csum = priv2;
+	__wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);
+
+	*csum = csum_block_add(*csum, next, progress);
+	return 0;
+}
+
+static __always_inline
+size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
+				size_t len, void *to, void *priv2)
+{
+	__wsum next, *csum = priv2;
+
+	next = csum_and_copy_from_user(iter_from, to + progress, len);
+	*csum = csum_block_add(*csum, next, progress);
+	return next ? 0 : len;
+}
+
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
+				  __wsum *csum, struct iov_iter *i)
+{
+	size_t copied;
+
+	if (WARN_ON_ONCE(!i->data_source))
+		return false;
+	copied = iterate_and_advance2(i, bytes, addr, csum,
+				      copy_from_user_iter_csum,
+				      memcpy_from_iter_csum);
+	if (likely(copied == bytes))
+		return true;
+	iov_iter_revert(i, copied);
+	return false;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter_full);
diff --git a/net/socket.c b/net/socket.c
index c4a6f5532955..5740475e084c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -403,7 +403,7 @@ static const struct xattr_handler sockfs_security_xattr_handler = {
 	.set = sockfs_security_xattr_set,
 };
 
-static const struct xattr_handler *sockfs_xattr_handlers[] = {
+static const struct xattr_handler * const sockfs_xattr_handlers[] = {
 	&sockfs_xattr_handler,
 	&sockfs_security_xattr_handler,
 	NULL
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 65a6c6429a53..caa94cf57123 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -83,7 +83,6 @@ static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt)
 		return NULL;
 
 	req->rq_xprt = xprt;
-	INIT_LIST_HEAD(&req->rq_bc_list);
 
 	/* Preallocate one XDR receive buffer */
 	if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) {
@@ -349,10 +348,8 @@ found:
 }
 
 /*
- * Add callback request to callback list.  The callback
- * service sleeps on the sv_cb_waitq waiting for new
- * requests.  Wake it up after adding enqueing the
- * request.
+ * Add callback request to callback list.  Wake a thread
+ * on the first pool (usually the only pool) to handle it.
  */
 void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 {
@@ -369,8 +366,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
 	dprintk("RPC:       add callback request to list\n");
 	xprt_get(xprt);
-	spin_lock(&bc_serv->sv_cb_lock);
-	list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
-	wake_up(&bc_serv->sv_cb_waitq);
-	spin_unlock(&bc_serv->sv_cb_lock);
+	lwq_enqueue(&req->rq_bc_list, &bc_serv->sv_cb_list);
+	svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index f420d8457345..dcc2b4f49e77 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -472,7 +472,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
 		return NULL;
 	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	switch (mode & S_IFMT) {
 	case S_IFDIR:
 		inode->i_fop = &simple_dir_operations;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 812fda9d45dd..3f2ea7a0496f 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -438,9 +438,7 @@ EXPORT_SYMBOL_GPL(svc_bind);
 static void
 __svc_init_bc(struct svc_serv *serv)
 {
-	INIT_LIST_HEAD(&serv->sv_cb_list);
-	spin_lock_init(&serv->sv_cb_lock);
-	init_waitqueue_head(&serv->sv_cb_waitq);
+	lwq_init(&serv->sv_cb_list);
 }
 #else
 static void
@@ -509,9 +507,9 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 				i, serv->sv_name);
 
 		pool->sp_id = i;
-		INIT_LIST_HEAD(&pool->sp_sockets);
+		lwq_init(&pool->sp_xprts);
 		INIT_LIST_HEAD(&pool->sp_all_threads);
-		spin_lock_init(&pool->sp_lock);
+		init_llist_head(&pool->sp_idle_threads);
 
 		percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL);
 		percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL);
@@ -575,11 +573,12 @@ svc_destroy(struct kref *ref)
 	timer_shutdown_sync(&serv->sv_temptimer);
 
 	/*
-	 * The last user is gone and thus all sockets have to be destroyed to
-	 * the point. Check this.
+	 * Remaining transports at this point are not expected.
 	 */
-	BUG_ON(!list_empty(&serv->sv_permsocks));
-	BUG_ON(!list_empty(&serv->sv_tempsocks));
+	WARN_ONCE(!list_empty(&serv->sv_permsocks),
+		  "SVC: permsocks remain for %s\n", serv->sv_program->pg_name);
+	WARN_ONCE(!list_empty(&serv->sv_tempsocks),
+		  "SVC: tempsocks remain for %s\n", serv->sv_program->pg_name);
 
 	cache_clean_deferred(serv);
 
@@ -642,7 +641,6 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
 
 	folio_batch_init(&rqstp->rq_fbatch);
 
-	__set_bit(RQ_BUSY, &rqstp->rq_flags);
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
@@ -682,10 +680,13 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 	serv->sv_nrthreads += 1;
 	spin_unlock_bh(&serv->sv_lock);
 
-	spin_lock_bh(&pool->sp_lock);
-	pool->sp_nrthreads++;
+	atomic_inc(&pool->sp_nrthreads);
+
+	/* Protected by whatever lock the service uses when calling
+	 * svc_set_num_threads()
+	 */
 	list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
-	spin_unlock_bh(&pool->sp_lock);
+
 	return rqstp;
 }
 
@@ -701,23 +702,25 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 void svc_pool_wake_idle_thread(struct svc_pool *pool)
 {
 	struct svc_rqst	*rqstp;
+	struct llist_node *ln;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
-		if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags))
-			continue;
-
+	ln = READ_ONCE(pool->sp_idle_threads.first);
+	if (ln) {
+		rqstp = llist_entry(ln, struct svc_rqst, rq_idle);
 		WRITE_ONCE(rqstp->rq_qtime, ktime_get());
-		wake_up_process(rqstp->rq_task);
+		if (!task_is_running(rqstp->rq_task)) {
+			wake_up_process(rqstp->rq_task);
+			trace_svc_wake_up(rqstp->rq_task->pid);
+			percpu_counter_inc(&pool->sp_threads_woken);
+		}
 		rcu_read_unlock();
-		percpu_counter_inc(&pool->sp_threads_woken);
-		trace_svc_wake_up(rqstp->rq_task->pid);
 		return;
 	}
 	rcu_read_unlock();
 
-	set_bit(SP_CONGESTED, &pool->sp_flags);
 }
+EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
 
 static struct svc_pool *
 svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
@@ -725,36 +728,38 @@ svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
 	return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools];
 }
 
-static struct task_struct *
-svc_pool_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
+static struct svc_pool *
+svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool,
+		unsigned int *state)
 {
+	struct svc_pool *pool;
 	unsigned int i;
-	struct task_struct *task = NULL;
+
+retry:
+	pool = target_pool;
 
 	if (pool != NULL) {
-		spin_lock_bh(&pool->sp_lock);
+		if (atomic_inc_not_zero(&pool->sp_nrthreads))
+			goto found_pool;
+		return NULL;
 	} else {
 		for (i = 0; i < serv->sv_nrpools; i++) {
 			pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
-			spin_lock_bh(&pool->sp_lock);
-			if (!list_empty(&pool->sp_all_threads))
+			if (atomic_inc_not_zero(&pool->sp_nrthreads))
 				goto found_pool;
-			spin_unlock_bh(&pool->sp_lock);
 		}
 		return NULL;
 	}
 
 found_pool:
-	if (!list_empty(&pool->sp_all_threads)) {
-		struct svc_rqst *rqstp;
-
-		rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
-		set_bit(RQ_VICTIM, &rqstp->rq_flags);
-		list_del_rcu(&rqstp->rq_all);
-		task = rqstp->rq_task;
-	}
-	spin_unlock_bh(&pool->sp_lock);
-	return task;
+	set_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
+	set_bit(SP_NEED_VICTIM, &pool->sp_flags);
+	if (!atomic_dec_and_test(&pool->sp_nrthreads))
+		return pool;
+	/* Nothing left in this pool any more */
+	clear_bit(SP_NEED_VICTIM, &pool->sp_flags);
+	clear_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
+	goto retry;
 }
 
 static int
@@ -795,18 +800,16 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 static int
 svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 {
-	struct svc_rqst	*rqstp;
-	struct task_struct *task;
 	unsigned int state = serv->sv_nrthreads-1;
+	struct svc_pool *victim;
 
 	do {
-		task = svc_pool_victim(serv, pool, &state);
-		if (task == NULL)
+		victim = svc_pool_victim(serv, pool, &state);
+		if (!victim)
 			break;
-		rqstp = kthread_data(task);
-		/* Did we lose a race to svo_function threadfn? */
-		if (kthread_stop(task) == -EINTR)
-			svc_exit_thread(rqstp);
+		svc_pool_wake_idle_thread(victim);
+		wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS,
+			    TASK_IDLE);
 		nrservs++;
 	} while (nrservs < 0);
 	return 0;
@@ -832,13 +835,10 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 int
 svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 {
-	if (pool == NULL) {
+	if (!pool)
 		nrservs -= serv->sv_nrthreads;
-	} else {
-		spin_lock_bh(&pool->sp_lock);
-		nrservs -= pool->sp_nrthreads;
-		spin_unlock_bh(&pool->sp_lock);
-	}
+	else
+		nrservs -= atomic_read(&pool->sp_nrthreads);
 
 	if (nrservs > 0)
 		return svc_start_kthreads(serv, pool, nrservs);
@@ -924,11 +924,9 @@ svc_exit_thread(struct svc_rqst *rqstp)
 	struct svc_serv	*serv = rqstp->rq_server;
 	struct svc_pool	*pool = rqstp->rq_pool;
 
-	spin_lock_bh(&pool->sp_lock);
-	pool->sp_nrthreads--;
-	if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
-		list_del_rcu(&rqstp->rq_all);
-	spin_unlock_bh(&pool->sp_lock);
+	list_del_rcu(&rqstp->rq_all);
+
+	atomic_dec(&pool->sp_nrthreads);
 
 	spin_lock_bh(&serv->sv_lock);
 	serv->sv_nrthreads -= 1;
@@ -938,6 +936,11 @@ svc_exit_thread(struct svc_rqst *rqstp)
 	svc_rqst_free(rqstp);
 
 	svc_put(serv);
+	/* That svc_put() cannot be the last, because the thread
+	 * waiting for SP_VICTIM_REMAINS to clear must hold
+	 * a reference. So it is still safe to access pool.
+	 */
+	clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags);
 }
 EXPORT_SYMBOL_GPL(svc_exit_thread);
 
@@ -1544,24 +1547,20 @@ out_drop:
 }
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
-/*
- * Process a backchannel RPC request that arrived over an existing
- * outbound connection
+/**
+ * svc_process_bc - process a reverse-direction RPC request
+ * @req: RPC request to be used for client-side processing
+ * @rqstp: server-side execution context
+ *
  */
-int
-bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
-	       struct svc_rqst *rqstp)
+void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp)
 {
 	struct rpc_task *task;
 	int proc_error;
-	int error;
-
-	dprintk("svc: %s(%p)\n", __func__, req);
 
 	/* Build the svc_rqst used by the common processing routine */
 	rqstp->rq_xid = req->rq_xid;
 	rqstp->rq_prot = req->rq_xprt->prot;
-	rqstp->rq_server = serv;
 	rqstp->rq_bc_net = req->rq_xprt->xprt_net;
 
 	rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
@@ -1590,10 +1589,8 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	 * been processed by the caller.
 	 */
 	svcxdr_init_decode(rqstp);
-	if (!xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2)) {
-		error = -EINVAL;
-		goto out;
-	}
+	if (!xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2))
+		return;
 
 	/* Parse and execute the bc call */
 	proc_error = svc_process_common(rqstp);
@@ -1602,26 +1599,18 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	if (!proc_error) {
 		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
-		error = -EINVAL;
-		goto out;
+		return;
 	}
 	/* Finally, send the reply synchronously */
 	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
 	task = rpc_run_bc_task(req);
-	if (IS_ERR(task)) {
-		error = PTR_ERR(task);
-		goto out;
-	}
+	if (IS_ERR(task))
+		return;
 
 	WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
-	error = task->tk_status;
 	rpc_put_task(task);
-
-out:
-	dprintk("svc: %s(), error=%d\n", __func__, error);
-	return error;
 }
-EXPORT_SYMBOL_GPL(bc_svc_process);
+EXPORT_SYMBOL_GPL(svc_process_bc);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
 /**
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 4cfe9640df48..fee83d1024bc 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -9,7 +9,6 @@
 #include <linux/sched/mm.h>
 #include <linux/errno.h>
 #include <linux/freezer.h>
-#include <linux/kthread.h>
 #include <linux/slab.h>
 #include <net/sock.h>
 #include <linux/sunrpc/addr.h>
@@ -17,6 +16,7 @@
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/bc_xprt.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <trace/events/sunrpc.h>
@@ -201,7 +201,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
 	kref_init(&xprt->xpt_ref);
 	xprt->xpt_server = serv;
 	INIT_LIST_HEAD(&xprt->xpt_list);
-	INIT_LIST_HEAD(&xprt->xpt_ready);
 	INIT_LIST_HEAD(&xprt->xpt_deferred);
 	INIT_LIST_HEAD(&xprt->xpt_users);
 	mutex_init(&xprt->xpt_mutex);
@@ -472,9 +471,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	pool = svc_pool_for_cpu(xprt->xpt_server);
 
 	percpu_counter_inc(&pool->sp_sockets_queued);
-	spin_lock_bh(&pool->sp_lock);
-	list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
-	spin_unlock_bh(&pool->sp_lock);
+	lwq_enqueue(&xprt->xpt_ready, &pool->sp_xprts);
 
 	svc_pool_wake_idle_thread(pool);
 }
@@ -487,18 +484,9 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
 {
 	struct svc_xprt	*xprt = NULL;
 
-	if (list_empty(&pool->sp_sockets))
-		goto out;
-
-	spin_lock_bh(&pool->sp_lock);
-	if (likely(!list_empty(&pool->sp_sockets))) {
-		xprt = list_first_entry(&pool->sp_sockets,
-					struct svc_xprt, xpt_ready);
-		list_del_init(&xprt->xpt_ready);
+	xprt = lwq_dequeue(&pool->sp_xprts, struct svc_xprt, xpt_ready);
+	if (xprt)
 		svc_xprt_get(xprt);
-	}
-	spin_unlock_bh(&pool->sp_lock);
-out:
 	return xprt;
 }
 
@@ -674,7 +662,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
 			continue;
 
 		set_current_state(TASK_IDLE);
-		if (kthread_should_stop()) {
+		if (svc_thread_should_stop(rqstp)) {
 			set_current_state(TASK_RUNNING);
 			return false;
 		}
@@ -699,7 +687,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
 }
 
 static bool
-rqst_should_sleep(struct svc_rqst *rqstp)
+svc_thread_should_sleep(struct svc_rqst *rqstp)
 {
 	struct svc_pool		*pool = rqstp->rq_pool;
 
@@ -708,65 +696,51 @@ rqst_should_sleep(struct svc_rqst *rqstp)
 		return false;
 
 	/* was a socket queued? */
-	if (!list_empty(&pool->sp_sockets))
+	if (!lwq_empty(&pool->sp_xprts))
 		return false;
 
 	/* are we shutting down? */
-	if (kthread_should_stop())
+	if (svc_thread_should_stop(rqstp))
 		return false;
 
-	/* are we freezing? */
-	if (freezing(current))
-		return false;
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	if (svc_is_backchannel(rqstp)) {
+		if (!lwq_empty(&rqstp->rq_server->sv_cb_list))
+			return false;
+	}
+#endif
 
 	return true;
 }
 
-static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp)
+static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
 {
-	struct svc_pool		*pool = rqstp->rq_pool;
-
-	/* rq_xprt should be clear on entry */
-	WARN_ON_ONCE(rqstp->rq_xprt);
-
-	rqstp->rq_xprt = svc_xprt_dequeue(pool);
-	if (rqstp->rq_xprt)
-		goto out_found;
-
-	set_current_state(TASK_IDLE);
-	smp_mb__before_atomic();
-	clear_bit(SP_CONGESTED, &pool->sp_flags);
-	clear_bit(RQ_BUSY, &rqstp->rq_flags);
-	smp_mb__after_atomic();
-
-	if (likely(rqst_should_sleep(rqstp)))
-		schedule();
-	else
+	struct svc_pool *pool = rqstp->rq_pool;
+
+	if (svc_thread_should_sleep(rqstp)) {
+		set_current_state(TASK_IDLE | TASK_FREEZABLE);
+		llist_add(&rqstp->rq_idle, &pool->sp_idle_threads);
+		if (likely(svc_thread_should_sleep(rqstp)))
+			schedule();
+
+		while (!llist_del_first_this(&pool->sp_idle_threads,
+					     &rqstp->rq_idle)) {
+			/* Work just became available.  This thread can only
+			 * handle it after removing rqstp from the idle
+			 * list. If that attempt failed, some other thread
+			 * must have queued itself after finding no
+			 * work to do, so that thread has taken responsibly
+			 * for this new work.  This thread can safely sleep
+			 * until woken again.
+			 */
+			schedule();
+			set_current_state(TASK_IDLE | TASK_FREEZABLE);
+		}
 		__set_current_state(TASK_RUNNING);
-
+	} else {
+		cond_resched();
+	}
 	try_to_freeze();
-
-	set_bit(RQ_BUSY, &rqstp->rq_flags);
-	smp_mb__after_atomic();
-	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
-	rqstp->rq_xprt = svc_xprt_dequeue(pool);
-	if (rqstp->rq_xprt)
-		goto out_found;
-
-	if (kthread_should_stop())
-		return NULL;
-	return NULL;
-out_found:
-	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
-	/* Normally we will wait up to 5 seconds for any required
-	 * cache information to be provided.
-	 */
-	if (!test_bit(SP_CONGESTED, &pool->sp_flags))
-		rqstp->rq_chandle.thread_wait = 5*HZ;
-	else
-		rqstp->rq_chandle.thread_wait = 1*HZ;
-	trace_svc_xprt_dequeue(rqstp);
-	return rqstp->rq_xprt;
 }
 
 static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
@@ -785,7 +759,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt
 	svc_xprt_received(newxpt);
 }
 
-static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 {
 	struct svc_serv *serv = rqstp->rq_server;
 	int len = 0;
@@ -826,11 +800,35 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 			len = xprt->xpt_ops->xpo_recvfrom(rqstp);
 		rqstp->rq_reserved = serv->sv_max_mesg;
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+		if (len <= 0)
+			goto out;
+
+		trace_svc_xdr_recvfrom(&rqstp->rq_arg);
+
+		clear_bit(XPT_OLD, &xprt->xpt_flags);
+
+		rqstp->rq_chandle.defer = svc_defer;
+
+		if (serv->sv_stats)
+			serv->sv_stats->netcnt++;
+		percpu_counter_inc(&rqstp->rq_pool->sp_messages_arrived);
+		rqstp->rq_stime = ktime_get();
+		svc_process(rqstp);
 	} else
 		svc_xprt_received(xprt);
 
 out:
-	return len;
+	rqstp->rq_res.len = 0;
+	svc_xprt_release(rqstp);
+}
+
+static void svc_thread_wake_next(struct svc_rqst *rqstp)
+{
+	if (!svc_thread_should_sleep(rqstp))
+		/* More work pending after I dequeued some,
+		 * wake another worker
+		 */
+		svc_pool_wake_idle_thread(rqstp->rq_pool);
 }
 
 /**
@@ -843,44 +841,51 @@ out:
  */
 void svc_recv(struct svc_rqst *rqstp)
 {
-	struct svc_xprt		*xprt = NULL;
-	struct svc_serv		*serv = rqstp->rq_server;
-	int			len;
+	struct svc_pool *pool = rqstp->rq_pool;
 
 	if (!svc_alloc_arg(rqstp))
-		goto out;
+		return;
 
-	try_to_freeze();
-	cond_resched();
-	if (kthread_should_stop())
-		goto out;
+	svc_thread_wait_for_work(rqstp);
 
-	xprt = svc_get_next_xprt(rqstp);
-	if (!xprt)
-		goto out;
+	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
 
-	len = svc_handle_xprt(rqstp, xprt);
+	if (svc_thread_should_stop(rqstp)) {
+		svc_thread_wake_next(rqstp);
+		return;
+	}
 
-	/* No data, incomplete (TCP) read, or accept() */
-	if (len <= 0)
-		goto out_release;
+	rqstp->rq_xprt = svc_xprt_dequeue(pool);
+	if (rqstp->rq_xprt) {
+		struct svc_xprt *xprt = rqstp->rq_xprt;
 
-	trace_svc_xdr_recvfrom(&rqstp->rq_arg);
+		svc_thread_wake_next(rqstp);
+		/* Normally we will wait up to 5 seconds for any required
+		 * cache information to be provided.  When there are no
+		 * idle threads, we reduce the wait time.
+		 */
+		if (pool->sp_idle_threads.first)
+			rqstp->rq_chandle.thread_wait = 5 * HZ;
+		else
+			rqstp->rq_chandle.thread_wait = 1 * HZ;
 
-	clear_bit(XPT_OLD, &xprt->xpt_flags);
+		trace_svc_xprt_dequeue(rqstp);
+		svc_handle_xprt(rqstp, xprt);
+	}
 
-	rqstp->rq_chandle.defer = svc_defer;
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	if (svc_is_backchannel(rqstp)) {
+		struct svc_serv *serv = rqstp->rq_server;
+		struct rpc_rqst *req;
 
-	if (serv->sv_stats)
-		serv->sv_stats->netcnt++;
-	percpu_counter_inc(&rqstp->rq_pool->sp_messages_arrived);
-	rqstp->rq_stime = ktime_get();
-	svc_process(rqstp);
-out:
-	return;
-out_release:
-	rqstp->rq_res.len = 0;
-	svc_xprt_release(rqstp);
+		req = lwq_dequeue(&serv->sv_cb_list,
+				  struct rpc_rqst, rq_bc_list);
+		if (req) {
+			svc_thread_wake_next(rqstp);
+			svc_process_bc(req, rqstp);
+		}
+	}
+#endif
 }
 EXPORT_SYMBOL_GPL(svc_recv);
 
@@ -890,7 +895,6 @@ EXPORT_SYMBOL_GPL(svc_recv);
 void svc_drop(struct svc_rqst *rqstp)
 {
 	trace_svc_drop(rqstp);
-	svc_xprt_release(rqstp);
 }
 EXPORT_SYMBOL_GPL(svc_drop);
 
@@ -906,8 +910,6 @@ void svc_send(struct svc_rqst *rqstp)
 	int status;
 
 	xprt = rqstp->rq_xprt;
-	if (!xprt)
-		return;
 
 	/* calculate over-all length */
 	xb = &rqstp->rq_res;
@@ -920,7 +922,6 @@ void svc_send(struct svc_rqst *rqstp)
 	status = xprt->xpt_ops->xpo_sendto(rqstp);
 
 	trace_svc_send(rqstp, status);
-	svc_xprt_release(rqstp);
 }
 
 /*
@@ -1031,7 +1032,6 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
 
 	spin_lock_bh(&serv->sv_lock);
 	list_del_init(&xprt->xpt_list);
-	WARN_ON_ONCE(!list_empty(&xprt->xpt_ready));
 	if (test_bit(XPT_TEMP, &xprt->xpt_flags))
 		serv->sv_tmpcnt--;
 	spin_unlock_bh(&serv->sv_lock);
@@ -1082,36 +1082,26 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st
 	return ret;
 }
 
-static struct svc_xprt *svc_dequeue_net(struct svc_serv *serv, struct net *net)
+static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
 {
-	struct svc_pool *pool;
 	struct svc_xprt *xprt;
-	struct svc_xprt *tmp;
 	int i;
 
 	for (i = 0; i < serv->sv_nrpools; i++) {
-		pool = &serv->sv_pools[i];
-
-		spin_lock_bh(&pool->sp_lock);
-		list_for_each_entry_safe(xprt, tmp, &pool->sp_sockets, xpt_ready) {
-			if (xprt->xpt_net != net)
-				continue;
-			list_del_init(&xprt->xpt_ready);
-			spin_unlock_bh(&pool->sp_lock);
-			return xprt;
+		struct svc_pool *pool = &serv->sv_pools[i];
+		struct llist_node *q, **t1, *t2;
+
+		q = lwq_dequeue_all(&pool->sp_xprts);
+		lwq_for_each_safe(xprt, t1, t2, &q, xpt_ready) {
+			if (xprt->xpt_net == net) {
+				set_bit(XPT_CLOSE, &xprt->xpt_flags);
+				svc_delete_xprt(xprt);
+				xprt = NULL;
+			}
 		}
-		spin_unlock_bh(&pool->sp_lock);
-	}
-	return NULL;
-}
-
-static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
-{
-	struct svc_xprt *xprt;
 
-	while ((xprt = svc_dequeue_net(serv, net))) {
-		set_bit(XPT_CLOSE, &xprt->xpt_flags);
-		svc_delete_xprt(xprt);
+		if (q)
+			lwq_enqueue_batch(q, &pool->sp_xprts);
 	}
 }
 
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index e4d84a13c566..8c817e755262 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -263,11 +263,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	/* Queue rqst for ULP's callback service */
 	bc_serv = xprt->bc_serv;
 	xprt_get(xprt);
-	spin_lock(&bc_serv->sv_cb_lock);
-	list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
-	spin_unlock(&bc_serv->sv_cb_lock);
+	lwq_enqueue(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
 
-	wake_up(&bc_serv->sv_cb_waitq);
+	svc_pool_wake_idle_thread(&bc_serv->sv_pools[0]);
 
 	r_xprt->rx_stats.bcall_count++;
 	return;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 85c8bcaebb80..3b05f90a3e50 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -852,7 +852,8 @@ out_readfail:
 	if (ret == -EINVAL)
 		svc_rdma_send_error(rdma_xprt, ctxt, ret);
 	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
-	return ret;
+	svc_xprt_deferred_close(xprt);
+	return -ENOTCONN;
 
 out_backchannel:
 	svc_rdma_handle_bc_reply(rqstp, ctxt);
diff --git a/rust/Makefile b/rust/Makefile
index 7dbf9abe0d01..28dd31cf1179 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -336,13 +336,13 @@ quiet_cmd_bindgen = BINDGEN $@
 		$(bindgen_target_cflags) $(bindgen_target_extra)
 
 $(obj)/bindings/bindings_generated.rs: private bindgen_target_flags = \
-    $(shell grep -v '^#\|^$$' $(srctree)/$(src)/bindgen_parameters)
+    $(shell grep -Ev '^#|^$$' $(srctree)/$(src)/bindgen_parameters)
 $(obj)/bindings/bindings_generated.rs: $(src)/bindings/bindings_helper.h \
     $(src)/bindgen_parameters FORCE
 	$(call if_changed_dep,bindgen)
 
 $(obj)/uapi/uapi_generated.rs: private bindgen_target_flags = \
-    $(shell grep -v '^#\|^$$' $(srctree)/$(src)/bindgen_parameters)
+    $(shell grep -Ev '^#|^$$' $(srctree)/$(src)/bindgen_parameters)
 $(obj)/uapi/uapi_generated.rs: $(src)/uapi/uapi_helper.h \
     $(src)/bindgen_parameters FORCE
 	$(call if_changed_dep,bindgen)
@@ -364,9 +364,7 @@ $(obj)/bindings/bindings_helpers_generated.rs: $(src)/helpers.c FORCE
 quiet_cmd_exports = EXPORTS $@
       cmd_exports = \
 	$(NM) -p --defined-only $< \
-		| grep -E ' (T|R|D) ' | cut -d ' ' -f 3 \
-		| xargs -Isymbol \
-		echo 'EXPORT_SYMBOL_RUST_GPL(symbol);' > $@
+		| awk '/ (T|R|D) / {printf "EXPORT_SYMBOL_RUST_GPL(%s);\n",$$3}' > $@
 
 $(obj)/exports_core_generated.h: $(obj)/core.o FORCE
 	$(call if_changed,exports)
diff --git a/rust/alloc/alloc.rs b/rust/alloc/alloc.rs
index 0b6bf5b6da43..8cb4a31cf6e5 100644
--- a/rust/alloc/alloc.rs
+++ b/rust/alloc/alloc.rs
@@ -6,9 +6,7 @@
 
 #[cfg(not(test))]
 use core::intrinsics;
-use core::intrinsics::{min_align_of_val, size_of_val};
 
-use core::ptr::Unique;
 #[cfg(not(test))]
 use core::ptr::{self, NonNull};
 
@@ -40,7 +38,6 @@ extern "Rust" {
     #[rustc_nounwind]
     fn __rust_alloc_zeroed(size: usize, align: usize) -> *mut u8;
 
-    #[cfg(not(bootstrap))]
     static __rust_no_alloc_shim_is_unstable: u8;
 }
 
@@ -98,7 +95,6 @@ pub unsafe fn alloc(layout: Layout) -> *mut u8 {
     unsafe {
         // Make sure we don't accidentally allow omitting the allocator shim in
         // stable code until it is actually stabilized.
-        #[cfg(not(bootstrap))]
         core::ptr::read_volatile(&__rust_no_alloc_shim_is_unstable);
 
         __rust_alloc(layout.size(), layout.align())
@@ -339,22 +335,6 @@ unsafe fn exchange_malloc(size: usize, align: usize) -> *mut u8 {
     }
 }
 
-#[cfg_attr(not(test), lang = "box_free")]
-#[inline]
-// This signature has to be the same as `Box`, otherwise an ICE will happen.
-// When an additional parameter to `Box` is added (like `A: Allocator`), this has to be added here as
-// well.
-// For example if `Box` is changed to  `struct Box<T: ?Sized, A: Allocator>(Unique<T>, A)`,
-// this function has to be changed to `fn box_free<T: ?Sized, A: Allocator>(Unique<T>, A)` as well.
-pub(crate) unsafe fn box_free<T: ?Sized, A: Allocator>(ptr: Unique<T>, alloc: A) {
-    unsafe {
-        let size = size_of_val(ptr.as_ref());
-        let align = min_align_of_val(ptr.as_ref());
-        let layout = Layout::from_size_align_unchecked(size, align);
-        alloc.deallocate(From::from(ptr.cast()), layout)
-    }
-}
-
 // # Allocation error handler
 
 #[cfg(not(no_global_oom_handling))]
@@ -414,7 +394,6 @@ pub mod __alloc_error_handler {
             static __rust_alloc_error_handler_should_panic: u8;
         }
 
-        #[allow(unused_unsafe)]
         if unsafe { __rust_alloc_error_handler_should_panic != 0 } {
             panic!("memory allocation of {size} bytes failed")
         } else {
diff --git a/rust/alloc/boxed.rs b/rust/alloc/boxed.rs
index c8173cea8317..9620eba17268 100644
--- a/rust/alloc/boxed.rs
+++ b/rust/alloc/boxed.rs
@@ -159,12 +159,12 @@ use core::hash::{Hash, Hasher};
 use core::iter::FusedIterator;
 use core::marker::Tuple;
 use core::marker::Unsize;
-use core::mem;
+use core::mem::{self, SizedTypeProperties};
 use core::ops::{
     CoerceUnsized, Deref, DerefMut, DispatchFromDyn, Generator, GeneratorState, Receiver,
 };
 use core::pin::Pin;
-use core::ptr::{self, Unique};
+use core::ptr::{self, NonNull, Unique};
 use core::task::{Context, Poll};
 
 #[cfg(not(no_global_oom_handling))]
@@ -483,8 +483,12 @@ impl<T, A: Allocator> Box<T, A> {
     where
         A: Allocator,
     {
-        let layout = Layout::new::<mem::MaybeUninit<T>>();
-        let ptr = alloc.allocate(layout)?.cast();
+        let ptr = if T::IS_ZST {
+            NonNull::dangling()
+        } else {
+            let layout = Layout::new::<mem::MaybeUninit<T>>();
+            alloc.allocate(layout)?.cast()
+        };
         unsafe { Ok(Box::from_raw_in(ptr.as_ptr(), alloc)) }
     }
 
@@ -553,8 +557,12 @@ impl<T, A: Allocator> Box<T, A> {
     where
         A: Allocator,
     {
-        let layout = Layout::new::<mem::MaybeUninit<T>>();
-        let ptr = alloc.allocate_zeroed(layout)?.cast();
+        let ptr = if T::IS_ZST {
+            NonNull::dangling()
+        } else {
+            let layout = Layout::new::<mem::MaybeUninit<T>>();
+            alloc.allocate_zeroed(layout)?.cast()
+        };
         unsafe { Ok(Box::from_raw_in(ptr.as_ptr(), alloc)) }
     }
 
@@ -679,14 +687,16 @@ impl<T> Box<[T]> {
     #[unstable(feature = "allocator_api", issue = "32838")]
     #[inline]
     pub fn try_new_uninit_slice(len: usize) -> Result<Box<[mem::MaybeUninit<T>]>, AllocError> {
-        unsafe {
+        let ptr = if T::IS_ZST || len == 0 {
+            NonNull::dangling()
+        } else {
             let layout = match Layout::array::<mem::MaybeUninit<T>>(len) {
                 Ok(l) => l,
                 Err(_) => return Err(AllocError),
             };
-            let ptr = Global.allocate(layout)?;
-            Ok(RawVec::from_raw_parts_in(ptr.as_mut_ptr() as *mut _, len, Global).into_box(len))
-        }
+            Global.allocate(layout)?.cast()
+        };
+        unsafe { Ok(RawVec::from_raw_parts_in(ptr.as_ptr(), len, Global).into_box(len)) }
     }
 
     /// Constructs a new boxed slice with uninitialized contents, with the memory
@@ -711,14 +721,16 @@ impl<T> Box<[T]> {
     #[unstable(feature = "allocator_api", issue = "32838")]
     #[inline]
     pub fn try_new_zeroed_slice(len: usize) -> Result<Box<[mem::MaybeUninit<T>]>, AllocError> {
-        unsafe {
+        let ptr = if T::IS_ZST || len == 0 {
+            NonNull::dangling()
+        } else {
             let layout = match Layout::array::<mem::MaybeUninit<T>>(len) {
                 Ok(l) => l,
                 Err(_) => return Err(AllocError),
             };
-            let ptr = Global.allocate_zeroed(layout)?;
-            Ok(RawVec::from_raw_parts_in(ptr.as_mut_ptr() as *mut _, len, Global).into_box(len))
-        }
+            Global.allocate_zeroed(layout)?.cast()
+        };
+        unsafe { Ok(RawVec::from_raw_parts_in(ptr.as_ptr(), len, Global).into_box(len)) }
     }
 }
 
@@ -1215,8 +1227,18 @@ impl<T: ?Sized, A: Allocator> Box<T, A> {
 
 #[stable(feature = "rust1", since = "1.0.0")]
 unsafe impl<#[may_dangle] T: ?Sized, A: Allocator> Drop for Box<T, A> {
+    #[inline]
     fn drop(&mut self) {
-        // FIXME: Do nothing, drop is currently performed by compiler.
+        // the T in the Box is dropped by the compiler before the destructor is run
+
+        let ptr = self.0;
+
+        unsafe {
+            let layout = Layout::for_value_raw(ptr.as_ptr());
+            if layout.size() != 0 {
+                self.1.deallocate(From::from(ptr.cast()), layout);
+            }
+        }
     }
 }
 
@@ -2165,7 +2187,7 @@ impl dyn Error + Send {
         let err: Box<dyn Error> = self;
         <dyn Error>::downcast(err).map_err(|s| unsafe {
             // Reapply the `Send` marker.
-            mem::transmute::<Box<dyn Error>, Box<dyn Error + Send>>(s)
+            Box::from_raw(Box::into_raw(s) as *mut (dyn Error + Send))
         })
     }
 }
@@ -2179,7 +2201,7 @@ impl dyn Error + Send + Sync {
         let err: Box<dyn Error> = self;
         <dyn Error>::downcast(err).map_err(|s| unsafe {
             // Reapply the `Send + Sync` marker.
-            mem::transmute::<Box<dyn Error>, Box<dyn Error + Send + Sync>>(s)
+            Box::from_raw(Box::into_raw(s) as *mut (dyn Error + Send + Sync))
         })
     }
 }
diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs
index 85e91356ecb3..73b9ffd845d9 100644
--- a/rust/alloc/lib.rs
+++ b/rust/alloc/lib.rs
@@ -58,6 +58,11 @@
 //! [`Rc`]: rc
 //! [`RefCell`]: core::cell
 
+// To run alloc tests without x.py without ending up with two copies of alloc, Miri needs to be
+// able to "empty" this crate. See <https://github.com/rust-lang/miri-test-libstd/issues/4>.
+// rustc itself never sets the feature, so this line has no effect there.
+#![cfg(any(not(feature = "miri-test-libstd"), test, doctest))]
+//
 #![allow(unused_attributes)]
 #![stable(feature = "alloc", since = "1.36.0")]
 #![doc(
@@ -77,11 +82,6 @@
 ))]
 #![no_std]
 #![needs_allocator]
-// To run alloc tests without x.py without ending up with two copies of alloc, Miri needs to be
-// able to "empty" this crate. See <https://github.com/rust-lang/miri-test-libstd/issues/4>.
-// rustc itself never sets the feature, so this line has no affect there.
-#![cfg(any(not(feature = "miri-test-libstd"), test, doctest))]
-//
 // Lints:
 #![deny(unsafe_op_in_unsafe_fn)]
 #![deny(fuzzy_provenance_casts)]
@@ -90,6 +90,8 @@
 #![warn(missing_docs)]
 #![allow(explicit_outlives_requirements)]
 #![warn(multiple_supertrait_upcastable)]
+#![cfg_attr(not(bootstrap), allow(internal_features))]
+#![cfg_attr(not(bootstrap), allow(rustdoc::redundant_explicit_links))]
 //
 // Library features:
 // tidy-alphabetical-start
@@ -139,7 +141,6 @@
 #![feature(maybe_uninit_uninit_array_transpose)]
 #![feature(pattern)]
 #![feature(pointer_byte_offsets)]
-#![feature(provide_any)]
 #![feature(ptr_internals)]
 #![feature(ptr_metadata)]
 #![feature(ptr_sub_ptr)]
diff --git a/rust/alloc/raw_vec.rs b/rust/alloc/raw_vec.rs
index 65d5ce15828e..a7425582a323 100644
--- a/rust/alloc/raw_vec.rs
+++ b/rust/alloc/raw_vec.rs
@@ -471,16 +471,26 @@ impl<T, A: Allocator> RawVec<T, A> {
         let (ptr, layout) = if let Some(mem) = self.current_memory() { mem } else { return Ok(()) };
         // See current_memory() why this assert is here
         let _: () = const { assert!(mem::size_of::<T>() % mem::align_of::<T>() == 0) };
-        let ptr = unsafe {
-            // `Layout::array` cannot overflow here because it would have
-            // overflowed earlier when capacity was larger.
-            let new_size = mem::size_of::<T>().unchecked_mul(cap);
-            let new_layout = Layout::from_size_align_unchecked(new_size, layout.align());
-            self.alloc
-                .shrink(ptr, layout, new_layout)
-                .map_err(|_| AllocError { layout: new_layout, non_exhaustive: () })?
-        };
-        self.set_ptr_and_cap(ptr, cap);
+
+        // If shrinking to 0, deallocate the buffer. We don't reach this point
+        // for the T::IS_ZST case since current_memory() will have returned
+        // None.
+        if cap == 0 {
+            unsafe { self.alloc.deallocate(ptr, layout) };
+            self.ptr = Unique::dangling();
+            self.cap = 0;
+        } else {
+            let ptr = unsafe {
+                // `Layout::array` cannot overflow here because it would have
+                // overflowed earlier when capacity was larger.
+                let new_size = mem::size_of::<T>().unchecked_mul(cap);
+                let new_layout = Layout::from_size_align_unchecked(new_size, layout.align());
+                self.alloc
+                    .shrink(ptr, layout, new_layout)
+                    .map_err(|_| AllocError { layout: new_layout, non_exhaustive: () })?
+            };
+            self.set_ptr_and_cap(ptr, cap);
+        }
         Ok(())
     }
 }
diff --git a/rust/alloc/vec/drain_filter.rs b/rust/alloc/vec/drain_filter.rs
deleted file mode 100644
index 09efff090e42..000000000000
--- a/rust/alloc/vec/drain_filter.rs
+++ /dev/null
@@ -1,199 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT
-
-use crate::alloc::{Allocator, Global};
-use core::mem::{ManuallyDrop, SizedTypeProperties};
-use core::ptr;
-use core::slice;
-
-use super::Vec;
-
-/// An iterator which uses a closure to determine if an element should be removed.
-///
-/// This struct is created by [`Vec::drain_filter`].
-/// See its documentation for more.
-///
-/// # Example
-///
-/// ```
-/// #![feature(drain_filter)]
-///
-/// let mut v = vec![0, 1, 2];
-/// let iter: std::vec::DrainFilter<'_, _, _> = v.drain_filter(|x| *x % 2 == 0);
-/// ```
-#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
-#[derive(Debug)]
-pub struct DrainFilter<
-    'a,
-    T,
-    F,
-    #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global,
-> where
-    F: FnMut(&mut T) -> bool,
-{
-    pub(super) vec: &'a mut Vec<T, A>,
-    /// The index of the item that will be inspected by the next call to `next`.
-    pub(super) idx: usize,
-    /// The number of items that have been drained (removed) thus far.
-    pub(super) del: usize,
-    /// The original length of `vec` prior to draining.
-    pub(super) old_len: usize,
-    /// The filter test predicate.
-    pub(super) pred: F,
-    /// A flag that indicates a panic has occurred in the filter test predicate.
-    /// This is used as a hint in the drop implementation to prevent consumption
-    /// of the remainder of the `DrainFilter`. Any unprocessed items will be
-    /// backshifted in the `vec`, but no further items will be dropped or
-    /// tested by the filter predicate.
-    pub(super) panic_flag: bool,
-}
-
-impl<T, F, A: Allocator> DrainFilter<'_, T, F, A>
-where
-    F: FnMut(&mut T) -> bool,
-{
-    /// Returns a reference to the underlying allocator.
-    #[unstable(feature = "allocator_api", issue = "32838")]
-    #[inline]
-    pub fn allocator(&self) -> &A {
-        self.vec.allocator()
-    }
-
-    /// Keep unyielded elements in the source `Vec`.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// #![feature(drain_filter)]
-    /// #![feature(drain_keep_rest)]
-    ///
-    /// let mut vec = vec!['a', 'b', 'c'];
-    /// let mut drain = vec.drain_filter(|_| true);
-    ///
-    /// assert_eq!(drain.next().unwrap(), 'a');
-    ///
-    /// // This call keeps 'b' and 'c' in the vec.
-    /// drain.keep_rest();
-    ///
-    /// // If we wouldn't call `keep_rest()`,
-    /// // `vec` would be empty.
-    /// assert_eq!(vec, ['b', 'c']);
-    /// ```
-    #[unstable(feature = "drain_keep_rest", issue = "101122")]
-    pub fn keep_rest(self) {
-        // At this moment layout looks like this:
-        //
-        //  _____________________/-- old_len
-        // /                     \
-        // [kept] [yielded] [tail]
-        //        \_______/ ^-- idx
-        //                \-- del
-        //
-        // Normally `Drop` impl would drop [tail] (via .for_each(drop), ie still calling `pred`)
-        //
-        // 1. Move [tail] after [kept]
-        // 2. Update length of the original vec to `old_len - del`
-        //    a. In case of ZST, this is the only thing we want to do
-        // 3. Do *not* drop self, as everything is put in a consistent state already, there is nothing to do
-        let mut this = ManuallyDrop::new(self);
-
-        unsafe {
-            // ZSTs have no identity, so we don't need to move them around.
-            if !T::IS_ZST && this.idx < this.old_len && this.del > 0 {
-                let ptr = this.vec.as_mut_ptr();
-                let src = ptr.add(this.idx);
-                let dst = src.sub(this.del);
-                let tail_len = this.old_len - this.idx;
-                src.copy_to(dst, tail_len);
-            }
-
-            let new_len = this.old_len - this.del;
-            this.vec.set_len(new_len);
-        }
-    }
-}
-
-#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
-impl<T, F, A: Allocator> Iterator for DrainFilter<'_, T, F, A>
-where
-    F: FnMut(&mut T) -> bool,
-{
-    type Item = T;
-
-    fn next(&mut self) -> Option<T> {
-        unsafe {
-            while self.idx < self.old_len {
-                let i = self.idx;
-                let v = slice::from_raw_parts_mut(self.vec.as_mut_ptr(), self.old_len);
-                self.panic_flag = true;
-                let drained = (self.pred)(&mut v[i]);
-                self.panic_flag = false;
-                // Update the index *after* the predicate is called. If the index
-                // is updated prior and the predicate panics, the element at this
-                // index would be leaked.
-                self.idx += 1;
-                if drained {
-                    self.del += 1;
-                    return Some(ptr::read(&v[i]));
-                } else if self.del > 0 {
-                    let del = self.del;
-                    let src: *const T = &v[i];
-                    let dst: *mut T = &mut v[i - del];
-                    ptr::copy_nonoverlapping(src, dst, 1);
-                }
-            }
-            None
-        }
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        (0, Some(self.old_len - self.idx))
-    }
-}
-
-#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
-impl<T, F, A: Allocator> Drop for DrainFilter<'_, T, F, A>
-where
-    F: FnMut(&mut T) -> bool,
-{
-    fn drop(&mut self) {
-        struct BackshiftOnDrop<'a, 'b, T, F, A: Allocator>
-        where
-            F: FnMut(&mut T) -> bool,
-        {
-            drain: &'b mut DrainFilter<'a, T, F, A>,
-        }
-
-        impl<'a, 'b, T, F, A: Allocator> Drop for BackshiftOnDrop<'a, 'b, T, F, A>
-        where
-            F: FnMut(&mut T) -> bool,
-        {
-            fn drop(&mut self) {
-                unsafe {
-                    if self.drain.idx < self.drain.old_len && self.drain.del > 0 {
-                        // This is a pretty messed up state, and there isn't really an
-                        // obviously right thing to do. We don't want to keep trying
-                        // to execute `pred`, so we just backshift all the unprocessed
-                        // elements and tell the vec that they still exist. The backshift
-                        // is required to prevent a double-drop of the last successfully
-                        // drained item prior to a panic in the predicate.
-                        let ptr = self.drain.vec.as_mut_ptr();
-                        let src = ptr.add(self.drain.idx);
-                        let dst = src.sub(self.drain.del);
-                        let tail_len = self.drain.old_len - self.drain.idx;
-                        src.copy_to(dst, tail_len);
-                    }
-                    self.drain.vec.set_len(self.drain.old_len - self.drain.del);
-                }
-            }
-        }
-
-        let backshift = BackshiftOnDrop { drain: self };
-
-        // Attempt to consume any remaining elements if the filter predicate
-        // has not yet panicked. We'll backshift any remaining elements
-        // whether we've already panicked or if the consumption here panics.
-        if !backshift.drain.panic_flag {
-            backshift.drain.for_each(drop);
-        }
-    }
-}
diff --git a/rust/alloc/vec/extract_if.rs b/rust/alloc/vec/extract_if.rs
new file mode 100644
index 000000000000..f314a51d4d3d
--- /dev/null
+++ b/rust/alloc/vec/extract_if.rs
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use crate::alloc::{Allocator, Global};
+use core::ptr;
+use core::slice;
+
+use super::Vec;
+
+/// An iterator which uses a closure to determine if an element should be removed.
+///
+/// This struct is created by [`Vec::extract_if`].
+/// See its documentation for more.
+///
+/// # Example
+///
+/// ```
+/// #![feature(extract_if)]
+///
+/// let mut v = vec![0, 1, 2];
+/// let iter: std::vec::ExtractIf<'_, _, _> = v.extract_if(|x| *x % 2 == 0);
+/// ```
+#[unstable(feature = "extract_if", reason = "recently added", issue = "43244")]
+#[derive(Debug)]
+#[must_use = "iterators are lazy and do nothing unless consumed"]
+pub struct ExtractIf<
+    'a,
+    T,
+    F,
+    #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global,
+> where
+    F: FnMut(&mut T) -> bool,
+{
+    pub(super) vec: &'a mut Vec<T, A>,
+    /// The index of the item that will be inspected by the next call to `next`.
+    pub(super) idx: usize,
+    /// The number of items that have been drained (removed) thus far.
+    pub(super) del: usize,
+    /// The original length of `vec` prior to draining.
+    pub(super) old_len: usize,
+    /// The filter test predicate.
+    pub(super) pred: F,
+}
+
+impl<T, F, A: Allocator> ExtractIf<'_, T, F, A>
+where
+    F: FnMut(&mut T) -> bool,
+{
+    /// Returns a reference to the underlying allocator.
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[inline]
+    pub fn allocator(&self) -> &A {
+        self.vec.allocator()
+    }
+}
+
+#[unstable(feature = "extract_if", reason = "recently added", issue = "43244")]
+impl<T, F, A: Allocator> Iterator for ExtractIf<'_, T, F, A>
+where
+    F: FnMut(&mut T) -> bool,
+{
+    type Item = T;
+
+    fn next(&mut self) -> Option<T> {
+        unsafe {
+            while self.idx < self.old_len {
+                let i = self.idx;
+                let v = slice::from_raw_parts_mut(self.vec.as_mut_ptr(), self.old_len);
+                let drained = (self.pred)(&mut v[i]);
+                // Update the index *after* the predicate is called. If the index
+                // is updated prior and the predicate panics, the element at this
+                // index would be leaked.
+                self.idx += 1;
+                if drained {
+                    self.del += 1;
+                    return Some(ptr::read(&v[i]));
+                } else if self.del > 0 {
+                    let del = self.del;
+                    let src: *const T = &v[i];
+                    let dst: *mut T = &mut v[i - del];
+                    ptr::copy_nonoverlapping(src, dst, 1);
+                }
+            }
+            None
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.old_len - self.idx))
+    }
+}
+
+#[unstable(feature = "extract_if", reason = "recently added", issue = "43244")]
+impl<T, F, A: Allocator> Drop for ExtractIf<'_, T, F, A>
+where
+    F: FnMut(&mut T) -> bool,
+{
+    fn drop(&mut self) {
+        unsafe {
+            if self.idx < self.old_len && self.del > 0 {
+                // This is a pretty messed up state, and there isn't really an
+                // obviously right thing to do. We don't want to keep trying
+                // to execute `pred`, so we just backshift all the unprocessed
+                // elements and tell the vec that they still exist. The backshift
+                // is required to prevent a double-drop of the last successfully
+                // drained item prior to a panic in the predicate.
+                let ptr = self.vec.as_mut_ptr();
+                let src = ptr.add(self.idx);
+                let dst = src.sub(self.del);
+                let tail_len = self.old_len - self.idx;
+                src.copy_to(dst, tail_len);
+            }
+            self.vec.set_len(self.old_len - self.del);
+        }
+    }
+}
diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs
index 05c70de0227e..209a88cfe598 100644
--- a/rust/alloc/vec/mod.rs
+++ b/rust/alloc/vec/mod.rs
@@ -74,10 +74,10 @@ use crate::boxed::Box;
 use crate::collections::{TryReserveError, TryReserveErrorKind};
 use crate::raw_vec::RawVec;
 
-#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
-pub use self::drain_filter::DrainFilter;
+#[unstable(feature = "extract_if", reason = "recently added", issue = "43244")]
+pub use self::extract_if::ExtractIf;
 
-mod drain_filter;
+mod extract_if;
 
 #[cfg(not(no_global_oom_handling))]
 #[stable(feature = "vec_splice", since = "1.21.0")]
@@ -216,7 +216,7 @@ mod spec_extend;
 ///
 /// # Indexing
 ///
-/// The `Vec` type allows to access values by index, because it implements the
+/// The `Vec` type allows access to values by index, because it implements the
 /// [`Index`] trait. An example will be more explicit:
 ///
 /// ```
@@ -618,22 +618,20 @@ impl<T> Vec<T> {
     /// Using memory that was allocated elsewhere:
     ///
     /// ```rust
-    /// #![feature(allocator_api)]
-    ///
-    /// use std::alloc::{AllocError, Allocator, Global, Layout};
+    /// use std::alloc::{alloc, Layout};
     ///
     /// fn main() {
     ///     let layout = Layout::array::<u32>(16).expect("overflow cannot happen");
     ///
     ///     let vec = unsafe {
-    ///         let mem = match Global.allocate(layout) {
-    ///             Ok(mem) => mem.cast::<u32>().as_ptr(),
-    ///             Err(AllocError) => return,
-    ///         };
+    ///         let mem = alloc(layout).cast::<u32>();
+    ///         if mem.is_null() {
+    ///             return;
+    ///         }
     ///
     ///         mem.write(1_000_000);
     ///
-    ///         Vec::from_raw_parts_in(mem, 1, 16, Global)
+    ///         Vec::from_raw_parts(mem, 1, 16)
     ///     };
     ///
     ///     assert_eq!(vec, &[1_000_000]);
@@ -876,19 +874,22 @@ impl<T, A: Allocator> Vec<T, A> {
     /// Using memory that was allocated elsewhere:
     ///
     /// ```rust
-    /// use std::alloc::{alloc, Layout};
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::{AllocError, Allocator, Global, Layout};
     ///
     /// fn main() {
     ///     let layout = Layout::array::<u32>(16).expect("overflow cannot happen");
+    ///
     ///     let vec = unsafe {
-    ///         let mem = alloc(layout).cast::<u32>();
-    ///         if mem.is_null() {
-    ///             return;
-    ///         }
+    ///         let mem = match Global.allocate(layout) {
+    ///             Ok(mem) => mem.cast::<u32>().as_ptr(),
+    ///             Err(AllocError) => return,
+    ///         };
     ///
     ///         mem.write(1_000_000);
     ///
-    ///         Vec::from_raw_parts(mem, 1, 16)
+    ///         Vec::from_raw_parts_in(mem, 1, 16, Global)
     ///     };
     ///
     ///     assert_eq!(vec, &[1_000_000]);
@@ -2507,7 +2508,7 @@ impl<T: Clone, A: Allocator> Vec<T, A> {
         let len = self.len();
 
         if new_len > len {
-            self.extend_with(new_len - len, ExtendElement(value))
+            self.extend_with(new_len - len, value)
         } else {
             self.truncate(new_len);
         }
@@ -2545,7 +2546,7 @@ impl<T: Clone, A: Allocator> Vec<T, A> {
         let len = self.len();
 
         if new_len > len {
-            self.try_extend_with(new_len - len, ExtendElement(value))
+            self.try_extend_with(new_len - len, value)
         } else {
             self.truncate(new_len);
             Ok(())
@@ -2684,26 +2685,10 @@ impl<T, A: Allocator, const N: usize> Vec<[T; N], A> {
     }
 }
 
-// This code generalizes `extend_with_{element,default}`.
-trait ExtendWith<T> {
-    fn next(&mut self) -> T;
-    fn last(self) -> T;
-}
-
-struct ExtendElement<T>(T);
-impl<T: Clone> ExtendWith<T> for ExtendElement<T> {
-    fn next(&mut self) -> T {
-        self.0.clone()
-    }
-    fn last(self) -> T {
-        self.0
-    }
-}
-
-impl<T, A: Allocator> Vec<T, A> {
+impl<T: Clone, A: Allocator> Vec<T, A> {
     #[cfg(not(no_global_oom_handling))]
-    /// Extend the vector by `n` values, using the given generator.
-    fn extend_with<E: ExtendWith<T>>(&mut self, n: usize, mut value: E) {
+    /// Extend the vector by `n` clones of value.
+    fn extend_with(&mut self, n: usize, value: T) {
         self.reserve(n);
 
         unsafe {
@@ -2715,15 +2700,15 @@ impl<T, A: Allocator> Vec<T, A> {
 
             // Write all elements except the last one
             for _ in 1..n {
-                ptr::write(ptr, value.next());
+                ptr::write(ptr, value.clone());
                 ptr = ptr.add(1);
-                // Increment the length in every step in case next() panics
+                // Increment the length in every step in case clone() panics
                 local_len.increment_len(1);
             }
 
             if n > 0 {
                 // We can write the last element directly without cloning needlessly
-                ptr::write(ptr, value.last());
+                ptr::write(ptr, value);
                 local_len.increment_len(1);
             }
 
@@ -2731,8 +2716,8 @@ impl<T, A: Allocator> Vec<T, A> {
         }
     }
 
-    /// Try to extend the vector by `n` values, using the given generator.
-    fn try_extend_with<E: ExtendWith<T>>(&mut self, n: usize, mut value: E) -> Result<(), TryReserveError> {
+    /// Try to extend the vector by `n` clones of value.
+    fn try_extend_with(&mut self, n: usize, value: T) -> Result<(), TryReserveError> {
         self.try_reserve(n)?;
 
         unsafe {
@@ -2744,15 +2729,15 @@ impl<T, A: Allocator> Vec<T, A> {
 
             // Write all elements except the last one
             for _ in 1..n {
-                ptr::write(ptr, value.next());
+                ptr::write(ptr, value.clone());
                 ptr = ptr.add(1);
-                // Increment the length in every step in case next() panics
+                // Increment the length in every step in case clone() panics
                 local_len.increment_len(1);
             }
 
             if n > 0 {
                 // We can write the last element directly without cloning needlessly
-                ptr::write(ptr, value.last());
+                ptr::write(ptr, value);
                 local_len.increment_len(1);
             }
 
@@ -3210,6 +3195,12 @@ impl<T, A: Allocator> Vec<T, A> {
     /// If the closure returns false, the element will remain in the vector and will not be yielded
     /// by the iterator.
     ///
+    /// If the returned `ExtractIf` is not exhausted, e.g. because it is dropped without iterating
+    /// or the iteration short-circuits, then the remaining elements will be retained.
+    /// Use [`retain`] with a negated predicate if you do not need the returned iterator.
+    ///
+    /// [`retain`]: Vec::retain
+    ///
     /// Using this method is equivalent to the following code:
     ///
     /// ```
@@ -3228,10 +3219,10 @@ impl<T, A: Allocator> Vec<T, A> {
     /// # assert_eq!(vec, vec![1, 4, 5]);
     /// ```
     ///
-    /// But `drain_filter` is easier to use. `drain_filter` is also more efficient,
+    /// But `extract_if` is easier to use. `extract_if` is also more efficient,
     /// because it can backshift the elements of the array in bulk.
     ///
-    /// Note that `drain_filter` also lets you mutate every element in the filter closure,
+    /// Note that `extract_if` also lets you mutate every element in the filter closure,
     /// regardless of whether you choose to keep or remove it.
     ///
     /// # Examples
@@ -3239,17 +3230,17 @@ impl<T, A: Allocator> Vec<T, A> {
     /// Splitting an array into evens and odds, reusing the original allocation:
     ///
     /// ```
-    /// #![feature(drain_filter)]
+    /// #![feature(extract_if)]
     /// let mut numbers = vec![1, 2, 3, 4, 5, 6, 8, 9, 11, 13, 14, 15];
     ///
-    /// let evens = numbers.drain_filter(|x| *x % 2 == 0).collect::<Vec<_>>();
+    /// let evens = numbers.extract_if(|x| *x % 2 == 0).collect::<Vec<_>>();
     /// let odds = numbers;
     ///
     /// assert_eq!(evens, vec![2, 4, 6, 8, 14]);
     /// assert_eq!(odds, vec![1, 3, 5, 9, 11, 13, 15]);
     /// ```
-    #[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
-    pub fn drain_filter<F>(&mut self, filter: F) -> DrainFilter<'_, T, F, A>
+    #[unstable(feature = "extract_if", reason = "recently added", issue = "43244")]
+    pub fn extract_if<F>(&mut self, filter: F) -> ExtractIf<'_, T, F, A>
     where
         F: FnMut(&mut T) -> bool,
     {
@@ -3260,7 +3251,7 @@ impl<T, A: Allocator> Vec<T, A> {
             self.set_len(0);
         }
 
-        DrainFilter { vec: self, idx: 0, del: 0, old_len, pred: filter, panic_flag: false }
+        ExtractIf { vec: self, idx: 0, del: 0, old_len, pred: filter }
     }
 }
 
@@ -3272,7 +3263,7 @@ impl<T, A: Allocator> Vec<T, A> {
 /// [`copy_from_slice`]: slice::copy_from_slice
 #[cfg(not(no_global_oom_handling))]
 #[stable(feature = "extend_ref", since = "1.2.0")]
-impl<'a, T: Copy + 'a, A: Allocator + 'a> Extend<&'a T> for Vec<T, A> {
+impl<'a, T: Copy + 'a, A: Allocator> Extend<&'a T> for Vec<T, A> {
     fn extend<I: IntoIterator<Item = &'a T>>(&mut self, iter: I) {
         self.spec_extend(iter.into_iter())
     }
@@ -3290,9 +3281,14 @@ impl<'a, T: Copy + 'a, A: Allocator + 'a> Extend<&'a T> for Vec<T, A> {
 
 /// Implements comparison of vectors, [lexicographically](Ord#lexicographical-comparison).
 #[stable(feature = "rust1", since = "1.0.0")]
-impl<T: PartialOrd, A: Allocator> PartialOrd for Vec<T, A> {
+impl<T, A1, A2> PartialOrd<Vec<T, A2>> for Vec<T, A1>
+where
+    T: PartialOrd,
+    A1: Allocator,
+    A2: Allocator,
+{
     #[inline]
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+    fn partial_cmp(&self, other: &Vec<T, A2>) -> Option<Ordering> {
         PartialOrd::partial_cmp(&**self, &**other)
     }
 }
diff --git a/rust/alloc/vec/spec_extend.rs b/rust/alloc/vec/spec_extend.rs
index a6a735201e59..ada919537446 100644
--- a/rust/alloc/vec/spec_extend.rs
+++ b/rust/alloc/vec/spec_extend.rs
@@ -77,7 +77,7 @@ impl<T, A: Allocator> TrySpecExtend<T, IntoIter<T>> for Vec<T, A> {
 }
 
 #[cfg(not(no_global_oom_handling))]
-impl<'a, T: 'a, I, A: Allocator + 'a> SpecExtend<&'a T, I> for Vec<T, A>
+impl<'a, T: 'a, I, A: Allocator> SpecExtend<&'a T, I> for Vec<T, A>
 where
     I: Iterator<Item = &'a T>,
     T: Clone,
@@ -87,7 +87,7 @@ where
     }
 }
 
-impl<'a, T: 'a, I, A: Allocator + 'a> TrySpecExtend<&'a T, I> for Vec<T, A>
+impl<'a, T: 'a, I, A: Allocator> TrySpecExtend<&'a T, I> for Vec<T, A>
 where
     I: Iterator<Item = &'a T>,
     T: Clone,
@@ -98,7 +98,7 @@ where
 }
 
 #[cfg(not(no_global_oom_handling))]
-impl<'a, T: 'a, A: Allocator + 'a> SpecExtend<&'a T, slice::Iter<'a, T>> for Vec<T, A>
+impl<'a, T: 'a, A: Allocator> SpecExtend<&'a T, slice::Iter<'a, T>> for Vec<T, A>
 where
     T: Copy,
 {
@@ -108,7 +108,7 @@ where
     }
 }
 
-impl<'a, T: 'a, A: Allocator + 'a> TrySpecExtend<&'a T, slice::Iter<'a, T>> for Vec<T, A>
+impl<'a, T: 'a, A: Allocator> TrySpecExtend<&'a T, slice::Iter<'a, T>> for Vec<T, A>
 where
     T: Copy,
 {
diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index c91a3c24f607..85f013ed4ca4 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -12,6 +12,7 @@
 #include <linux/refcount.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
+#include <linux/workqueue.h>
 
 /* `bindgen` gets confused at certain things. */
 const size_t BINDINGS_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN;
diff --git a/rust/compiler_builtins.rs b/rust/compiler_builtins.rs
index fb8ac3f211de..bba2922c6ef7 100644
--- a/rust/compiler_builtins.rs
+++ b/rust/compiler_builtins.rs
@@ -19,6 +19,7 @@
 //! [`compiler_builtins`]: https://github.com/rust-lang/compiler-builtins
 //! [`compiler-rt`]: https://compiler-rt.llvm.org/
 
+#![allow(internal_features)]
 #![feature(compiler_builtins)]
 #![compiler_builtins]
 #![no_builtins]
diff --git a/rust/helpers.c b/rust/helpers.c
index 4c86fe4a7e05..70e59efd92bc 100644
--- a/rust/helpers.c
+++ b/rust/helpers.c
@@ -30,6 +30,7 @@
 #include <linux/sched/signal.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/workqueue.h>
 
 __noreturn void rust_helper_BUG(void)
 {
@@ -144,6 +145,18 @@ struct kunit *rust_helper_kunit_get_current_test(void)
 }
 EXPORT_SYMBOL_GPL(rust_helper_kunit_get_current_test);
 
+void rust_helper_init_work_with_key(struct work_struct *work, work_func_t func,
+				    bool onstack, const char *name,
+				    struct lock_class_key *key)
+{
+	__init_work(work, onstack);
+	work->data = (atomic_long_t)WORK_DATA_INIT();
+	lockdep_init_map(&work->lockdep_map, name, key, 0);
+	INIT_LIST_HEAD(&work->entry);
+	work->func = func;
+}
+EXPORT_SYMBOL_GPL(rust_helper_init_work_with_key);
+
 /*
  * `bindgen` binds the C `size_t` type as the Rust `usize` type, so we can
  * use it in contexts where Rust expects a `usize` like slice (array) indices.
diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs
index 4ebb6f23fc2e..65be9ae57b80 100644
--- a/rust/kernel/init.rs
+++ b/rust/kernel/init.rs
@@ -35,7 +35,7 @@
 //! that you need to write `<-` instead of `:` for fields that you want to initialize in-place.
 //!
 //! ```rust
-//! # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+//! # #![allow(clippy::disallowed_names)]
 //! use kernel::{prelude::*, sync::Mutex, new_mutex};
 //! # use core::pin::Pin;
 //! #[pin_data]
@@ -55,7 +55,7 @@
 //! (or just the stack) to actually initialize a `Foo`:
 //!
 //! ```rust
-//! # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+//! # #![allow(clippy::disallowed_names)]
 //! # use kernel::{prelude::*, sync::Mutex, new_mutex};
 //! # use core::pin::Pin;
 //! # #[pin_data]
@@ -86,7 +86,7 @@
 //! To declare an init macro/function you just return an [`impl PinInit<T, E>`]:
 //!
 //! ```rust
-//! # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+//! # #![allow(clippy::disallowed_names)]
 //! # use kernel::{sync::Mutex, prelude::*, new_mutex, init::PinInit, try_pin_init};
 //! #[pin_data]
 //! struct DriverData {
@@ -236,7 +236,7 @@ pub mod macros;
 /// # Examples
 ///
 /// ```rust
-/// # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+/// # #![allow(clippy::disallowed_names)]
 /// # use kernel::{init, macros::pin_data, pin_init, stack_pin_init, init::*, sync::Mutex, new_mutex};
 /// # use core::pin::Pin;
 /// #[pin_data]
@@ -288,7 +288,7 @@ macro_rules! stack_pin_init {
 /// # Examples
 ///
 /// ```rust,ignore
-/// # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+/// # #![allow(clippy::disallowed_names)]
 /// # use kernel::{init, pin_init, stack_try_pin_init, init::*, sync::Mutex, new_mutex};
 /// # use macros::pin_data;
 /// # use core::{alloc::AllocError, pin::Pin};
@@ -314,7 +314,7 @@ macro_rules! stack_pin_init {
 /// ```
 ///
 /// ```rust,ignore
-/// # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+/// # #![allow(clippy::disallowed_names)]
 /// # use kernel::{init, pin_init, stack_try_pin_init, init::*, sync::Mutex, new_mutex};
 /// # use macros::pin_data;
 /// # use core::{alloc::AllocError, pin::Pin};
@@ -366,7 +366,7 @@ macro_rules! stack_try_pin_init {
 /// The syntax is almost identical to that of a normal `struct` initializer:
 ///
 /// ```rust
-/// # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+/// # #![allow(clippy::disallowed_names)]
 /// # use kernel::{init, pin_init, macros::pin_data, init::*};
 /// # use core::pin::Pin;
 /// #[pin_data]
@@ -411,7 +411,7 @@ macro_rules! stack_try_pin_init {
 /// To create an initializer function, simply declare it like this:
 ///
 /// ```rust
-/// # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+/// # #![allow(clippy::disallowed_names)]
 /// # use kernel::{init, pin_init, prelude::*, init::*};
 /// # use core::pin::Pin;
 /// # #[pin_data]
@@ -438,7 +438,7 @@ macro_rules! stack_try_pin_init {
 /// Users of `Foo` can now create it like this:
 ///
 /// ```rust
-/// # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+/// # #![allow(clippy::disallowed_names)]
 /// # use kernel::{init, pin_init, macros::pin_data, init::*};
 /// # use core::pin::Pin;
 /// # #[pin_data]
@@ -466,7 +466,7 @@ macro_rules! stack_try_pin_init {
 /// They can also easily embed it into their own `struct`s:
 ///
 /// ```rust
-/// # #![allow(clippy::disallowed_names, clippy::new_ret_no_self)]
+/// # #![allow(clippy::disallowed_names)]
 /// # use kernel::{init, pin_init, macros::pin_data, init::*};
 /// # use core::pin::Pin;
 /// # #[pin_data]
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index e8811700239a..e6aff80b521f 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -16,6 +16,8 @@
 #![feature(coerce_unsized)]
 #![feature(dispatch_from_dyn)]
 #![feature(new_uninit)]
+#![feature(offset_of)]
+#![feature(ptr_metadata)]
 #![feature(receiver_trait)]
 #![feature(unsize)]
 
@@ -45,6 +47,7 @@ pub mod str;
 pub mod sync;
 pub mod task;
 pub mod types;
+pub mod workqueue;
 
 #[doc(hidden)]
 pub use bindings;
diff --git a/rust/kernel/print.rs b/rust/kernel/print.rs
index 8009184bf6d7..f48926e3e9fe 100644
--- a/rust/kernel/print.rs
+++ b/rust/kernel/print.rs
@@ -399,6 +399,7 @@ macro_rules! pr_debug (
 /// Mimics the interface of [`std::print!`]. See [`core::fmt`] and
 /// `alloc::format!` for information about the formatting syntax.
 ///
+/// [`pr_info!`]: crate::pr_info!
 /// [`pr_cont`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_cont
 /// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html
 ///
diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs
index 3d496391a9bd..77cdbcf7bd2e 100644
--- a/rust/kernel/sync/arc.rs
+++ b/rust/kernel/sync/arc.rs
@@ -24,13 +24,13 @@ use crate::{
 };
 use alloc::boxed::Box;
 use core::{
-    alloc::AllocError,
+    alloc::{AllocError, Layout},
     fmt,
     marker::{PhantomData, Unsize},
     mem::{ManuallyDrop, MaybeUninit},
     ops::{Deref, DerefMut},
     pin::Pin,
-    ptr::NonNull,
+    ptr::{NonNull, Pointee},
 };
 use macros::pin_data;
 
@@ -215,6 +215,48 @@ impl<T: ?Sized> Arc<T> {
         }
     }
 
+    /// Convert the [`Arc`] into a raw pointer.
+    ///
+    /// The raw pointer has ownership of the refcount that this Arc object owned.
+    pub fn into_raw(self) -> *const T {
+        let ptr = self.ptr.as_ptr();
+        core::mem::forget(self);
+        // SAFETY: The pointer is valid.
+        unsafe { core::ptr::addr_of!((*ptr).data) }
+    }
+
+    /// Recreates an [`Arc`] instance previously deconstructed via [`Arc::into_raw`].
+    ///
+    /// # Safety
+    ///
+    /// `ptr` must have been returned by a previous call to [`Arc::into_raw`]. Additionally, it
+    /// must not be called more than once for each previous call to [`Arc::into_raw`].
+    pub unsafe fn from_raw(ptr: *const T) -> Self {
+        let refcount_layout = Layout::new::<bindings::refcount_t>();
+        // SAFETY: The caller guarantees that the pointer is valid.
+        let val_layout = Layout::for_value(unsafe { &*ptr });
+        // SAFETY: We're computing the layout of a real struct that existed when compiling this
+        // binary, so its layout is not so large that it can trigger arithmetic overflow.
+        let val_offset = unsafe { refcount_layout.extend(val_layout).unwrap_unchecked().1 };
+
+        let metadata: <T as Pointee>::Metadata = core::ptr::metadata(ptr);
+        // SAFETY: The metadata of `T` and `ArcInner<T>` is the same because `ArcInner` is a struct
+        // with `T` as its last field.
+        //
+        // This is documented at:
+        // <https://doc.rust-lang.org/std/ptr/trait.Pointee.html>.
+        let metadata: <ArcInner<T> as Pointee>::Metadata =
+            unsafe { core::mem::transmute_copy(&metadata) };
+        // SAFETY: The pointer is in-bounds of an allocation both before and after offsetting the
+        // pointer, since it originates from a previous call to `Arc::into_raw` and is still valid.
+        let ptr = unsafe { (ptr as *mut u8).sub(val_offset) as *mut () };
+        let ptr = core::ptr::from_raw_parts_mut(ptr, metadata);
+
+        // SAFETY: By the safety requirements we know that `ptr` came from `Arc::into_raw`, so the
+        // reference count held then will be owned by the new `Arc` object.
+        unsafe { Self::from_inner(NonNull::new_unchecked(ptr)) }
+    }
+
     /// Returns an [`ArcBorrow`] from the given [`Arc`].
     ///
     /// This is useful when the argument of a function call is an [`ArcBorrow`] (e.g., in a method
@@ -302,7 +344,7 @@ impl<T: ?Sized> Drop for Arc<T> {
             // The count reached zero, we must free the memory.
             //
             // SAFETY: The pointer was initialised from the result of `Box::leak`.
-            unsafe { Box::from_raw(self.ptr.as_ptr()) };
+            unsafe { drop(Box::from_raw(self.ptr.as_ptr())) };
         }
     }
 }
diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs
index ed353399c4e5..b679b6f6dbeb 100644
--- a/rust/kernel/sync/condvar.rs
+++ b/rust/kernel/sync/condvar.rs
@@ -91,7 +91,6 @@ unsafe impl Sync for CondVar {}
 
 impl CondVar {
     /// Constructs a new condvar initialiser.
-    #[allow(clippy::new_ret_no_self)]
     pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit<Self> {
         pin_init!(Self {
             _pin: PhantomPinned,
diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs
index 70a785f04754..f12a684bc957 100644
--- a/rust/kernel/sync/lock.rs
+++ b/rust/kernel/sync/lock.rs
@@ -99,7 +99,6 @@ unsafe impl<T: ?Sized + Send, B: Backend> Sync for Lock<T, B> {}
 
 impl<T, B: Backend> Lock<T, B> {
     /// Constructs a new lock initialiser.
-    #[allow(clippy::new_ret_no_self)]
     pub fn new(t: T, name: &'static CStr, key: &'static LockClassKey) -> impl PinInit<Self> {
         pin_init!(Self {
             data: UnsafeCell::new(t),
diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs
index 7eda15e5f1b3..b2299bc7ac1f 100644
--- a/rust/kernel/task.rs
+++ b/rust/kernel/task.rs
@@ -82,7 +82,7 @@ impl Task {
     /// Returns a task reference for the currently executing task/thread.
     ///
     /// The recommended way to get the current task/thread is to use the
-    /// [`current`](crate::current) macro because it is safe.
+    /// [`current`] macro because it is safe.
     ///
     /// # Safety
     ///
diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs
new file mode 100644
index 000000000000..b67fb1ba168e
--- /dev/null
+++ b/rust/kernel/workqueue.rs
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Work queues.
+//!
+//! This file has two components: The raw work item API, and the safe work item API.
+//!
+//! One pattern that is used in both APIs is the `ID` const generic, which exists to allow a single
+//! type to define multiple `work_struct` fields. This is done by choosing an id for each field,
+//! and using that id to specify which field you wish to use. (The actual value doesn't matter, as
+//! long as you use different values for different fields of the same struct.) Since these IDs are
+//! generic, they are used only at compile-time, so they shouldn't exist in the final binary.
+//!
+//! # The raw API
+//!
+//! The raw API consists of the `RawWorkItem` trait, where the work item needs to provide an
+//! arbitrary function that knows how to enqueue the work item. It should usually not be used
+//! directly, but if you want to, you can use it without using the pieces from the safe API.
+//!
+//! # The safe API
+//!
+//! The safe API is used via the `Work` struct and `WorkItem` traits. Furthermore, it also includes
+//! a trait called `WorkItemPointer`, which is usually not used directly by the user.
+//!
+//!  * The `Work` struct is the Rust wrapper for the C `work_struct` type.
+//!  * The `WorkItem` trait is implemented for structs that can be enqueued to a workqueue.
+//!  * The `WorkItemPointer` trait is implemented for the pointer type that points at a something
+//!    that implements `WorkItem`.
+//!
+//! ## Example
+//!
+//! This example defines a struct that holds an integer and can be scheduled on the workqueue. When
+//! the struct is executed, it will print the integer. Since there is only one `work_struct` field,
+//! we do not need to specify ids for the fields.
+//!
+//! ```
+//! use kernel::prelude::*;
+//! use kernel::sync::Arc;
+//! use kernel::workqueue::{self, Work, WorkItem};
+//! use kernel::{impl_has_work, new_work};
+//!
+//! #[pin_data]
+//! struct MyStruct {
+//!     value: i32,
+//!     #[pin]
+//!     work: Work<MyStruct>,
+//! }
+//!
+//! impl_has_work! {
+//!     impl HasWork<Self> for MyStruct { self.work }
+//! }
+//!
+//! impl MyStruct {
+//!     fn new(value: i32) -> Result<Arc<Self>> {
+//!         Arc::pin_init(pin_init!(MyStruct {
+//!             value,
+//!             work <- new_work!("MyStruct::work"),
+//!         }))
+//!     }
+//! }
+//!
+//! impl WorkItem for MyStruct {
+//!     type Pointer = Arc<MyStruct>;
+//!
+//!     fn run(this: Arc<MyStruct>) {
+//!         pr_info!("The value is: {}", this.value);
+//!     }
+//! }
+//!
+//! /// This method will enqueue the struct for execution on the system workqueue, where its value
+//! /// will be printed.
+//! fn print_later(val: Arc<MyStruct>) {
+//!     let _ = workqueue::system().enqueue(val);
+//! }
+//! ```
+//!
+//! The following example shows how multiple `work_struct` fields can be used:
+//!
+//! ```
+//! use kernel::prelude::*;
+//! use kernel::sync::Arc;
+//! use kernel::workqueue::{self, Work, WorkItem};
+//! use kernel::{impl_has_work, new_work};
+//!
+//! #[pin_data]
+//! struct MyStruct {
+//!     value_1: i32,
+//!     value_2: i32,
+//!     #[pin]
+//!     work_1: Work<MyStruct, 1>,
+//!     #[pin]
+//!     work_2: Work<MyStruct, 2>,
+//! }
+//!
+//! impl_has_work! {
+//!     impl HasWork<Self, 1> for MyStruct { self.work_1 }
+//!     impl HasWork<Self, 2> for MyStruct { self.work_2 }
+//! }
+//!
+//! impl MyStruct {
+//!     fn new(value_1: i32, value_2: i32) -> Result<Arc<Self>> {
+//!         Arc::pin_init(pin_init!(MyStruct {
+//!             value_1,
+//!             value_2,
+//!             work_1 <- new_work!("MyStruct::work_1"),
+//!             work_2 <- new_work!("MyStruct::work_2"),
+//!         }))
+//!     }
+//! }
+//!
+//! impl WorkItem<1> for MyStruct {
+//!     type Pointer = Arc<MyStruct>;
+//!
+//!     fn run(this: Arc<MyStruct>) {
+//!         pr_info!("The value is: {}", this.value_1);
+//!     }
+//! }
+//!
+//! impl WorkItem<2> for MyStruct {
+//!     type Pointer = Arc<MyStruct>;
+//!
+//!     fn run(this: Arc<MyStruct>) {
+//!         pr_info!("The second value is: {}", this.value_2);
+//!     }
+//! }
+//!
+//! fn print_1_later(val: Arc<MyStruct>) {
+//!     let _ = workqueue::system().enqueue::<Arc<MyStruct>, 1>(val);
+//! }
+//!
+//! fn print_2_later(val: Arc<MyStruct>) {
+//!     let _ = workqueue::system().enqueue::<Arc<MyStruct>, 2>(val);
+//! }
+//! ```
+//!
+//! C header: [`include/linux/workqueue.h`](../../../../include/linux/workqueue.h)
+
+use crate::{bindings, prelude::*, sync::Arc, sync::LockClassKey, types::Opaque};
+use alloc::alloc::AllocError;
+use alloc::boxed::Box;
+use core::marker::PhantomData;
+use core::pin::Pin;
+
+/// Creates a [`Work`] initialiser with the given name and a newly-created lock class.
+#[macro_export]
+macro_rules! new_work {
+    ($($name:literal)?) => {
+        $crate::workqueue::Work::new($crate::optional_name!($($name)?), $crate::static_lock_class!())
+    };
+}
+
+/// A kernel work queue.
+///
+/// Wraps the kernel's C `struct workqueue_struct`.
+///
+/// It allows work items to be queued to run on thread pools managed by the kernel. Several are
+/// always available, for example, `system`, `system_highpri`, `system_long`, etc.
+#[repr(transparent)]
+pub struct Queue(Opaque<bindings::workqueue_struct>);
+
+// SAFETY: Accesses to workqueues used by [`Queue`] are thread-safe.
+unsafe impl Send for Queue {}
+// SAFETY: Accesses to workqueues used by [`Queue`] are thread-safe.
+unsafe impl Sync for Queue {}
+
+impl Queue {
+    /// Use the provided `struct workqueue_struct` with Rust.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that the provided raw pointer is not dangling, that it points at a
+    /// valid workqueue, and that it remains valid until the end of 'a.
+    pub unsafe fn from_raw<'a>(ptr: *const bindings::workqueue_struct) -> &'a Queue {
+        // SAFETY: The `Queue` type is `#[repr(transparent)]`, so the pointer cast is valid. The
+        // caller promises that the pointer is not dangling.
+        unsafe { &*(ptr as *const Queue) }
+    }
+
+    /// Enqueues a work item.
+    ///
+    /// This may fail if the work item is already enqueued in a workqueue.
+    ///
+    /// The work item will be submitted using `WORK_CPU_UNBOUND`.
+    pub fn enqueue<W, const ID: u64>(&self, w: W) -> W::EnqueueOutput
+    where
+        W: RawWorkItem<ID> + Send + 'static,
+    {
+        let queue_ptr = self.0.get();
+
+        // SAFETY: We only return `false` if the `work_struct` is already in a workqueue. The other
+        // `__enqueue` requirements are not relevant since `W` is `Send` and static.
+        //
+        // The call to `bindings::queue_work_on` will dereference the provided raw pointer, which
+        // is ok because `__enqueue` guarantees that the pointer is valid for the duration of this
+        // closure.
+        //
+        // Furthermore, if the C workqueue code accesses the pointer after this call to
+        // `__enqueue`, then the work item was successfully enqueued, and `bindings::queue_work_on`
+        // will have returned true. In this case, `__enqueue` promises that the raw pointer will
+        // stay valid until we call the function pointer in the `work_struct`, so the access is ok.
+        unsafe {
+            w.__enqueue(move |work_ptr| {
+                bindings::queue_work_on(bindings::WORK_CPU_UNBOUND as _, queue_ptr, work_ptr)
+            })
+        }
+    }
+
+    /// Tries to spawn the given function or closure as a work item.
+    ///
+    /// This method can fail because it allocates memory to store the work item.
+    pub fn try_spawn<T: 'static + Send + FnOnce()>(&self, func: T) -> Result<(), AllocError> {
+        let init = pin_init!(ClosureWork {
+            work <- new_work!("Queue::try_spawn"),
+            func: Some(func),
+        });
+
+        self.enqueue(Box::pin_init(init).map_err(|_| AllocError)?);
+        Ok(())
+    }
+}
+
+/// A helper type used in `try_spawn`.
+#[pin_data]
+struct ClosureWork<T> {
+    #[pin]
+    work: Work<ClosureWork<T>>,
+    func: Option<T>,
+}
+
+impl<T> ClosureWork<T> {
+    fn project(self: Pin<&mut Self>) -> &mut Option<T> {
+        // SAFETY: The `func` field is not structurally pinned.
+        unsafe { &mut self.get_unchecked_mut().func }
+    }
+}
+
+impl<T: FnOnce()> WorkItem for ClosureWork<T> {
+    type Pointer = Pin<Box<Self>>;
+
+    fn run(mut this: Pin<Box<Self>>) {
+        if let Some(func) = this.as_mut().project().take() {
+            (func)()
+        }
+    }
+}
+
+/// A raw work item.
+///
+/// This is the low-level trait that is designed for being as general as possible.
+///
+/// The `ID` parameter to this trait exists so that a single type can provide multiple
+/// implementations of this trait. For example, if a struct has multiple `work_struct` fields, then
+/// you will implement this trait once for each field, using a different id for each field. The
+/// actual value of the id is not important as long as you use different ids for different fields
+/// of the same struct. (Fields of different structs need not use different ids.)
+///
+/// Note that the id is used only to select the right method to call during compilation. It wont be
+/// part of the final executable.
+///
+/// # Safety
+///
+/// Implementers must ensure that any pointers passed to a `queue_work_on` closure by `__enqueue`
+/// remain valid for the duration specified in the guarantees section of the documentation for
+/// `__enqueue`.
+pub unsafe trait RawWorkItem<const ID: u64> {
+    /// The return type of [`Queue::enqueue`].
+    type EnqueueOutput;
+
+    /// Enqueues this work item on a queue using the provided `queue_work_on` method.
+    ///
+    /// # Guarantees
+    ///
+    /// If this method calls the provided closure, then the raw pointer is guaranteed to point at a
+    /// valid `work_struct` for the duration of the call to the closure. If the closure returns
+    /// true, then it is further guaranteed that the pointer remains valid until someone calls the
+    /// function pointer stored in the `work_struct`.
+    ///
+    /// # Safety
+    ///
+    /// The provided closure may only return `false` if the `work_struct` is already in a workqueue.
+    ///
+    /// If the work item type is annotated with any lifetimes, then you must not call the function
+    /// pointer after any such lifetime expires. (Never calling the function pointer is okay.)
+    ///
+    /// If the work item type is not [`Send`], then the function pointer must be called on the same
+    /// thread as the call to `__enqueue`.
+    unsafe fn __enqueue<F>(self, queue_work_on: F) -> Self::EnqueueOutput
+    where
+        F: FnOnce(*mut bindings::work_struct) -> bool;
+}
+
+/// Defines the method that should be called directly when a work item is executed.
+///
+/// This trait is implemented by `Pin<Box<T>>` and `Arc<T>`, and is mainly intended to be
+/// implemented for smart pointer types. For your own structs, you would implement [`WorkItem`]
+/// instead. The `run` method on this trait will usually just perform the appropriate
+/// `container_of` translation and then call into the `run` method from the [`WorkItem`] trait.
+///
+/// This trait is used when the `work_struct` field is defined using the [`Work`] helper.
+///
+/// # Safety
+///
+/// Implementers must ensure that [`__enqueue`] uses a `work_struct` initialized with the [`run`]
+/// method of this trait as the function pointer.
+///
+/// [`__enqueue`]: RawWorkItem::__enqueue
+/// [`run`]: WorkItemPointer::run
+pub unsafe trait WorkItemPointer<const ID: u64>: RawWorkItem<ID> {
+    /// Run this work item.
+    ///
+    /// # Safety
+    ///
+    /// The provided `work_struct` pointer must originate from a previous call to `__enqueue` where
+    /// the `queue_work_on` closure returned true, and the pointer must still be valid.
+    unsafe extern "C" fn run(ptr: *mut bindings::work_struct);
+}
+
+/// Defines the method that should be called when this work item is executed.
+///
+/// This trait is used when the `work_struct` field is defined using the [`Work`] helper.
+pub trait WorkItem<const ID: u64 = 0> {
+    /// The pointer type that this struct is wrapped in. This will typically be `Arc<Self>` or
+    /// `Pin<Box<Self>>`.
+    type Pointer: WorkItemPointer<ID>;
+
+    /// The method that should be called when this work item is executed.
+    fn run(this: Self::Pointer);
+}
+
+/// Links for a work item.
+///
+/// This struct contains a function pointer to the `run` function from the [`WorkItemPointer`]
+/// trait, and defines the linked list pointers necessary to enqueue a work item in a workqueue.
+///
+/// Wraps the kernel's C `struct work_struct`.
+///
+/// This is a helper type used to associate a `work_struct` with the [`WorkItem`] that uses it.
+#[repr(transparent)]
+pub struct Work<T: ?Sized, const ID: u64 = 0> {
+    work: Opaque<bindings::work_struct>,
+    _inner: PhantomData<T>,
+}
+
+// SAFETY: Kernel work items are usable from any thread.
+//
+// We do not need to constrain `T` since the work item does not actually contain a `T`.
+unsafe impl<T: ?Sized, const ID: u64> Send for Work<T, ID> {}
+// SAFETY: Kernel work items are usable from any thread.
+//
+// We do not need to constrain `T` since the work item does not actually contain a `T`.
+unsafe impl<T: ?Sized, const ID: u64> Sync for Work<T, ID> {}
+
+impl<T: ?Sized, const ID: u64> Work<T, ID> {
+    /// Creates a new instance of [`Work`].
+    #[inline]
+    #[allow(clippy::new_ret_no_self)]
+    pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit<Self>
+    where
+        T: WorkItem<ID>,
+    {
+        // SAFETY: The `WorkItemPointer` implementation promises that `run` can be used as the work
+        // item function.
+        unsafe {
+            kernel::init::pin_init_from_closure(move |slot| {
+                let slot = Self::raw_get(slot);
+                bindings::init_work_with_key(
+                    slot,
+                    Some(T::Pointer::run),
+                    false,
+                    name.as_char_ptr(),
+                    key.as_ptr(),
+                );
+                Ok(())
+            })
+        }
+    }
+
+    /// Get a pointer to the inner `work_struct`.
+    ///
+    /// # Safety
+    ///
+    /// The provided pointer must not be dangling and must be properly aligned. (But the memory
+    /// need not be initialized.)
+    #[inline]
+    pub unsafe fn raw_get(ptr: *const Self) -> *mut bindings::work_struct {
+        // SAFETY: The caller promises that the pointer is aligned and not dangling.
+        //
+        // A pointer cast would also be ok due to `#[repr(transparent)]`. We use `addr_of!` so that
+        // the compiler does not complain that the `work` field is unused.
+        unsafe { Opaque::raw_get(core::ptr::addr_of!((*ptr).work)) }
+    }
+}
+
+/// Declares that a type has a [`Work<T, ID>`] field.
+///
+/// The intended way of using this trait is via the [`impl_has_work!`] macro. You can use the macro
+/// like this:
+///
+/// ```no_run
+/// use kernel::impl_has_work;
+/// use kernel::prelude::*;
+/// use kernel::workqueue::Work;
+///
+/// struct MyWorkItem {
+///     work_field: Work<MyWorkItem, 1>,
+/// }
+///
+/// impl_has_work! {
+///     impl HasWork<MyWorkItem, 1> for MyWorkItem { self.work_field }
+/// }
+/// ```
+///
+/// Note that since the `Work` type is annotated with an id, you can have several `work_struct`
+/// fields by using a different id for each one.
+///
+/// # Safety
+///
+/// The [`OFFSET`] constant must be the offset of a field in Self of type [`Work<T, ID>`]. The methods on
+/// this trait must have exactly the behavior that the definitions given below have.
+///
+/// [`Work<T, ID>`]: Work
+/// [`impl_has_work!`]: crate::impl_has_work
+/// [`OFFSET`]: HasWork::OFFSET
+pub unsafe trait HasWork<T, const ID: u64 = 0> {
+    /// The offset of the [`Work<T, ID>`] field.
+    ///
+    /// [`Work<T, ID>`]: Work
+    const OFFSET: usize;
+
+    /// Returns the offset of the [`Work<T, ID>`] field.
+    ///
+    /// This method exists because the [`OFFSET`] constant cannot be accessed if the type is not Sized.
+    ///
+    /// [`Work<T, ID>`]: Work
+    /// [`OFFSET`]: HasWork::OFFSET
+    #[inline]
+    fn get_work_offset(&self) -> usize {
+        Self::OFFSET
+    }
+
+    /// Returns a pointer to the [`Work<T, ID>`] field.
+    ///
+    /// # Safety
+    ///
+    /// The provided pointer must point at a valid struct of type `Self`.
+    ///
+    /// [`Work<T, ID>`]: Work
+    #[inline]
+    unsafe fn raw_get_work(ptr: *mut Self) -> *mut Work<T, ID> {
+        // SAFETY: The caller promises that the pointer is valid.
+        unsafe { (ptr as *mut u8).add(Self::OFFSET) as *mut Work<T, ID> }
+    }
+
+    /// Returns a pointer to the struct containing the [`Work<T, ID>`] field.
+    ///
+    /// # Safety
+    ///
+    /// The pointer must point at a [`Work<T, ID>`] field in a struct of type `Self`.
+    ///
+    /// [`Work<T, ID>`]: Work
+    #[inline]
+    unsafe fn work_container_of(ptr: *mut Work<T, ID>) -> *mut Self
+    where
+        Self: Sized,
+    {
+        // SAFETY: The caller promises that the pointer points at a field of the right type in the
+        // right kind of struct.
+        unsafe { (ptr as *mut u8).sub(Self::OFFSET) as *mut Self }
+    }
+}
+
+/// Used to safely implement the [`HasWork<T, ID>`] trait.
+///
+/// # Examples
+///
+/// ```
+/// use kernel::impl_has_work;
+/// use kernel::sync::Arc;
+/// use kernel::workqueue::{self, Work};
+///
+/// struct MyStruct {
+///     work_field: Work<MyStruct, 17>,
+/// }
+///
+/// impl_has_work! {
+///     impl HasWork<MyStruct, 17> for MyStruct { self.work_field }
+/// }
+/// ```
+///
+/// [`HasWork<T, ID>`]: HasWork
+#[macro_export]
+macro_rules! impl_has_work {
+    ($(impl$(<$($implarg:ident),*>)?
+       HasWork<$work_type:ty $(, $id:tt)?>
+       for $self:ident $(<$($selfarg:ident),*>)?
+       { self.$field:ident }
+    )*) => {$(
+        // SAFETY: The implementation of `raw_get_work` only compiles if the field has the right
+        // type.
+        unsafe impl$(<$($implarg),*>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self $(<$($selfarg),*>)? {
+            const OFFSET: usize = ::core::mem::offset_of!(Self, $field) as usize;
+
+            #[inline]
+            unsafe fn raw_get_work(ptr: *mut Self) -> *mut $crate::workqueue::Work<$work_type $(, $id)?> {
+                // SAFETY: The caller promises that the pointer is not dangling.
+                unsafe {
+                    ::core::ptr::addr_of_mut!((*ptr).$field)
+                }
+            }
+        }
+    )*};
+}
+
+impl_has_work! {
+    impl<T> HasWork<Self> for ClosureWork<T> { self.work }
+}
+
+unsafe impl<T, const ID: u64> WorkItemPointer<ID> for Arc<T>
+where
+    T: WorkItem<ID, Pointer = Self>,
+    T: HasWork<T, ID>,
+{
+    unsafe extern "C" fn run(ptr: *mut bindings::work_struct) {
+        // SAFETY: The `__enqueue` method always uses a `work_struct` stored in a `Work<T, ID>`.
+        let ptr = ptr as *mut Work<T, ID>;
+        // SAFETY: This computes the pointer that `__enqueue` got from `Arc::into_raw`.
+        let ptr = unsafe { T::work_container_of(ptr) };
+        // SAFETY: This pointer comes from `Arc::into_raw` and we've been given back ownership.
+        let arc = unsafe { Arc::from_raw(ptr) };
+
+        T::run(arc)
+    }
+}
+
+unsafe impl<T, const ID: u64> RawWorkItem<ID> for Arc<T>
+where
+    T: WorkItem<ID, Pointer = Self>,
+    T: HasWork<T, ID>,
+{
+    type EnqueueOutput = Result<(), Self>;
+
+    unsafe fn __enqueue<F>(self, queue_work_on: F) -> Self::EnqueueOutput
+    where
+        F: FnOnce(*mut bindings::work_struct) -> bool,
+    {
+        // Casting between const and mut is not a problem as long as the pointer is a raw pointer.
+        let ptr = Arc::into_raw(self).cast_mut();
+
+        // SAFETY: Pointers into an `Arc` point at a valid value.
+        let work_ptr = unsafe { T::raw_get_work(ptr) };
+        // SAFETY: `raw_get_work` returns a pointer to a valid value.
+        let work_ptr = unsafe { Work::raw_get(work_ptr) };
+
+        if queue_work_on(work_ptr) {
+            Ok(())
+        } else {
+            // SAFETY: The work queue has not taken ownership of the pointer.
+            Err(unsafe { Arc::from_raw(ptr) })
+        }
+    }
+}
+
+unsafe impl<T, const ID: u64> WorkItemPointer<ID> for Pin<Box<T>>
+where
+    T: WorkItem<ID, Pointer = Self>,
+    T: HasWork<T, ID>,
+{
+    unsafe extern "C" fn run(ptr: *mut bindings::work_struct) {
+        // SAFETY: The `__enqueue` method always uses a `work_struct` stored in a `Work<T, ID>`.
+        let ptr = ptr as *mut Work<T, ID>;
+        // SAFETY: This computes the pointer that `__enqueue` got from `Arc::into_raw`.
+        let ptr = unsafe { T::work_container_of(ptr) };
+        // SAFETY: This pointer comes from `Arc::into_raw` and we've been given back ownership.
+        let boxed = unsafe { Box::from_raw(ptr) };
+        // SAFETY: The box was already pinned when it was enqueued.
+        let pinned = unsafe { Pin::new_unchecked(boxed) };
+
+        T::run(pinned)
+    }
+}
+
+unsafe impl<T, const ID: u64> RawWorkItem<ID> for Pin<Box<T>>
+where
+    T: WorkItem<ID, Pointer = Self>,
+    T: HasWork<T, ID>,
+{
+    type EnqueueOutput = ();
+
+    unsafe fn __enqueue<F>(self, queue_work_on: F) -> Self::EnqueueOutput
+    where
+        F: FnOnce(*mut bindings::work_struct) -> bool,
+    {
+        // SAFETY: We're not going to move `self` or any of its fields, so its okay to temporarily
+        // remove the `Pin` wrapper.
+        let boxed = unsafe { Pin::into_inner_unchecked(self) };
+        let ptr = Box::into_raw(boxed);
+
+        // SAFETY: Pointers into a `Box` point at a valid value.
+        let work_ptr = unsafe { T::raw_get_work(ptr) };
+        // SAFETY: `raw_get_work` returns a pointer to a valid value.
+        let work_ptr = unsafe { Work::raw_get(work_ptr) };
+
+        if !queue_work_on(work_ptr) {
+            // SAFETY: This method requires exclusive ownership of the box, so it cannot be in a
+            // workqueue.
+            unsafe { ::core::hint::unreachable_unchecked() }
+        }
+    }
+}
+
+/// Returns the system work queue (`system_wq`).
+///
+/// It is the one used by `schedule[_delayed]_work[_on]()`. Multi-CPU multi-threaded. There are
+/// users which expect relatively short queue flush time.
+///
+/// Callers shouldn't queue work items which can run for too long.
+pub fn system() -> &'static Queue {
+    // SAFETY: `system_wq` is a C global, always available.
+    unsafe { Queue::from_raw(bindings::system_wq) }
+}
+
+/// Returns the system high-priority work queue (`system_highpri_wq`).
+///
+/// It is similar to the one returned by [`system`] but for work items which require higher
+/// scheduling priority.
+pub fn system_highpri() -> &'static Queue {
+    // SAFETY: `system_highpri_wq` is a C global, always available.
+    unsafe { Queue::from_raw(bindings::system_highpri_wq) }
+}
+
+/// Returns the system work queue for potentially long-running work items (`system_long_wq`).
+///
+/// It is similar to the one returned by [`system`] but may host long running work items. Queue
+/// flushing might take relatively long.
+pub fn system_long() -> &'static Queue {
+    // SAFETY: `system_long_wq` is a C global, always available.
+    unsafe { Queue::from_raw(bindings::system_long_wq) }
+}
+
+/// Returns the system unbound work queue (`system_unbound_wq`).
+///
+/// Workers are not bound to any specific CPU, not concurrency managed, and all queued work items
+/// are executed immediately as long as `max_active` limit is not reached and resources are
+/// available.
+pub fn system_unbound() -> &'static Queue {
+    // SAFETY: `system_unbound_wq` is a C global, always available.
+    unsafe { Queue::from_raw(bindings::system_unbound_wq) }
+}
+
+/// Returns the system freezable work queue (`system_freezable_wq`).
+///
+/// It is equivalent to the one returned by [`system`] except that it's freezable.
+///
+/// A freezable workqueue participates in the freeze phase of the system suspend operations. Work
+/// items on the workqueue are drained and no new work item starts execution until thawed.
+pub fn system_freezable() -> &'static Queue {
+    // SAFETY: `system_freezable_wq` is a C global, always available.
+    unsafe { Queue::from_raw(bindings::system_freezable_wq) }
+}
+
+/// Returns the system power-efficient work queue (`system_power_efficient_wq`).
+///
+/// It is inclined towards saving power and is converted to "unbound" variants if the
+/// `workqueue.power_efficient` kernel parameter is specified; otherwise, it is similar to the one
+/// returned by [`system`].
+pub fn system_power_efficient() -> &'static Queue {
+    // SAFETY: `system_power_efficient_wq` is a C global, always available.
+    unsafe { Queue::from_raw(bindings::system_power_efficient_wq) }
+}
+
+/// Returns the system freezable power-efficient work queue (`system_freezable_power_efficient_wq`).
+///
+/// It is similar to the one returned by [`system_power_efficient`] except that is freezable.
+///
+/// A freezable workqueue participates in the freeze phase of the system suspend operations. Work
+/// items on the workqueue are drained and no new work item starts execution until thawed.
+pub fn system_freezable_power_efficient() -> &'static Queue {
+    // SAFETY: `system_freezable_power_efficient_wq` is a C global, always available.
+    unsafe { Queue::from_raw(bindings::system_freezable_power_efficient_wq) }
+}
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 82e3fb19fdaf..da37bfa97211 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -262,7 +262,7 @@ $(obj)/%.lst: $(src)/%.c FORCE
 # Compile Rust sources (.rs)
 # ---------------------------------------------------------------------------
 
-rust_allowed_features := new_uninit
+rust_allowed_features := new_uninit,offset_of
 
 # `--out-dir` is required to avoid temporaries being created by `rustc` in the
 # current working directory, which may be not accessible in the out-of-tree
diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux
index 3cd6ca15f390..c9f3e03124d7 100644
--- a/scripts/Makefile.vmlinux
+++ b/scripts/Makefile.vmlinux
@@ -19,6 +19,7 @@ quiet_cmd_cc_o_c = CC      $@
 
 ifdef CONFIG_MODULES
 KASAN_SANITIZE_.vmlinux.export.o := n
+KCSAN_SANITIZE_.vmlinux.export.o := n
 GCOV_PROFILE_.vmlinux.export.o := n
 targets += .vmlinux.export.o
 vmlinux: .vmlinux.export.o
diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
index 0edfdb40364b..25b3b587d37c 100644
--- a/scripts/Makefile.vmlinux_o
+++ b/scripts/Makefile.vmlinux_o
@@ -37,7 +37,8 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION))
 
 vmlinux-objtool-args-$(delay-objtool)			+= $(objtool-args-y)
 vmlinux-objtool-args-$(CONFIG_GCOV_KERNEL)		+= --no-unreachable
-vmlinux-objtool-args-$(CONFIG_NOINSTR_VALIDATION)	+= --noinstr $(if $(CONFIG_CPU_UNRET_ENTRY), --unret)
+vmlinux-objtool-args-$(CONFIG_NOINSTR_VALIDATION)	+= --noinstr \
+							   $(if $(or $(CONFIG_CPU_UNRET_ENTRY),$(CONFIG_CPU_SRSO)), --unret)
 
 objtool-args = $(vmlinux-objtool-args-y) --link
 
diff --git a/scripts/atomic/gen-atomic-fallback.sh b/scripts/atomic/gen-atomic-fallback.sh
index a45154cefa48..f80d69cfeb1f 100755
--- a/scripts/atomic/gen-atomic-fallback.sh
+++ b/scripts/atomic/gen-atomic-fallback.sh
@@ -223,14 +223,15 @@ gen_xchg_fallbacks()
 
 gen_try_cmpxchg_fallback()
 {
+	local prefix="$1"; shift
 	local cmpxchg="$1"; shift;
-	local order="$1"; shift;
+	local suffix="$1"; shift;
 
 cat <<EOF
-#define raw_try_${cmpxchg}${order}(_ptr, _oldp, _new) \\
+#define raw_${prefix}try_${cmpxchg}${suffix}(_ptr, _oldp, _new) \\
 ({ \\
 	typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \\
-	___r = raw_${cmpxchg}${order}((_ptr), ___o, (_new)); \\
+	___r = raw_${prefix}${cmpxchg}${suffix}((_ptr), ___o, (_new)); \\
 	if (unlikely(___r != ___o)) \\
 		*___op = ___r; \\
 	likely(___r == ___o); \\
@@ -259,11 +260,11 @@ gen_try_cmpxchg_order_fallback()
 	fi
 
 	printf "#else\n"
-	gen_try_cmpxchg_fallback "${cmpxchg}" "${order}"
+	gen_try_cmpxchg_fallback "" "${cmpxchg}" "${order}"
 	printf "#endif\n\n"
 }
 
-gen_try_cmpxchg_fallbacks()
+gen_try_cmpxchg_order_fallbacks()
 {
 	local cmpxchg="$1"; shift;
 
@@ -272,15 +273,17 @@ gen_try_cmpxchg_fallbacks()
 	done
 }
 
-gen_cmpxchg_local_fallbacks()
+gen_def_and_try_cmpxchg_fallback()
 {
+	local prefix="$1"; shift
 	local cmpxchg="$1"; shift
+	local suffix="$1"; shift
 
-	printf "#define raw_${cmpxchg} arch_${cmpxchg}\n\n"
-	printf "#ifdef arch_try_${cmpxchg}\n"
-	printf "#define raw_try_${cmpxchg} arch_try_${cmpxchg}\n"
+	printf "#define raw_${prefix}${cmpxchg}${suffix} arch_${prefix}${cmpxchg}${suffix}\n\n"
+	printf "#ifdef arch_${prefix}try_${cmpxchg}${suffix}\n"
+	printf "#define raw_${prefix}try_${cmpxchg}${suffix} arch_${prefix}try_${cmpxchg}${suffix}\n"
 	printf "#else\n"
-	gen_try_cmpxchg_fallback "${cmpxchg}" ""
+	gen_try_cmpxchg_fallback "${prefix}" "${cmpxchg}" "${suffix}"
 	printf "#endif\n\n"
 }
 
@@ -302,15 +305,15 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128"; do
 done
 
 for cmpxchg in "cmpxchg" "cmpxchg64" "cmpxchg128"; do
-	gen_try_cmpxchg_fallbacks "${cmpxchg}"
+	gen_try_cmpxchg_order_fallbacks "${cmpxchg}"
 done
 
-for cmpxchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local"; do
-	gen_cmpxchg_local_fallbacks "${cmpxchg}" ""
+for cmpxchg in "cmpxchg" "cmpxchg64" "cmpxchg128"; do
+	gen_def_and_try_cmpxchg_fallback "" "${cmpxchg}" "_local"
 done
 
-for cmpxchg in "sync_cmpxchg"; do
-	printf "#define raw_${cmpxchg} arch_${cmpxchg}\n\n"
+for cmpxchg in "cmpxchg"; do
+	gen_def_and_try_cmpxchg_fallback "sync_" "${cmpxchg}" ""
 done
 
 grep '^[a-z]' "$1" | while read name meta args; do
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 8f8f8e3b20f9..592f3ec89b5f 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -169,7 +169,8 @@ for xchg in "xchg" "cmpxchg" "cmpxchg64" "cmpxchg128" "try_cmpxchg" "try_cmpxchg
 	done
 done
 
-for xchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local" "sync_cmpxchg" "try_cmpxchg_local" "try_cmpxchg64_local" "try_cmpxchg128_local"; do
+for xchg in "cmpxchg_local" "cmpxchg64_local" "cmpxchg128_local" "sync_cmpxchg" \
+	    "try_cmpxchg_local" "try_cmpxchg64_local" "try_cmpxchg128_local" "sync_try_cmpxchg"; do
 	gen_xchg "${xchg}" ""
 	printf "\n"
 done
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 7d16f863edf1..25fdb7fda112 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6427,15 +6427,6 @@ sub process {
 			}
 		}
 
-# check for soon-to-be-deprecated single-argument k[v]free_rcu() API
-		if ($line =~ /\bk[v]?free_rcu\s*\([^(]+\)/) {
-			if ($line =~ /\bk[v]?free_rcu\s*\([^,]+\)/) {
-				ERROR("DEPRECATED_API",
-				      "Single-argument k[v]free_rcu() API is deprecated, please pass rcu_head object or call k[v]free_rcu_mightsleep()." . $herecurr);
-			}
-		}
-
-
 # check for unnecessary "Out of Memory" messages
 		if ($line =~ /^\+.*\b$logFunctions\s*\(/ &&
 		    $prevline =~ /^[ \+]\s*if\s*\(\s*(\!\s*|NULL\s*==\s*)?($Lval)(\s*==\s*NULL\s*)?\s*\)/ &&
diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch
index dc39d938ea77..188412aa2757 100644
--- a/scripts/const_structs.checkpatch
+++ b/scripts/const_structs.checkpatch
@@ -94,3 +94,4 @@ vm_operations_struct
 wacom_features
 watchdog_ops
 wd_ops
+xattr_handler
diff --git a/scripts/faddr2line b/scripts/faddr2line
index 0e73aca4f908..587415a52b6f 100755
--- a/scripts/faddr2line
+++ b/scripts/faddr2line
@@ -58,8 +58,21 @@ die() {
 	exit 1
 }
 
-READELF="${CROSS_COMPILE:-}readelf"
-ADDR2LINE="${CROSS_COMPILE:-}addr2line"
+UTIL_SUFFIX=""
+if [[ "${LLVM:-}" == "" ]]; then
+	UTIL_PREFIX=${CROSS_COMPILE:-}
+else
+	UTIL_PREFIX=llvm-
+
+	if [[ "${LLVM}" == *"/" ]]; then
+		UTIL_PREFIX=${LLVM}${UTIL_PREFIX}
+	elif [[ "${LLVM}" == "-"* ]]; then
+		UTIL_SUFFIX=${LLVM}
+	fi
+fi
+
+READELF="${UTIL_PREFIX}readelf${UTIL_SUFFIX}"
+ADDR2LINE="${UTIL_PREFIX}addr2line${UTIL_SUFFIX}"
 AWK="awk"
 GREP="grep"
 
@@ -166,6 +179,11 @@ __faddr2line() {
 			local cur_sym_elf_size=${fields[2]}
 			local cur_sym_name=${fields[7]:-}
 
+			# is_mapping_symbol(cur_sym_name)
+			if [[ ${cur_sym_name} =~ ^(\.L|L0|\$) ]]; then
+				continue
+			fi
+
 			if [[ $cur_sym_addr = $sym_addr ]] &&
 			   [[ $cur_sym_elf_size = $sym_elf_size ]] &&
 			   [[ $cur_sym_name = $sym_name ]]; then
@@ -260,7 +278,7 @@ __faddr2line() {
 
 		DONE=1
 
-	done < <(${READELF} --symbols --wide $objfile | sed 's/\[.*\]//' | ${AWK} -v fn=$sym_name '$4 == "FUNC" && $8 == fn')
+	done < <(${READELF} --symbols --wide $objfile | sed 's/\[.*\]//' | ${AWK} -v fn=$sym_name '$8 == fn')
 }
 
 [[ $# -lt 2 ]] && usage
diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c
index 951b74ba1b24..366395cab490 100644
--- a/scripts/gcc-plugins/randomize_layout_plugin.c
+++ b/scripts/gcc-plugins/randomize_layout_plugin.c
@@ -191,12 +191,14 @@ static void partition_struct(tree *fields, unsigned long length, struct partitio
 
 static void performance_shuffle(tree *newtree, unsigned long length, ranctx *prng_state)
 {
-	unsigned long i, x;
+	unsigned long i, x, index;
 	struct partition_group size_group[length];
 	unsigned long num_groups = 0;
 	unsigned long randnum;
 
 	partition_struct(newtree, length, (struct partition_group *)&size_group, &num_groups);
+
+	/* FIXME: this group shuffle is currently a no-op. */
 	for (i = num_groups - 1; i > 0; i--) {
 		struct partition_group tmp;
 		randnum = ranval(prng_state) % (i + 1);
@@ -206,11 +208,14 @@ static void performance_shuffle(tree *newtree, unsigned long length, ranctx *prn
 	}
 
 	for (x = 0; x < num_groups; x++) {
-		for (i = size_group[x].start + size_group[x].length - 1; i > size_group[x].start; i--) {
+		for (index = size_group[x].length - 1; index > 0; index--) {
 			tree tmp;
+
+			i = size_group[x].start + index;
 			if (DECL_BIT_FIELD_TYPE(newtree[i]))
 				continue;
-			randnum = ranval(prng_state) % (i + 1);
+			randnum = ranval(prng_state) % (index + 1);
+			randnum += size_group[x].start;
 			// we could handle this case differently if desired
 			if (DECL_BIT_FIELD_TYPE(newtree[randnum]))
 				continue;
diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh
index d65ab8bfeaf4..fd5ffdb81bab 100755
--- a/scripts/min-tool-version.sh
+++ b/scripts/min-tool-version.sh
@@ -31,7 +31,7 @@ llvm)
 	fi
 	;;
 rustc)
-	echo 1.71.1
+	echo 1.73.0
 	;;
 bindgen)
 	echo 0.65.1
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index bd6a910f6528..53a0070ff5df 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -226,7 +226,7 @@ static int __aafs_setup_d_inode(struct inode *dir, struct dentry *dentry,
 
 	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_private = data;
 	if (S_ISDIR(mode)) {
 		inode->i_op = iops ? iops : &simple_dir_inode_operations;
@@ -1557,7 +1557,8 @@ void __aafs_profile_migrate_dents(struct aa_profile *old,
 		if (new->dents[i]) {
 			struct inode *inode = d_inode(new->dents[i]);
 
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		}
 		old->dents[i] = NULL;
 	}
@@ -2543,7 +2544,7 @@ static int aa_mk_null_file(struct dentry *parent)
 
 	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFCHR | S_IRUGO | S_IWUGO;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	init_special_inode(inode, S_IFCHR | S_IRUGO | S_IWUGO,
 			   MKDEV(MEM_MAJOR, 3));
 	d_instantiate(dentry, inode);
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 108eccc5ada5..3fa325d5efac 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -734,7 +734,7 @@ fail:
  * apparmor_bprm_committing_creds - do task cleanup on committing new creds
  * @bprm: binprm for the exec  (NOT NULL)
  */
-static void apparmor_bprm_committing_creds(struct linux_binprm *bprm)
+static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 	struct aa_label *label = aa_current_raw_label();
 	struct aa_label *new_label = cred_label(bprm->cred);
@@ -756,7 +756,7 @@ static void apparmor_bprm_committing_creds(struct linux_binprm *bprm)
  * apparmor_bprm_committed_creds() - do cleanup after new creds committed
  * @bprm: binprm for the exec  (NOT NULL)
  */
-static void apparmor_bprm_committed_creds(struct linux_binprm *bprm)
+static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 	/* clear out temporary/transitional state from the context */
 	aa_clear_task_ctx_trans(task_ctx(current));
diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c
index 8b8846073e14..913ec8d0eb63 100644
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -89,10 +89,10 @@ void __aa_loaddata_update(struct aa_loaddata *data, long revision)
 		struct inode *inode;
 
 		inode = d_inode(data->dents[AAFS_LOADDATA_DIR]);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 		inode = d_inode(data->dents[AAFS_LOADDATA_REVISION]);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	}
 }
 
diff --git a/security/commoncap.c b/security/commoncap.c
index bc0521104197..8e8c630ce204 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -720,7 +720,7 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
  * its xattrs and, if present, apply them to the proposed credentials being
  * constructed by execve().
  */
-static int get_file_caps(struct linux_binprm *bprm, struct file *file,
+static int get_file_caps(struct linux_binprm *bprm, const struct file *file,
 			 bool *effective, bool *has_fcap)
 {
 	int rc = 0;
@@ -882,7 +882,7 @@ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
  *
  * Return: 0 if successful, -ve on error.
  */
-int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
+int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
 {
 	/* Process setpcap binaries and capabilities for uid 0 */
 	const struct cred *old = current_cred();
diff --git a/security/inode.c b/security/inode.c
index 3aa75fffa8c9..9e7cde913667 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -145,7 +145,7 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
 
 	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_private = data;
 	if (S_ISDIR(mode)) {
 		inode->i_op = &simple_dir_inode_operations;
diff --git a/security/integrity/Kconfig b/security/integrity/Kconfig
index 232191ee09e3..b6e074ac0227 100644
--- a/security/integrity/Kconfig
+++ b/security/integrity/Kconfig
@@ -68,8 +68,6 @@ config INTEGRITY_MACHINE_KEYRING
 	depends on INTEGRITY_ASYMMETRIC_KEYS
 	depends on SYSTEM_BLACKLIST_KEYRING
 	depends on LOAD_UEFI_KEYS || LOAD_PPC_KEYS
-	select INTEGRITY_CA_MACHINE_KEYRING if LOAD_PPC_KEYS
-	select INTEGRITY_CA_MACHINE_KEYRING_MAX if LOAD_PPC_KEYS
 	help
 	 If set, provide a keyring to which Machine Owner Keys (MOK) may
 	 be added. This keyring shall contain just MOK keys.  Unlike keys
diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c
index 3e7bee30080f..3265d744d5ce 100644
--- a/security/integrity/ima/ima_modsig.c
+++ b/security/integrity/ima/ima_modsig.c
@@ -29,7 +29,7 @@ struct modsig {
 	 * storing the signature.
 	 */
 	int raw_pkcs7_len;
-	u8 raw_pkcs7[];
+	u8 raw_pkcs7[] __counted_by(raw_pkcs7_len);
 };
 
 /*
@@ -65,10 +65,11 @@ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len,
 	buf_len -= sig_len + sizeof(*sig);
 
 	/* Allocate sig_len additional bytes to hold the raw PKCS#7 data. */
-	hdr = kzalloc(sizeof(*hdr) + sig_len, GFP_KERNEL);
+	hdr = kzalloc(struct_size(hdr, raw_pkcs7, sig_len), GFP_KERNEL);
 	if (!hdr)
 		return -ENOMEM;
 
+	hdr->raw_pkcs7_len = sig_len;
 	hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len);
 	if (IS_ERR(hdr->pkcs7_msg)) {
 		rc = PTR_ERR(hdr->pkcs7_msg);
@@ -77,7 +78,6 @@ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len,
 	}
 
 	memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len);
-	hdr->raw_pkcs7_len = sig_len;
 
 	/* We don't know the hash algorithm yet. */
 	hdr->hash_algo = HASH_ALGO__LAST;
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 3c1e7122076b..471cf36dedc0 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -109,13 +109,6 @@ extern void __key_link_end(struct key *keyring,
 extern key_ref_t find_key_to_update(key_ref_t keyring_ref,
 				    const struct keyring_index_key *index_key);
 
-extern struct key *keyring_search_instkey(struct key *keyring,
-					  key_serial_t target_id);
-
-extern int iterate_over_keyring(const struct key *keyring,
-				int (*func)(const struct key *key, void *data),
-				void *data);
-
 struct keyring_search_context {
 	struct keyring_index_key index_key;
 	const struct cred	*cred;
diff --git a/security/keys/trusted-keys/trusted_core.c b/security/keys/trusted-keys/trusted_core.c
index 85fb5c22529a..fee1ab2c734d 100644
--- a/security/keys/trusted-keys/trusted_core.c
+++ b/security/keys/trusted-keys/trusted_core.c
@@ -358,17 +358,17 @@ static int __init init_trusted(void)
 		if (!get_random)
 			get_random = kernel_get_random;
 
-		static_call_update(trusted_key_seal,
-				   trusted_key_sources[i].ops->seal);
-		static_call_update(trusted_key_unseal,
-				   trusted_key_sources[i].ops->unseal);
-		static_call_update(trusted_key_get_random,
-				   get_random);
-		trusted_key_exit = trusted_key_sources[i].ops->exit;
-		migratable = trusted_key_sources[i].ops->migratable;
-
 		ret = trusted_key_sources[i].ops->init();
-		if (!ret)
+		if (!ret) {
+			static_call_update(trusted_key_seal, trusted_key_sources[i].ops->seal);
+			static_call_update(trusted_key_unseal, trusted_key_sources[i].ops->unseal);
+			static_call_update(trusted_key_get_random, get_random);
+
+			trusted_key_exit = trusted_key_sources[i].ops->exit;
+			migratable = trusted_key_sources[i].ops->migratable;
+		}
+
+		if (!ret || ret != -ENODEV)
 			break;
 	}
 
diff --git a/security/keys/trusted-keys/trusted_tee.c b/security/keys/trusted-keys/trusted_tee.c
index ac3e270ade69..aa3d477de6db 100644
--- a/security/keys/trusted-keys/trusted_tee.c
+++ b/security/keys/trusted-keys/trusted_tee.c
@@ -65,24 +65,16 @@ static int trusted_tee_seal(struct trusted_key_payload *p, char *datablob)
 	int ret;
 	struct tee_ioctl_invoke_arg inv_arg;
 	struct tee_param param[4];
-	struct tee_shm *reg_shm_in = NULL, *reg_shm_out = NULL;
+	struct tee_shm *reg_shm = NULL;
 
 	memset(&inv_arg, 0, sizeof(inv_arg));
 	memset(&param, 0, sizeof(param));
 
-	reg_shm_in = tee_shm_register_kernel_buf(pvt_data.ctx, p->key,
-						 p->key_len);
-	if (IS_ERR(reg_shm_in)) {
-		dev_err(pvt_data.dev, "key shm register failed\n");
-		return PTR_ERR(reg_shm_in);
-	}
-
-	reg_shm_out = tee_shm_register_kernel_buf(pvt_data.ctx, p->blob,
-						  sizeof(p->blob));
-	if (IS_ERR(reg_shm_out)) {
-		dev_err(pvt_data.dev, "blob shm register failed\n");
-		ret = PTR_ERR(reg_shm_out);
-		goto out;
+	reg_shm = tee_shm_register_kernel_buf(pvt_data.ctx, p->key,
+					      sizeof(p->key) + sizeof(p->blob));
+	if (IS_ERR(reg_shm)) {
+		dev_err(pvt_data.dev, "shm register failed\n");
+		return PTR_ERR(reg_shm);
 	}
 
 	inv_arg.func = TA_CMD_SEAL;
@@ -90,13 +82,13 @@ static int trusted_tee_seal(struct trusted_key_payload *p, char *datablob)
 	inv_arg.num_params = 4;
 
 	param[0].attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT;
-	param[0].u.memref.shm = reg_shm_in;
+	param[0].u.memref.shm = reg_shm;
 	param[0].u.memref.size = p->key_len;
 	param[0].u.memref.shm_offs = 0;
 	param[1].attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT;
-	param[1].u.memref.shm = reg_shm_out;
+	param[1].u.memref.shm = reg_shm;
 	param[1].u.memref.size = sizeof(p->blob);
-	param[1].u.memref.shm_offs = 0;
+	param[1].u.memref.shm_offs = sizeof(p->key);
 
 	ret = tee_client_invoke_func(pvt_data.ctx, &inv_arg, param);
 	if ((ret < 0) || (inv_arg.ret != 0)) {
@@ -107,11 +99,7 @@ static int trusted_tee_seal(struct trusted_key_payload *p, char *datablob)
 		p->blob_len = param[1].u.memref.size;
 	}
 
-out:
-	if (reg_shm_out)
-		tee_shm_free(reg_shm_out);
-	if (reg_shm_in)
-		tee_shm_free(reg_shm_in);
+	tee_shm_free(reg_shm);
 
 	return ret;
 }
@@ -124,24 +112,16 @@ static int trusted_tee_unseal(struct trusted_key_payload *p, char *datablob)
 	int ret;
 	struct tee_ioctl_invoke_arg inv_arg;
 	struct tee_param param[4];
-	struct tee_shm *reg_shm_in = NULL, *reg_shm_out = NULL;
+	struct tee_shm *reg_shm = NULL;
 
 	memset(&inv_arg, 0, sizeof(inv_arg));
 	memset(&param, 0, sizeof(param));
 
-	reg_shm_in = tee_shm_register_kernel_buf(pvt_data.ctx, p->blob,
-						 p->blob_len);
-	if (IS_ERR(reg_shm_in)) {
-		dev_err(pvt_data.dev, "blob shm register failed\n");
-		return PTR_ERR(reg_shm_in);
-	}
-
-	reg_shm_out = tee_shm_register_kernel_buf(pvt_data.ctx, p->key,
-						  sizeof(p->key));
-	if (IS_ERR(reg_shm_out)) {
-		dev_err(pvt_data.dev, "key shm register failed\n");
-		ret = PTR_ERR(reg_shm_out);
-		goto out;
+	reg_shm = tee_shm_register_kernel_buf(pvt_data.ctx, p->key,
+					      sizeof(p->key) + sizeof(p->blob));
+	if (IS_ERR(reg_shm)) {
+		dev_err(pvt_data.dev, "shm register failed\n");
+		return PTR_ERR(reg_shm);
 	}
 
 	inv_arg.func = TA_CMD_UNSEAL;
@@ -149,11 +129,11 @@ static int trusted_tee_unseal(struct trusted_key_payload *p, char *datablob)
 	inv_arg.num_params = 4;
 
 	param[0].attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT;
-	param[0].u.memref.shm = reg_shm_in;
+	param[0].u.memref.shm = reg_shm;
 	param[0].u.memref.size = p->blob_len;
-	param[0].u.memref.shm_offs = 0;
+	param[0].u.memref.shm_offs = sizeof(p->key);
 	param[1].attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT;
-	param[1].u.memref.shm = reg_shm_out;
+	param[1].u.memref.shm = reg_shm;
 	param[1].u.memref.size = sizeof(p->key);
 	param[1].u.memref.shm_offs = 0;
 
@@ -166,11 +146,7 @@ static int trusted_tee_unseal(struct trusted_key_payload *p, char *datablob)
 		p->key_len = param[1].u.memref.size;
 	}
 
-out:
-	if (reg_shm_out)
-		tee_shm_free(reg_shm_out);
-	if (reg_shm_in)
-		tee_shm_free(reg_shm_in);
+	tee_shm_free(reg_shm);
 
 	return ret;
 }
diff --git a/security/security.c b/security/security.c
index 23b129d482a7..dcb3e7014f9b 100644
--- a/security/security.c
+++ b/security/security.c
@@ -957,7 +957,7 @@ int security_capable(const struct cred *cred,
  *
  * Return: Returns 0 if permission is granted.
  */
-int security_quotactl(int cmds, int type, int id, struct super_block *sb)
+int security_quotactl(int cmds, int type, int id, const struct super_block *sb)
 {
 	return call_int_hook(quotactl, 0, cmds, type, id, sb);
 }
@@ -1079,7 +1079,7 @@ int security_bprm_creds_for_exec(struct linux_binprm *bprm)
  *
  * Return: Returns 0 if the hook is successful and permission is granted.
  */
-int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
+int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
 {
 	return call_int_hook(bprm_creds_from_file, 0, bprm, file);
 }
@@ -1118,7 +1118,7 @@ int security_bprm_check(struct linux_binprm *bprm)
  * open file descriptors to which access will no longer be granted when the
  * attributes are changed.  This is called immediately before commit_creds().
  */
-void security_bprm_committing_creds(struct linux_binprm *bprm)
+void security_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 	call_void_hook(bprm_committing_creds, bprm);
 }
@@ -1134,7 +1134,7 @@ void security_bprm_committing_creds(struct linux_binprm *bprm)
  * process such as clearing out non-inheritable signal state.  This is called
  * immediately after commit_creds().
  */
-void security_bprm_committed_creds(struct linux_binprm *bprm)
+void security_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 	call_void_hook(bprm_committed_creds, bprm);
 }
@@ -1319,7 +1319,7 @@ EXPORT_SYMBOL(security_sb_remount);
  *
  * Return: Returns 0 if permission is granted.
  */
-int security_sb_kern_mount(struct super_block *sb)
+int security_sb_kern_mount(const struct super_block *sb)
 {
 	return call_int_hook(sb_kern_mount, 0, sb);
 }
@@ -3957,7 +3957,7 @@ void security_inode_invalidate_secctx(struct inode *inode)
 EXPORT_SYMBOL(security_inode_invalidate_secctx);
 
 /**
- * security_inode_notifysecctx() - Nofify the LSM of an inode's security label
+ * security_inode_notifysecctx() - Notify the LSM of an inode's security label
  * @inode: inode
  * @ctx: secctx
  * @ctxlen: length of secctx
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index d30348fbe0df..61abc1e094a8 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -77,3 +77,13 @@ config SECURITY_SELINUX_DEBUG
 	  This enables debugging code designed to help SELinux kernel
 	  developers, unless you know what this does in the kernel code you
 	  should leave this disabled.
+
+	  To fine control the messages to be printed enable
+	  CONFIG_DYNAMIC_DEBUG and see
+	  Documentation/admin-guide/dynamic-debug-howto.rst for additional
+	  information.
+
+	  Example usage:
+
+		echo -n 'file "security/selinux/*" +p' > \
+			/proc/dynamic_debug/control
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index 836379639058..c47519ed8156 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -12,6 +12,8 @@ obj-$(CONFIG_SECURITY_SELINUX) := selinux.o
 
 ccflags-y := -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include
 
+ccflags-$(CONFIG_SECURITY_SELINUX_DEBUG) += -DDEBUG
+
 selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \
 	     netnode.o netport.o status.o \
 	     ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2aa0e219d721..feda711c6b7b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1937,7 +1937,7 @@ static inline int may_rename(struct inode *old_dir,
 
 /* Check whether a task can perform a filesystem operation. */
 static int superblock_has_perm(const struct cred *cred,
-			       struct super_block *sb,
+			       const struct super_block *sb,
 			       u32 perms,
 			       struct common_audit_data *ad)
 {
@@ -2139,7 +2139,7 @@ static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
 	return cred_has_capability(cred, cap, opts, ns == &init_user_ns);
 }
 
-static int selinux_quotactl(int cmds, int type, int id, struct super_block *sb)
+static int selinux_quotactl(int cmds, int type, int id, const struct super_block *sb)
 {
 	const struct cred *cred = current_cred();
 	int rc = 0;
@@ -2455,7 +2455,7 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 /*
  * Prepare a process for imminent new credential changes due to exec
  */
-static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
+static void selinux_bprm_committing_creds(const struct linux_binprm *bprm)
 {
 	struct task_security_struct *new_tsec;
 	struct rlimit *rlim, *initrlim;
@@ -2501,7 +2501,7 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
  * Clean up the process immediately after the installation of new credentials
  * due to exec
  */
-static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
+static void selinux_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 	const struct task_security_struct *tsec = selinux_cred(current_cred());
 	u32 osid, sid;
@@ -2721,7 +2721,7 @@ out_bad_option:
 	return -EINVAL;
 }
 
-static int selinux_sb_kern_mount(struct super_block *sb)
+static int selinux_sb_kern_mount(const struct super_block *sb)
 {
 	const struct cred *cred = current_cred();
 	struct common_audit_data ad;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 6fa640263216..6c596ae7fef9 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1198,7 +1198,7 @@ static struct inode *sel_make_inode(struct super_block *sb, umode_t mode)
 
 	if (ret) {
 		ret->i_mode = mode;
-		ret->i_atime = ret->i_mtime = inode_set_ctime_current(ret);
+		simple_inode_init_ts(ret);
 	}
 	return ret;
 }
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index 86d98a8e291b..8751a602ead2 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -17,6 +17,7 @@
  *	Tuned number of hash slots for avtab to reduce memory usage
  */
 
+#include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
@@ -66,8 +67,7 @@ static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask)
 }
 
 static struct avtab_node*
-avtab_insert_node(struct avtab *h, u32 hvalue,
-		  struct avtab_node *prev,
+avtab_insert_node(struct avtab *h, struct avtab_node **dst,
 		  const struct avtab_key *key, const struct avtab_datum *datum)
 {
 	struct avtab_node *newnode;
@@ -89,15 +89,8 @@ avtab_insert_node(struct avtab *h, u32 hvalue,
 		newnode->datum.u.data = datum->u.data;
 	}
 
-	if (prev) {
-		newnode->next = prev->next;
-		prev->next = newnode;
-	} else {
-		struct avtab_node **n = &h->htable[hvalue];
-
-		newnode->next = *n;
-		*n = newnode;
-	}
+	newnode->next = *dst;
+	*dst = newnode;
 
 	h->nel++;
 	return newnode;
@@ -137,7 +130,8 @@ static int avtab_insert(struct avtab *h, const struct avtab_key *key,
 			break;
 	}
 
-	newnode = avtab_insert_node(h, hvalue, prev, key, datum);
+	newnode = avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue],
+				    key, datum);
 	if (!newnode)
 		return -ENOMEM;
 
@@ -177,7 +171,8 @@ struct avtab_node *avtab_insert_nonunique(struct avtab *h,
 		    key->target_class < cur->key.target_class)
 			break;
 	}
-	return avtab_insert_node(h, hvalue, prev, key, datum);
+	return avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue],
+				 key, datum);
 }
 
 /* This search function returns a node pointer, and can be used in
@@ -298,13 +293,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
 	u32 nslot = 0;
 
 	if (nrules != 0) {
-		u32 shift = 1;
-		u32 work = nrules >> 3;
-		while (work) {
-			work >>= 1;
-			shift++;
-		}
-		nslot = 1 << shift;
+		nslot = nrules > 3 ? rounddown_pow_of_two(nrules / 2) : 2;
 		if (nslot > MAX_AVTAB_HASH_BUCKETS)
 			nslot = MAX_AVTAB_HASH_BUCKETS;
 
@@ -349,7 +338,7 @@ void avtab_hash_eval(struct avtab *h, const char *tag)
 	}
 
 	pr_debug("SELinux: %s:  %d entries and %d/%d buckets used, "
-	       "longest chain length %d sum of chain length^2 %llu\n",
+	       "longest chain length %d, sum of chain length^2 %llu\n",
 	       tag, h->nel, slots_used, h->nslot, max_chain_len,
 	       chain2_len_sum);
 }
@@ -477,11 +466,7 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
 		return -EINVAL;
 	}
 
-	set = 0;
-	for (i = 0; i < ARRAY_SIZE(spec_order); i++) {
-		if (key.specified & spec_order[i])
-			set++;
-	}
+	set = hweight16(key.specified & (AVTAB_XPERMS | AVTAB_TYPE | AVTAB_AV));
 	if (!set || set > 1) {
 		pr_err("SELinux:  avtab:  more than one specifier\n");
 		return -EINVAL;
diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c
index ac5cdddfbf78..c05d8346a94a 100644
--- a/security/selinux/ss/hashtab.c
+++ b/security/selinux/ss/hashtab.c
@@ -107,10 +107,12 @@ int hashtab_map(struct hashtab *h,
 void hashtab_stat(struct hashtab *h, struct hashtab_info *info)
 {
 	u32 i, chain_len, slots_used, max_chain_len;
+	u64 chain2_len_sum;
 	struct hashtab_node *cur;
 
 	slots_used = 0;
 	max_chain_len = 0;
+	chain2_len_sum = 0;
 	for (i = 0; i < h->size; i++) {
 		cur = h->htable[i];
 		if (cur) {
@@ -123,11 +125,14 @@ void hashtab_stat(struct hashtab *h, struct hashtab_info *info)
 
 			if (chain_len > max_chain_len)
 				max_chain_len = chain_len;
+
+			chain2_len_sum += (u64)chain_len * chain_len;
 		}
 	}
 
 	info->slots_used = slots_used;
 	info->max_chain_len = max_chain_len;
+	info->chain2_len_sum = chain2_len_sum;
 }
 #endif /* CONFIG_SECURITY_SELINUX_DEBUG */
 
diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h
index f9713b56d3d0..09b0a3744937 100644
--- a/security/selinux/ss/hashtab.h
+++ b/security/selinux/ss/hashtab.h
@@ -38,6 +38,7 @@ struct hashtab {
 struct hashtab_info {
 	u32 slots_used;
 	u32 max_chain_len;
+	u64 chain2_len_sum;
 };
 
 /*
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 2d528f699a22..595a435ea9c8 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -491,7 +491,7 @@ static u32 role_trans_hash(const void *k)
 {
 	const struct role_trans_key *key = k;
 
-	return key->role + (key->type << 3) + (key->tclass << 5);
+	return jhash_3words(key->role, key->type, (u32)key->tclass << 16 | key->tclass, 0);
 }
 
 static int role_trans_cmp(const void *k1, const void *k2)
@@ -684,9 +684,9 @@ static void hash_eval(struct hashtab *h, const char *hash_name)
 	struct hashtab_info info;
 
 	hashtab_stat(h, &info);
-	pr_debug("SELinux: %s:  %d entries and %d/%d buckets used, longest chain length %d\n",
+	pr_debug("SELinux: %s:  %d entries and %d/%d buckets used, longest chain length %d, sum of chain length^2 %llu\n",
 		 hash_name, h->nel, info.slots_used, h->size,
-		 info.max_chain_len);
+		 info.max_chain_len, info.chain2_len_sum);
 }
 
 static void symtab_hash_eval(struct symtab *s)
diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c
index d8ead463b8df..732fd8e22a12 100644
--- a/security/selinux/ss/sidtab.c
+++ b/security/selinux/ss/sidtab.c
@@ -25,7 +25,7 @@ struct sidtab_str_cache {
 	struct list_head lru_member;
 	struct sidtab_entry *parent;
 	u32 len;
-	char str[];
+	char str[] __counted_by(len);
 };
 
 #define index_to_sid(index) ((index) + SECINITSID_NUM + 1)
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 25006fddc964..255f1b470295 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -52,7 +52,7 @@ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
  *
  * @bprm: Pointer to "struct linux_binprm".
  */
-static void tomoyo_bprm_committed_creds(struct linux_binprm *bprm)
+static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm)
 {
 	/* Clear old_domain_info saved by execve() request. */
 	struct tomoyo_task *s = tomoyo_task(current);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index bd9ddf412b46..9a69236fa207 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3527,7 +3527,7 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to)
 	if (runtime->state == SNDRV_PCM_STATE_OPEN ||
 	    runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
 		return -EBADFD;
-	if (!to->user_backed)
+	if (!user_backed_iter(to))
 		return -EINVAL;
 	if (to->nr_segs > 1024 || to->nr_segs != runtime->channels)
 		return -EINVAL;
@@ -3567,7 +3567,7 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from)
 	if (runtime->state == SNDRV_PCM_STATE_OPEN ||
 	    runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
 		return -EBADFD;
-	if (!from->user_backed)
+	if (!user_backed_iter(from))
 		return -EINVAL;
 	if (from->nr_segs > 128 || from->nr_segs != runtime->channels ||
 	    !frame_aligned(runtime, iov->iov_len))
diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile
index 909b6eb500fe..e69c26abe1ea 100644
--- a/tools/include/nolibc/Makefile
+++ b/tools/include/nolibc/Makefile
@@ -34,6 +34,7 @@ all_files := \
 		signal.h \
 		stackprotector.h \
 		std.h \
+		stdarg.h \
 		stdint.h \
 		stdlib.h \
 		string.h \
diff --git a/tools/include/nolibc/arch-aarch64.h b/tools/include/nolibc/arch-aarch64.h
index 6c33c46848e3..b23ac1f04035 100644
--- a/tools/include/nolibc/arch-aarch64.h
+++ b/tools/include/nolibc/arch-aarch64.h
@@ -20,10 +20,7 @@
  *   - the arguments are cast to long and assigned into the target registers
  *     which are then simply passed as registers to the asm code, so that we
  *     don't have to experience issues with register constraints.
- *
- * On aarch64, select() is not implemented so we have to use pselect6().
  */
-#define __ARCH_WANT_SYS_PSELECT6
 
 #define my_syscall0(num)                                                      \
 ({                                                                            \
diff --git a/tools/include/nolibc/arch-loongarch.h b/tools/include/nolibc/arch-loongarch.h
index bf98f6220195..3f8ef8f86c0f 100644
--- a/tools/include/nolibc/arch-loongarch.h
+++ b/tools/include/nolibc/arch-loongarch.h
@@ -19,10 +19,8 @@
  *   - the arguments are cast to long and assigned into the target
  *     registers which are then simply passed as registers to the asm code,
  *     so that we don't have to experience issues with register constraints.
- *
- * On LoongArch, select() is not implemented so we have to use pselect6().
  */
-#define __ARCH_WANT_SYS_PSELECT6
+
 #define _NOLIBC_SYSCALL_CLOBBERLIST \
 	"memory", "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8"
 
diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h
index 950cc2283fd7..1927c643c739 100644
--- a/tools/include/nolibc/arch-riscv.h
+++ b/tools/include/nolibc/arch-riscv.h
@@ -19,10 +19,7 @@
  *   - the arguments are cast to long and assigned into the target
  *     registers which are then simply passed as registers to the asm code,
  *     so that we don't have to experience issues with register constraints.
- *
- * On riscv, select() is not implemented so we have to use pselect6().
  */
-#define __ARCH_WANT_SYS_PSELECT6
 
 #define my_syscall0(num)                                                      \
 ({                                                                            \
diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index e5ccb926c903..68609f421934 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -173,4 +173,46 @@ void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_
 	__builtin_unreachable();
 }
 
+#define NOLIBC_ARCH_HAS_MEMMOVE
+void *memmove(void *dst, const void *src, size_t len);
+
+#define NOLIBC_ARCH_HAS_MEMCPY
+void *memcpy(void *dst, const void *src, size_t len);
+
+#define NOLIBC_ARCH_HAS_MEMSET
+void *memset(void *dst, int c, size_t len);
+
+__asm__ (
+".section .text.nolibc_memmove_memcpy\n"
+".weak memmove\n"
+".weak memcpy\n"
+"memmove:\n"
+"memcpy:\n"
+	"movq %rdx, %rcx\n\t"
+	"movq %rdi, %rax\n\t"
+	"movq %rdi, %rdx\n\t"
+	"subq %rsi, %rdx\n\t"
+	"cmpq %rcx, %rdx\n\t"
+	"jb   .Lbackward_copy\n\t"
+	"rep movsb\n\t"
+	"retq\n"
+".Lbackward_copy:"
+	"leaq -1(%rdi, %rcx, 1), %rdi\n\t"
+	"leaq -1(%rsi, %rcx, 1), %rsi\n\t"
+	"std\n\t"
+	"rep movsb\n\t"
+	"cld\n\t"
+	"retq\n"
+
+".section .text.nolibc_memset\n"
+".weak memset\n"
+"memset:\n"
+	"xchgl %eax, %esi\n\t"
+	"movq  %rdx, %rcx\n\t"
+	"pushq %rdi\n\t"
+	"rep stosb\n\t"
+	"popq  %rax\n\t"
+	"retq\n"
+);
+
 #endif /* _NOLIBC_ARCH_X86_64_H */
diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h
index a05655b4ce1d..43b551468c2a 100644
--- a/tools/include/nolibc/crt.h
+++ b/tools/include/nolibc/crt.h
@@ -13,12 +13,23 @@ const unsigned long *_auxv __attribute__((weak));
 static void __stack_chk_init(void);
 static void exit(int);
 
+extern void (*const __preinit_array_start[])(void) __attribute__((weak));
+extern void (*const __preinit_array_end[])(void) __attribute__((weak));
+
+extern void (*const __init_array_start[])(void) __attribute__((weak));
+extern void (*const __init_array_end[])(void) __attribute__((weak));
+
+extern void (*const __fini_array_start[])(void) __attribute__((weak));
+extern void (*const __fini_array_end[])(void) __attribute__((weak));
+
 __attribute__((weak))
 void _start_c(long *sp)
 {
 	long argc;
 	char **argv;
 	char **envp;
+	int exitcode;
+	void (* const *func)(void);
 	const unsigned long *auxv;
 	/* silence potential warning: conflicting types for 'main' */
 	int _nolibc_main(int, char **, char **) __asm__ ("main");
@@ -55,8 +66,18 @@ void _start_c(long *sp)
 		;
 	_auxv = auxv;
 
+	for (func = __preinit_array_start; func < __preinit_array_end; func++)
+		(*func)();
+	for (func = __init_array_start; func < __init_array_end; func++)
+		(*func)();
+
 	/* go to application */
-	exit(_nolibc_main(argc, argv, envp));
+	exitcode = _nolibc_main(argc, argv, envp);
+
+	for (func = __fini_array_end; func > __fini_array_start;)
+		(*--func)();
+
+	exit(exitcode);
 }
 
 #endif /* _NOLIBC_CRT_H */
diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 1f8d821000ac..989e707263a4 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -74,10 +74,10 @@
  *            -I../nolibc -o hello hello.c -lgcc
  *
  * The available standard (but limited) include files are:
- *   ctype.h, errno.h, signal.h, stdio.h, stdlib.h, string.h, time.h
+ *   ctype.h, errno.h, signal.h, stdarg.h, stdio.h, stdlib.h, string.h, time.h
  *
  * In addition, the following ones are expected to be provided by the compiler:
- *   float.h, stdarg.h, stddef.h
+ *   float.h, stddef.h
  *
  * The following ones which are part to the C standard are not provided:
  *   assert.h, locale.h, math.h, setjmp.h, limits.h
diff --git a/tools/include/nolibc/stdarg.h b/tools/include/nolibc/stdarg.h
new file mode 100644
index 000000000000..c628b5783da6
--- /dev/null
+++ b/tools/include/nolibc/stdarg.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * Variadic argument support for NOLIBC
+ * Copyright (C) 2005-2020 Rich Felker, et al.
+ */
+
+#ifndef _NOLIBC_STDARG_H
+#define _NOLIBC_STDARG_H
+
+typedef __builtin_va_list va_list;
+#define va_start(v, l)   __builtin_va_start(v, l)
+#define va_end(v)        __builtin_va_end(v)
+#define va_arg(v, l)     __builtin_va_arg(v, l)
+#define va_copy(d, s)    __builtin_va_copy(d, s)
+
+#endif /* _NOLIBC_STDARG_H */
diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h
index cae402c11e57..d7ef43973916 100644
--- a/tools/include/nolibc/stdio.h
+++ b/tools/include/nolibc/stdio.h
@@ -7,13 +7,12 @@
 #ifndef _NOLIBC_STDIO_H
 #define _NOLIBC_STDIO_H
 
-#include <stdarg.h>
-
 #include "std.h"
 #include "arch.h"
 #include "errno.h"
 #include "types.h"
 #include "sys.h"
+#include "stdarg.h"
 #include "stdlib.h"
 #include "string.h"
 
diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h
index 0c2e06c7c477..a01c69dd495f 100644
--- a/tools/include/nolibc/string.h
+++ b/tools/include/nolibc/string.h
@@ -27,28 +27,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
 	return c1;
 }
 
-static __attribute__((unused))
-void *_nolibc_memcpy_up(void *dst, const void *src, size_t len)
-{
-	size_t pos = 0;
-
-	while (pos < len) {
-		((char *)dst)[pos] = ((const char *)src)[pos];
-		pos++;
-	}
-	return dst;
-}
-
-static __attribute__((unused))
-void *_nolibc_memcpy_down(void *dst, const void *src, size_t len)
-{
-	while (len) {
-		len--;
-		((char *)dst)[len] = ((const char *)src)[len];
-	}
-	return dst;
-}
-
+#ifndef NOLIBC_ARCH_HAS_MEMMOVE
 /* might be ignored by the compiler without -ffreestanding, then found as
  * missing.
  */
@@ -72,14 +51,24 @@ void *memmove(void *dst, const void *src, size_t len)
 	}
 	return dst;
 }
+#endif /* #ifndef NOLIBC_ARCH_HAS_MEMMOVE */
 
+#ifndef NOLIBC_ARCH_HAS_MEMCPY
 /* must be exported, as it's used by libgcc on ARM */
 __attribute__((weak,unused,section(".text.nolibc_memcpy")))
 void *memcpy(void *dst, const void *src, size_t len)
 {
-	return _nolibc_memcpy_up(dst, src, len);
+	size_t pos = 0;
+
+	while (pos < len) {
+		((char *)dst)[pos] = ((const char *)src)[pos];
+		pos++;
+	}
+	return dst;
 }
+#endif /* #ifndef NOLIBC_ARCH_HAS_MEMCPY */
 
+#ifndef NOLIBC_ARCH_HAS_MEMSET
 /* might be ignored by the compiler without -ffreestanding, then found as
  * missing.
  */
@@ -95,6 +84,7 @@ void *memset(void *dst, int b, size_t len)
 	}
 	return dst;
 }
+#endif /* #ifndef NOLIBC_ARCH_HAS_MEMSET */
 
 static __attribute__((unused))
 char *strchr(const char *s, int c)
diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index fdb6bd6c0e2f..2f359cb03d10 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -7,7 +7,6 @@
 #ifndef _NOLIBC_SYS_H
 #define _NOLIBC_SYS_H
 
-#include <stdarg.h>
 #include "std.h"
 
 /* system includes */
@@ -25,6 +24,7 @@
 
 #include "arch.h"
 #include "errno.h"
+#include "stdarg.h"
 #include "types.h"
 
 
@@ -43,6 +43,16 @@
 		: __sysret_arg;                         /* return original value */ \
 })
 
+/* Syscall ENOSYS helper: Avoids unused-parameter warnings and provides a
+ * debugging hook.
+ */
+
+static __inline__ int __nolibc_enosys(const char *syscall, ...)
+{
+	(void)syscall;
+	return -ENOSYS;
+}
+
 
 /* Functions in this file only describe syscalls. They're declared static so
  * that the compiler usually decides to inline them while still being allowed
@@ -133,7 +143,7 @@ int sys_chmod(const char *path, mode_t mode)
 #elif defined(__NR_chmod)
 	return my_syscall2(__NR_chmod, path, mode);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, path, mode);
 #endif
 }
 
@@ -156,7 +166,7 @@ int sys_chown(const char *path, uid_t owner, gid_t group)
 #elif defined(__NR_chown)
 	return my_syscall3(__NR_chown, path, owner, group);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, path, owner, group);
 #endif
 }
 
@@ -230,7 +240,7 @@ int sys_dup2(int old, int new)
 #elif defined(__NR_dup2)
 	return my_syscall2(__NR_dup2, old, new);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, old, new);
 #endif
 }
 
@@ -312,7 +322,7 @@ pid_t sys_fork(void)
 #elif defined(__NR_fork)
 	return my_syscall0(__NR_fork);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__);
 #endif
 }
 #endif
@@ -486,7 +496,7 @@ int sys_gettimeofday(struct timeval *tv, struct timezone *tz)
 #ifdef __NR_gettimeofday
 	return my_syscall2(__NR_gettimeofday, tv, tz);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, tv, tz);
 #endif
 }
 
@@ -563,7 +573,7 @@ int sys_link(const char *old, const char *new)
 #elif defined(__NR_link)
 	return my_syscall2(__NR_link, old, new);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, old, new);
 #endif
 }
 
@@ -584,7 +594,7 @@ off_t sys_lseek(int fd, off_t offset, int whence)
 #ifdef __NR_lseek
 	return my_syscall3(__NR_lseek, fd, offset, whence);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, fd, offset, whence);
 #endif
 }
 
@@ -607,7 +617,7 @@ int sys_mkdir(const char *path, mode_t mode)
 #elif defined(__NR_mkdir)
 	return my_syscall2(__NR_mkdir, path, mode);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, path, mode);
 #endif
 }
 
@@ -629,7 +639,7 @@ int sys_rmdir(const char *path)
 #elif defined(__NR_unlinkat)
 	return my_syscall3(__NR_unlinkat, AT_FDCWD, path, AT_REMOVEDIR);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, path);
 #endif
 }
 
@@ -652,7 +662,7 @@ long sys_mknod(const char *path, mode_t mode, dev_t dev)
 #elif defined(__NR_mknod)
 	return my_syscall3(__NR_mknod, path, mode, dev);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, path, mode, dev);
 #endif
 }
 
@@ -742,7 +752,7 @@ int sys_open(const char *path, int flags, mode_t mode)
 #elif defined(__NR_open)
 	return my_syscall3(__NR_open, path, flags, mode);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, path, flags, mode);
 #endif
 }
 
@@ -842,7 +852,7 @@ int sys_poll(struct pollfd *fds, int nfds, int timeout)
 #elif defined(__NR_poll)
 	return my_syscall3(__NR_poll, fds, nfds, timeout);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, fds, nfds, timeout);
 #endif
 }
 
@@ -920,7 +930,11 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva
 		struct timeval *t;
 	} arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout };
 	return my_syscall1(__NR_select, &arg);
-#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6)
+#elif defined(__NR__newselect)
+	return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout);
+#elif defined(__NR_select)
+	return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout);
+#elif defined(__NR_pselect6)
 	struct timespec t;
 
 	if (timeout) {
@@ -928,13 +942,8 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva
 		t.tv_nsec = timeout->tv_usec * 1000;
 	}
 	return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
-#elif defined(__NR__newselect) || defined(__NR_select)
-#ifndef __NR__newselect
-#define __NR__newselect __NR_select
-#endif
-	return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, nfds, rfds, wfds, efds, timeout);
 #endif
 }
 
@@ -989,7 +998,7 @@ int sys_statx(int fd, const char *path, int flags, unsigned int mask, struct sta
 #ifdef __NR_statx
 	return my_syscall5(__NR_statx, fd, path, flags, mask, buf);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, fd, path, flags, mask, buf);
 #endif
 }
 
@@ -1047,7 +1056,7 @@ int sys_symlink(const char *old, const char *new)
 #elif defined(__NR_symlink)
 	return my_syscall2(__NR_symlink, old, new);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, old, new);
 #endif
 }
 
@@ -1104,7 +1113,7 @@ int sys_unlink(const char *path)
 #elif defined(__NR_unlink)
 	return my_syscall1(__NR_unlink, path);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, path);
 #endif
 }
 
@@ -1127,7 +1136,7 @@ pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage)
 #ifdef __NR_wait4
 	return my_syscall4(__NR_wait4, pid, status, options, rusage);
 #else
-	return -ENOSYS;
+	return __nolibc_enosys(__func__, pid, status, options, rusage);
 #endif
 }
 
diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps
index f842bc66b967..64d139400db1 100644
--- a/tools/net/ynl/Makefile.deps
+++ b/tools/net/ynl/Makefile.deps
@@ -18,3 +18,4 @@ CFLAGS_devlink:=$(call get_hdr_inc,_LINUX_DEVLINK_H_,devlink.h)
 CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h)
 CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h)
 CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h)
+CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_H,nfsd.h)
diff --git a/tools/net/ynl/generated/Makefile b/tools/net/ynl/generated/Makefile
index 2f47b9cac757..84cbabdd02a8 100644
--- a/tools/net/ynl/generated/Makefile
+++ b/tools/net/ynl/generated/Makefile
@@ -14,7 +14,7 @@ YNL_GEN_ARG_ethtool:=--user-header linux/ethtool_netlink.h \
 
 TOOL:=../ynl-gen-c.py
 
-GENS:=ethtool devlink handshake fou netdev
+GENS:=ethtool devlink handshake fou netdev nfsd
 SRCS=$(patsubst %,%-user.c,${GENS})
 HDRS=$(patsubst %,%-user.h,${GENS})
 OBJS=$(patsubst %,%-user.o,${GENS})
diff --git a/tools/net/ynl/generated/nfsd-user.c b/tools/net/ynl/generated/nfsd-user.c
new file mode 100644
index 000000000000..fec6828680ce
--- /dev/null
+++ b/tools/net/ynl/generated/nfsd-user.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN user source */
+
+#include <stdlib.h>
+#include <string.h>
+#include "nfsd-user.h"
+#include "ynl.h"
+#include <linux/nfsd_netlink.h>
+
+#include <libmnl/libmnl.h>
+#include <linux/genetlink.h>
+
+/* Enums */
+static const char * const nfsd_op_strmap[] = {
+	[NFSD_CMD_RPC_STATUS_GET] = "rpc-status-get",
+};
+
+const char *nfsd_op_str(int op)
+{
+	if (op < 0 || op >= (int)MNL_ARRAY_SIZE(nfsd_op_strmap))
+		return NULL;
+	return nfsd_op_strmap[op];
+}
+
+/* Policies */
+struct ynl_policy_attr nfsd_rpc_status_policy[NFSD_A_RPC_STATUS_MAX + 1] = {
+	[NFSD_A_RPC_STATUS_XID] = { .name = "xid", .type = YNL_PT_U32, },
+	[NFSD_A_RPC_STATUS_FLAGS] = { .name = "flags", .type = YNL_PT_U32, },
+	[NFSD_A_RPC_STATUS_PROG] = { .name = "prog", .type = YNL_PT_U32, },
+	[NFSD_A_RPC_STATUS_VERSION] = { .name = "version", .type = YNL_PT_U8, },
+	[NFSD_A_RPC_STATUS_PROC] = { .name = "proc", .type = YNL_PT_U32, },
+	[NFSD_A_RPC_STATUS_SERVICE_TIME] = { .name = "service_time", .type = YNL_PT_U64, },
+	[NFSD_A_RPC_STATUS_PAD] = { .name = "pad", .type = YNL_PT_IGNORE, },
+	[NFSD_A_RPC_STATUS_SADDR4] = { .name = "saddr4", .type = YNL_PT_U32, },
+	[NFSD_A_RPC_STATUS_DADDR4] = { .name = "daddr4", .type = YNL_PT_U32, },
+	[NFSD_A_RPC_STATUS_SADDR6] = { .name = "saddr6", .type = YNL_PT_BINARY,},
+	[NFSD_A_RPC_STATUS_DADDR6] = { .name = "daddr6", .type = YNL_PT_BINARY,},
+	[NFSD_A_RPC_STATUS_SPORT] = { .name = "sport", .type = YNL_PT_U16, },
+	[NFSD_A_RPC_STATUS_DPORT] = { .name = "dport", .type = YNL_PT_U16, },
+	[NFSD_A_RPC_STATUS_COMPOUND_OPS] = { .name = "compound-ops", .type = YNL_PT_U32, },
+};
+
+struct ynl_policy_nest nfsd_rpc_status_nest = {
+	.max_attr = NFSD_A_RPC_STATUS_MAX,
+	.table = nfsd_rpc_status_policy,
+};
+
+/* Common nested types */
+/* ============== NFSD_CMD_RPC_STATUS_GET ============== */
+/* NFSD_CMD_RPC_STATUS_GET - dump */
+void nfsd_rpc_status_get_list_free(struct nfsd_rpc_status_get_list *rsp)
+{
+	struct nfsd_rpc_status_get_list *next = rsp;
+
+	while ((void *)next != YNL_LIST_END) {
+		rsp = next;
+		next = rsp->next;
+
+		free(rsp->obj.saddr6);
+		free(rsp->obj.daddr6);
+		free(rsp->obj.compound_ops);
+		free(rsp);
+	}
+}
+
+struct nfsd_rpc_status_get_list *nfsd_rpc_status_get_dump(struct ynl_sock *ys)
+{
+	struct ynl_dump_state yds = {};
+	struct nlmsghdr *nlh;
+	int err;
+
+	yds.ys = ys;
+	yds.alloc_sz = sizeof(struct nfsd_rpc_status_get_list);
+	yds.cb = nfsd_rpc_status_get_rsp_parse;
+	yds.rsp_cmd = NFSD_CMD_RPC_STATUS_GET;
+	yds.rsp_policy = &nfsd_rpc_status_nest;
+
+	nlh = ynl_gemsg_start_dump(ys, ys->family_id, NFSD_CMD_RPC_STATUS_GET, 1);
+
+	err = ynl_exec_dump(ys, nlh, &yds);
+	if (err < 0)
+		goto free_list;
+
+	return yds.first;
+
+free_list:
+	nfsd_rpc_status_get_list_free(yds.first);
+	return NULL;
+}
+
+const struct ynl_family ynl_nfsd_family =  {
+	.name		= "nfsd",
+};
diff --git a/tools/net/ynl/generated/nfsd-user.h b/tools/net/ynl/generated/nfsd-user.h
new file mode 100644
index 000000000000..b6b69501031a
--- /dev/null
+++ b/tools/net/ynl/generated/nfsd-user.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN user header */
+
+#ifndef _LINUX_NFSD_GEN_H
+#define _LINUX_NFSD_GEN_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <linux/types.h>
+#include <linux/nfsd_netlink.h>
+
+struct ynl_sock;
+
+extern const struct ynl_family ynl_nfsd_family;
+
+/* Enums */
+const char *nfsd_op_str(int op);
+
+/* Common nested types */
+/* ============== NFSD_CMD_RPC_STATUS_GET ============== */
+/* NFSD_CMD_RPC_STATUS_GET - dump */
+struct nfsd_rpc_status_get_list {
+	struct nfsd_rpc_status_get_list *next;
+	struct nfsd_rpc_status_get_rsp obj __attribute__ ((aligned (8)));
+};
+
+void nfsd_rpc_status_get_list_free(struct nfsd_rpc_status_get_list *rsp);
+
+struct nfsd_rpc_status_get_list *nfsd_rpc_status_get_dump(struct ynl_sock *ys);
+
+#endif /* _LINUX_NFSD_GEN_H */
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index c0f25d00181e..e327cd827135 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -291,7 +291,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		switch (modrm_reg & 7) {
 		case 5:
 			imm = -imm;
-			/* fallthrough */
+			fallthrough;
 		case 0:
 			/* add/sub imm, %rsp */
 			ADD_OP(op) {
@@ -375,7 +375,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			break;
 		}
 
-		/* fallthrough */
+		fallthrough;
 	case 0x88:
 		if (!rex_w)
 			break;
@@ -656,7 +656,7 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			break;
 		}
 
-		/* fallthrough */
+		fallthrough;
 
 	case 0xca: /* retf */
 	case 0xcb: /* retf */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index e308d1ba664e..e94756e09ca9 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1611,6 +1611,22 @@ static int add_jump_destinations(struct objtool_file *file)
 		}
 
 		/*
+		 * An intra-TU jump in retpoline.o might not have a relocation
+		 * for its jump dest, in which case the above
+		 * add_{retpoline,return}_call() didn't happen.
+		 */
+		if (jump_dest->sym && jump_dest->offset == jump_dest->sym->offset) {
+			if (jump_dest->sym->retpoline_thunk) {
+				add_retpoline_call(file, insn);
+				continue;
+			}
+			if (jump_dest->sym->return_thunk) {
+				add_return_call(file, insn, true);
+				continue;
+			}
+		}
+
+		/*
 		 * Cross-function jump.
 		 */
 		if (insn_func(insn) && insn_func(jump_dest) &&
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 081befa4674b..3d27983dc908 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -22,8 +22,6 @@
 #include <objtool/elf.h>
 #include <objtool/warn.h>
 
-#define MAX_NAME_LEN 128
-
 static inline u32 str_hash(const char *str)
 {
 	return jhash(str, strlen(str), 0);
@@ -515,7 +513,7 @@ static int read_symbols(struct elf *elf)
 	/* Create parent/child links for any cold subfunctions */
 	list_for_each_entry(sec, &elf->sections, list) {
 		sec_for_each_sym(sec, sym) {
-			char pname[MAX_NAME_LEN + 1];
+			char *pname;
 			size_t pnamelen;
 			if (sym->type != STT_FUNC)
 				continue;
@@ -531,15 +529,15 @@ static int read_symbols(struct elf *elf)
 				continue;
 
 			pnamelen = coldstr - sym->name;
-			if (pnamelen > MAX_NAME_LEN) {
-				WARN("%s(): parent function name exceeds maximum length of %d characters",
-				     sym->name, MAX_NAME_LEN);
+			pname = strndup(sym->name, pnamelen);
+			if (!pname) {
+				WARN("%s(): failed to allocate memory",
+				     sym->name);
 				return -1;
 			}
 
-			strncpy(pname, sym->name, pnamelen);
-			pname[pnamelen] = '\0';
 			pfunc = find_symbol_by_name(elf, pname);
+			free(pname);
 
 			if (!pfunc) {
 				WARN("%s(): can't find parent function",
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index e45c7cb1d5bc..e92f67383dde 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -14,6 +14,8 @@ NORETURN(__stack_chk_fail)
 NORETURN(__ubsan_handle_builtin_unreachable)
 NORETURN(arch_call_rest_init)
 NORETURN(arch_cpu_idle_dead)
+NORETURN(bch2_trans_in_restart_error)
+NORETURN(bch2_trans_restart_error)
 NORETURN(cpu_bringup_and_idle)
 NORETURN(cpu_startup_entry)
 NORETURN(do_exit)
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index c54f7235c5d9..f40febdd6e36 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -146,7 +146,5 @@ int main(int argc, const char **argv)
 	exec_cmd_init("objtool", UNUSED, UNUSED, UNUSED);
 	pager_init(UNUSED);
 
-	objtool_run(argc, argv);
-
-	return 0;
+	return objtool_run(argc, argv);
 }
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index 4afb132e4e4f..a6e9848189d6 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -3,7 +3,7 @@
 #
 # Test for cpuset v2 partition root state (PRS)
 #
-# The sched verbose flag is set, if available, so that the console log
+# The sched verbose flag can be optionally set so that the console log
 # can be examined for the correct setting of scheduling domain.
 #
 
@@ -22,27 +22,27 @@ WAIT_INOTIFY=$(cd $(dirname $0); pwd)/wait_inotify
 # Find cgroup v2 mount point
 CGROUP2=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
 [[ -n "$CGROUP2" ]] || skip_test "Cgroup v2 mount point not found!"
+SUBPARTS_CPUS=$CGROUP2/.__DEBUG__.cpuset.cpus.subpartitions
+CPULIST=$(cat $CGROUP2/cpuset.cpus.effective)
 
-CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//")
-[[ $CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!"
+NR_CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//")
+[[ $NR_CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!"
 
 # Set verbose flag and delay factor
 PROG=$1
-VERBOSE=
+VERBOSE=0
 DELAY_FACTOR=1
 SCHED_DEBUG=
 while [[ "$1" = -* ]]
 do
 	case "$1" in
-		-v) VERBOSE=1
+		-v) ((VERBOSE++))
 		    # Enable sched/verbose can slow thing down
 		    [[ $DELAY_FACTOR -eq 1 ]] &&
 			DELAY_FACTOR=2
-		    break
 		    ;;
 		-d) DELAY_FACTOR=$2
 		    shift
-		    break
 		    ;;
 		*)  echo "Usage: $PROG [-v] [-d <delay-factor>"
 		    exit
@@ -52,7 +52,7 @@ do
 done
 
 # Set sched verbose flag if available when "-v" option is specified
-if [[ -n "$VERBOSE" && -d /sys/kernel/debug/sched ]]
+if [[ $VERBOSE -gt 0 && -d /sys/kernel/debug/sched ]]
 then
 	# Used to restore the original setting during cleanup
 	SCHED_DEBUG=$(cat /sys/kernel/debug/sched/verbose)
@@ -61,14 +61,26 @@ fi
 
 cd $CGROUP2
 echo +cpuset > cgroup.subtree_control
+
+#
+# If cpuset has been set up and used in child cgroups, we may not be able to
+# create partition under root cgroup because of the CPU exclusivity rule.
+# So we are going to skip the test if this is the case.
+#
 [[ -d test ]] || mkdir test
-cd test
+echo 0-6 > test/cpuset.cpus
+echo root > test/cpuset.cpus.partition
+cat test/cpuset.cpus.partition | grep -q invalid
+RESULT=$?
+echo member > test/cpuset.cpus.partition
+echo "" > test/cpuset.cpus
+[[ $RESULT -eq 0 ]] && skip_test "Child cgroups are using cpuset!"
 
 cleanup()
 {
 	online_cpus
+	cd $CGROUP2
 	rmdir A1/A2/A3 A1/A2 A1 B1 > /dev/null 2>&1
-	cd ..
 	rmdir test > /dev/null 2>&1
 	[[ -n "$SCHED_DEBUG" ]] &&
 		echo "$SCHED_DEBUG" > /sys/kernel/debug/sched/verbose
@@ -103,7 +115,7 @@ test_partition()
 	[[ $? -eq 0 ]] || exit 1
 	ACTUAL_VAL=$(cat cpuset.cpus.partition)
 	[[ $ACTUAL_VAL != $EXPECTED_VAL ]] && {
-		echo "cpuset.cpus.partition: expect $EXPECTED_VAL, found $EXPECTED_VAL"
+		echo "cpuset.cpus.partition: expect $EXPECTED_VAL, found $ACTUAL_VAL"
 		echo "Test FAILED"
 		exit 1
 	}
@@ -114,7 +126,7 @@ test_effective_cpus()
 	EXPECTED_VAL=$1
 	ACTUAL_VAL=$(cat cpuset.cpus.effective)
 	[[ "$ACTUAL_VAL" != "$EXPECTED_VAL" ]] && {
-		echo "cpuset.cpus.effective: expect '$EXPECTED_VAL', found '$EXPECTED_VAL'"
+		echo "cpuset.cpus.effective: expect '$EXPECTED_VAL', found '$ACTUAL_VAL'"
 		echo "Test FAILED"
 		exit 1
 	}
@@ -139,6 +151,7 @@ test_add_proc()
 #
 test_isolated()
 {
+	cd $CGROUP2/test
 	echo 2-3 > cpuset.cpus
 	TYPE=$(cat cpuset.cpus.partition)
 	[[ $TYPE = member ]] || echo member > cpuset.cpus.partition
@@ -203,125 +216,220 @@ test_isolated()
 #
 # Cgroup test hierarchy
 #
-# test -- A1 -- A2 -- A3
-#      \- B1
+# root -- A1 -- A2 -- A3
+#      +- B1
 #
-#  P<v> = set cpus.partition (0:member, 1:root, 2:isolated, -1:root invalid)
-#  C<l> = add cpu-list
+#  P<v> = set cpus.partition (0:member, 1:root, 2:isolated)
+#  C<l> = add cpu-list to cpuset.cpus
+#  X<l> = add cpu-list to cpuset.cpus.exclusive
 #  S<p> = use prefix in subtree_control
 #  T    = put a task into cgroup
-#  O<c>-<v> = Write <v> to CPU online file of <c>
+#  O<c>=<v> = Write <v> to CPU online file of <c>
 #
 SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1"
 TEST_MATRIX=(
-	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
-	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
-	"  S+    C0-1     .      .    C2-3    S+    C4-5     .      .     0 A2:0-1"
-	"  S+    C0-1     .      .    C2-3    P1      .      .      .     0 "
-	"  S+    C0-1     .      .    C2-3   P1:S+ C0-1:P1   .      .     0 "
-	"  S+    C0-1     .      .    C2-3   P1:S+  C1:P1    .      .     0 "
-	"  S+   C0-1:S+   .      .    C2-3     .      .      .     P1     0 "
-	"  S+   C0-1:P1   .      .    C2-3    S+     C1      .      .     0 "
-	"  S+   C0-1:P1   .      .    C2-3    S+    C1:P1    .      .     0 "
-	"  S+   C0-1:P1   .      .    C2-3    S+    C1:P1    .     P1     0 "
-	"  S+   C0-1:P1   .      .    C2-3   C4-5     .      .      .     0 A1:4-5"
-	"  S+   C0-1:P1   .      .    C2-3  S+:C4-5   .      .      .     0 A1:4-5"
-	"  S+    C0-1     .      .   C2-3:P1   .      .      .     C2     0 "
-	"  S+    C0-1     .      .   C2-3:P1   .      .      .    C4-5    0 B1:4-5"
-	"  S+ C0-3:P1:S+ C2-3:P1 .      .      .      .      .      .     0 A1:0-1,A2:2-3"
-	"  S+ C0-3:P1:S+ C2-3:P1 .      .     C1-3    .      .      .     0 A1:1,A2:2-3"
-	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      .      .      .     0 A1:,A2:3 A1:P1,A2:P1"
-	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      P0     .      .     0 A1:3,A2:3 A1:P1,A2:P0"
-	"  S+ C2-3:P1:S+  C2:P1  .      .     C2-4    .      .      .     0 A1:3-4,A2:2"
-	"  S+ C2-3:P1:S+  C3:P1  .      .     C3      .      .     C0-2   0 A1:,B1:0-2 A1:P1,A2:P1"
-	"  S+ $SETUP_A123_PARTITIONS    .     C2-3    .      .      .     0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	#  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+	#  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
+	"   C0-1     .      .    C2-3    S+    C4-5     .      .     0 A2:0-1"
+	"   C0-1     .      .    C2-3    P1      .      .      .     0 "
+	"   C0-1     .      .    C2-3   P1:S+ C0-1:P1   .      .     0 "
+	"   C0-1     .      .    C2-3   P1:S+  C1:P1    .      .     0 "
+	"  C0-1:S+   .      .    C2-3     .      .      .     P1     0 "
+	"  C0-1:P1   .      .    C2-3    S+     C1      .      .     0 "
+	"  C0-1:P1   .      .    C2-3    S+    C1:P1    .      .     0 "
+	"  C0-1:P1   .      .    C2-3    S+    C1:P1    .     P1     0 "
+	"  C0-1:P1   .      .    C2-3   C4-5     .      .      .     0 A1:4-5"
+	"  C0-1:P1   .      .    C2-3  S+:C4-5   .      .      .     0 A1:4-5"
+	"   C0-1     .      .   C2-3:P1   .      .      .     C2     0 "
+	"   C0-1     .      .   C2-3:P1   .      .      .    C4-5    0 B1:4-5"
+	"C0-3:P1:S+ C2-3:P1 .      .      .      .      .      .     0 A1:0-1,A2:2-3"
+	"C0-3:P1:S+ C2-3:P1 .      .     C1-3    .      .      .     0 A1:1,A2:2-3"
+	"C2-3:P1:S+  C3:P1  .      .     C3      .      .      .     0 A1:,A2:3 A1:P1,A2:P1"
+	"C2-3:P1:S+  C3:P1  .      .     C3      P0     .      .     0 A1:3,A2:3 A1:P1,A2:P0"
+	"C2-3:P1:S+  C2:P1  .      .     C2-4    .      .      .     0 A1:3-4,A2:2"
+	"C2-3:P1:S+  C3:P1  .      .     C3      .      .     C0-2   0 A1:,B1:0-2 A1:P1,A2:P1"
+	"$SETUP_A123_PARTITIONS    .     C2-3    .      .      .     0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
 
 	# CPU offlining cases:
-	"  S+    C0-1     .      .    C2-3    S+    C4-5     .     O2-0   0 A1:0-1,B1:3"
-	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O2-0    .      .      .     0 A1:0-1,A2:3"
-	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O2-0   O2-1    .      .     0 A1:0-1,A2:2-3"
-	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O1-0    .      .      .     0 A1:0,A2:2-3"
-	"  S+ C0-3:P1:S+ C2-3:P1 .      .     O1-0   O1-1    .      .     0 A1:0-1,A2:2-3"
-	"  S+ C2-3:P1:S+  C3:P1  .      .     O3-0   O3-1    .      .     0 A1:2,A2:3 A1:P1,A2:P1"
-	"  S+ C2-3:P1:S+  C3:P2  .      .     O3-0   O3-1    .      .     0 A1:2,A2:3 A1:P1,A2:P2"
-	"  S+ C2-3:P1:S+  C3:P1  .      .     O2-0   O2-1    .      .     0 A1:2,A2:3 A1:P1,A2:P1"
-	"  S+ C2-3:P1:S+  C3:P2  .      .     O2-0   O2-1    .      .     0 A1:2,A2:3 A1:P1,A2:P2"
-	"  S+ C2-3:P1:S+  C3:P1  .      .     O2-0    .      .      .     0 A1:,A2:3 A1:P1,A2:P1"
-	"  S+ C2-3:P1:S+  C3:P1  .      .     O3-0    .      .      .     0 A1:2,A2: A1:P1,A2:P1"
-	"  S+ C2-3:P1:S+  C3:P1  .      .    T:O2-0   .      .      .     0 A1:3,A2:3 A1:P1,A2:P-1"
-	"  S+ C2-3:P1:S+  C3:P1  .      .      .    T:O3-0   .      .     0 A1:2,A2:2 A1:P1,A2:P-1"
-	"  S+ $SETUP_A123_PARTITIONS    .     O1-0    .      .      .     0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
-	"  S+ $SETUP_A123_PARTITIONS    .     O2-0    .      .      .     0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1"
-	"  S+ $SETUP_A123_PARTITIONS    .     O3-0    .      .      .     0 A1:1,A2:2,A3: A1:P1,A2:P1,A3:P1"
-	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0   .      .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
-	"  S+ $SETUP_A123_PARTITIONS    .      .    T:O2-0   .      .     0 A1:1,A2:3,A3:3 A1:P1,A2:P1,A3:P-1"
-	"  S+ $SETUP_A123_PARTITIONS    .      .      .    T:O3-0   .     0 A1:1,A2:2,A3:2 A1:P1,A2:P1,A3:P-1"
-	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O1-1    .      .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
-	"  S+ $SETUP_A123_PARTITIONS    .      .    T:O2-0  O2-1    .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
-	"  S+ $SETUP_A123_PARTITIONS    .      .      .    T:O3-0  O3-1   0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
-	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O2-0   O1-1    .     0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1"
-	"  S+ $SETUP_A123_PARTITIONS    .    T:O1-0  O2-0   O2-1    .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
-
-	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
-	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
+	"   C0-1     .      .    C2-3    S+    C4-5     .     O2=0   0 A1:0-1,B1:3"
+	"C0-3:P1:S+ C2-3:P1 .      .     O2=0    .      .      .     0 A1:0-1,A2:3"
+	"C0-3:P1:S+ C2-3:P1 .      .     O2=0   O2=1    .      .     0 A1:0-1,A2:2-3"
+	"C0-3:P1:S+ C2-3:P1 .      .     O1=0    .      .      .     0 A1:0,A2:2-3"
+	"C0-3:P1:S+ C2-3:P1 .      .     O1=0   O1=1    .      .     0 A1:0-1,A2:2-3"
+	"C2-3:P1:S+  C3:P1  .      .     O3=0   O3=1    .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+	"C2-3:P1:S+  C3:P2  .      .     O3=0   O3=1    .      .     0 A1:2,A2:3 A1:P1,A2:P2"
+	"C2-3:P1:S+  C3:P1  .      .     O2=0   O2=1    .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+	"C2-3:P1:S+  C3:P2  .      .     O2=0   O2=1    .      .     0 A1:2,A2:3 A1:P1,A2:P2"
+	"C2-3:P1:S+  C3:P1  .      .     O2=0    .      .      .     0 A1:,A2:3 A1:P1,A2:P1"
+	"C2-3:P1:S+  C3:P1  .      .     O3=0    .      .      .     0 A1:2,A2: A1:P1,A2:P1"
+	"C2-3:P1:S+  C3:P1  .      .    T:O2=0   .      .      .     0 A1:3,A2:3 A1:P1,A2:P-1"
+	"C2-3:P1:S+  C3:P1  .      .      .    T:O3=0   .      .     0 A1:2,A2:2 A1:P1,A2:P-1"
+	"$SETUP_A123_PARTITIONS    .     O1=0    .      .      .     0 A1:,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"$SETUP_A123_PARTITIONS    .     O2=0    .      .      .     0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1"
+	"$SETUP_A123_PARTITIONS    .     O3=0    .      .      .     0 A1:1,A2:2,A3: A1:P1,A2:P1,A3:P1"
+	"$SETUP_A123_PARTITIONS    .    T:O1=0   .      .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+	"$SETUP_A123_PARTITIONS    .      .    T:O2=0   .      .     0 A1:1,A2:3,A3:3 A1:P1,A2:P1,A3:P-1"
+	"$SETUP_A123_PARTITIONS    .      .      .    T:O3=0   .     0 A1:1,A2:2,A3:2 A1:P1,A2:P1,A3:P-1"
+	"$SETUP_A123_PARTITIONS    .    T:O1=0  O1=1    .      .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"$SETUP_A123_PARTITIONS    .      .    T:O2=0  O2=1    .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"$SETUP_A123_PARTITIONS    .      .      .    T:O3=0  O3=1   0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"$SETUP_A123_PARTITIONS    .    T:O1=0  O2=0   O1=1    .     0 A1:1,A2:,A3:3 A1:P1,A2:P1,A3:P1"
+	"$SETUP_A123_PARTITIONS    .    T:O1=0  O2=0   O2=1    .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+
+	#  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+	#  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
+	#
+	# Remote partition and cpuset.cpus.exclusive tests
+	#
+	" C0-3:S+ C1-3:S+ C2-3     .    X2-3     .      .      .     0 A1:0-3,A2:1-3,A3:2-3,XA1:2-3"
+	" C0-3:S+ C1-3:S+ C2-3     .    X2-3  X2-3:P2   .      .     0 A1:0-1,A2:2-3,A3:2-3 A1:P0,A2:P2 2-3"
+	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X3:P2    .      .     0 A1:0-2,A2:3,A3:3 A1:P0,A2:P2 3"
+	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3  X2-3:P2   .     0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3"
+	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3 X2-3:P2:C3 .     0 A1:0-2,A2:1-2,A3:3 A1:P0,A3:P2 3"
+	" C0-3:S+ C1-3:S+ C2-3   C2-3     .      .      .      P2    0 A1:0-3,A2:1-3,A3:2-3,B1:2-3 A1:P0,A3:P0,B1:P-2"
+	" C0-3:S+ C1-3:S+ C2-3   C4-5     .      .      .      P2    0 B1:4-5 B1:P2 4-5"
+	" C0-3:S+ C1-3:S+ C2-3    C4    X2-3   X2-3  X2-3:P2   P2    0 A3:2-3,B1:4 A3:P2,B1:P2 2-4"
+	" C0-3:S+ C1-3:S+ C2-3    C4    X2-3   X2-3 X2-3:P2:C1-3 P2  0 A3:2-3,B1:4 A3:P2,B1:P2 2-4"
+	" C0-3:S+ C1-3:S+ C2-3    C4    X1-3  X1-3:P2   P2     .     0 A2:1,A3:2-3 A2:P2,A3:P2 1-3"
+	" C0-3:S+ C1-3:S+ C2-3    C4    X2-3   X2-3  X2-3:P2 P2:C4-5 0 A3:2-3,B1:4-5 A3:P2,B1:P2 2-5"
+
+	# Nested remote/local partition tests
+	" C0-3:S+ C1-3:S+ C2-3   C4-5   X2-3  X2-3:P1   P2     P1    0 A1:0-1,A2:,A3:2-3,B1:4-5 \
+								       A1:P0,A2:P1,A3:P2,B1:P1 2-3"
+	" C0-3:S+ C1-3:S+ C2-3    C4    X2-3  X2-3:P1   P2     P1    0 A1:0-1,A2:,A3:2-3,B1:4 \
+								       A1:P0,A2:P1,A3:P2,B1:P1 2-4"
+	" C0-3:S+ C1-3:S+  C3     C4    X2-3  X2-3:P1   P2     P1    0 A1:0-1,A2:2,A3:3,B1:4 \
+								       A1:P0,A2:P1,A3:P2,B1:P1 2-4"
+	" C0-4:S+ C1-4:S+ C2-4     .    X2-4  X2-4:P2  X4:P1    .    0 A1:0-1,A2:2-3,A3:4 \
+								       A1:P0,A2:P2,A3:P1 2-4"
+	" C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \
+				   .      .      X5      .      .    0 A1:0-4,A2:1-4,A3:2-4 \
+								       A1:P0,A2:P-2,A3:P-1 ."
+	" C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \
+				   .      .      .      X1      .    0 A1:0-1,A2:2-4,A3:2-4 \
+								       A1:P0,A2:P2,A3:P-1 2-4"
+
+	# Remote partition offline tests
+	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3 X2-3:P2:O2=0 .   0 A1:0-1,A2:1,A3:3 A1:P0,A3:P2 2-3"
+	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3"
+	" C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3    P2:O3=0   .   0 A1:0-2,A2:1-2,A3: A1:P0,A3:P2 3"
+	" C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3   T:P2:O3=0  .   0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3"
+
+	# An invalidated remote partition cannot self-recover from hotplug
+	" C0-3:S+ C1-3:S+  C2      .    X2-3   X2-3   T:P2:O2=0 O2=1 0 A1:0-3,A2:1-3,A3:2 A1:P0,A3:P-2"
+
+	# cpus.exclusive.effective clearing test
+	" C0-3:S+ C1-3:S+  C2      .   X2-3:X    .      .      .     0 A1:0-3,A2:1-3,A3:2,XA1:"
+
+	# Invalid to valid remote partition transition test
+	" C0-3:S+   C1-3    .      .      .    X3:P2    .      .     0 A1:0-3,A2:1-3,XA2: A2:P-2"
+	" C0-3:S+ C1-3:X3:P2
+			    .      .    X2-3    P2      .      .     0 A1:0-2,A2:3,XA2:3 A2:P2 3"
+
+	# Invalid to valid local partition direct transition tests
+	" C1-3:S+:P2 C2-3:X1:P2 .  .      .      .      .      .     0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3"
+	" C1-3:S+:P2 C2-3:X1:P2 .  .      .    X3:P2    .      .     0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3"
+	"  C0-3:P2   .      .    C4-6   C0-4     .      .      .     0 A1:0-4,B1:4-6 A1:P-2,B1:P0"
+	"  C0-3:P2   .      .    C4-6 C0-4:C0-3  .      .      .     0 A1:0-3,B1:4-6 A1:P2,B1:P0 0-3"
+	"  C0-3:P2   .      .  C3-5:C4-5  .      .      .      .     0 A1:0-3,B1:4-5 A1:P2,B1:P0 0-3"
+
+	# Local partition invalidation tests
+	" C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
+				   .      .      .      .      .     0 A1:1,A2:2,A3:3 A1:P2,A2:P2,A3:P2 1-3"
+	" C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
+				   .      .     X4      .      .     0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3"
+	" C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
+				   .      .     C4      .      .     0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3"
+	# Local partition CPU change tests
+	" C0-5:S+:P2 C4-5:S+:P1 .  .      .    C3-5     .      .     0 A1:0-2,A2:3-5 A1:P2,A2:P1 0-2"
+	" C0-5:S+:P2 C4-5:S+:P1 .  .    C1-5     .      .      .     0 A1:1-3,A2:4-5 A1:P2,A2:P1 1-3"
+
+	# cpus_allowed/exclusive_cpus update tests
+	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
+				   .     C4      .      P2     .     0 A1:4,A2:4,XA2:,XA3:,A3:4 \
+								       A1:P0,A3:P-2 ."
+	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
+				   .     X1      .      P2     .     0 A1:0-3,A2:1-3,XA1:1,XA2:,XA3:,A3:2-3 \
+								       A1:P0,A3:P-2 ."
+	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
+				   .      .     C3      P2     .     0 A1:0-2,A2:0-2,XA2:3,XA3:3,A3:3 \
+								       A1:P0,A3:P2 3"
+	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
+				   .      .     X3      P2     .     0 A1:0-2,A2:1-2,XA2:3,XA3:3,A3:3 \
+								       A1:P0,A3:P2 3"
+	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
+				   .      .     X3      .      .     0 A1:0-3,A2:1-3,XA2:3,XA3:3,A3:2-3 \
+								       A1:P0,A3:P-2 ."
+	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
+				   .      .     C3      .      .     0 A1:0-3,A2:3,XA2:3,XA3:3,A3:3 \
+								       A1:P0,A3:P-2 ."
+	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
+				   .     C4      .      .      .     0 A1:4,A2:4,A3:4,XA1:,XA2:,XA3 \
+								       A1:P0,A3:P-2 ."
+
+	#  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+	#  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
 	#
 	# Incorrect change to cpuset.cpus invalidates partition root
 	#
 	# Adding CPUs to partition root that are not in parent's
 	# cpuset.cpus is allowed, but those extra CPUs are ignored.
-	"  S+ C2-3:P1:S+ C3:P1   .      .      .     C2-4    .      .     0 A1:,A2:2-3 A1:P1,A2:P1"
+	"C2-3:P1:S+ C3:P1   .      .      .     C2-4    .      .     0 A1:,A2:2-3 A1:P1,A2:P1"
 
 	# Taking away all CPUs from parent or itself if there are tasks
 	# will make the partition invalid.
-	"  S+ C2-3:P1:S+  C3:P1  .      .      T     C2-3    .      .     0 A1:2-3,A2:2-3 A1:P1,A2:P-1"
-	"  S+  C3:P1:S+    C3    .      .      T      P1     .      .     0 A1:3,A2:3 A1:P1,A2:P-1"
-	"  S+ $SETUP_A123_PARTITIONS    .    T:C2-3   .      .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
-	"  S+ $SETUP_A123_PARTITIONS    . T:C2-3:C1-3 .      .      .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
+	"C2-3:P1:S+  C3:P1  .      .      T     C2-3    .      .     0 A1:2-3,A2:2-3 A1:P1,A2:P-1"
+	" C3:P1:S+    C3    .      .      T      P1     .      .     0 A1:3,A2:3 A1:P1,A2:P-1"
+	"$SETUP_A123_PARTITIONS    .    T:C2-3   .      .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P-1,A3:P-1"
+	"$SETUP_A123_PARTITIONS    . T:C2-3:C1-3 .      .      .     0 A1:1,A2:2,A3:3 A1:P1,A2:P1,A3:P1"
 
 	# Changing a partition root to member makes child partitions invalid
-	"  S+ C2-3:P1:S+  C3:P1  .      .      P0     .      .      .     0 A1:2-3,A2:3 A1:P0,A2:P-1"
-	"  S+ $SETUP_A123_PARTITIONS    .     C2-3    P0     .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P0,A3:P-1"
+	"C2-3:P1:S+  C3:P1  .      .      P0     .      .      .     0 A1:2-3,A2:3 A1:P0,A2:P-1"
+	"$SETUP_A123_PARTITIONS    .     C2-3    P0     .      .     0 A1:2-3,A2:2-3,A3:3 A1:P1,A2:P0,A3:P-1"
 
 	# cpuset.cpus can contains cpus not in parent's cpuset.cpus as long
 	# as they overlap.
-	"  S+ C2-3:P1:S+  .      .      .      .   C3-4:P1   .      .     0 A1:2,A2:3 A1:P1,A2:P1"
+	"C2-3:P1:S+  .      .      .      .   C3-4:P1   .      .     0 A1:2,A2:3 A1:P1,A2:P1"
 
 	# Deletion of CPUs distributed to child cgroup is allowed.
-	"  S+ C0-1:P1:S+ C1      .    C2-3   C4-5     .      .      .     0 A1:4-5,A2:4-5"
+	"C0-1:P1:S+ C1      .    C2-3   C4-5     .      .      .     0 A1:4-5,A2:4-5"
 
 	# To become a valid partition root, cpuset.cpus must overlap parent's
 	# cpuset.cpus.
-	"  S+   C0-1:P1   .      .    C2-3    S+   C4-5:P1   .      .     0 A1:0-1,A2:0-1 A1:P1,A2:P-1"
+	"  C0-1:P1   .      .    C2-3    S+   C4-5:P1   .      .     0 A1:0-1,A2:0-1 A1:P1,A2:P-1"
 
 	# Enabling partition with child cpusets is allowed
-	"  S+   C0-1:S+  C1      .    C2-3    P1      .      .      .     0 A1:0-1,A2:1 A1:P1"
+	"  C0-1:S+  C1      .    C2-3    P1      .      .      .     0 A1:0-1,A2:1 A1:P1"
 
 	# A partition root with non-partition root parent is invalid, but it
 	# can be made valid if its parent becomes a partition root too.
-	"  S+   C0-1:S+  C1      .    C2-3     .      P2     .      .     0 A1:0-1,A2:1 A1:P0,A2:P-2"
-	"  S+   C0-1:S+ C1:P2    .    C2-3     P1     .      .      .     0 A1:0,A2:1 A1:P1,A2:P2"
+	"  C0-1:S+  C1      .    C2-3     .      P2     .      .     0 A1:0-1,A2:1 A1:P0,A2:P-2"
+	"  C0-1:S+ C1:P2    .    C2-3     P1     .      .      .     0 A1:0,A2:1 A1:P1,A2:P2"
 
 	# A non-exclusive cpuset.cpus change will invalidate partition and its siblings
-	"  S+   C0-1:P1   .      .    C2-3   C0-2     .      .      .     0 A1:0-2,B1:2-3 A1:P-1,B1:P0"
-	"  S+   C0-1:P1   .      .  P1:C2-3  C0-2   .      .      .     0 A1:0-2,B1:2-3 A1:P-1,B1:P-1"
-	"  S+    C0-1     .      .  P1:C2-3  C0-2   .      .      .     0 A1:0-2,B1:2-3 A1:P0,B1:P-1"
+	"  C0-1:P1   .      .    C2-3   C0-2     .      .      .     0 A1:0-2,B1:2-3 A1:P-1,B1:P0"
+	"  C0-1:P1   .      .  P1:C2-3  C0-2     .      .      .     0 A1:0-2,B1:2-3 A1:P-1,B1:P-1"
+	"   C0-1     .      .  P1:C2-3  C0-2     .      .      .     0 A1:0-2,B1:2-3 A1:P0,B1:P-1"
 
-	# test  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate
-	# ----  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------
+	#  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+	#  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
 	# Failure cases:
 
 	# A task cannot be added to a partition with no cpu
-	"  S+ C2-3:P1:S+  C3:P1  .      .    O2-0:T   .      .      .     1 A1:,A2:3 A1:P1,A2:P1"
+	"C2-3:P1:S+  C3:P1  .      .    O2=0:T   .      .      .     1 A1:,A2:3 A1:P1,A2:P1"
+
+	# Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected
+	"   C0-3     .      .    C4-5   X0-3     .      .     X3-5   1 A1:0-3,B1:4-5"
 )
 
 #
 # Write to the cpu online file
-#  $1 - <c>-<v> where <c> = cpu number, <v> value to be written
+#  $1 - <c>=<v> where <c> = cpu number, <v> value to be written
 #
 write_cpu_online()
 {
-	CPU=${1%-*}
-	VAL=${1#*-}
+	CPU=${1%=*}
+	VAL=${1#*=}
 	CPUFILE=//sys/devices/system/cpu/cpu${CPU}/online
 	if [[ $VAL -eq 0 ]]
 	then
@@ -349,11 +457,12 @@ set_ctrl_state()
 	TMPMSG=/tmp/.msg_$$
 	CGRP=$1
 	STATE=$2
-	SHOWERR=${3}${VERBOSE}
+	SHOWERR=${3}
 	CTRL=${CTRL:=$CONTROLLER}
 	HASERR=0
 	REDIRECT="2> $TMPMSG"
 	[[ -z "$STATE" || "$STATE" = '.' ]] && return 0
+	[[ $VERBOSE -gt 0 ]] && SHOWERR=1
 
 	rm -f $TMPMSG
 	for CMD in $(echo $STATE | sed -e "s/:/ /g")
@@ -362,12 +471,18 @@ set_ctrl_state()
 		SFILE=$CGRP/cgroup.subtree_control
 		PFILE=$CGRP/cpuset.cpus.partition
 		CFILE=$CGRP/cpuset.cpus
+		XFILE=$CGRP/cpuset.cpus.exclusive
 		S=$(expr substr $CMD 1 1)
 		if [[ $S = S ]]
 		then
 			PREFIX=${CMD#?}
 			COMM="echo ${PREFIX}${CTRL} > $SFILE"
 			eval $COMM $REDIRECT
+		elif [[ $S = X ]]
+		then
+			CPUS=${CMD#?}
+			COMM="echo $CPUS > $XFILE"
+			eval $COMM $REDIRECT
 		elif [[ $S = C ]]
 		then
 			CPUS=${CMD#?}
@@ -430,7 +545,7 @@ online_cpus()
 	[[ -n "OFFLINE_CPUS" ]] && {
 		for C in $OFFLINE_CPUS
 		do
-			write_cpu_online ${C}-1
+			write_cpu_online ${C}=1
 		done
 	}
 }
@@ -443,18 +558,27 @@ reset_cgroup_states()
 	echo 0 > $CGROUP2/cgroup.procs
 	online_cpus
 	rmdir A1/A2/A3 A1/A2 A1 B1 > /dev/null 2>&1
-	set_ctrl_state . S-
+	pause 0.02
+	set_ctrl_state . R-
 	pause 0.01
 }
 
 dump_states()
 {
-	for DIR in A1 A1/A2 A1/A2/A3 B1
+	for DIR in . A1 A1/A2 A1/A2/A3 B1
 	do
+		CPUS=$DIR/cpuset.cpus
 		ECPUS=$DIR/cpuset.cpus.effective
+		XCPUS=$DIR/cpuset.cpus.exclusive
+		XECPUS=$DIR/cpuset.cpus.exclusive.effective
 		PRS=$DIR/cpuset.cpus.partition
-		[[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)"
-		[[ -e $PRS   ]] && echo "$PRS: $(cat $PRS)"
+		PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions
+		[[ -e $CPUS   ]] && echo "$CPUS: $(cat $CPUS)"
+		[[ -e $XCPUS  ]] && echo "$XCPUS: $(cat $XCPUS)"
+		[[ -e $ECPUS  ]] && echo "$ECPUS: $(cat $ECPUS)"
+		[[ -e $XECPUS ]] && echo "$XECPUS: $(cat $XECPUS)"
+		[[ -e $PRS    ]] && echo "$PRS: $(cat $PRS)"
+		[[ -e $PCPUS  ]] && echo "$PCPUS: $(cat $PCPUS)"
 	done
 }
 
@@ -470,11 +594,17 @@ check_effective_cpus()
 		set -- $(echo $CHK | sed -e "s/:/ /g")
 		CGRP=$1
 		CPUS=$2
+		if [[ $CGRP = X* ]]
+		then
+			CGRP=${CGRP#X}
+			FILE=cpuset.cpus.exclusive.effective
+		else
+			FILE=cpuset.cpus.effective
+		fi
 		[[ $CGRP = A2 ]] && CGRP=A1/A2
 		[[ $CGRP = A3 ]] && CGRP=A1/A2/A3
-		FILE=$CGRP/cpuset.cpus.effective
-		[[ -e $FILE ]] || return 1
-		[[ $CPUS = $(cat $FILE) ]] || return 1
+		[[ -e $CGRP/$FILE ]] || return 1
+		[[ $CPUS = $(cat $CGRP/$FILE) ]] || return 1
 	done
 }
 
@@ -525,6 +655,65 @@ check_cgroup_states()
 }
 
 #
+# Get isolated (including offline) CPUs by looking at
+# /sys/kernel/debug/sched/domains and compare that with the expected value.
+#
+# Note that a sched domain of just 1 CPU will be considered isolated.
+#
+# $1 - expected isolated cpu list
+#
+check_isolcpus()
+{
+	EXPECT_VAL=$1
+	ISOLCPUS=
+	LASTISOLCPU=
+	SCHED_DOMAINS=/sys/kernel/debug/sched/domains
+	[[ -d $SCHED_DOMAINS ]] || return 0
+	[[ $EXPECT_VAL = . ]] && EXPECT_VAL=
+
+	for ((CPU=0; CPU < $NR_CPUS; CPU++))
+	do
+		[[ -n "$(ls ${SCHED_DOMAINS}/cpu$CPU)" ]] && continue
+
+		if [[ -z "$LASTISOLCPU" ]]
+		then
+			ISOLCPUS=$CPU
+			LASTISOLCPU=$CPU
+		elif [[ "$LASTISOLCPU" -eq $((CPU - 1)) ]]
+		then
+			echo $ISOLCPUS | grep -q "\<$LASTISOLCPU\$"
+			if [[ $? -eq 0 ]]
+			then
+				ISOLCPUS=${ISOLCPUS}-
+			fi
+			LASTISOLCPU=$CPU
+		else
+			if [[ $ISOLCPUS = *- ]]
+			then
+				ISOLCPUS=${ISOLCPUS}$LASTISOLCPU
+			fi
+			ISOLCPUS=${ISOLCPUS},$CPU
+			LASTISOLCPU=$CPU
+		fi
+	done
+	[[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU
+	[[ "$EXPECT_VAL" = "$ISOLCPUS" ]]
+}
+
+test_fail()
+{
+	TESTNUM=$1
+	TESTTYPE=$2
+	ADDINFO=$3
+	echo "Test $TEST[$TESTNUM] failed $TESTTYPE check!"
+	[[ -n "$ADDINFO" ]] && echo "*** $ADDINFO ***"
+	eval echo \${$TEST[$I]}
+	echo
+	dump_states
+	exit 1
+}
+
+#
 # Run cpuset state transition test
 #  $1 - test matrix name
 #
@@ -536,88 +725,83 @@ run_state_test()
 {
 	TEST=$1
 	CONTROLLER=cpuset
-	CPULIST=0-6
 	I=0
 	eval CNT="\${#$TEST[@]}"
 
 	reset_cgroup_states
-	echo $CPULIST > cpuset.cpus
-	echo root > cpuset.cpus.partition
 	console_msg "Running state transition test ..."
 
 	while [[ $I -lt $CNT ]]
 	do
 		echo "Running test $I ..." > /dev/console
+		[[ $VERBOSE -gt 1 ]] && {
+			echo ""
+			eval echo \${$TEST[$I]}
+		}
 		eval set -- "\${$TEST[$I]}"
-		ROOT=$1
-		OLD_A1=$2
-		OLD_A2=$3
-		OLD_A3=$4
-		OLD_B1=$5
-		NEW_A1=$6
-		NEW_A2=$7
-		NEW_A3=$8
-		NEW_B1=$9
-		RESULT=${10}
-		ECPUS=${11}
-		STATES=${12}
-
-		set_ctrl_state_noerr .        $ROOT
+		OLD_A1=$1
+		OLD_A2=$2
+		OLD_A3=$3
+		OLD_B1=$4
+		NEW_A1=$5
+		NEW_A2=$6
+		NEW_A3=$7
+		NEW_B1=$8
+		RESULT=$9
+		ECPUS=${10}
+		STATES=${11}
+		ICPUS=${12}
+
+		set_ctrl_state_noerr B1       $OLD_B1
 		set_ctrl_state_noerr A1       $OLD_A1
 		set_ctrl_state_noerr A1/A2    $OLD_A2
 		set_ctrl_state_noerr A1/A2/A3 $OLD_A3
-		set_ctrl_state_noerr B1       $OLD_B1
 		RETVAL=0
 		set_ctrl_state A1       $NEW_A1; ((RETVAL += $?))
 		set_ctrl_state A1/A2    $NEW_A2; ((RETVAL += $?))
 		set_ctrl_state A1/A2/A3 $NEW_A3; ((RETVAL += $?))
 		set_ctrl_state B1       $NEW_B1; ((RETVAL += $?))
 
-		[[ $RETVAL -ne $RESULT ]] && {
-			echo "Test $TEST[$I] failed result check!"
-			eval echo \"\${$TEST[$I]}\"
-			dump_states
-			exit 1
-		}
+		[[ $RETVAL -ne $RESULT ]] && test_fail $I result
 
 		[[ -n "$ECPUS" && "$ECPUS" != . ]] && {
 			check_effective_cpus $ECPUS
-			[[ $? -ne 0 ]] && {
-				echo "Test $TEST[$I] failed effective CPU check!"
-				eval echo \"\${$TEST[$I]}\"
-				echo
-				dump_states
-				exit 1
-			}
+			[[ $? -ne 0 ]] && test_fail $I "effective CPU"
 		}
 
-		[[ -n "$STATES" ]] && {
+		[[ -n "$STATES" && "$STATES" != . ]] && {
 			check_cgroup_states $STATES
-			[[ $? -ne 0 ]] && {
-				echo "FAILED: Test $TEST[$I] failed states check!"
-				eval echo \"\${$TEST[$I]}\"
-				echo
-				dump_states
-				exit 1
-			}
+			[[ $? -ne 0 ]] && test_fail $I states
 		}
 
+		# Compare the expected isolated CPUs with the actual ones,
+		# if available
+		[[ -n "$ICPUS" ]] && {
+			check_isolcpus $ICPUS
+			[[ $? -ne 0 ]] && test_fail $I "isolated CPU" \
+				"Expect $ICPUS, get $ISOLCPUS instead"
+		}
 		reset_cgroup_states
 		#
 		# Check to see if effective cpu list changes
 		#
-		pause 0.05
 		NEWLIST=$(cat cpuset.cpus.effective)
+		RETRY=0
+		while [[ $NEWLIST != $CPULIST && $RETRY -lt 5 ]]
+		do
+			# Wait a bit longer & recheck a few times
+			pause 0.01
+			((RETRY++))
+			NEWLIST=$(cat cpuset.cpus.effective)
+		done
 		[[ $NEWLIST != $CPULIST ]] && {
 			echo "Effective cpus changed to $NEWLIST after test $I!"
 			exit 1
 		}
-		[[ -n "$VERBOSE" ]] && echo "Test $I done."
+		[[ $VERBOSE -gt 0 ]] && echo "Test $I done."
 		((I++))
 	done
 	echo "All $I tests of $TEST PASSED."
-
-	echo member > cpuset.cpus.partition
 }
 
 #
@@ -642,6 +826,7 @@ test_inotify()
 {
 	ERR=0
 	PRS=/tmp/.prs_$$
+	cd $CGROUP2/test
 	[[ -f $WAIT_INOTIFY ]] || {
 		echo "wait_inotify not found, inotify test SKIPPED."
 		return
@@ -655,7 +840,7 @@ test_inotify()
 	rm -f $PRS
 	wait_inotify $PWD/cpuset.cpus.partition $PRS &
 	pause 0.01
-	set_ctrl_state . "O1-0"
+	set_ctrl_state . "O1=0"
 	pause 0.01
 	check_cgroup_states ".:P-1"
 	if [[ $? -ne 0 ]]
@@ -689,5 +874,3 @@ run_state_test TEST_MATRIX
 test_isolated
 test_inotify
 echo "All tests PASSED."
-cd ..
-rmdir test
diff --git a/tools/testing/selftests/lkdtm/config b/tools/testing/selftests/lkdtm/config
index 5d52f64dfb43..7afe05e8c4d7 100644
--- a/tools/testing/selftests/lkdtm/config
+++ b/tools/testing/selftests/lkdtm/config
@@ -9,7 +9,6 @@ CONFIG_INIT_ON_FREE_DEFAULT_ON=y
 CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
 CONFIG_UBSAN=y
 CONFIG_UBSAN_BOUNDS=y
-CONFIG_UBSAN_TRAP=y
 CONFIG_STACKPROTECTOR_STRONG=y
 CONFIG_SLUB_DEBUG=y
 CONFIG_SLUB_DEBUG_ON=y
diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt
index 607b8d7e3ea3..368973f05250 100644
--- a/tools/testing/selftests/lkdtm/tests.txt
+++ b/tools/testing/selftests/lkdtm/tests.txt
@@ -1,4 +1,5 @@
 #PANIC
+#PANIC_STOP_IRQOFF Crashes entire system
 BUG kernel BUG at
 WARNING WARNING:
 WARNING_MESSAGE message trigger
@@ -7,7 +8,7 @@ EXCEPTION
 #EXHAUST_STACK Corrupts memory on failure
 #CORRUPT_STACK Crashes entire system on success
 #CORRUPT_STACK_STRONG Crashes entire system on success
-ARRAY_BOUNDS
+ARRAY_BOUNDS call trace:|UBSAN: array-index-out-of-bounds
 CORRUPT_LIST_ADD list_add corruption
 CORRUPT_LIST_DEL list_del corruption
 STACK_GUARD_PAGE_LEADING
diff --git a/tools/testing/selftests/nolibc/.gitignore b/tools/testing/selftests/nolibc/.gitignore
index 52f613cdad54..5119f9f7afd2 100644
--- a/tools/testing/selftests/nolibc/.gitignore
+++ b/tools/testing/selftests/nolibc/.gitignore
@@ -1,4 +1,5 @@
 /initramfs/
+/initramfs.cpio
 /libc-test
 /nolibc-test
 /run.out
diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index dfe66776a331..a0fc07253baf 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -82,7 +82,7 @@ QEMU_ARCH_arm        = arm
 QEMU_ARCH_mips       = mipsel  # works with malta_defconfig
 QEMU_ARCH_ppc        = ppc
 QEMU_ARCH_ppc64      = ppc64
-QEMU_ARCH_ppc64le    = ppc64le
+QEMU_ARCH_ppc64le    = ppc64
 QEMU_ARCH_riscv      = riscv64
 QEMU_ARCH_s390       = s390x
 QEMU_ARCH_loongarch  = loongarch64
@@ -113,6 +113,7 @@ else
 Q=@
 endif
 
+CFLAGS_i386 = $(call cc-option,-m32)
 CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
 CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
 CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2)
@@ -131,18 +132,20 @@ REPORT  ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++
 
 help:
 	@echo "Supported targets under selftests/nolibc:"
-	@echo "  all          call the \"run\" target below"
-	@echo "  help         this help"
-	@echo "  sysroot      create the nolibc sysroot here (uses \$$ARCH)"
-	@echo "  nolibc-test  build the executable (uses \$$CC and \$$CROSS_COMPILE)"
-	@echo "  libc-test    build an executable using the compiler's default libc instead"
-	@echo "  run-user     runs the executable under QEMU (uses \$$XARCH, \$$TEST)"
-	@echo "  initramfs    prepare the initramfs with nolibc-test"
-	@echo "  defconfig    create a fresh new default config (uses \$$XARCH)"
-	@echo "  kernel       (re)build the kernel with the initramfs (uses \$$XARCH)"
-	@echo "  run          runs the kernel in QEMU after building it (uses \$$XARCH, \$$TEST)"
-	@echo "  rerun        runs a previously prebuilt kernel in QEMU (uses \$$XARCH, \$$TEST)"
-	@echo "  clean        clean the sysroot, initramfs, build and output files"
+	@echo "  all               call the \"run\" target below"
+	@echo "  help              this help"
+	@echo "  sysroot           create the nolibc sysroot here (uses \$$ARCH)"
+	@echo "  nolibc-test       build the executable (uses \$$CC and \$$CROSS_COMPILE)"
+	@echo "  libc-test         build an executable using the compiler's default libc instead"
+	@echo "  run-user          runs the executable under QEMU (uses \$$XARCH, \$$TEST)"
+	@echo "  initramfs.cpio    prepare the initramfs archive with nolibc-test"
+	@echo "  initramfs         prepare the initramfs tree with nolibc-test"
+	@echo "  defconfig         create a fresh new default config (uses \$$XARCH)"
+	@echo "  kernel            (re)build the kernel (uses \$$XARCH)"
+	@echo "  kernel-standalone (re)build the kernel with the initramfs (uses \$$XARCH)"
+	@echo "  run               runs the kernel in QEMU after building it (uses \$$XARCH, \$$TEST)"
+	@echo "  rerun             runs a previously prebuilt kernel in QEMU (uses \$$XARCH, \$$TEST)"
+	@echo "  clean             clean the sysroot, initramfs, build and output files"
 	@echo ""
 	@echo "The output file is \"run.out\". Test ranges may be passed using \$$TEST."
 	@echo ""
@@ -168,17 +171,17 @@ sysroot/$(ARCH)/include:
 	$(Q)mv sysroot/sysroot sysroot/$(ARCH)
 
 ifneq ($(NOLIBC_SYSROOT),0)
-nolibc-test: nolibc-test.c sysroot/$(ARCH)/include
+nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include
 	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
-	  -nostdlib -static -Isysroot/$(ARCH)/include $< -lgcc
+	  -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c -lgcc
 else
-nolibc-test: nolibc-test.c
+nolibc-test: nolibc-test.c nolibc-test-linkage.c
 	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
-	  -nostdlib -static -include ../../../include/nolibc/nolibc.h $< -lgcc
+	  -nostdlib -static -include ../../../include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c -lgcc
 endif
 
-libc-test: nolibc-test.c
-	$(QUIET_CC)$(HOSTCC) -o $@ $<
+libc-test: nolibc-test.c nolibc-test-linkage.c
+	$(QUIET_CC)$(HOSTCC) -o $@ nolibc-test.c nolibc-test-linkage.c
 
 # local libc-test
 run-libc-test: libc-test
@@ -195,6 +198,9 @@ run-user: nolibc-test
 	$(Q)qemu-$(QEMU_ARCH) ./nolibc-test > "$(CURDIR)/run.out" || :
 	$(Q)$(REPORT) $(CURDIR)/run.out
 
+initramfs.cpio: kernel nolibc-test
+	$(QUIET_GEN)echo 'file /init nolibc-test 755 0 0' | $(srctree)/usr/gen_init_cpio - > initramfs.cpio
+
 initramfs: nolibc-test
 	$(QUIET_MKDIR)mkdir -p initramfs
 	$(call QUIET_INSTALL, initramfs/init)
@@ -203,17 +209,20 @@ initramfs: nolibc-test
 defconfig:
 	$(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare
 
-kernel: initramfs
+kernel:
+	$(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME)
+
+kernel-standalone: initramfs
 	$(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs
 
 # run the tests after building the kernel
-run: kernel
-	$(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out"
+run: kernel initramfs.cpio
+	$(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -initrd initramfs.cpio -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out"
 	$(Q)$(REPORT) $(CURDIR)/run.out
 
 # re-run the tests from an existing kernel
 rerun:
-	$(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out"
+	$(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -initrd initramfs.cpio -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out"
 	$(Q)$(REPORT) $(CURDIR)/run.out
 
 # report with existing test log
@@ -227,6 +236,8 @@ clean:
 	$(Q)rm -f nolibc-test
 	$(call QUIET_CLEAN, libc-test)
 	$(Q)rm -f libc-test
+	$(call QUIET_CLEAN, initramfs.cpio)
+	$(Q)rm -rf initramfs.cpio
 	$(call QUIET_CLEAN, initramfs)
 	$(Q)rm -rf initramfs
 	$(call QUIET_CLEAN, run.out)
diff --git a/tools/testing/selftests/nolibc/nolibc-test-linkage.c b/tools/testing/selftests/nolibc/nolibc-test-linkage.c
new file mode 100644
index 000000000000..5ff4c8a1db2a
--- /dev/null
+++ b/tools/testing/selftests/nolibc/nolibc-test-linkage.c
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "nolibc-test-linkage.h"
+
+#ifndef NOLIBC
+#include <errno.h>
+#endif
+
+void *linkage_test_errno_addr(void)
+{
+	return &errno;
+}
+
+int linkage_test_constructor_test_value;
+
+__attribute__((constructor))
+static void constructor1(void)
+{
+	linkage_test_constructor_test_value = 2;
+}
+
+__attribute__((constructor))
+static void constructor2(void)
+{
+	linkage_test_constructor_test_value *= 3;
+}
diff --git a/tools/testing/selftests/nolibc/nolibc-test-linkage.h b/tools/testing/selftests/nolibc/nolibc-test-linkage.h
new file mode 100644
index 000000000000..c66473070d73
--- /dev/null
+++ b/tools/testing/selftests/nolibc/nolibc-test-linkage.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _NOLIBC_TEST_LINKAGE_H
+#define _NOLIBC_TEST_LINKAGE_H
+
+void *linkage_test_errno_addr(void);
+extern int linkage_test_constructor_test_value;
+
+#endif /* _NOLIBC_TEST_LINKAGE_H */
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index fb3bf91462e2..2f10541e6f38 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -41,6 +41,8 @@
 #endif
 #endif
 
+#include "nolibc-test-linkage.h"
+
 /* for the type of int_fast16_t and int_fast32_t, musl differs from glibc and nolibc */
 #define SINT_MAX_OF_TYPE(type) (((type)1 << (sizeof(type) * 8 - 2)) - (type)1 + ((type)1 << (sizeof(type) * 8 - 2)))
 #define SINT_MIN_OF_TYPE(type) (-SINT_MAX_OF_TYPE(type) - 1)
@@ -57,6 +59,9 @@ static int test_argc;
 /* will be used by some test cases as readable file, please don't write it */
 static const char *argv0;
 
+/* will be used by constructor tests */
+static int constructor_test_value;
+
 /* definition of a series of tests */
 struct test {
 	const char *name;              /* test name */
@@ -594,6 +599,19 @@ int expect_strne(const char *expr, int llen, const char *cmp)
 #define CASE_TEST(name) \
 	case __LINE__: llen += printf("%d %s", test, #name);
 
+/* constructors validate that they are executed in definition order */
+__attribute__((constructor))
+static void constructor1(void)
+{
+	constructor_test_value = 1;
+}
+
+__attribute__((constructor))
+static void constructor2(void)
+{
+	constructor_test_value *= 2;
+}
+
 int run_startup(int min, int max)
 {
 	int test;
@@ -630,7 +648,9 @@ int run_startup(int min, int max)
 		CASE_TEST(environ_HOME);     EXPECT_PTRNZ(1, getenv("HOME")); break;
 		CASE_TEST(auxv_addr);        EXPECT_PTRGT(test_auxv != (void *)-1, test_auxv, brk); break;
 		CASE_TEST(auxv_AT_UID);      EXPECT_EQ(1, getauxval(AT_UID), getuid()); break;
-		CASE_TEST(auxv_AT_PAGESZ);   EXPECT_GE(1, getauxval(AT_PAGESZ), 4096); break;
+		CASE_TEST(constructor);      EXPECT_EQ(1, constructor_test_value, 2); break;
+		CASE_TEST(linkage_errno);    EXPECT_PTREQ(1, linkage_test_errno_addr(), &errno); break;
+		CASE_TEST(linkage_constr);   EXPECT_EQ(1, linkage_test_constructor_test_value, 6); break;
 		case __LINE__:
 			return ret; /* must be last */
 		/* note: do not set any defaults so as to permit holes above */
@@ -894,14 +914,14 @@ int run_syscall(int min, int max)
 		CASE_TEST(lseek_0);           EXPECT_SYSER(1, lseek(0, 0, SEEK_SET), -1, ESPIPE); break;
 		CASE_TEST(mkdir_root);        EXPECT_SYSER(1, mkdir("/", 0755), -1, EEXIST); break;
 		CASE_TEST(mmap_bad);          EXPECT_PTRER(1, mmap(NULL, 0, PROT_READ, MAP_PRIVATE, 0, 0), MAP_FAILED, EINVAL); break;
-		CASE_TEST(munmap_bad);        EXPECT_SYSER(1, munmap((void *)1, 0), -1, EINVAL); break;
+		CASE_TEST(munmap_bad);        EXPECT_SYSER(1, munmap(NULL, 0), -1, EINVAL); break;
 		CASE_TEST(mmap_munmap_good);  EXPECT_SYSZR(1, test_mmap_munmap()); break;
 		CASE_TEST(open_tty);          EXPECT_SYSNE(1, tmp = open("/dev/null", 0), -1); if (tmp != -1) close(tmp); break;
 		CASE_TEST(open_blah);         EXPECT_SYSER(1, tmp = open("/proc/self/blah", 0), -1, ENOENT); if (tmp != -1) close(tmp); break;
 		CASE_TEST(pipe);              EXPECT_SYSZR(1, test_pipe()); break;
 		CASE_TEST(poll_null);         EXPECT_SYSZR(1, poll(NULL, 0, 0)); break;
 		CASE_TEST(poll_stdout);       EXPECT_SYSNE(1, ({ struct pollfd fds = { 1, POLLOUT, 0}; poll(&fds, 1, 0); }), -1); break;
-		CASE_TEST(poll_fault);        EXPECT_SYSER(1, poll((void *)1, 1, 0), -1, EFAULT); break;
+		CASE_TEST(poll_fault);        EXPECT_SYSER(1, poll(NULL, 1, 0), -1, EFAULT); break;
 		CASE_TEST(prctl);             EXPECT_SYSER(1, prctl(PR_SET_NAME, (unsigned long)NULL, 0, 0, 0), -1, EFAULT); break;
 		CASE_TEST(read_badf);         EXPECT_SYSER(1, read(-1, &tmp, 1), -1, EBADF); break;
 		CASE_TEST(rmdir_blah);        EXPECT_SYSER(1, rmdir("/blah"), -1, ENOENT); break;
@@ -910,7 +930,7 @@ int run_syscall(int min, int max)
 		CASE_TEST(select_stdout);     EXPECT_SYSNE(1, ({ fd_set fds; FD_ZERO(&fds); FD_SET(1, &fds); select(2, NULL, &fds, NULL, NULL); }), -1); break;
 		CASE_TEST(select_fault);      EXPECT_SYSER(1, select(1, (void *)1, NULL, NULL, 0), -1, EFAULT); break;
 		CASE_TEST(stat_blah);         EXPECT_SYSER(1, stat("/proc/self/blah", &stat_buf), -1, ENOENT); break;
-		CASE_TEST(stat_fault);        EXPECT_SYSER(1, stat((void *)1, &stat_buf), -1, EFAULT); break;
+		CASE_TEST(stat_fault);        EXPECT_SYSER(1, stat(NULL, &stat_buf), -1, EFAULT); break;
 		CASE_TEST(stat_timestamps);   EXPECT_SYSZR(1, test_stat_timestamps()); break;
 		CASE_TEST(symlink_root);      EXPECT_SYSER(1, symlink("/", "/"), -1, EEXIST); break;
 		CASE_TEST(unlink_root);       EXPECT_SYSER(1, unlink("/"), -1, EISDIR); break;
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index b8e2ea23cb3f..6e415ddb206f 100644..100755
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -331,3 +331,32 @@ specify_qemu_net () {
 		echo $1 -net none
 	fi
 }
+
+# Extract the ftrace output from the console log output
+# The ftrace output in the original logs look like:
+# Dumping ftrace buffer:
+# ---------------------------------
+# [...]
+# ---------------------------------
+extract_ftrace_from_console() {
+	awk < "$1" '
+
+	/Dumping ftrace buffer:/ {
+		buffer_count++
+		print "Ftrace dump " buffer_count ":"
+		capture = 1
+		next
+	}
+
+	/---------------------------------/ {
+		if(capture == 1) {
+			capture = 2
+			next
+		} else if(capture == 2) {
+			capture = 0
+			print ""
+		}
+	}
+
+	capture == 2'
+}
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index 5be670dd4009..de65d77b47ff 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -13,7 +13,7 @@
 #
 # Authors: Paul E. McKenney <paulmck@linux.ibm.com>
 
-T=/tmp/kvm-recheck.sh.$$
+T="`mktemp ${TMPDIR-/tmp}/kvm-recheck.sh.XXXXXX`"
 trap 'rm -f $T' 0 2
 
 configerrors=0
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index b0f36a638a69..7af73ddc148d 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -49,6 +49,7 @@ TORTURE_SHUTDOWN_GRACE=180
 TORTURE_SUITE=rcu
 TORTURE_MOD=rcutorture
 TORTURE_TRUST_MAKE=""
+debuginfo="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"
 resdir=""
 configs=""
 cpus=0
@@ -68,6 +69,7 @@ usage () {
 	echo "       --cpus N"
 	echo "       --datestamp string"
 	echo "       --defconfig string"
+	echo "       --debug-info"
 	echo "       --dryrun batches|scenarios|sched|script"
 	echo "       --duration minutes | <seconds>s | <hours>h | <days>d"
 	echo "       --gdb"
@@ -135,6 +137,15 @@ do
 		ds=$2
 		shift
 		;;
+	--debug-info|--debuginfo)
+		if test -z "$TORTURE_KCONFIG_KCSAN_ARG" && test -z "$TORTURE_BOOT_GDB_ARG"
+		then
+			TORTURE_KCONFIG_KCSAN_ARG="$debuginfo"; export TORTURE_KCONFIG_KCSAN_ARG
+			TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG
+		else
+			echo "Ignored redundant --debug-info (implied by --kcsan &c)"
+		fi
+		;;
 	--defconfig)
 		checkarg --defconfig "defconfigtype" "$#" "$2" '^[^/][^/]*$' '^--'
 		TORTURE_DEFCONFIG=$2
@@ -163,7 +174,7 @@ do
 		shift
 		;;
 	--gdb)
-		TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"; export TORTURE_KCONFIG_GDB_ARG
+		TORTURE_KCONFIG_GDB_ARG="$debuginfo"; export TORTURE_KCONFIG_GDB_ARG
 		TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG
 		TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG
 		;;
@@ -179,7 +190,7 @@ do
 		shift
 		;;
 	--kasan)
-		TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
+		TORTURE_KCONFIG_KASAN_ARG="$debuginfo CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
 		if test -n "$torture_qemu_mem_default"
 		then
 			TORTURE_QEMU_MEM=2G
@@ -191,7 +202,7 @@ do
 		shift
 		;;
 	--kcsan)
-		TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
+		TORTURE_KCONFIG_KCSAN_ARG="$debuginfo CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
 		;;
 	--kmake-arg|--kmake-args)
 		checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 9ab0f6bc172c..b07c11cf6929 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -11,7 +11,7 @@
 #
 # Authors: Paul E. McKenney <paulmck@linux.ibm.com>
 
-T=${TMPDIR-/tmp}/parse-console.sh.$$
+T="`mktemp -d ${TMPDIR-/tmp}/parse-console.sh.XXXXXX`"
 file="$1"
 title="$2"
 
@@ -182,3 +182,10 @@ if ! test -s $file.diags
 then
 	rm -f $file.diags
 fi
+
+# Call extract_ftrace_from_console function, if the output is empty,
+# don't create $file.ftrace. Otherwise output the results to $file.ftrace
+extract_ftrace_from_console $file > $file.ftrace
+if [ ! -s $file.ftrace ]; then
+	rm -f $file.ftrace
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh
index 12b50a4a881a..d5a0d8a33c27 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -472,7 +472,7 @@ do
 	if test -n "$firsttime"
 	then
 		torture_bootargs="refscale.scale_type="$prim" refscale.nreaders=$HALF_ALLOTED_CPUS refscale.loops=10000 refscale.holdoff=20 torture.disable_onoff_at_boot"
-		torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make
+		torture_set "refscale-$prim" tools/testing/selftests/rcutorture/bin/kvm.sh --torture refscale --allcpus --duration 5 --kconfig "CONFIG_TASKS_TRACE_RCU=y CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --bootargs "refscale.verbose_batched=$VERBOSE_BATCH_CPUS torture.verbose_sleep_frequency=8 torture.verbose_sleep_duration=$VERBOSE_BATCH_CPUS" --trust-make
 		mv $T/last-resdir-nodebug $T/first-resdir-nodebug || :
 		if test -f "$T/last-resdir-kasan"
 		then
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02
index 093ea6e8e65c..9003c56cd764 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02
@@ -11,3 +11,4 @@ CONFIG_FORCE_TASKS_TRACE_RCU=y
 #CHECK#CONFIG_TASKS_TRACE_RCU=y
 CONFIG_TASKS_TRACE_RCU_READ_MB=n
 CONFIG_RCU_EXPERT=y
+CONFIG_DEBUG_OBJECTS=y
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 7e8c937627dd..0b872c0a42d2 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -14,6 +14,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
 			check_initial_reg_state sigreturn iopl ioperm \
 			test_vsyscall mov_ss_trap \
 			syscall_arg_fault fsgsbase_restore sigaltstack
+TARGETS_C_BOTHBITS += nx_stack
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
@@ -109,3 +110,6 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S
 # state.
 $(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
 $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
+
+$(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack
+$(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack
diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c
index eb0e46905bf9..8f9b06d9ce03 100644
--- a/tools/testing/selftests/x86/lam.c
+++ b/tools/testing/selftests/x86/lam.c
@@ -573,7 +573,7 @@ int do_uring(unsigned long lam)
 	char path[PATH_MAX] = {0};
 
 	/* get current process path */
-	if (readlink("/proc/self/exe", path, PATH_MAX) <= 0)
+	if (readlink("/proc/self/exe", path, PATH_MAX - 1) <= 0)
 		return 1;
 
 	int file_fd = open(path, O_RDONLY);
@@ -680,14 +680,14 @@ static int handle_execve(struct testcases *test)
 		perror("Fork failed.");
 		ret = 1;
 	} else if (pid == 0) {
-		char path[PATH_MAX];
+		char path[PATH_MAX] = {0};
 
 		/* Set LAM mode in parent process */
 		if (set_lam(lam) != 0)
 			return 1;
 
 		/* Get current binary's path and the binary was run by execve */
-		if (readlink("/proc/self/exe", path, PATH_MAX) <= 0)
+		if (readlink("/proc/self/exe", path, PATH_MAX - 1) <= 0)
 			exit(-1);
 
 		/* run binary to get LAM mode and return to parent process */
diff --git a/tools/testing/selftests/x86/nx_stack.c b/tools/testing/selftests/x86/nx_stack.c
new file mode 100644
index 000000000000..ea4a4e246879
--- /dev/null
+++ b/tools/testing/selftests/x86/nx_stack.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that userspace stack is NX. Requires linking with -Wl,-z,noexecstack
+ * because I don't want to bother with PT_GNU_STACK detection.
+ *
+ * Fill the stack with INT3's and then try to execute some of them:
+ * SIGSEGV -- good, SIGTRAP -- bad.
+ *
+ * Regular stack is completely overwritten before testing.
+ * Test doesn't exit SIGSEGV handler after first fault at INT3.
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#undef NDEBUG
+#include <assert.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#define PAGE_SIZE 4096
+
+/*
+ * This is memset(rsp, 0xcc, -1); but down.
+ * It will SIGSEGV when bottom of the stack is reached.
+ * Byte-size access is important! (see rdi tweak in the signal handler).
+ */
+void make_stack1(void);
+asm(
+".pushsection .text\n"
+".globl make_stack1\n"
+".align 16\n"
+"make_stack1:\n"
+	"mov $0xcc, %al\n"
+#if defined __amd64__
+	"mov %rsp, %rdi\n"
+	"mov $-1, %rcx\n"
+#elif defined __i386__
+	"mov %esp, %edi\n"
+	"mov $-1, %ecx\n"
+#else
+#error
+#endif
+	"std\n"
+	"rep stosb\n"
+	/* unreachable */
+	"hlt\n"
+".type make_stack1,@function\n"
+".size make_stack1,.-make_stack1\n"
+".popsection\n"
+);
+
+/*
+ * memset(p, 0xcc, -1);
+ * It will SIGSEGV when top of the stack is reached.
+ */
+void make_stack2(uint64_t p);
+asm(
+".pushsection .text\n"
+".globl make_stack2\n"
+".align 16\n"
+"make_stack2:\n"
+	"mov $0xcc, %al\n"
+#if defined __amd64__
+	"mov $-1, %rcx\n"
+#elif defined __i386__
+	"mov $-1, %ecx\n"
+#else
+#error
+#endif
+	"cld\n"
+	"rep stosb\n"
+	/* unreachable */
+	"hlt\n"
+".type make_stack2,@function\n"
+".size make_stack2,.-make_stack2\n"
+".popsection\n"
+);
+
+static volatile int test_state = 0;
+static volatile unsigned long stack_min_addr;
+
+#if defined __amd64__
+#define RDI	REG_RDI
+#define RIP	REG_RIP
+#define RIP_STRING "rip"
+#elif defined __i386__
+#define RDI	REG_EDI
+#define RIP	REG_EIP
+#define RIP_STRING "eip"
+#else
+#error
+#endif
+
+static void sigsegv(int _, siginfo_t *__, void *uc_)
+{
+	/*
+	 * Some Linux versions didn't clear DF before entering signal
+	 * handler. make_stack1() doesn't have a chance to clear DF
+	 * either so we clear it by hand here.
+	 */
+	asm volatile ("cld" ::: "memory");
+
+	ucontext_t *uc = uc_;
+
+	if (test_state == 0) {
+		/* Stack is faulted and cleared from RSP to the lowest address. */
+		stack_min_addr = ++uc->uc_mcontext.gregs[RDI];
+		if (1) {
+			printf("stack min %lx\n", stack_min_addr);
+		}
+		uc->uc_mcontext.gregs[RIP] = (uintptr_t)&make_stack2;
+		test_state = 1;
+	} else if (test_state == 1) {
+		/* Stack has been cleared from top to bottom. */
+		unsigned long stack_max_addr = uc->uc_mcontext.gregs[RDI];
+		if (1) {
+			printf("stack max %lx\n", stack_max_addr);
+		}
+		/* Start faulting pages on stack and see what happens. */
+		uc->uc_mcontext.gregs[RIP] = stack_max_addr - PAGE_SIZE;
+		test_state = 2;
+	} else if (test_state == 2) {
+		/* Stack page is NX -- good, test next page. */
+		uc->uc_mcontext.gregs[RIP] -= PAGE_SIZE;
+		if (uc->uc_mcontext.gregs[RIP] == stack_min_addr) {
+			/* One more SIGSEGV and test ends. */
+			test_state = 3;
+		}
+	} else {
+		printf("PASS\tAll stack pages are NX\n");
+		_exit(EXIT_SUCCESS);
+	}
+}
+
+static void sigtrap(int _, siginfo_t *__, void *uc_)
+{
+	const ucontext_t *uc = uc_;
+	unsigned long rip = uc->uc_mcontext.gregs[RIP];
+	printf("FAIL\texecutable page on the stack: " RIP_STRING " %lx\n", rip);
+	_exit(EXIT_FAILURE);
+}
+
+int main(void)
+{
+	{
+		struct sigaction act = {};
+		sigemptyset(&act.sa_mask);
+		act.sa_flags = SA_SIGINFO;
+		act.sa_sigaction = &sigsegv;
+		int rv = sigaction(SIGSEGV, &act, NULL);
+		assert(rv == 0);
+	}
+	{
+		struct sigaction act = {};
+		sigemptyset(&act.sa_mask);
+		act.sa_flags = SA_SIGINFO;
+		act.sa_sigaction = &sigtrap;
+		int rv = sigaction(SIGTRAP, &act, NULL);
+		assert(rv == 0);
+	}
+	{
+		struct rlimit rlim;
+		int rv = getrlimit(RLIMIT_STACK, &rlim);
+		assert(rv == 0);
+		/* Cap stack at time-honored 8 MiB value. */
+		rlim.rlim_max = rlim.rlim_cur;
+		if (rlim.rlim_max > 8 * 1024 * 1024) {
+			rlim.rlim_max = 8 * 1024 * 1024;
+		}
+		rv = setrlimit(RLIMIT_STACK, &rlim);
+		assert(rv == 0);
+	}
+	{
+		/*
+		 * We don't know now much stack SIGSEGV handler uses.
+		 * Bump this by 1 page every time someone complains,
+		 * or rewrite it in assembly.
+		 */
+		const size_t len = SIGSTKSZ;
+		void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+		assert(p != MAP_FAILED);
+		stack_t ss = {};
+		ss.ss_sp = p;
+		ss.ss_size = len;
+		int rv = sigaltstack(&ss, NULL);
+		assert(rv == 0);
+	}
+	make_stack1();
+	/*
+	 * Unreachable, but if _this_ INT3 is ever reached, it's a bug somewhere.
+	 * Fold it into main SIGTRAP pathway.
+	 */
+	__builtin_trap();
+}