152 files changed, 5011 insertions, 6743 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 39b378ee56ce..e07e7de9ac49 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -94,6 +94,7 @@ config ARM64
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
 	select ARCH_WANT_LD_ORPHAN_WARN
+	select ARCH_WANTS_NO_INSTR
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARM_AMBA
 	select ARM_ARCH_TIMER
@@ -1470,12 +1471,6 @@ menu "ARMv8.3 architectural features"
 config ARM64_PTR_AUTH
 	bool "Enable support for pointer authentication"
 	default y
-	depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
-	# Modern compilers insert a .note.gnu.property section note for PAC
-	# which is only understood by binutils starting with version 2.33.1.
-	depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
-	depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
-	depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
 	help
 	  Pointer authentication (part of the ARMv8.3 Extensions) provides
 	  instructions for signing and authenticating pointers against secret
@@ -1487,13 +1482,6 @@ config ARM64_PTR_AUTH
 	  for each process at exec() time, with these keys being
 	  context-switched along with the process.
 
-	  If the compiler supports the -mbranch-protection or
-	  -msign-return-address flag (e.g. GCC 7 or later), then this option
-	  will also cause the kernel itself to be compiled with return address
-	  protection. In this case, and if the target hardware is known to
-	  support pointer authentication, then CONFIG_STACKPROTECTOR can be
-	  disabled with minimal loss of protection.
-
 	  The feature is detected at runtime. If the feature is not present in
 	  hardware it will not be advertised to userspace/KVM guest nor will it
 	  be enabled.
@@ -1504,6 +1492,24 @@ config ARM64_PTR_AUTH
 	  but with the feature disabled. On such a system, this option should
 	  not be selected.
 
+config ARM64_PTR_AUTH_KERNEL
+	bool "Use pointer authentication for kernel"
+	default y
+	depends on ARM64_PTR_AUTH
+	depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
+	# Modern compilers insert a .note.gnu.property section note for PAC
+	# which is only understood by binutils starting with version 2.33.1.
+	depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
+	depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
+	depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
+	help
+	  If the compiler supports the -mbranch-protection or
+	  -msign-return-address flag (e.g. GCC 7 or later), then this option
+	  will cause the kernel itself to be compiled with return address
+	  protection. In this case, and if the target hardware is known to
+	  support pointer authentication, then CONFIG_STACKPROTECTOR can be
+	  disabled with minimal loss of protection.
+
 	  This feature works with FUNCTION_GRAPH_TRACER option only if
 	  DYNAMIC_FTRACE_WITH_REGS is enabled.
 
@@ -1595,7 +1601,7 @@ config ARM64_BTI_KERNEL
 	bool "Use Branch Target Identification for kernel"
 	default y
 	depends on ARM64_BTI
-	depends on ARM64_PTR_AUTH
+	depends on ARM64_PTR_AUTH_KERNEL
 	depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697
 	depends on !CC_IS_GCC || GCC_VERSION >= 100100
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index b52481f0605d..3b5b1c480449 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -70,7 +70,7 @@ endif
 # off, this will be overridden if we are using branch protection.
 branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
 
-ifeq ($(CONFIG_ARM64_PTR_AUTH),y)
+ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y)
 branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all
 # We enable additional protection for leaf functions as there is some
 # narrow potential for ROP protection benefits and no substantial
diff --git a/arch/arm64/boot/dts/microchip/sparx5.dtsi b/arch/arm64/boot/dts/microchip/sparx5.dtsi
index d64621d1213b..ad07fff40544 100644
--- a/arch/arm64/boot/dts/microchip/sparx5.dtsi
+++ b/arch/arm64/boot/dts/microchip/sparx5.dtsi
@@ -135,9 +135,12 @@
 			};
 		};
 
-		reset@611010008 {
-			compatible = "microchip,sparx5-chip-reset";
+		reset: reset-controller@611010008 {
+			compatible = "microchip,sparx5-switch-reset";
 			reg = <0x6 0x11010008 0x4>;
+			reg-names = "gcb";
+			#reset-cells = <1>;
+			cpu-syscon = <&cpu_ctrl>;
 		};
 
 		uart0: serial@600100000 {
@@ -275,6 +278,21 @@
 					"GPIO_46", "GPIO_47";
 				function = "emmc";
 			};
+
+			miim1_pins: miim1-pins {
+				pins = "GPIO_56", "GPIO_57";
+				function = "miim";
+			};
+
+			miim2_pins: miim2-pins {
+				pins = "GPIO_58", "GPIO_59";
+				function = "miim";
+			};
+
+			miim3_pins: miim3-pins {
+				pins = "GPIO_52", "GPIO_53";
+				function = "miim";
+			};
 		};
 
 		sgpio0: gpio@61101036c {
@@ -285,6 +303,8 @@
 			clocks = <&sys_clk>;
 			pinctrl-0 = <&sgpio0_pins>;
 			pinctrl-names = "default";
+			resets = <&reset 0>;
+			reset-names = "switch";
 			reg = <0x6 0x1101036c 0x100>;
 			sgpio_in0: gpio@0 {
 				compatible = "microchip,sparx5-sgpio-bank";
@@ -292,6 +312,9 @@
 				gpio-controller;
 				#gpio-cells = <3>;
 				ngpios = <96>;
+				interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
+				interrupt-controller;
+				#interrupt-cells = <3>;
 			};
 			sgpio_out0: gpio@1 {
 				compatible = "microchip,sparx5-sgpio-bank";
@@ -310,6 +333,8 @@
 			clocks = <&sys_clk>;
 			pinctrl-0 = <&sgpio1_pins>;
 			pinctrl-names = "default";
+			resets = <&reset 0>;
+			reset-names = "switch";
 			reg = <0x6 0x11010484 0x100>;
 			sgpio_in1: gpio@0 {
 				compatible = "microchip,sparx5-sgpio-bank";
@@ -317,6 +342,9 @@
 				gpio-controller;
 				#gpio-cells = <3>;
 				ngpios = <96>;
+				interrupts = <GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>;
+				interrupt-controller;
+				#interrupt-cells = <3>;
 			};
 			sgpio_out1: gpio@1 {
 				compatible = "microchip,sparx5-sgpio-bank";
@@ -335,6 +363,8 @@
 			clocks = <&sys_clk>;
 			pinctrl-0 = <&sgpio2_pins>;
 			pinctrl-names = "default";
+			resets = <&reset 0>;
+			reset-names = "switch";
 			reg = <0x6 0x1101059c 0x100>;
 			sgpio_in2: gpio@0 {
 				reg = <0>;
@@ -342,6 +372,9 @@
 				gpio-controller;
 				#gpio-cells = <3>;
 				ngpios = <96>;
+				interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
+				interrupt-controller;
+				#interrupt-cells = <3>;
 			};
 			sgpio_out2: gpio@1 {
 				compatible = "microchip,sparx5-sgpio-bank";
@@ -386,5 +419,62 @@
 			#thermal-sensor-cells = <0>;
 			clocks = <&ahb_clk>;
 		};
+
+		mdio0: mdio@6110102b0 {
+			compatible = "mscc,ocelot-miim";
+			status = "disabled";
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0x6 0x110102b0 0x24>;
+		};
+
+		mdio1: mdio@6110102d4 {
+			compatible = "mscc,ocelot-miim";
+			status = "disabled";
+			pinctrl-0 = <&miim1_pins>;
+			pinctrl-names = "default";
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0x6 0x110102d4 0x24>;
+		};
+
+		mdio2: mdio@6110102f8 {
+			compatible = "mscc,ocelot-miim";
+			status = "disabled";
+			pinctrl-0 = <&miim2_pins>;
+			pinctrl-names = "default";
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0x6 0x110102d4 0x24>;
+		};
+
+		mdio3: mdio@61101031c {
+			compatible = "mscc,ocelot-miim";
+			status = "disabled";
+			pinctrl-0 = <&miim3_pins>;
+			pinctrl-names = "default";
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0x6 0x1101031c 0x24>;
+		};
+
+		serdes: serdes@10808000 {
+			compatible = "microchip,sparx5-serdes";
+			#phy-cells = <1>;
+			clocks = <&sys_clk>;
+			reg = <0x6 0x10808000 0x5d0000>;
+		};
+
+		switch: switch@0x600000000 {
+			compatible = "microchip,sparx5-switch";
+			reg =	<0x6 0 0x401000>,
+				<0x6 0x10004000 0x7fc000>,
+				<0x6 0x11010000 0xaf0000>;
+			reg-names = "cpu", "dev", "gcb";
+			interrupt-names = "xtr";
+			interrupts = <GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>;
+			resets = <&reset 0>;
+			reset-names = "switch";
+		};
 	};
 };
diff --git a/arch/arm64/boot/dts/microchip/sparx5_pcb134_board.dtsi b/arch/arm64/boot/dts/microchip/sparx5_pcb134_board.dtsi
index f0c915160990..33faf1f3264f 100644
--- a/arch/arm64/boot/dts/microchip/sparx5_pcb134_board.dtsi
+++ b/arch/arm64/boot/dts/microchip/sparx5_pcb134_board.dtsi
@@ -7,30 +7,6 @@
 #include "sparx5_pcb_common.dtsi"
 
 /{
-	aliases {
-	    i2c0   = &i2c0;
-	    i2c100 = &i2c100;
-	    i2c101 = &i2c101;
-	    i2c102 = &i2c102;
-	    i2c103 = &i2c103;
-	    i2c104 = &i2c104;
-	    i2c105 = &i2c105;
-	    i2c106 = &i2c106;
-	    i2c107 = &i2c107;
-	    i2c108 = &i2c108;
-	    i2c109 = &i2c109;
-	    i2c110 = &i2c110;
-	    i2c111 = &i2c111;
-	    i2c112 = &i2c112;
-	    i2c113 = &i2c113;
-	    i2c114 = &i2c114;
-	    i2c115 = &i2c115;
-	    i2c116 = &i2c116;
-	    i2c117 = &i2c117;
-	    i2c118 = &i2c118;
-	    i2c119 = &i2c119;
-	};
-
 	gpio-restart {
 		compatible = "gpio-restart";
 		gpios = <&gpio 37 GPIO_ACTIVE_LOW>;
@@ -298,17 +274,10 @@
 
 &spi0 {
 	status = "okay";
-	spi@0 {
-		compatible = "spi-mux";
-		mux-controls = <&mux>;
-		#address-cells = <1>;
-		#size-cells = <0>;
-		reg = <0>;	/* CS0 */
-		spi-flash@9 {
-			compatible = "jedec,spi-nor";
-			spi-max-frequency = <8000000>;
-			reg = <0x9>;	/* SPI */
-		};
+	spi-flash@0 {
+		compatible = "jedec,spi-nor";
+		spi-max-frequency = <8000000>;
+		reg = <0>;
 	};
 };
 
@@ -328,6 +297,33 @@
 	};
 };
 
+&sgpio0 {
+	status = "okay";
+	microchip,sgpio-port-ranges = <8 15>;
+	gpio@0 {
+		ngpios = <64>;
+	};
+	gpio@1 {
+		ngpios = <64>;
+	};
+};
+
+&sgpio1 {
+	status = "okay";
+	microchip,sgpio-port-ranges = <24 31>;
+	gpio@0 {
+		ngpios = <64>;
+	};
+	gpio@1 {
+		ngpios = <64>;
+	};
+};
+
+&sgpio2 {
+	status = "okay";
+	microchip,sgpio-port-ranges = <0 0>, <11 31>;
+};
+
 &gpio {
 	i2cmux_pins_i: i2cmux-pins-i {
 	       pins = "GPIO_16", "GPIO_17", "GPIO_18", "GPIO_19",
@@ -415,9 +411,9 @@
 
 &i2c0_imux {
 	pinctrl-names =
-		"i2c100", "i2c101", "i2c102", "i2c103",
-		"i2c104", "i2c105", "i2c106", "i2c107",
-		"i2c108", "i2c109", "i2c110", "i2c111", "idle";
+		"i2c_sfp1", "i2c_sfp2", "i2c_sfp3", "i2c_sfp4",
+		"i2c_sfp5", "i2c_sfp6", "i2c_sfp7", "i2c_sfp8",
+		"i2c_sfp9", "i2c_sfp10", "i2c_sfp11", "i2c_sfp12", "idle";
 	pinctrl-0 = <&i2cmux_0>;
 	pinctrl-1 = <&i2cmux_1>;
 	pinctrl-2 = <&i2cmux_2>;
@@ -431,62 +427,62 @@
 	pinctrl-10 = <&i2cmux_10>;
 	pinctrl-11 = <&i2cmux_11>;
 	pinctrl-12 = <&i2cmux_pins_i>;
-	i2c100: i2c_sfp1 {
+	i2c_sfp1: i2c_sfp1 {
 		reg = <0x0>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c101: i2c_sfp2 {
+	i2c_sfp2: i2c_sfp2 {
 		reg = <0x1>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c102: i2c_sfp3 {
+	i2c_sfp3: i2c_sfp3 {
 		reg = <0x2>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c103: i2c_sfp4 {
+	i2c_sfp4: i2c_sfp4 {
 		reg = <0x3>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c104: i2c_sfp5 {
+	i2c_sfp5: i2c_sfp5 {
 		reg = <0x4>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c105: i2c_sfp6 {
+	i2c_sfp6: i2c_sfp6 {
 		reg = <0x5>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c106: i2c_sfp7 {
+	i2c_sfp7: i2c_sfp7 {
 		reg = <0x6>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c107: i2c_sfp8 {
+	i2c_sfp8: i2c_sfp8 {
 		reg = <0x7>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c108: i2c_sfp9 {
+	i2c_sfp9: i2c_sfp9 {
 		reg = <0x8>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c109: i2c_sfp10 {
+	i2c_sfp10: i2c_sfp10 {
 		reg = <0x9>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c110: i2c_sfp11 {
+	i2c_sfp11: i2c_sfp11 {
 		reg = <0xa>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c111: i2c_sfp12 {
+	i2c_sfp12: i2c_sfp12 {
 		reg = <0xb>;
 		#address-cells = <1>;
 		#size-cells = <0>;
@@ -499,44 +495,413 @@
 		     &gpio 61 GPIO_ACTIVE_HIGH
 		     &gpio 54 GPIO_ACTIVE_HIGH>;
 	idle-state = <0x8>;
-	i2c112: i2c_sfp13 {
+	i2c_sfp13: i2c_sfp13 {
 		reg = <0x0>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c113: i2c_sfp14 {
+	i2c_sfp14: i2c_sfp14 {
 		reg = <0x1>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c114: i2c_sfp15 {
+	i2c_sfp15: i2c_sfp15 {
 		reg = <0x2>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c115: i2c_sfp16 {
+	i2c_sfp16: i2c_sfp16 {
 		reg = <0x3>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c116: i2c_sfp17 {
+	i2c_sfp17: i2c_sfp17 {
 		reg = <0x4>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c117: i2c_sfp18 {
+	i2c_sfp18: i2c_sfp18 {
 		reg = <0x5>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c118: i2c_sfp19 {
+	i2c_sfp19: i2c_sfp19 {
 		reg = <0x6>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c119: i2c_sfp20 {
+	i2c_sfp20: i2c_sfp20 {
 		reg = <0x7>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
 };
+
+&mdio3 {
+	status = "ok";
+	phy64: ethernet-phy@64 {
+		reg = <28>;
+	};
+};
+
+&axi {
+	sfp_eth12: sfp-eth12 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp1>;
+		tx-disable-gpios = <&sgpio_out2 11 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 11 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 11 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 12 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth13: sfp-eth13 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp2>;
+		tx-disable-gpios = <&sgpio_out2 12 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 12 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 12 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 13 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth14: sfp-eth14 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp3>;
+		tx-disable-gpios = <&sgpio_out2 13 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 13 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 13 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 14 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth15: sfp-eth15 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp4>;
+		tx-disable-gpios = <&sgpio_out2 14 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 14 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 14 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 15 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth48: sfp-eth48 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp5>;
+		tx-disable-gpios = <&sgpio_out2 15 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 15 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 15 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 16 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth49: sfp-eth49 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp6>;
+		tx-disable-gpios = <&sgpio_out2 16 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 16 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 16 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 17 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth50: sfp-eth50 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp7>;
+		tx-disable-gpios = <&sgpio_out2 17 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 17 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 17 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 18 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth51: sfp-eth51 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp8>;
+		tx-disable-gpios = <&sgpio_out2 18 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 18 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 18 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 19 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth52: sfp-eth52 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp9>;
+		tx-disable-gpios = <&sgpio_out2 19 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 19 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 19 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 20 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth53: sfp-eth53 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp10>;
+		tx-disable-gpios = <&sgpio_out2 20 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 20 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 20 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 21 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth54: sfp-eth54 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp11>;
+		tx-disable-gpios = <&sgpio_out2 21 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 21 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 21 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 22 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth55: sfp-eth55 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp12>;
+		tx-disable-gpios = <&sgpio_out2 22 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 22 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 22 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 23 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth56: sfp-eth56 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp13>;
+		tx-disable-gpios = <&sgpio_out2 23 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 23 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 23 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 24 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth57: sfp-eth57 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp14>;
+		tx-disable-gpios = <&sgpio_out2 24 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 24 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 24 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 25 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth58: sfp-eth58 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp15>;
+		tx-disable-gpios = <&sgpio_out2 25 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 25 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 25 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 26 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth59: sfp-eth59 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp16>;
+		tx-disable-gpios = <&sgpio_out2 26 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 26 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 26 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 27 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth60: sfp-eth60 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp17>;
+		tx-disable-gpios = <&sgpio_out2 27 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 27 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 27 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 28 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth61: sfp-eth61 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp18>;
+		tx-disable-gpios = <&sgpio_out2 28 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 28 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 28 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 29 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth62: sfp-eth62 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp19>;
+		tx-disable-gpios = <&sgpio_out2 29 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 29 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 29 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 30 0 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth63: sfp-eth63 {
+		compatible       = "sff,sfp";
+		i2c-bus          = <&i2c_sfp20>;
+		tx-disable-gpios = <&sgpio_out2 30 1 GPIO_ACTIVE_LOW>;
+		los-gpios        = <&sgpio_in2 30 1 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios   = <&sgpio_in2 30 2 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios   = <&sgpio_in2 31 0 GPIO_ACTIVE_HIGH>;
+	};
+};
+
+&switch {
+	ethernet-ports {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		/* 10G SFPs */
+		port12: port@12 {
+			reg = <12>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 13>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth12>;
+			microchip,sd-sgpio = <301>;
+			managed = "in-band-status";
+		};
+		port13: port@13 {
+			reg = <13>;
+			/* Example: CU SFP, 1G speed */
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 14>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth13>;
+			microchip,sd-sgpio = <305>;
+			managed = "in-band-status";
+		};
+		port14: port@14 {
+			reg = <14>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 15>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth14>;
+			microchip,sd-sgpio = <309>;
+			managed = "in-band-status";
+		};
+		port15: port@15 {
+			reg = <15>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 16>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth15>;
+			microchip,sd-sgpio = <313>;
+			managed = "in-band-status";
+		};
+		port48: port@48 {
+			reg = <48>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 17>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth48>;
+			microchip,sd-sgpio = <317>;
+			managed = "in-band-status";
+		};
+		port49: port@49 {
+			reg = <49>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 18>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth49>;
+			microchip,sd-sgpio = <321>;
+			managed = "in-band-status";
+		};
+		port50: port@50 {
+			reg = <50>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 19>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth50>;
+			microchip,sd-sgpio = <325>;
+			managed = "in-band-status";
+		};
+		port51: port@51 {
+			reg = <51>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 20>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth51>;
+			microchip,sd-sgpio = <329>;
+			managed = "in-band-status";
+		};
+		port52: port@52 {
+			reg = <52>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 21>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth52>;
+			microchip,sd-sgpio = <333>;
+			managed = "in-band-status";
+		};
+		port53: port@53 {
+			reg = <53>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 22>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth53>;
+			microchip,sd-sgpio = <337>;
+			managed = "in-band-status";
+		};
+		port54: port@54 {
+			reg = <54>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 23>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth54>;
+			microchip,sd-sgpio = <341>;
+			managed = "in-band-status";
+		};
+		port55: port@55 {
+			reg = <55>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 24>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth55>;
+			microchip,sd-sgpio = <345>;
+			managed = "in-band-status";
+		};
+		/* 25G SFPs */
+		port56: port@56 {
+			reg = <56>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 25>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth56>;
+			microchip,sd-sgpio = <349>;
+			managed = "in-band-status";
+		};
+		port57: port@57 {
+			reg = <57>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 26>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth57>;
+			microchip,sd-sgpio = <353>;
+			managed = "in-band-status";
+		};
+		port58: port@58 {
+			reg = <58>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 27>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth58>;
+			microchip,sd-sgpio = <357>;
+			managed = "in-band-status";
+		};
+		port59: port@59 {
+			reg = <59>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 28>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth59>;
+			microchip,sd-sgpio = <361>;
+			managed = "in-band-status";
+		};
+		port60: port@60 {
+			reg = <60>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 29>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth60>;
+			microchip,sd-sgpio = <365>;
+			managed = "in-band-status";
+		};
+		port61: port@61 {
+			reg = <61>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 30>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth61>;
+			microchip,sd-sgpio = <369>;
+			managed = "in-band-status";
+		};
+		port62: port@62 {
+			reg = <62>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 31>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth62>;
+			microchip,sd-sgpio = <373>;
+			managed = "in-band-status";
+		};
+		port63: port@63 {
+			reg = <63>;
+			microchip,bandwidth = <10000>;
+			phys = <&serdes 32>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth63>;
+			microchip,sd-sgpio = <377>;
+			managed = "in-band-status";
+		};
+		/* Finally the Management interface */
+		port64: port@64 {
+			reg = <64>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 0>;
+			phy-handle = <&phy64>;
+			phy-mode = "sgmii";
+		};
+	};
+};
diff --git a/arch/arm64/boot/dts/microchip/sparx5_pcb135_board.dtsi b/arch/arm64/boot/dts/microchip/sparx5_pcb135_board.dtsi
index e28c6dd16377..ef96e6d8c6b3 100644
--- a/arch/arm64/boot/dts/microchip/sparx5_pcb135_board.dtsi
+++ b/arch/arm64/boot/dts/microchip/sparx5_pcb135_board.dtsi
@@ -7,14 +7,6 @@
 #include "sparx5_pcb_common.dtsi"
 
 /{
-	aliases {
-	    i2c0   = &i2c0;
-	    i2c152 = &i2c152;
-	    i2c153 = &i2c153;
-	    i2c154 = &i2c154;
-	    i2c155 = &i2c155;
-	};
-
 	gpio-restart {
 		compatible = "gpio-restart";
 		gpios = <&gpio 37 GPIO_ACTIVE_LOW>;
@@ -97,17 +89,10 @@
 
 &spi0 {
 	status = "okay";
-	spi@0 {
-		compatible = "spi-mux";
-		mux-controls = <&mux>;
-		#address-cells = <1>;
-		#size-cells = <0>;
-		reg = <0>; /* CS0 */
-		spi-flash@9 {
-			compatible = "jedec,spi-nor";
-			spi-max-frequency = <8000000>;
-			reg = <0x9>; /* SPI */
-		};
+	spi-flash@0 {
+		compatible = "jedec,spi-nor";
+		spi-max-frequency = <8000000>;
+		reg = <0>;
 	};
 };
 
@@ -138,6 +123,11 @@
 	};
 };
 
+&sgpio2 {
+	status = "okay";
+	microchip,sgpio-port-ranges = <0 0>, <16 18>, <28 31>;
+};
+
 &axi {
 	i2c0_imux: i2c0-imux@0 {
 		compatible = "i2c-mux-pinctrl";
@@ -149,31 +139,614 @@
 
 &i2c0_imux {
 	pinctrl-names =
-		"i2c152", "i2c153", "i2c154", "i2c155",
+		"i2c_sfp1", "i2c_sfp2", "i2c_sfp3", "i2c_sfp4",
 		"idle";
 	pinctrl-0 = <&i2cmux_s29>;
 	pinctrl-1 = <&i2cmux_s30>;
 	pinctrl-2 = <&i2cmux_s31>;
 	pinctrl-3 = <&i2cmux_s32>;
 	pinctrl-4 = <&i2cmux_pins_i>;
-	i2c152: i2c_sfp1 {
+	i2c_sfp1: i2c_sfp1 {
 		reg = <0x0>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c153: i2c_sfp2 {
+	i2c_sfp2: i2c_sfp2 {
 		reg = <0x1>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c154: i2c_sfp3 {
+	i2c_sfp3: i2c_sfp3 {
 		reg = <0x2>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
-	i2c155: i2c_sfp4 {
+	i2c_sfp4: i2c_sfp4 {
 		reg = <0x3>;
 		#address-cells = <1>;
 		#size-cells = <0>;
 	};
 };
+
+&axi {
+	sfp_eth60: sfp-eth60 {
+		compatible	   = "sff,sfp";
+		i2c-bus            = <&i2c_sfp1>;
+		tx-disable-gpios   = <&sgpio_out2 28 0 GPIO_ACTIVE_LOW>;
+		rate-select0-gpios = <&sgpio_out2 28 1 GPIO_ACTIVE_HIGH>;
+		los-gpios          = <&sgpio_in2 28 0 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios     = <&sgpio_in2 28 1 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios     = <&sgpio_in2 28 2 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth61: sfp-eth61 {
+		compatible         = "sff,sfp";
+		i2c-bus            = <&i2c_sfp2>;
+		tx-disable-gpios   = <&sgpio_out2 29 0 GPIO_ACTIVE_LOW>;
+		rate-select0-gpios = <&sgpio_out2 29 1 GPIO_ACTIVE_HIGH>;
+		los-gpios          = <&sgpio_in2 29 0 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios     = <&sgpio_in2 29 1 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios     = <&sgpio_in2 29 2 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth62: sfp-eth62 {
+		compatible         = "sff,sfp";
+		i2c-bus            = <&i2c_sfp3>;
+		tx-disable-gpios   = <&sgpio_out2 30 0 GPIO_ACTIVE_LOW>;
+		rate-select0-gpios = <&sgpio_out2 30 1 GPIO_ACTIVE_HIGH>;
+		los-gpios          = <&sgpio_in2 30 0 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios     = <&sgpio_in2 30 1 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios     = <&sgpio_in2 30 2 GPIO_ACTIVE_HIGH>;
+	};
+	sfp_eth63: sfp-eth63 {
+		compatible         = "sff,sfp";
+		i2c-bus            = <&i2c_sfp4>;
+		tx-disable-gpios   = <&sgpio_out2 31 0 GPIO_ACTIVE_LOW>;
+		rate-select0-gpios = <&sgpio_out2 31 1 GPIO_ACTIVE_HIGH>;
+		los-gpios          = <&sgpio_in2 31 0 GPIO_ACTIVE_HIGH>;
+		mod-def0-gpios     = <&sgpio_in2 31 1 GPIO_ACTIVE_LOW>;
+		tx-fault-gpios     = <&sgpio_in2 31 2 GPIO_ACTIVE_HIGH>;
+	};
+};
+
+&mdio0 {
+	status = "ok";
+	phy0: ethernet-phy@0 {
+		reg = <0>;
+	};
+	phy1: ethernet-phy@1 {
+		reg = <1>;
+	};
+	phy2: ethernet-phy@2 {
+		reg = <2>;
+	};
+	phy3: ethernet-phy@3 {
+		reg = <3>;
+	};
+	phy4: ethernet-phy@4 {
+		reg = <4>;
+	};
+	phy5: ethernet-phy@5 {
+		reg = <5>;
+	};
+	phy6: ethernet-phy@6 {
+		reg = <6>;
+	};
+	phy7: ethernet-phy@7 {
+		reg = <7>;
+	};
+	phy8: ethernet-phy@8 {
+		reg = <8>;
+	};
+	phy9: ethernet-phy@9 {
+		reg = <9>;
+	};
+	phy10: ethernet-phy@10 {
+		reg = <10>;
+	};
+	phy11: ethernet-phy@11 {
+		reg = <11>;
+	};
+	phy12: ethernet-phy@12 {
+		reg = <12>;
+	};
+	phy13: ethernet-phy@13 {
+		reg = <13>;
+	};
+	phy14: ethernet-phy@14 {
+		reg = <14>;
+	};
+	phy15: ethernet-phy@15 {
+		reg = <15>;
+	};
+	phy16: ethernet-phy@16 {
+		reg = <16>;
+	};
+	phy17: ethernet-phy@17 {
+		reg = <17>;
+	};
+	phy18: ethernet-phy@18 {
+		reg = <18>;
+	};
+	phy19: ethernet-phy@19 {
+		reg = <19>;
+	};
+	phy20: ethernet-phy@20 {
+		reg = <20>;
+	};
+	phy21: ethernet-phy@21 {
+		reg = <21>;
+	};
+	phy22: ethernet-phy@22 {
+		reg = <22>;
+	};
+	phy23: ethernet-phy@23 {
+		reg = <23>;
+	};
+};
+
+&mdio1 {
+	status = "ok";
+	phy24: ethernet-phy@24 {
+		reg = <0>;
+	};
+	phy25: ethernet-phy@25 {
+		reg = <1>;
+	};
+	phy26: ethernet-phy@26 {
+		reg = <2>;
+	};
+	phy27: ethernet-phy@27 {
+		reg = <3>;
+	};
+	phy28: ethernet-phy@28 {
+		reg = <4>;
+	};
+	phy29: ethernet-phy@29 {
+		reg = <5>;
+	};
+	phy30: ethernet-phy@30 {
+		reg = <6>;
+	};
+	phy31: ethernet-phy@31 {
+		reg = <7>;
+	};
+	phy32: ethernet-phy@32 {
+		reg = <8>;
+	};
+	phy33: ethernet-phy@33 {
+		reg = <9>;
+	};
+	phy34: ethernet-phy@34 {
+		reg = <10>;
+	};
+	phy35: ethernet-phy@35 {
+		reg = <11>;
+	};
+	phy36: ethernet-phy@36 {
+		reg = <12>;
+	};
+	phy37: ethernet-phy@37 {
+		reg = <13>;
+	};
+	phy38: ethernet-phy@38 {
+		reg = <14>;
+	};
+	phy39: ethernet-phy@39 {
+		reg = <15>;
+	};
+	phy40: ethernet-phy@40 {
+		reg = <16>;
+	};
+	phy41: ethernet-phy@41 {
+		reg = <17>;
+	};
+	phy42: ethernet-phy@42 {
+		reg = <18>;
+	};
+	phy43: ethernet-phy@43 {
+		reg = <19>;
+	};
+	phy44: ethernet-phy@44 {
+		reg = <20>;
+	};
+	phy45: ethernet-phy@45 {
+		reg = <21>;
+	};
+	phy46: ethernet-phy@46 {
+		reg = <22>;
+	};
+	phy47: ethernet-phy@47 {
+		reg = <23>;
+	};
+};
+
+&mdio3 {
+	status = "ok";
+	phy64: ethernet-phy@64 {
+		reg = <28>;
+	};
+};
+
+&switch {
+	ethernet-ports {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		port0: port@0 {
+			reg = <0>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 13>;
+			phy-handle = <&phy0>;
+			phy-mode = "qsgmii";
+		};
+		port1: port@1 {
+			reg = <1>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 13>;
+			phy-handle = <&phy1>;
+			phy-mode = "qsgmii";
+		};
+		port2: port@2 {
+			reg = <2>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 13>;
+			phy-handle = <&phy2>;
+			phy-mode = "qsgmii";
+		};
+		port3: port@3 {
+			reg = <3>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 13>;
+			phy-handle = <&phy3>;
+			phy-mode = "qsgmii";
+		};
+		port4: port@4 {
+			reg = <4>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 14>;
+			phy-handle = <&phy4>;
+			phy-mode = "qsgmii";
+		};
+		port5: port@5 {
+			reg = <5>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 14>;
+			phy-handle = <&phy5>;
+			phy-mode = "qsgmii";
+		};
+		port6: port@6 {
+			reg = <6>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 14>;
+			phy-handle = <&phy6>;
+			phy-mode = "qsgmii";
+		};
+		port7: port@7 {
+			reg = <7>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 14>;
+			phy-handle = <&phy7>;
+			phy-mode = "qsgmii";
+		};
+		port8: port@8 {
+			reg = <8>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 15>;
+			phy-handle = <&phy8>;
+			phy-mode = "qsgmii";
+		};
+		port9: port@9 {
+			reg = <9>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 15>;
+			phy-handle = <&phy9>;
+			phy-mode = "qsgmii";
+		};
+		port10: port@10 {
+			reg = <10>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 15>;
+			phy-handle = <&phy10>;
+			phy-mode = "qsgmii";
+		};
+		port11: port@11 {
+			reg = <11>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 15>;
+			phy-handle = <&phy11>;
+			phy-mode = "qsgmii";
+		};
+		port12: port@12 {
+			reg = <12>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 16>;
+			phy-handle = <&phy12>;
+			phy-mode = "qsgmii";
+		};
+		port13: port@13 {
+			reg = <13>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 16>;
+			phy-handle = <&phy13>;
+			phy-mode = "qsgmii";
+		};
+		port14: port@14 {
+			reg = <14>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 16>;
+			phy-handle = <&phy14>;
+			phy-mode = "qsgmii";
+		};
+		port15: port@15 {
+			reg = <15>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 16>;
+			phy-handle = <&phy15>;
+			phy-mode = "qsgmii";
+		};
+		port16: port@16 {
+			reg = <16>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 17>;
+			phy-handle = <&phy16>;
+			phy-mode = "qsgmii";
+		};
+		port17: port@17 {
+			reg = <17>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 17>;
+			phy-handle = <&phy17>;
+			phy-mode = "qsgmii";
+		};
+		port18: port@18 {
+			reg = <18>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 17>;
+			phy-handle = <&phy18>;
+			phy-mode = "qsgmii";
+		};
+		port19: port@19 {
+			reg = <19>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 17>;
+			phy-handle = <&phy19>;
+			phy-mode = "qsgmii";
+		};
+		port20: port@20 {
+			reg = <20>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 18>;
+			phy-handle = <&phy20>;
+			phy-mode = "qsgmii";
+		};
+		port21: port@21 {
+			reg = <21>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 18>;
+			phy-handle = <&phy21>;
+			phy-mode = "qsgmii";
+		};
+		port22: port@22 {
+			reg = <22>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 18>;
+			phy-handle = <&phy22>;
+			phy-mode = "qsgmii";
+		};
+		port23: port@23 {
+			reg = <23>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 18>;
+			phy-handle = <&phy23>;
+			phy-mode = "qsgmii";
+		};
+		port24: port@24 {
+			reg = <24>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 19>;
+			phy-handle = <&phy24>;
+			phy-mode = "qsgmii";
+		};
+		port25: port@25 {
+			reg = <25>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 19>;
+			phy-handle = <&phy25>;
+			phy-mode = "qsgmii";
+		};
+		port26: port@26 {
+			reg = <26>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 19>;
+			phy-handle = <&phy26>;
+			phy-mode = "qsgmii";
+		};
+		port27: port@27 {
+			reg = <27>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 19>;
+			phy-handle = <&phy27>;
+			phy-mode = "qsgmii";
+		};
+		port28: port@28 {
+			reg = <28>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 20>;
+			phy-handle = <&phy28>;
+			phy-mode = "qsgmii";
+		};
+		port29: port@29 {
+			reg = <29>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 20>;
+			phy-handle = <&phy29>;
+			phy-mode = "qsgmii";
+		};
+		port30: port@30 {
+			reg = <30>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 20>;
+			phy-handle = <&phy30>;
+			phy-mode = "qsgmii";
+		};
+		port31: port@31 {
+			reg = <31>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 20>;
+			phy-handle = <&phy31>;
+			phy-mode = "qsgmii";
+		};
+		port32: port@32 {
+			reg = <32>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 21>;
+			phy-handle = <&phy32>;
+			phy-mode = "qsgmii";
+		};
+		port33: port@33 {
+			reg = <33>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 21>;
+			phy-handle = <&phy33>;
+			phy-mode = "qsgmii";
+		};
+		port34: port@34 {
+			reg = <34>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 21>;
+			phy-handle = <&phy34>;
+			phy-mode = "qsgmii";
+		};
+		port35: port@35 {
+			reg = <35>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 21>;
+			phy-handle = <&phy35>;
+			phy-mode = "qsgmii";
+		};
+		port36: port@36 {
+			reg = <36>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 22>;
+			phy-handle = <&phy36>;
+			phy-mode = "qsgmii";
+		};
+		port37: port@37 {
+			reg = <37>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 22>;
+			phy-handle = <&phy37>;
+			phy-mode = "qsgmii";
+		};
+		port38: port@38 {
+			reg = <38>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 22>;
+			phy-handle = <&phy38>;
+			phy-mode = "qsgmii";
+		};
+		port39: port@39 {
+			reg = <39>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 22>;
+			phy-handle = <&phy39>;
+			phy-mode = "qsgmii";
+		};
+		port40: port@40 {
+			reg = <40>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 23>;
+			phy-handle = <&phy40>;
+			phy-mode = "qsgmii";
+		};
+		port41: port@41 {
+			reg = <41>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 23>;
+			phy-handle = <&phy41>;
+			phy-mode = "qsgmii";
+		};
+		port42: port@42 {
+			reg = <42>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 23>;
+			phy-handle = <&phy42>;
+			phy-mode = "qsgmii";
+		};
+		port43: port@43 {
+			reg = <43>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 23>;
+			phy-handle = <&phy43>;
+			phy-mode = "qsgmii";
+		};
+		port44: port@44 {
+			reg = <44>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 24>;
+			phy-handle = <&phy44>;
+			phy-mode = "qsgmii";
+		};
+		port45: port@45 {
+			reg = <45>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 24>;
+			phy-handle = <&phy45>;
+			phy-mode = "qsgmii";
+		};
+		port46: port@46 {
+			reg = <46>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 24>;
+			phy-handle = <&phy46>;
+			phy-mode = "qsgmii";
+		};
+		port47: port@47 {
+			reg = <47>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 24>;
+			phy-handle = <&phy47>;
+			phy-mode = "qsgmii";
+		};
+		/* Then the 25G interfaces */
+		port60: port@60 {
+			reg = <60>;
+			microchip,bandwidth = <25000>;
+			phys = <&serdes 29>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth60>;
+			managed = "in-band-status";
+		};
+		port61: port@61 {
+			reg = <61>;
+			microchip,bandwidth = <25000>;
+			phys = <&serdes 30>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth61>;
+			managed = "in-band-status";
+		};
+		port62: port@62 {
+			reg = <62>;
+			microchip,bandwidth = <25000>;
+			phys = <&serdes 31>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth62>;
+			managed = "in-band-status";
+		};
+		port63: port@63 {
+			reg = <63>;
+			microchip,bandwidth = <25000>;
+			phys = <&serdes 32>;
+			phy-mode = "10gbase-r";
+			sfp = <&sfp_eth63>;
+			managed = "in-band-status";
+		};
+		/* Finally the Management interface */
+		port64: port@64 {
+			reg = <64>;
+			microchip,bandwidth = <1000>;
+			phys = <&serdes 0>;
+			phy-handle = <&phy64>;
+			phy-mode = "sgmii";
+		};
+	};
+};
diff --git a/arch/arm64/boot/dts/rockchip/rk3308.dtsi b/arch/arm64/boot/dts/rockchip/rk3308.dtsi
index 0c5fa9801e6f..b815ce73e5c6 100644
--- a/arch/arm64/boot/dts/rockchip/rk3308.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3308.dtsi
@@ -637,6 +637,28 @@
 		status = "disabled";
 	};
 
+	gmac: ethernet@ff4e0000 {
+		compatible = "rockchip,rk3308-gmac";
+		reg = <0x0 0xff4e0000 0x0 0x10000>;
+		interrupts = <GIC_SPI 64 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "macirq";
+		clocks = <&cru SCLK_MAC>, <&cru SCLK_MAC_RX_TX>,
+			 <&cru SCLK_MAC_RX_TX>, <&cru SCLK_MAC_REF>,
+			 <&cru SCLK_MAC>, <&cru ACLK_MAC>,
+			 <&cru PCLK_MAC>, <&cru SCLK_MAC_RMII>;
+		clock-names = "stmmaceth", "mac_clk_rx",
+			      "mac_clk_tx", "clk_mac_ref",
+			      "clk_mac_refout", "aclk_mac",
+			      "pclk_mac", "clk_mac_speed";
+		phy-mode = "rmii";
+		pinctrl-names = "default";
+		pinctrl-0 = <&rmii_pins &mac_refclk_12ma>;
+		resets = <&cru SRST_MAC_A>;
+		reset-names = "stmmaceth";
+		rockchip,grf = <&grf>;
+		status = "disabled";
+	};
+
 	cru: clock-controller@ff500000 {
 		compatible = "rockchip,rk3308-cru";
 		reg = <0x0 0xff500000 0x0 0x1000>;
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index d0901e610df3..09a805cc32d7 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -68,19 +68,13 @@ CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS
 $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
-ifdef REGENERATE_ARM64_CRYPTO
 quiet_cmd_perlasm = PERLASM $@
       cmd_perlasm = $(PERL) $(<) void $(@)
 
-$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
+$(obj)/%-core.S: $(src)/%-armv8.pl
 	$(call cmd,perlasm)
 
-$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
+$(obj)/sha256-core.S: $(src)/sha512-armv8.pl
 	$(call cmd,perlasm)
 
-$(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
-	$(call cmd,perlasm)
-
-endif
-
 clean-files += poly1305-core.S sha256-core.S sha512-core.S
diff --git a/arch/arm64/crypto/poly1305-core.S_shipped b/arch/arm64/crypto/poly1305-core.S_shipped
deleted file mode 100644
index fb2822abf63a..000000000000
--- a/arch/arm64/crypto/poly1305-core.S_shipped
+++ /dev/null
@@ -1,835 +0,0 @@
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern	OPENSSL_armcap_P
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl	poly1305_blocks
-.globl	poly1305_emit
-
-.globl	poly1305_init
-.type	poly1305_init,%function
-.align	5
-poly1305_init:
-	cmp	x1,xzr
-	stp	xzr,xzr,[x0]		// zero hash value
-	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
-
-	csel	x0,xzr,x0,eq
-	b.eq	.Lno_key
-
-#ifndef	__KERNEL__
-	adrp	x17,OPENSSL_armcap_P
-	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
-#endif
-
-	ldp	x7,x8,[x1]		// load key
-	mov	x9,#0xfffffffc0fffffff
-	movk	x9,#0x0fff,lsl#48
-#ifdef	__AARCH64EB__
-	rev	x7,x7			// flip bytes
-	rev	x8,x8
-#endif
-	and	x7,x7,x9		// &=0ffffffc0fffffff
-	and	x9,x9,#-4
-	and	x8,x8,x9		// &=0ffffffc0ffffffc
-	mov	w9,#-1
-	stp	x7,x8,[x0,#32]	// save key value
-	str	w9,[x0,#48]	// impossible key power value
-
-#ifndef	__KERNEL__
-	tst	w17,#ARMV7_NEON
-
-	adr	x12,.Lpoly1305_blocks
-	adr	x7,.Lpoly1305_blocks_neon
-	adr	x13,.Lpoly1305_emit
-
-	csel	x12,x12,x7,eq
-
-# ifdef	__ILP32__
-	stp	w12,w13,[x2]
-# else
-	stp	x12,x13,[x2]
-# endif
-#endif
-	mov	x0,#1
-.Lno_key:
-	ret
-.size	poly1305_init,.-poly1305_init
-
-.type	poly1305_blocks,%function
-.align	5
-poly1305_blocks:
-.Lpoly1305_blocks:
-	ands	x2,x2,#-16
-	b.eq	.Lno_data
-
-	ldp	x4,x5,[x0]		// load hash value
-	ldp	x6,x17,[x0,#16]	// [along with is_base2_26]
-	ldp	x7,x8,[x0,#32]	// load key value
-
-#ifdef	__AARCH64EB__
-	lsr	x12,x4,#32
-	mov	w13,w4
-	lsr	x14,x5,#32
-	mov	w15,w5
-	lsr	x16,x6,#32
-#else
-	mov	w12,w4
-	lsr	x13,x4,#32
-	mov	w14,w5
-	lsr	x15,x5,#32
-	mov	w16,w6
-#endif
-
-	add	x12,x12,x13,lsl#26	// base 2^26 -> base 2^64
-	lsr	x13,x14,#12
-	adds	x12,x12,x14,lsl#52
-	add	x13,x13,x15,lsl#14
-	adc	x13,x13,xzr
-	lsr	x14,x16,#24
-	adds	x13,x13,x16,lsl#40
-	adc	x14,x14,xzr
-
-	cmp	x17,#0			// is_base2_26?
-	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
-	csel	x4,x4,x12,eq		// choose between radixes
-	csel	x5,x5,x13,eq
-	csel	x6,x6,x14,eq
-
-.Loop:
-	ldp	x10,x11,[x1],#16	// load input
-	sub	x2,x2,#16
-#ifdef	__AARCH64EB__
-	rev	x10,x10
-	rev	x11,x11
-#endif
-	adds	x4,x4,x10		// accumulate input
-	adcs	x5,x5,x11
-
-	mul	x12,x4,x7		// h0*r0
-	adc	x6,x6,x3
-	umulh	x13,x4,x7
-
-	mul	x10,x5,x9		// h1*5*r1
-	umulh	x11,x5,x9
-
-	adds	x12,x12,x10
-	mul	x10,x4,x8		// h0*r1
-	adc	x13,x13,x11
-	umulh	x14,x4,x8
-
-	adds	x13,x13,x10
-	mul	x10,x5,x7		// h1*r0
-	adc	x14,x14,xzr
-	umulh	x11,x5,x7
-
-	adds	x13,x13,x10
-	mul	x10,x6,x9		// h2*5*r1
-	adc	x14,x14,x11
-	mul	x11,x6,x7		// h2*r0
-
-	adds	x13,x13,x10
-	adc	x14,x14,x11
-
-	and	x10,x14,#-4		// final reduction
-	and	x6,x14,#3
-	add	x10,x10,x14,lsr#2
-	adds	x4,x12,x10
-	adcs	x5,x13,xzr
-	adc	x6,x6,xzr
-
-	cbnz	x2,.Loop
-
-	stp	x4,x5,[x0]		// store hash value
-	stp	x6,xzr,[x0,#16]	// [and clear is_base2_26]
-
-.Lno_data:
-	ret
-.size	poly1305_blocks,.-poly1305_blocks
-
-.type	poly1305_emit,%function
-.align	5
-poly1305_emit:
-.Lpoly1305_emit:
-	ldp	x4,x5,[x0]		// load hash base 2^64
-	ldp	x6,x7,[x0,#16]	// [along with is_base2_26]
-	ldp	x10,x11,[x2]	// load nonce
-
-#ifdef	__AARCH64EB__
-	lsr	x12,x4,#32
-	mov	w13,w4
-	lsr	x14,x5,#32
-	mov	w15,w5
-	lsr	x16,x6,#32
-#else
-	mov	w12,w4
-	lsr	x13,x4,#32
-	mov	w14,w5
-	lsr	x15,x5,#32
-	mov	w16,w6
-#endif
-
-	add	x12,x12,x13,lsl#26	// base 2^26 -> base 2^64
-	lsr	x13,x14,#12
-	adds	x12,x12,x14,lsl#52
-	add	x13,x13,x15,lsl#14
-	adc	x13,x13,xzr
-	lsr	x14,x16,#24
-	adds	x13,x13,x16,lsl#40
-	adc	x14,x14,xzr
-
-	cmp	x7,#0			// is_base2_26?
-	csel	x4,x4,x12,eq		// choose between radixes
-	csel	x5,x5,x13,eq
-	csel	x6,x6,x14,eq
-
-	adds	x12,x4,#5		// compare to modulus
-	adcs	x13,x5,xzr
-	adc	x14,x6,xzr
-
-	tst	x14,#-4			// see if it's carried/borrowed
-
-	csel	x4,x4,x12,eq
-	csel	x5,x5,x13,eq
-
-#ifdef	__AARCH64EB__
-	ror	x10,x10,#32		// flip nonce words
-	ror	x11,x11,#32
-#endif
-	adds	x4,x4,x10		// accumulate nonce
-	adc	x5,x5,x11
-#ifdef	__AARCH64EB__
-	rev	x4,x4			// flip output bytes
-	rev	x5,x5
-#endif
-	stp	x4,x5,[x1]		// write result
-
-	ret
-.size	poly1305_emit,.-poly1305_emit
-.type	poly1305_mult,%function
-.align	5
-poly1305_mult:
-	mul	x12,x4,x7		// h0*r0
-	umulh	x13,x4,x7
-
-	mul	x10,x5,x9		// h1*5*r1
-	umulh	x11,x5,x9
-
-	adds	x12,x12,x10
-	mul	x10,x4,x8		// h0*r1
-	adc	x13,x13,x11
-	umulh	x14,x4,x8
-
-	adds	x13,x13,x10
-	mul	x10,x5,x7		// h1*r0
-	adc	x14,x14,xzr
-	umulh	x11,x5,x7
-
-	adds	x13,x13,x10
-	mul	x10,x6,x9		// h2*5*r1
-	adc	x14,x14,x11
-	mul	x11,x6,x7		// h2*r0
-
-	adds	x13,x13,x10
-	adc	x14,x14,x11
-
-	and	x10,x14,#-4		// final reduction
-	and	x6,x14,#3
-	add	x10,x10,x14,lsr#2
-	adds	x4,x12,x10
-	adcs	x5,x13,xzr
-	adc	x6,x6,xzr
-
-	ret
-.size	poly1305_mult,.-poly1305_mult
-
-.type	poly1305_splat,%function
-.align	4
-poly1305_splat:
-	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x13,x4,#26,#26
-	extr	x14,x5,x4,#52
-	and	x14,x14,#0x03ffffff
-	ubfx	x15,x5,#14,#26
-	extr	x16,x6,x5,#40
-
-	str	w12,[x0,#16*0]	// r0
-	add	w12,w13,w13,lsl#2	// r1*5
-	str	w13,[x0,#16*1]	// r1
-	add	w13,w14,w14,lsl#2	// r2*5
-	str	w12,[x0,#16*2]	// s1
-	str	w14,[x0,#16*3]	// r2
-	add	w14,w15,w15,lsl#2	// r3*5
-	str	w13,[x0,#16*4]	// s2
-	str	w15,[x0,#16*5]	// r3
-	add	w15,w16,w16,lsl#2	// r4*5
-	str	w14,[x0,#16*6]	// s3
-	str	w16,[x0,#16*7]	// r4
-	str	w15,[x0,#16*8]	// s4
-
-	ret
-.size	poly1305_splat,.-poly1305_splat
-
-#ifdef	__KERNEL__
-.globl	poly1305_blocks_neon
-#endif
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
-	ldr	x17,[x0,#24]
-	cmp	x2,#128
-	b.lo	.Lpoly1305_blocks
-
-	.inst	0xd503233f		// paciasp
-	stp	x29,x30,[sp,#-80]!
-	add	x29,sp,#0
-
-	stp	d8,d9,[sp,#16]		// meet ABI requirements
-	stp	d10,d11,[sp,#32]
-	stp	d12,d13,[sp,#48]
-	stp	d14,d15,[sp,#64]
-
-	cbz	x17,.Lbase2_64_neon
-
-	ldp	w10,w11,[x0]		// load hash value base 2^26
-	ldp	w12,w13,[x0,#8]
-	ldr	w14,[x0,#16]
-
-	tst	x2,#31
-	b.eq	.Leven_neon
-
-	ldp	x7,x8,[x0,#32]	// load key value
-
-	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
-	lsr	x5,x12,#12
-	adds	x4,x4,x12,lsl#52
-	add	x5,x5,x13,lsl#14
-	adc	x5,x5,xzr
-	lsr	x6,x14,#24
-	adds	x5,x5,x14,lsl#40
-	adc	x14,x6,xzr		// can be partially reduced...
-
-	ldp	x12,x13,[x1],#16	// load input
-	sub	x2,x2,#16
-	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
-
-#ifdef	__AARCH64EB__
-	rev	x12,x12
-	rev	x13,x13
-#endif
-	adds	x4,x4,x12		// accumulate input
-	adcs	x5,x5,x13
-	adc	x6,x6,x3
-
-	bl	poly1305_mult
-
-	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,x4,#26,#26
-	extr	x12,x5,x4,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,x5,#14,#26
-	extr	x14,x6,x5,#40
-
-	b	.Leven_neon
-
-.align	4
-.Lbase2_64_neon:
-	ldp	x7,x8,[x0,#32]	// load key value
-
-	ldp	x4,x5,[x0]		// load hash value base 2^64
-	ldr	x6,[x0,#16]
-
-	tst	x2,#31
-	b.eq	.Linit_neon
-
-	ldp	x12,x13,[x1],#16	// load input
-	sub	x2,x2,#16
-	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
-#ifdef	__AARCH64EB__
-	rev	x12,x12
-	rev	x13,x13
-#endif
-	adds	x4,x4,x12		// accumulate input
-	adcs	x5,x5,x13
-	adc	x6,x6,x3
-
-	bl	poly1305_mult
-
-.Linit_neon:
-	ldr	w17,[x0,#48]		// first table element
-	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,x4,#26,#26
-	extr	x12,x5,x4,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,x5,#14,#26
-	extr	x14,x6,x5,#40
-
-	cmp	w17,#-1			// is value impossible?
-	b.ne	.Leven_neon
-
-	fmov	d24,x10
-	fmov	d25,x11
-	fmov	d26,x12
-	fmov	d27,x13
-	fmov	d28,x14
-
-	////////////////////////////////// initialize r^n table
-	mov	x4,x7			// r^1
-	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
-	mov	x5,x8
-	mov	x6,xzr
-	add	x0,x0,#48+12
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^2
-	sub	x0,x0,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^3
-	sub	x0,x0,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^4
-	sub	x0,x0,#4
-	bl	poly1305_splat
-	sub	x0,x0,#48		// restore original x0
-	b	.Ldo_neon
-
-.align	4
-.Leven_neon:
-	fmov	d24,x10
-	fmov	d25,x11
-	fmov	d26,x12
-	fmov	d27,x13
-	fmov	d28,x14
-
-.Ldo_neon:
-	ldp	x8,x12,[x1,#32]	// inp[2:3]
-	subs	x2,x2,#64
-	ldp	x9,x13,[x1,#48]
-	add	x16,x1,#96
-	adr	x17,.Lzeros
-
-	lsl	x3,x3,#24
-	add	x15,x0,#48
-
-#ifdef	__AARCH64EB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	d14,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,x3,x12,lsr#40
-	add	x13,x3,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	d15,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	fmov	d16,x8
-	fmov	d17,x10
-	fmov	d18,x12
-
-	ldp	x8,x12,[x1],#16	// inp[0:1]
-	ldp	x9,x13,[x1],#48
-
-	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
-	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
-	ld1	{v8.4s},[x15]
-
-#ifdef	__AARCH64EB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	d9,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,x3,x12,lsr#40
-	add	x13,x3,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	d10,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	movi	v31.2d,#-1
-	fmov	d11,x8
-	fmov	d12,x10
-	fmov	d13,x12
-	ushr	v31.2d,v31.2d,#38
-
-	b.ls	.Lskip_loop
-
-.align	4
-.Loop_neon:
-	////////////////////////////////////////////////////////////////
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	//   ___________________/
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	//   ___________________/ ____________________/
-	//
-	// Note that we start with inp[2:3]*r^2. This is because it
-	// doesn't depend on reduction in previous iteration.
-	////////////////////////////////////////////////////////////////
-	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
-	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
-	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
-	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	subs	x2,x2,#64
-	umull	v23.2d,v14.2s,v7.s[2]
-	csel	x16,x17,x16,lo
-	umull	v22.2d,v14.2s,v5.s[2]
-	umull	v21.2d,v14.2s,v3.s[2]
-	 ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
-	umull	v20.2d,v14.2s,v1.s[2]
-	 ldp	x9,x13,[x16],#48
-	umull	v19.2d,v14.2s,v0.s[2]
-#ifdef	__AARCH64EB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	umlal	v23.2d,v15.2s,v5.s[2]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	v22.2d,v15.2s,v3.s[2]
-	 and	x5,x9,#0x03ffffff
-	umlal	v21.2d,v15.2s,v1.s[2]
-	 ubfx	x6,x8,#26,#26
-	umlal	v20.2d,v15.2s,v0.s[2]
-	 ubfx	x7,x9,#26,#26
-	umlal	v19.2d,v15.2s,v8.s[2]
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-
-	umlal	v23.2d,v16.2s,v3.s[2]
-	 extr	x8,x12,x8,#52
-	umlal	v22.2d,v16.2s,v1.s[2]
-	 extr	x9,x13,x9,#52
-	umlal	v21.2d,v16.2s,v0.s[2]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	v20.2d,v16.2s,v8.s[2]
-	 fmov	d14,x4
-	umlal	v19.2d,v16.2s,v6.s[2]
-	 and	x8,x8,#0x03ffffff
-
-	umlal	v23.2d,v17.2s,v1.s[2]
-	 and	x9,x9,#0x03ffffff
-	umlal	v22.2d,v17.2s,v0.s[2]
-	 ubfx	x10,x12,#14,#26
-	umlal	v21.2d,v17.2s,v8.s[2]
-	 ubfx	x11,x13,#14,#26
-	umlal	v20.2d,v17.2s,v6.s[2]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	v19.2d,v17.2s,v4.s[2]
-	 fmov	d15,x6
-
-	add	v11.2s,v11.2s,v26.2s
-	 add	x12,x3,x12,lsr#40
-	umlal	v23.2d,v18.2s,v0.s[2]
-	 add	x13,x3,x13,lsr#40
-	umlal	v22.2d,v18.2s,v8.s[2]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	v21.2d,v18.2s,v6.s[2]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	v20.2d,v18.2s,v4.s[2]
-	 fmov	d16,x8
-	umlal	v19.2d,v18.2s,v2.s[2]
-	 fmov	d17,x10
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4 and accumulate
-
-	add	v9.2s,v9.2s,v24.2s
-	 fmov	d18,x12
-	umlal	v22.2d,v11.2s,v1.s[0]
-	 ldp	x8,x12,[x1],#16	// inp[0:1]
-	umlal	v19.2d,v11.2s,v6.s[0]
-	 ldp	x9,x13,[x1],#48
-	umlal	v23.2d,v11.2s,v3.s[0]
-	umlal	v20.2d,v11.2s,v8.s[0]
-	umlal	v21.2d,v11.2s,v0.s[0]
-#ifdef	__AARCH64EB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	add	v10.2s,v10.2s,v25.2s
-	umlal	v22.2d,v9.2s,v5.s[0]
-	umlal	v23.2d,v9.2s,v7.s[0]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	v21.2d,v9.2s,v3.s[0]
-	 and	x5,x9,#0x03ffffff
-	umlal	v19.2d,v9.2s,v0.s[0]
-	 ubfx	x6,x8,#26,#26
-	umlal	v20.2d,v9.2s,v1.s[0]
-	 ubfx	x7,x9,#26,#26
-
-	add	v12.2s,v12.2s,v27.2s
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	umlal	v22.2d,v10.2s,v3.s[0]
-	 extr	x8,x12,x8,#52
-	umlal	v23.2d,v10.2s,v5.s[0]
-	 extr	x9,x13,x9,#52
-	umlal	v19.2d,v10.2s,v8.s[0]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	v21.2d,v10.2s,v1.s[0]
-	 fmov	d9,x4
-	umlal	v20.2d,v10.2s,v0.s[0]
-	 and	x8,x8,#0x03ffffff
-
-	add	v13.2s,v13.2s,v28.2s
-	 and	x9,x9,#0x03ffffff
-	umlal	v22.2d,v12.2s,v0.s[0]
-	 ubfx	x10,x12,#14,#26
-	umlal	v19.2d,v12.2s,v4.s[0]
-	 ubfx	x11,x13,#14,#26
-	umlal	v23.2d,v12.2s,v1.s[0]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	v20.2d,v12.2s,v6.s[0]
-	 fmov	d10,x6
-	umlal	v21.2d,v12.2s,v8.s[0]
-	 add	x12,x3,x12,lsr#40
-
-	umlal	v22.2d,v13.2s,v8.s[0]
-	 add	x13,x3,x13,lsr#40
-	umlal	v19.2d,v13.2s,v2.s[0]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	v23.2d,v13.2s,v0.s[0]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	v20.2d,v13.2s,v4.s[0]
-	 fmov	d11,x8
-	umlal	v21.2d,v13.2s,v6.s[0]
-	 fmov	d12,x10
-	 fmov	d13,x12
-
-	/////////////////////////////////////////////////////////////////
-	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	// and P. Schwabe
-	//
-	// [see discussion in poly1305-armv4 module]
-
-	ushr	v29.2d,v22.2d,#26
-	xtn	v27.2s,v22.2d
-	 ushr	v30.2d,v19.2d,#26
-	 and	v19.16b,v19.16b,v31.16b
-	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
-	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
-	 add	v20.2d,v20.2d,v30.2d	// h0 -> h1
-
-	ushr	v29.2d,v23.2d,#26
-	xtn	v28.2s,v23.2d
-	 ushr	v30.2d,v20.2d,#26
-	 xtn	v25.2s,v20.2d
-	bic	v28.2s,#0xfc,lsl#24
-	 add	v21.2d,v21.2d,v30.2d	// h1 -> h2
-
-	add	v19.2d,v19.2d,v29.2d
-	shl	v29.2d,v29.2d,#2
-	 shrn	v30.2s,v21.2d,#26
-	 xtn	v26.2s,v21.2d
-	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
-	 bic	v25.2s,#0xfc,lsl#24
-	 add	v27.2s,v27.2s,v30.2s		// h2 -> h3
-	 bic	v26.2s,#0xfc,lsl#24
-
-	shrn	v29.2s,v19.2d,#26
-	xtn	v24.2s,v19.2d
-	 ushr	v30.2s,v27.2s,#26
-	 bic	v27.2s,#0xfc,lsl#24
-	 bic	v24.2s,#0xfc,lsl#24
-	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
-	 add	v28.2s,v28.2s,v30.2s		// h3 -> h4
-
-	b.hi	.Loop_neon
-
-.Lskip_loop:
-	dup	v16.2d,v16.d[0]
-	add	v11.2s,v11.2s,v26.2s
-
-	////////////////////////////////////////////////////////////////
-	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	adds	x2,x2,#32
-	b.ne	.Long_tail
-
-	dup	v16.2d,v11.d[0]
-	add	v14.2s,v9.2s,v24.2s
-	add	v17.2s,v12.2s,v27.2s
-	add	v15.2s,v10.2s,v25.2s
-	add	v18.2s,v13.2s,v28.2s
-
-.Long_tail:
-	dup	v14.2d,v14.d[0]
-	umull2	v19.2d,v16.4s,v6.4s
-	umull2	v22.2d,v16.4s,v1.4s
-	umull2	v23.2d,v16.4s,v3.4s
-	umull2	v21.2d,v16.4s,v0.4s
-	umull2	v20.2d,v16.4s,v8.4s
-
-	dup	v15.2d,v15.d[0]
-	umlal2	v19.2d,v14.4s,v0.4s
-	umlal2	v21.2d,v14.4s,v3.4s
-	umlal2	v22.2d,v14.4s,v5.4s
-	umlal2	v23.2d,v14.4s,v7.4s
-	umlal2	v20.2d,v14.4s,v1.4s
-
-	dup	v17.2d,v17.d[0]
-	umlal2	v19.2d,v15.4s,v8.4s
-	umlal2	v22.2d,v15.4s,v3.4s
-	umlal2	v21.2d,v15.4s,v1.4s
-	umlal2	v23.2d,v15.4s,v5.4s
-	umlal2	v20.2d,v15.4s,v0.4s
-
-	dup	v18.2d,v18.d[0]
-	umlal2	v22.2d,v17.4s,v0.4s
-	umlal2	v23.2d,v17.4s,v1.4s
-	umlal2	v19.2d,v17.4s,v4.4s
-	umlal2	v20.2d,v17.4s,v6.4s
-	umlal2	v21.2d,v17.4s,v8.4s
-
-	umlal2	v22.2d,v18.4s,v8.4s
-	umlal2	v19.2d,v18.4s,v2.4s
-	umlal2	v23.2d,v18.4s,v0.4s
-	umlal2	v20.2d,v18.4s,v4.4s
-	umlal2	v21.2d,v18.4s,v6.4s
-
-	b.eq	.Lshort_tail
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	add	v9.2s,v9.2s,v24.2s
-	umlal	v22.2d,v11.2s,v1.2s
-	umlal	v19.2d,v11.2s,v6.2s
-	umlal	v23.2d,v11.2s,v3.2s
-	umlal	v20.2d,v11.2s,v8.2s
-	umlal	v21.2d,v11.2s,v0.2s
-
-	add	v10.2s,v10.2s,v25.2s
-	umlal	v22.2d,v9.2s,v5.2s
-	umlal	v19.2d,v9.2s,v0.2s
-	umlal	v23.2d,v9.2s,v7.2s
-	umlal	v20.2d,v9.2s,v1.2s
-	umlal	v21.2d,v9.2s,v3.2s
-
-	add	v12.2s,v12.2s,v27.2s
-	umlal	v22.2d,v10.2s,v3.2s
-	umlal	v19.2d,v10.2s,v8.2s
-	umlal	v23.2d,v10.2s,v5.2s
-	umlal	v20.2d,v10.2s,v0.2s
-	umlal	v21.2d,v10.2s,v1.2s
-
-	add	v13.2s,v13.2s,v28.2s
-	umlal	v22.2d,v12.2s,v0.2s
-	umlal	v19.2d,v12.2s,v4.2s
-	umlal	v23.2d,v12.2s,v1.2s
-	umlal	v20.2d,v12.2s,v6.2s
-	umlal	v21.2d,v12.2s,v8.2s
-
-	umlal	v22.2d,v13.2s,v8.2s
-	umlal	v19.2d,v13.2s,v2.2s
-	umlal	v23.2d,v13.2s,v0.2s
-	umlal	v20.2d,v13.2s,v4.2s
-	umlal	v21.2d,v13.2s,v6.2s
-
-.Lshort_tail:
-	////////////////////////////////////////////////////////////////
-	// horizontal add
-
-	addp	v22.2d,v22.2d,v22.2d
-	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
-	addp	v19.2d,v19.2d,v19.2d
-	 ldp	d10,d11,[sp,#32]
-	addp	v23.2d,v23.2d,v23.2d
-	 ldp	d12,d13,[sp,#48]
-	addp	v20.2d,v20.2d,v20.2d
-	 ldp	d14,d15,[sp,#64]
-	addp	v21.2d,v21.2d,v21.2d
-	 ldr	x30,[sp,#8]
-
-	////////////////////////////////////////////////////////////////
-	// lazy reduction, but without narrowing
-
-	ushr	v29.2d,v22.2d,#26
-	and	v22.16b,v22.16b,v31.16b
-	 ushr	v30.2d,v19.2d,#26
-	 and	v19.16b,v19.16b,v31.16b
-
-	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
-	 add	v20.2d,v20.2d,v30.2d	// h0 -> h1
-
-	ushr	v29.2d,v23.2d,#26
-	and	v23.16b,v23.16b,v31.16b
-	 ushr	v30.2d,v20.2d,#26
-	 and	v20.16b,v20.16b,v31.16b
-	 add	v21.2d,v21.2d,v30.2d	// h1 -> h2
-
-	add	v19.2d,v19.2d,v29.2d
-	shl	v29.2d,v29.2d,#2
-	 ushr	v30.2d,v21.2d,#26
-	 and	v21.16b,v21.16b,v31.16b
-	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
-	 add	v22.2d,v22.2d,v30.2d	// h2 -> h3
-
-	ushr	v29.2d,v19.2d,#26
-	and	v19.16b,v19.16b,v31.16b
-	 ushr	v30.2d,v22.2d,#26
-	 and	v22.16b,v22.16b,v31.16b
-	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
-	 add	v23.2d,v23.2d,v30.2d	// h3 -> h4
-
-	////////////////////////////////////////////////////////////////
-	// write the result, can be partially reduced
-
-	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
-	mov	x4,#1
-	st1	{v23.s}[0],[x0]
-	str	x4,[x0,#8]		// set is_base2_26
-
-	ldr	x29,[sp],#80
-	 .inst	0xd50323bf		// autiasp
-	ret
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0
-.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
-.align	2
-#if !defined(__KERNEL__) && !defined(_WIN64)
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped
deleted file mode 100644
index 7c7ce2e3bad6..000000000000
--- a/arch/arm64/crypto/sha256-core.S_shipped
+++ /dev/null
@@ -1,2069 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// This code is taken from the OpenSSL project but the author (Andy Polyakov)
-// has relicensed it under the GPLv2. Therefore this program is free software;
-// you can redistribute it and/or modify it under the terms of the GNU General
-// Public License version 2 as published by the Free Software Foundation.
-//
-// The original headers, including the original license headers, are
-// included below for completeness.
-
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License").  You may not use
-// this file except in compliance with the License.  You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-//		SHA256-hw	SHA256(*)	SHA512
-// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
-// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
-// Denver	2.01		10.5 (+26%)	6.70 (+8%)
-// X-Gene			20.0 (+100%)	12.8 (+300%(***))
-// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
-//
-// (*)	Software SHA256 results are of lesser relevance, presented
-//	mostly for informational purposes.
-// (**)	The result is a trade-off: it's possible to improve it by
-//	10% (or by 1 cycle per round), but at the cost of 20% loss
-//	on Cortex-A53 (or by 4 cycles per round).
-// (***)	Super-impressive coefficients over gcc-generated code are
-//	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significanty faster
-//	and the gap is only 40-90%.
-//
-// October 2016.
-//
-// Originally it was reckoned that it makes no sense to implement NEON
-// version of SHA256 for 64-bit processors. This is because performance
-// improvement on most wide-spread Cortex-A5x processors was observed
-// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-// observed that 32-bit NEON SHA256 performs significantly better than
-// 64-bit scalar version on *some* of the more recent processors. As
-// result 64-bit NEON version of SHA256 was added to provide best
-// all-round performance. For example it executes ~30% faster on X-Gene
-// and Mongoose. [For reference, NEON version of SHA512 is bound to
-// deliver much less improvement, likely *negative* on Cortex-A5x.
-// Which is why NEON support is limited to SHA256.]
-
-#ifndef	__KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern	OPENSSL_armcap_P
-.globl	sha256_block_data_order
-.type	sha256_block_data_order,%function
-.align	6
-sha256_block_data_order:
-#ifndef	__KERNEL__
-# ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
-# else
-	ldr	x16,.LOPENSSL_armcap_P
-# endif
-	adr	x17,.LOPENSSL_armcap_P
-	add	x16,x16,x17
-	ldr	w16,[x16]
-	tst	w16,#ARMV8_SHA256
-	b.ne	.Lv8_entry
-	tst	w16,#ARMV7_NEON
-	b.ne	.Lneon_entry
-#endif
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#4*4
-
-	ldp	w20,w21,[x0]				// load context
-	ldp	w22,w23,[x0,#2*4]
-	ldp	w24,w25,[x0,#4*4]
-	add	x2,x1,x2,lsl#6	// end of input
-	ldp	w26,w27,[x0,#6*4]
-	adr	x30,.LK256
-	stp	x0,x2,[x29,#96]
-
-.Loop:
-	ldp	w3,w4,[x1],#2*4
-	ldr	w19,[x30],#4			// *K++
-	eor	w28,w21,w22				// magic seed
-	str	x1,[x29,#112]
-#ifndef	__AARCH64EB__
-	rev	w3,w3			// 0
-#endif
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	eor	w6,w24,w24,ror#14
-	and	w17,w25,w24
-	bic	w19,w26,w24
-	add	w27,w27,w3			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w6,ror#11	// Sigma1(e)
-	ror	w6,w20,#2
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	eor	w17,w20,w20,ror#9
-	add	w27,w27,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w23,w23,w27			// d+=h
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w6,w17,ror#13	// Sigma0(a)
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w4,w4			// 1
-#endif
-	ldp	w5,w6,[x1],#2*4
-	add	w27,w27,w17			// h+=Sigma0(a)
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	eor	w7,w23,w23,ror#14
-	and	w17,w24,w23
-	bic	w28,w25,w23
-	add	w26,w26,w4			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w7,ror#11	// Sigma1(e)
-	ror	w7,w27,#2
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	eor	w17,w27,w27,ror#9
-	add	w26,w26,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w22,w22,w26			// d+=h
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w7,w17,ror#13	// Sigma0(a)
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w5,w5			// 2
-#endif
-	add	w26,w26,w17			// h+=Sigma0(a)
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	eor	w8,w22,w22,ror#14
-	and	w17,w23,w22
-	bic	w19,w24,w22
-	add	w25,w25,w5			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w8,ror#11	// Sigma1(e)
-	ror	w8,w26,#2
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	eor	w17,w26,w26,ror#9
-	add	w25,w25,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w21,w21,w25			// d+=h
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w8,w17,ror#13	// Sigma0(a)
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w6,w6			// 3
-#endif
-	ldp	w7,w8,[x1],#2*4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	eor	w9,w21,w21,ror#14
-	and	w17,w22,w21
-	bic	w28,w23,w21
-	add	w24,w24,w6			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w9,ror#11	// Sigma1(e)
-	ror	w9,w25,#2
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	eor	w17,w25,w25,ror#9
-	add	w24,w24,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w20,w20,w24			// d+=h
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w9,w17,ror#13	// Sigma0(a)
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w7,w7			// 4
-#endif
-	add	w24,w24,w17			// h+=Sigma0(a)
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	eor	w10,w20,w20,ror#14
-	and	w17,w21,w20
-	bic	w19,w22,w20
-	add	w23,w23,w7			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w10,ror#11	// Sigma1(e)
-	ror	w10,w24,#2
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	eor	w17,w24,w24,ror#9
-	add	w23,w23,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w27,w27,w23			// d+=h
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w10,w17,ror#13	// Sigma0(a)
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w8,w8			// 5
-#endif
-	ldp	w9,w10,[x1],#2*4
-	add	w23,w23,w17			// h+=Sigma0(a)
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	eor	w11,w27,w27,ror#14
-	and	w17,w20,w27
-	bic	w28,w21,w27
-	add	w22,w22,w8			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w11,ror#11	// Sigma1(e)
-	ror	w11,w23,#2
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	eor	w17,w23,w23,ror#9
-	add	w22,w22,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w26,w26,w22			// d+=h
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w11,w17,ror#13	// Sigma0(a)
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w9,w9			// 6
-#endif
-	add	w22,w22,w17			// h+=Sigma0(a)
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	eor	w12,w26,w26,ror#14
-	and	w17,w27,w26
-	bic	w19,w20,w26
-	add	w21,w21,w9			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w12,ror#11	// Sigma1(e)
-	ror	w12,w22,#2
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	eor	w17,w22,w22,ror#9
-	add	w21,w21,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w25,w25,w21			// d+=h
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w12,w17,ror#13	// Sigma0(a)
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w10,w10			// 7
-#endif
-	ldp	w11,w12,[x1],#2*4
-	add	w21,w21,w17			// h+=Sigma0(a)
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	eor	w13,w25,w25,ror#14
-	and	w17,w26,w25
-	bic	w28,w27,w25
-	add	w20,w20,w10			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w13,ror#11	// Sigma1(e)
-	ror	w13,w21,#2
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	eor	w17,w21,w21,ror#9
-	add	w20,w20,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w24,w24,w20			// d+=h
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w13,w17,ror#13	// Sigma0(a)
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w20,w20,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w11,w11			// 8
-#endif
-	add	w20,w20,w17			// h+=Sigma0(a)
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	eor	w14,w24,w24,ror#14
-	and	w17,w25,w24
-	bic	w19,w26,w24
-	add	w27,w27,w11			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w14,ror#11	// Sigma1(e)
-	ror	w14,w20,#2
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	eor	w17,w20,w20,ror#9
-	add	w27,w27,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w23,w23,w27			// d+=h
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w14,w17,ror#13	// Sigma0(a)
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w12,w12			// 9
-#endif
-	ldp	w13,w14,[x1],#2*4
-	add	w27,w27,w17			// h+=Sigma0(a)
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	eor	w15,w23,w23,ror#14
-	and	w17,w24,w23
-	bic	w28,w25,w23
-	add	w26,w26,w12			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w15,ror#11	// Sigma1(e)
-	ror	w15,w27,#2
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	eor	w17,w27,w27,ror#9
-	add	w26,w26,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w22,w22,w26			// d+=h
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w15,w17,ror#13	// Sigma0(a)
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w13,w13			// 10
-#endif
-	add	w26,w26,w17			// h+=Sigma0(a)
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	eor	w0,w22,w22,ror#14
-	and	w17,w23,w22
-	bic	w19,w24,w22
-	add	w25,w25,w13			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w0,ror#11	// Sigma1(e)
-	ror	w0,w26,#2
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	eor	w17,w26,w26,ror#9
-	add	w25,w25,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w21,w21,w25			// d+=h
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w0,w17,ror#13	// Sigma0(a)
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w14,w14			// 11
-#endif
-	ldp	w15,w0,[x1],#2*4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	str	w6,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	eor	w6,w21,w21,ror#14
-	and	w17,w22,w21
-	bic	w28,w23,w21
-	add	w24,w24,w14			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w6,ror#11	// Sigma1(e)
-	ror	w6,w25,#2
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	eor	w17,w25,w25,ror#9
-	add	w24,w24,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w20,w20,w24			// d+=h
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w6,w17,ror#13	// Sigma0(a)
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w15,w15			// 12
-#endif
-	add	w24,w24,w17			// h+=Sigma0(a)
-	str	w7,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	eor	w7,w20,w20,ror#14
-	and	w17,w21,w20
-	bic	w19,w22,w20
-	add	w23,w23,w15			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w7,ror#11	// Sigma1(e)
-	ror	w7,w24,#2
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	eor	w17,w24,w24,ror#9
-	add	w23,w23,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w27,w27,w23			// d+=h
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w7,w17,ror#13	// Sigma0(a)
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w0,w0			// 13
-#endif
-	ldp	w1,w2,[x1]
-	add	w23,w23,w17			// h+=Sigma0(a)
-	str	w8,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	eor	w8,w27,w27,ror#14
-	and	w17,w20,w27
-	bic	w28,w21,w27
-	add	w22,w22,w0			// h+=X[i]
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w8,ror#11	// Sigma1(e)
-	ror	w8,w23,#2
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	eor	w17,w23,w23,ror#9
-	add	w22,w22,w16			// h+=Sigma1(e)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	add	w26,w26,w22			// d+=h
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w8,w17,ror#13	// Sigma0(a)
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w1,w1			// 14
-#endif
-	ldr	w6,[sp,#12]
-	add	w22,w22,w17			// h+=Sigma0(a)
-	str	w9,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	eor	w9,w26,w26,ror#14
-	and	w17,w27,w26
-	bic	w19,w20,w26
-	add	w21,w21,w1			// h+=X[i]
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w9,ror#11	// Sigma1(e)
-	ror	w9,w22,#2
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	eor	w17,w22,w22,ror#9
-	add	w21,w21,w16			// h+=Sigma1(e)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	add	w25,w25,w21			// d+=h
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w9,w17,ror#13	// Sigma0(a)
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	w2,w2			// 15
-#endif
-	ldr	w7,[sp,#0]
-	add	w21,w21,w17			// h+=Sigma0(a)
-	str	w10,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w9,w4,#7
-	and	w17,w26,w25
-	ror	w8,w1,#17
-	bic	w28,w27,w25
-	ror	w10,w21,#2
-	add	w20,w20,w2			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w9,w9,w4,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w10,w10,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w8,w8,w1,ror#19
-	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w10,w21,ror#22	// Sigma0(a)
-	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
-	add	w3,w3,w12
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w3,w3,w9
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w3,w3,w8
-.Loop_16_xx:
-	ldr	w8,[sp,#4]
-	str	w11,[sp,#0]
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	ror	w10,w5,#7
-	and	w17,w25,w24
-	ror	w9,w2,#17
-	bic	w19,w26,w24
-	ror	w11,w20,#2
-	add	w27,w27,w3			// h+=X[i]
-	eor	w16,w16,w24,ror#11
-	eor	w10,w10,w5,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w24,ror#25	// Sigma1(e)
-	eor	w11,w11,w20,ror#13
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w9,w9,w2,ror#19
-	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
-	add	w27,w27,w16			// h+=Sigma1(e)
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w11,w20,ror#22	// Sigma0(a)
-	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
-	add	w4,w4,w13
-	add	w23,w23,w27			// d+=h
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w4,w4,w10
-	add	w27,w27,w17			// h+=Sigma0(a)
-	add	w4,w4,w9
-	ldr	w9,[sp,#8]
-	str	w12,[sp,#4]
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	ror	w11,w6,#7
-	and	w17,w24,w23
-	ror	w10,w3,#17
-	bic	w28,w25,w23
-	ror	w12,w27,#2
-	add	w26,w26,w4			// h+=X[i]
-	eor	w16,w16,w23,ror#11
-	eor	w11,w11,w6,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w23,ror#25	// Sigma1(e)
-	eor	w12,w12,w27,ror#13
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w10,w10,w3,ror#19
-	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
-	add	w26,w26,w16			// h+=Sigma1(e)
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w12,w27,ror#22	// Sigma0(a)
-	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
-	add	w5,w5,w14
-	add	w22,w22,w26			// d+=h
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w5,w5,w11
-	add	w26,w26,w17			// h+=Sigma0(a)
-	add	w5,w5,w10
-	ldr	w10,[sp,#12]
-	str	w13,[sp,#8]
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	ror	w12,w7,#7
-	and	w17,w23,w22
-	ror	w11,w4,#17
-	bic	w19,w24,w22
-	ror	w13,w26,#2
-	add	w25,w25,w5			// h+=X[i]
-	eor	w16,w16,w22,ror#11
-	eor	w12,w12,w7,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w22,ror#25	// Sigma1(e)
-	eor	w13,w13,w26,ror#13
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w11,w11,w4,ror#19
-	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
-	add	w25,w25,w16			// h+=Sigma1(e)
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w13,w26,ror#22	// Sigma0(a)
-	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
-	add	w6,w6,w15
-	add	w21,w21,w25			// d+=h
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w6,w6,w12
-	add	w25,w25,w17			// h+=Sigma0(a)
-	add	w6,w6,w11
-	ldr	w11,[sp,#0]
-	str	w14,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	ror	w13,w8,#7
-	and	w17,w22,w21
-	ror	w12,w5,#17
-	bic	w28,w23,w21
-	ror	w14,w25,#2
-	add	w24,w24,w6			// h+=X[i]
-	eor	w16,w16,w21,ror#11
-	eor	w13,w13,w8,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w21,ror#25	// Sigma1(e)
-	eor	w14,w14,w25,ror#13
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w12,w12,w5,ror#19
-	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
-	add	w24,w24,w16			// h+=Sigma1(e)
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w14,w25,ror#22	// Sigma0(a)
-	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
-	add	w7,w7,w0
-	add	w20,w20,w24			// d+=h
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w7,w7,w13
-	add	w24,w24,w17			// h+=Sigma0(a)
-	add	w7,w7,w12
-	ldr	w12,[sp,#4]
-	str	w15,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	ror	w14,w9,#7
-	and	w17,w21,w20
-	ror	w13,w6,#17
-	bic	w19,w22,w20
-	ror	w15,w24,#2
-	add	w23,w23,w7			// h+=X[i]
-	eor	w16,w16,w20,ror#11
-	eor	w14,w14,w9,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w20,ror#25	// Sigma1(e)
-	eor	w15,w15,w24,ror#13
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w13,w13,w6,ror#19
-	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
-	add	w23,w23,w16			// h+=Sigma1(e)
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w15,w24,ror#22	// Sigma0(a)
-	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
-	add	w8,w8,w1
-	add	w27,w27,w23			// d+=h
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w8,w8,w14
-	add	w23,w23,w17			// h+=Sigma0(a)
-	add	w8,w8,w13
-	ldr	w13,[sp,#8]
-	str	w0,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	ror	w15,w10,#7
-	and	w17,w20,w27
-	ror	w14,w7,#17
-	bic	w28,w21,w27
-	ror	w0,w23,#2
-	add	w22,w22,w8			// h+=X[i]
-	eor	w16,w16,w27,ror#11
-	eor	w15,w15,w10,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w27,ror#25	// Sigma1(e)
-	eor	w0,w0,w23,ror#13
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w14,w14,w7,ror#19
-	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
-	add	w22,w22,w16			// h+=Sigma1(e)
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w0,w23,ror#22	// Sigma0(a)
-	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
-	add	w9,w9,w2
-	add	w26,w26,w22			// d+=h
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w9,w9,w15
-	add	w22,w22,w17			// h+=Sigma0(a)
-	add	w9,w9,w14
-	ldr	w14,[sp,#12]
-	str	w1,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	ror	w0,w11,#7
-	and	w17,w27,w26
-	ror	w15,w8,#17
-	bic	w19,w20,w26
-	ror	w1,w22,#2
-	add	w21,w21,w9			// h+=X[i]
-	eor	w16,w16,w26,ror#11
-	eor	w0,w0,w11,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w26,ror#25	// Sigma1(e)
-	eor	w1,w1,w22,ror#13
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w15,w15,w8,ror#19
-	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
-	add	w21,w21,w16			// h+=Sigma1(e)
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w1,w22,ror#22	// Sigma0(a)
-	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
-	add	w10,w10,w3
-	add	w25,w25,w21			// d+=h
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w10,w10,w0
-	add	w21,w21,w17			// h+=Sigma0(a)
-	add	w10,w10,w15
-	ldr	w15,[sp,#0]
-	str	w2,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w1,w12,#7
-	and	w17,w26,w25
-	ror	w0,w9,#17
-	bic	w28,w27,w25
-	ror	w2,w21,#2
-	add	w20,w20,w10			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w1,w1,w12,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w2,w2,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w0,w0,w9,ror#19
-	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w2,w21,ror#22	// Sigma0(a)
-	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
-	add	w11,w11,w4
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w11,w11,w1
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w11,w11,w0
-	ldr	w0,[sp,#4]
-	str	w3,[sp,#0]
-	ror	w16,w24,#6
-	add	w27,w27,w19			// h+=K[i]
-	ror	w2,w13,#7
-	and	w17,w25,w24
-	ror	w1,w10,#17
-	bic	w19,w26,w24
-	ror	w3,w20,#2
-	add	w27,w27,w11			// h+=X[i]
-	eor	w16,w16,w24,ror#11
-	eor	w2,w2,w13,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w20,w21			// a^b, b^c in next round
-	eor	w16,w16,w24,ror#25	// Sigma1(e)
-	eor	w3,w3,w20,ror#13
-	add	w27,w27,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w1,w1,w10,ror#19
-	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
-	add	w27,w27,w16			// h+=Sigma1(e)
-	eor	w28,w28,w21			// Maj(a,b,c)
-	eor	w17,w3,w20,ror#22	// Sigma0(a)
-	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
-	add	w12,w12,w5
-	add	w23,w23,w27			// d+=h
-	add	w27,w27,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w12,w12,w2
-	add	w27,w27,w17			// h+=Sigma0(a)
-	add	w12,w12,w1
-	ldr	w1,[sp,#8]
-	str	w4,[sp,#4]
-	ror	w16,w23,#6
-	add	w26,w26,w28			// h+=K[i]
-	ror	w3,w14,#7
-	and	w17,w24,w23
-	ror	w2,w11,#17
-	bic	w28,w25,w23
-	ror	w4,w27,#2
-	add	w26,w26,w12			// h+=X[i]
-	eor	w16,w16,w23,ror#11
-	eor	w3,w3,w14,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w27,w20			// a^b, b^c in next round
-	eor	w16,w16,w23,ror#25	// Sigma1(e)
-	eor	w4,w4,w27,ror#13
-	add	w26,w26,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w2,w2,w11,ror#19
-	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
-	add	w26,w26,w16			// h+=Sigma1(e)
-	eor	w19,w19,w20			// Maj(a,b,c)
-	eor	w17,w4,w27,ror#22	// Sigma0(a)
-	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
-	add	w13,w13,w6
-	add	w22,w22,w26			// d+=h
-	add	w26,w26,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w13,w13,w3
-	add	w26,w26,w17			// h+=Sigma0(a)
-	add	w13,w13,w2
-	ldr	w2,[sp,#12]
-	str	w5,[sp,#8]
-	ror	w16,w22,#6
-	add	w25,w25,w19			// h+=K[i]
-	ror	w4,w15,#7
-	and	w17,w23,w22
-	ror	w3,w12,#17
-	bic	w19,w24,w22
-	ror	w5,w26,#2
-	add	w25,w25,w13			// h+=X[i]
-	eor	w16,w16,w22,ror#11
-	eor	w4,w4,w15,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w26,w27			// a^b, b^c in next round
-	eor	w16,w16,w22,ror#25	// Sigma1(e)
-	eor	w5,w5,w26,ror#13
-	add	w25,w25,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w3,w3,w12,ror#19
-	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
-	add	w25,w25,w16			// h+=Sigma1(e)
-	eor	w28,w28,w27			// Maj(a,b,c)
-	eor	w17,w5,w26,ror#22	// Sigma0(a)
-	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
-	add	w14,w14,w7
-	add	w21,w21,w25			// d+=h
-	add	w25,w25,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w14,w14,w4
-	add	w25,w25,w17			// h+=Sigma0(a)
-	add	w14,w14,w3
-	ldr	w3,[sp,#0]
-	str	w6,[sp,#12]
-	ror	w16,w21,#6
-	add	w24,w24,w28			// h+=K[i]
-	ror	w5,w0,#7
-	and	w17,w22,w21
-	ror	w4,w13,#17
-	bic	w28,w23,w21
-	ror	w6,w25,#2
-	add	w24,w24,w14			// h+=X[i]
-	eor	w16,w16,w21,ror#11
-	eor	w5,w5,w0,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w25,w26			// a^b, b^c in next round
-	eor	w16,w16,w21,ror#25	// Sigma1(e)
-	eor	w6,w6,w25,ror#13
-	add	w24,w24,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w4,w4,w13,ror#19
-	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
-	add	w24,w24,w16			// h+=Sigma1(e)
-	eor	w19,w19,w26			// Maj(a,b,c)
-	eor	w17,w6,w25,ror#22	// Sigma0(a)
-	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
-	add	w15,w15,w8
-	add	w20,w20,w24			// d+=h
-	add	w24,w24,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w15,w15,w5
-	add	w24,w24,w17			// h+=Sigma0(a)
-	add	w15,w15,w4
-	ldr	w4,[sp,#4]
-	str	w7,[sp,#0]
-	ror	w16,w20,#6
-	add	w23,w23,w19			// h+=K[i]
-	ror	w6,w1,#7
-	and	w17,w21,w20
-	ror	w5,w14,#17
-	bic	w19,w22,w20
-	ror	w7,w24,#2
-	add	w23,w23,w15			// h+=X[i]
-	eor	w16,w16,w20,ror#11
-	eor	w6,w6,w1,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w24,w25			// a^b, b^c in next round
-	eor	w16,w16,w20,ror#25	// Sigma1(e)
-	eor	w7,w7,w24,ror#13
-	add	w23,w23,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w5,w5,w14,ror#19
-	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
-	add	w23,w23,w16			// h+=Sigma1(e)
-	eor	w28,w28,w25			// Maj(a,b,c)
-	eor	w17,w7,w24,ror#22	// Sigma0(a)
-	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
-	add	w0,w0,w9
-	add	w27,w27,w23			// d+=h
-	add	w23,w23,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w0,w0,w6
-	add	w23,w23,w17			// h+=Sigma0(a)
-	add	w0,w0,w5
-	ldr	w5,[sp,#8]
-	str	w8,[sp,#4]
-	ror	w16,w27,#6
-	add	w22,w22,w28			// h+=K[i]
-	ror	w7,w2,#7
-	and	w17,w20,w27
-	ror	w6,w15,#17
-	bic	w28,w21,w27
-	ror	w8,w23,#2
-	add	w22,w22,w0			// h+=X[i]
-	eor	w16,w16,w27,ror#11
-	eor	w7,w7,w2,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w23,w24			// a^b, b^c in next round
-	eor	w16,w16,w27,ror#25	// Sigma1(e)
-	eor	w8,w8,w23,ror#13
-	add	w22,w22,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w6,w6,w15,ror#19
-	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
-	add	w22,w22,w16			// h+=Sigma1(e)
-	eor	w19,w19,w24			// Maj(a,b,c)
-	eor	w17,w8,w23,ror#22	// Sigma0(a)
-	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
-	add	w1,w1,w10
-	add	w26,w26,w22			// d+=h
-	add	w22,w22,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w1,w1,w7
-	add	w22,w22,w17			// h+=Sigma0(a)
-	add	w1,w1,w6
-	ldr	w6,[sp,#12]
-	str	w9,[sp,#8]
-	ror	w16,w26,#6
-	add	w21,w21,w19			// h+=K[i]
-	ror	w8,w3,#7
-	and	w17,w27,w26
-	ror	w7,w0,#17
-	bic	w19,w20,w26
-	ror	w9,w22,#2
-	add	w21,w21,w1			// h+=X[i]
-	eor	w16,w16,w26,ror#11
-	eor	w8,w8,w3,ror#18
-	orr	w17,w17,w19			// Ch(e,f,g)
-	eor	w19,w22,w23			// a^b, b^c in next round
-	eor	w16,w16,w26,ror#25	// Sigma1(e)
-	eor	w9,w9,w22,ror#13
-	add	w21,w21,w17			// h+=Ch(e,f,g)
-	and	w28,w28,w19			// (b^c)&=(a^b)
-	eor	w7,w7,w0,ror#19
-	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
-	add	w21,w21,w16			// h+=Sigma1(e)
-	eor	w28,w28,w23			// Maj(a,b,c)
-	eor	w17,w9,w22,ror#22	// Sigma0(a)
-	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
-	add	w2,w2,w11
-	add	w25,w25,w21			// d+=h
-	add	w21,w21,w28			// h+=Maj(a,b,c)
-	ldr	w28,[x30],#4		// *K++, w19 in next round
-	add	w2,w2,w8
-	add	w21,w21,w17			// h+=Sigma0(a)
-	add	w2,w2,w7
-	ldr	w7,[sp,#0]
-	str	w10,[sp,#12]
-	ror	w16,w25,#6
-	add	w20,w20,w28			// h+=K[i]
-	ror	w9,w4,#7
-	and	w17,w26,w25
-	ror	w8,w1,#17
-	bic	w28,w27,w25
-	ror	w10,w21,#2
-	add	w20,w20,w2			// h+=X[i]
-	eor	w16,w16,w25,ror#11
-	eor	w9,w9,w4,ror#18
-	orr	w17,w17,w28			// Ch(e,f,g)
-	eor	w28,w21,w22			// a^b, b^c in next round
-	eor	w16,w16,w25,ror#25	// Sigma1(e)
-	eor	w10,w10,w21,ror#13
-	add	w20,w20,w17			// h+=Ch(e,f,g)
-	and	w19,w19,w28			// (b^c)&=(a^b)
-	eor	w8,w8,w1,ror#19
-	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
-	add	w20,w20,w16			// h+=Sigma1(e)
-	eor	w19,w19,w22			// Maj(a,b,c)
-	eor	w17,w10,w21,ror#22	// Sigma0(a)
-	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
-	add	w3,w3,w12
-	add	w24,w24,w20			// d+=h
-	add	w20,w20,w19			// h+=Maj(a,b,c)
-	ldr	w19,[x30],#4		// *K++, w28 in next round
-	add	w3,w3,w9
-	add	w20,w20,w17			// h+=Sigma0(a)
-	add	w3,w3,w8
-	cbnz	w19,.Loop_16_xx
-
-	ldp	x0,x2,[x29,#96]
-	ldr	x1,[x29,#112]
-	sub	x30,x30,#260		// rewind
-
-	ldp	w3,w4,[x0]
-	ldp	w5,w6,[x0,#2*4]
-	add	x1,x1,#14*4			// advance input pointer
-	ldp	w7,w8,[x0,#4*4]
-	add	w20,w20,w3
-	ldp	w9,w10,[x0,#6*4]
-	add	w21,w21,w4
-	add	w22,w22,w5
-	add	w23,w23,w6
-	stp	w20,w21,[x0]
-	add	w24,w24,w7
-	add	w25,w25,w8
-	stp	w22,w23,[x0,#2*4]
-	add	w26,w26,w9
-	add	w27,w27,w10
-	cmp	x1,x2
-	stp	w24,w25,[x0,#4*4]
-	stp	w26,w27,[x0,#6*4]
-	b.ne	.Loop
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#4*4
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#128
-	ret
-.size	sha256_block_data_order,.-sha256_block_data_order
-
-.align	6
-.type	.LK256,%object
-.LK256:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-	.long	0	//terminator
-.size	.LK256,.-.LK256
-#ifndef	__KERNEL__
-.align	3
-.LOPENSSL_armcap_P:
-# ifdef	__ILP32__
-	.long	OPENSSL_armcap_P-.
-# else
-	.quad	OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz	"SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align	2
-#ifndef	__KERNEL__
-.type	sha256_block_armv8,%function
-.align	6
-sha256_block_armv8:
-.Lv8_entry:
-	stp		x29,x30,[sp,#-16]!
-	add		x29,sp,#0
-
-	ld1		{v0.4s,v1.4s},[x0]
-	adr		x3,.LK256
-
-.Loop_hw:
-	ld1		{v4.16b-v7.16b},[x1],#64
-	sub		x2,x2,#1
-	ld1		{v16.4s},[x3],#16
-	rev32		v4.16b,v4.16b
-	rev32		v5.16b,v5.16b
-	rev32		v6.16b,v6.16b
-	rev32		v7.16b,v7.16b
-	orr		v18.16b,v0.16b,v0.16b		// offload
-	orr		v19.16b,v1.16b,v1.16b
-	ld1		{v17.4s},[x3],#16
-	add		v16.4s,v16.4s,v4.4s
-	.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-	.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1		{v16.4s},[x3],#16
-	add		v17.4s,v17.4s,v5.4s
-	.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-	.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1		{v17.4s},[x3],#16
-	add		v16.4s,v16.4s,v6.4s
-	.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-	.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1		{v16.4s},[x3],#16
-	add		v17.4s,v17.4s,v7.4s
-	.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-	.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1		{v17.4s},[x3],#16
-	add		v16.4s,v16.4s,v4.4s
-	.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-	.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1		{v16.4s},[x3],#16
-	add		v17.4s,v17.4s,v5.4s
-	.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-	.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1		{v17.4s},[x3],#16
-	add		v16.4s,v16.4s,v6.4s
-	.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-	.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1		{v16.4s},[x3],#16
-	add		v17.4s,v17.4s,v7.4s
-	.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-	.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1		{v17.4s},[x3],#16
-	add		v16.4s,v16.4s,v4.4s
-	.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-	.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
-	ld1		{v16.4s},[x3],#16
-	add		v17.4s,v17.4s,v5.4s
-	.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-	.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
-	ld1		{v17.4s},[x3],#16
-	add		v16.4s,v16.4s,v6.4s
-	.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-	.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
-	ld1		{v16.4s},[x3],#16
-	add		v17.4s,v17.4s,v7.4s
-	.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-	.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
-	ld1		{v17.4s},[x3],#16
-	add		v16.4s,v16.4s,v4.4s
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-
-	ld1		{v16.4s},[x3],#16
-	add		v17.4s,v17.4s,v5.4s
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-
-	ld1		{v17.4s},[x3]
-	add		v16.4s,v16.4s,v6.4s
-	sub		x3,x3,#64*4-16	// rewind
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
-	.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
-
-	add		v17.4s,v17.4s,v7.4s
-	orr		v2.16b,v0.16b,v0.16b
-	.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
-	.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
-
-	add		v0.4s,v0.4s,v18.4s
-	add		v1.4s,v1.4s,v19.4s
-
-	cbnz		x2,.Loop_hw
-
-	st1		{v0.4s,v1.4s},[x0]
-
-	ldr		x29,[sp],#16
-	ret
-.size	sha256_block_armv8,.-sha256_block_armv8
-#endif
-#ifdef	__KERNEL__
-.globl	sha256_block_neon
-#endif
-.type	sha256_block_neon,%function
-.align	4
-sha256_block_neon:
-.Lneon_entry:
-	stp	x29, x30, [sp, #-16]!
-	mov	x29, sp
-	sub	sp,sp,#16*4
-
-	adr	x16,.LK256
-	add	x2,x1,x2,lsl#6	// len to point at the end of inp
-
-	ld1	{v0.16b},[x1], #16
-	ld1	{v1.16b},[x1], #16
-	ld1	{v2.16b},[x1], #16
-	ld1	{v3.16b},[x1], #16
-	ld1	{v4.4s},[x16], #16
-	ld1	{v5.4s},[x16], #16
-	ld1	{v6.4s},[x16], #16
-	ld1	{v7.4s},[x16], #16
-	rev32	v0.16b,v0.16b		// yes, even on
-	rev32	v1.16b,v1.16b		// big-endian
-	rev32	v2.16b,v2.16b
-	rev32	v3.16b,v3.16b
-	mov	x17,sp
-	add	v4.4s,v4.4s,v0.4s
-	add	v5.4s,v5.4s,v1.4s
-	add	v6.4s,v6.4s,v2.4s
-	st1	{v4.4s-v5.4s},[x17], #32
-	add	v7.4s,v7.4s,v3.4s
-	st1	{v6.4s-v7.4s},[x17]
-	sub	x17,x17,#32
-
-	ldp	w3,w4,[x0]
-	ldp	w5,w6,[x0,#8]
-	ldp	w7,w8,[x0,#16]
-	ldp	w9,w10,[x0,#24]
-	ldr	w12,[sp,#0]
-	mov	w13,wzr
-	eor	w14,w4,w5
-	mov	w15,wzr
-	b	.L_00_48
-
-.align	4
-.L_00_48:
-	ext	v4.16b,v0.16b,v1.16b,#4
-	add	w10,w10,w12
-	add	w3,w3,w15
-	and	w12,w8,w7
-	bic	w15,w9,w7
-	ext	v7.16b,v2.16b,v3.16b,#4
-	eor	w11,w7,w7,ror#5
-	add	w3,w3,w13
-	mov	d19,v3.d[1]
-	orr	w12,w12,w15
-	eor	w11,w11,w7,ror#19
-	ushr	v6.4s,v4.4s,#7
-	eor	w15,w3,w3,ror#11
-	ushr	v5.4s,v4.4s,#3
-	add	w10,w10,w12
-	add	v0.4s,v0.4s,v7.4s
-	ror	w11,w11,#6
-	sli	v6.4s,v4.4s,#25
-	eor	w13,w3,w4
-	eor	w15,w15,w3,ror#20
-	ushr	v7.4s,v4.4s,#18
-	add	w10,w10,w11
-	ldr	w12,[sp,#4]
-	and	w14,w14,w13
-	eor	v5.16b,v5.16b,v6.16b
-	ror	w15,w15,#2
-	add	w6,w6,w10
-	sli	v7.4s,v4.4s,#14
-	eor	w14,w14,w4
-	ushr	v16.4s,v19.4s,#17
-	add	w9,w9,w12
-	add	w10,w10,w15
-	and	w12,w7,w6
-	eor	v5.16b,v5.16b,v7.16b
-	bic	w15,w8,w6
-	eor	w11,w6,w6,ror#5
-	sli	v16.4s,v19.4s,#15
-	add	w10,w10,w14
-	orr	w12,w12,w15
-	ushr	v17.4s,v19.4s,#10
-	eor	w11,w11,w6,ror#19
-	eor	w15,w10,w10,ror#11
-	ushr	v7.4s,v19.4s,#19
-	add	w9,w9,w12
-	ror	w11,w11,#6
-	add	v0.4s,v0.4s,v5.4s
-	eor	w14,w10,w3
-	eor	w15,w15,w10,ror#20
-	sli	v7.4s,v19.4s,#13
-	add	w9,w9,w11
-	ldr	w12,[sp,#8]
-	and	w13,w13,w14
-	eor	v17.16b,v17.16b,v16.16b
-	ror	w15,w15,#2
-	add	w5,w5,w9
-	eor	w13,w13,w3
-	eor	v17.16b,v17.16b,v7.16b
-	add	w8,w8,w12
-	add	w9,w9,w15
-	and	w12,w6,w5
-	add	v0.4s,v0.4s,v17.4s
-	bic	w15,w7,w5
-	eor	w11,w5,w5,ror#5
-	add	w9,w9,w13
-	ushr	v18.4s,v0.4s,#17
-	orr	w12,w12,w15
-	ushr	v19.4s,v0.4s,#10
-	eor	w11,w11,w5,ror#19
-	eor	w15,w9,w9,ror#11
-	sli	v18.4s,v0.4s,#15
-	add	w8,w8,w12
-	ushr	v17.4s,v0.4s,#19
-	ror	w11,w11,#6
-	eor	w13,w9,w10
-	eor	v19.16b,v19.16b,v18.16b
-	eor	w15,w15,w9,ror#20
-	add	w8,w8,w11
-	sli	v17.4s,v0.4s,#13
-	ldr	w12,[sp,#12]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	ld1	{v4.4s},[x16], #16
-	add	w4,w4,w8
-	eor	v19.16b,v19.16b,v17.16b
-	eor	w14,w14,w10
-	eor	v17.16b,v17.16b,v17.16b
-	add	w7,w7,w12
-	add	w8,w8,w15
-	and	w12,w5,w4
-	mov	v17.d[1],v19.d[0]
-	bic	w15,w6,w4
-	eor	w11,w4,w4,ror#5
-	add	w8,w8,w14
-	add	v0.4s,v0.4s,v17.4s
-	orr	w12,w12,w15
-	eor	w11,w11,w4,ror#19
-	eor	w15,w8,w8,ror#11
-	add	v4.4s,v4.4s,v0.4s
-	add	w7,w7,w12
-	ror	w11,w11,#6
-	eor	w14,w8,w9
-	eor	w15,w15,w8,ror#20
-	add	w7,w7,w11
-	ldr	w12,[sp,#16]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w3,w3,w7
-	eor	w13,w13,w9
-	st1	{v4.4s},[x17], #16
-	ext	v4.16b,v1.16b,v2.16b,#4
-	add	w6,w6,w12
-	add	w7,w7,w15
-	and	w12,w4,w3
-	bic	w15,w5,w3
-	ext	v7.16b,v3.16b,v0.16b,#4
-	eor	w11,w3,w3,ror#5
-	add	w7,w7,w13
-	mov	d19,v0.d[1]
-	orr	w12,w12,w15
-	eor	w11,w11,w3,ror#19
-	ushr	v6.4s,v4.4s,#7
-	eor	w15,w7,w7,ror#11
-	ushr	v5.4s,v4.4s,#3
-	add	w6,w6,w12
-	add	v1.4s,v1.4s,v7.4s
-	ror	w11,w11,#6
-	sli	v6.4s,v4.4s,#25
-	eor	w13,w7,w8
-	eor	w15,w15,w7,ror#20
-	ushr	v7.4s,v4.4s,#18
-	add	w6,w6,w11
-	ldr	w12,[sp,#20]
-	and	w14,w14,w13
-	eor	v5.16b,v5.16b,v6.16b
-	ror	w15,w15,#2
-	add	w10,w10,w6
-	sli	v7.4s,v4.4s,#14
-	eor	w14,w14,w8
-	ushr	v16.4s,v19.4s,#17
-	add	w5,w5,w12
-	add	w6,w6,w15
-	and	w12,w3,w10
-	eor	v5.16b,v5.16b,v7.16b
-	bic	w15,w4,w10
-	eor	w11,w10,w10,ror#5
-	sli	v16.4s,v19.4s,#15
-	add	w6,w6,w14
-	orr	w12,w12,w15
-	ushr	v17.4s,v19.4s,#10
-	eor	w11,w11,w10,ror#19
-	eor	w15,w6,w6,ror#11
-	ushr	v7.4s,v19.4s,#19
-	add	w5,w5,w12
-	ror	w11,w11,#6
-	add	v1.4s,v1.4s,v5.4s
-	eor	w14,w6,w7
-	eor	w15,w15,w6,ror#20
-	sli	v7.4s,v19.4s,#13
-	add	w5,w5,w11
-	ldr	w12,[sp,#24]
-	and	w13,w13,w14
-	eor	v17.16b,v17.16b,v16.16b
-	ror	w15,w15,#2
-	add	w9,w9,w5
-	eor	w13,w13,w7
-	eor	v17.16b,v17.16b,v7.16b
-	add	w4,w4,w12
-	add	w5,w5,w15
-	and	w12,w10,w9
-	add	v1.4s,v1.4s,v17.4s
-	bic	w15,w3,w9
-	eor	w11,w9,w9,ror#5
-	add	w5,w5,w13
-	ushr	v18.4s,v1.4s,#17
-	orr	w12,w12,w15
-	ushr	v19.4s,v1.4s,#10
-	eor	w11,w11,w9,ror#19
-	eor	w15,w5,w5,ror#11
-	sli	v18.4s,v1.4s,#15
-	add	w4,w4,w12
-	ushr	v17.4s,v1.4s,#19
-	ror	w11,w11,#6
-	eor	w13,w5,w6
-	eor	v19.16b,v19.16b,v18.16b
-	eor	w15,w15,w5,ror#20
-	add	w4,w4,w11
-	sli	v17.4s,v1.4s,#13
-	ldr	w12,[sp,#28]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	ld1	{v4.4s},[x16], #16
-	add	w8,w8,w4
-	eor	v19.16b,v19.16b,v17.16b
-	eor	w14,w14,w6
-	eor	v17.16b,v17.16b,v17.16b
-	add	w3,w3,w12
-	add	w4,w4,w15
-	and	w12,w9,w8
-	mov	v17.d[1],v19.d[0]
-	bic	w15,w10,w8
-	eor	w11,w8,w8,ror#5
-	add	w4,w4,w14
-	add	v1.4s,v1.4s,v17.4s
-	orr	w12,w12,w15
-	eor	w11,w11,w8,ror#19
-	eor	w15,w4,w4,ror#11
-	add	v4.4s,v4.4s,v1.4s
-	add	w3,w3,w12
-	ror	w11,w11,#6
-	eor	w14,w4,w5
-	eor	w15,w15,w4,ror#20
-	add	w3,w3,w11
-	ldr	w12,[sp,#32]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w7,w7,w3
-	eor	w13,w13,w5
-	st1	{v4.4s},[x17], #16
-	ext	v4.16b,v2.16b,v3.16b,#4
-	add	w10,w10,w12
-	add	w3,w3,w15
-	and	w12,w8,w7
-	bic	w15,w9,w7
-	ext	v7.16b,v0.16b,v1.16b,#4
-	eor	w11,w7,w7,ror#5
-	add	w3,w3,w13
-	mov	d19,v1.d[1]
-	orr	w12,w12,w15
-	eor	w11,w11,w7,ror#19
-	ushr	v6.4s,v4.4s,#7
-	eor	w15,w3,w3,ror#11
-	ushr	v5.4s,v4.4s,#3
-	add	w10,w10,w12
-	add	v2.4s,v2.4s,v7.4s
-	ror	w11,w11,#6
-	sli	v6.4s,v4.4s,#25
-	eor	w13,w3,w4
-	eor	w15,w15,w3,ror#20
-	ushr	v7.4s,v4.4s,#18
-	add	w10,w10,w11
-	ldr	w12,[sp,#36]
-	and	w14,w14,w13
-	eor	v5.16b,v5.16b,v6.16b
-	ror	w15,w15,#2
-	add	w6,w6,w10
-	sli	v7.4s,v4.4s,#14
-	eor	w14,w14,w4
-	ushr	v16.4s,v19.4s,#17
-	add	w9,w9,w12
-	add	w10,w10,w15
-	and	w12,w7,w6
-	eor	v5.16b,v5.16b,v7.16b
-	bic	w15,w8,w6
-	eor	w11,w6,w6,ror#5
-	sli	v16.4s,v19.4s,#15
-	add	w10,w10,w14
-	orr	w12,w12,w15
-	ushr	v17.4s,v19.4s,#10
-	eor	w11,w11,w6,ror#19
-	eor	w15,w10,w10,ror#11
-	ushr	v7.4s,v19.4s,#19
-	add	w9,w9,w12
-	ror	w11,w11,#6
-	add	v2.4s,v2.4s,v5.4s
-	eor	w14,w10,w3
-	eor	w15,w15,w10,ror#20
-	sli	v7.4s,v19.4s,#13
-	add	w9,w9,w11
-	ldr	w12,[sp,#40]
-	and	w13,w13,w14
-	eor	v17.16b,v17.16b,v16.16b
-	ror	w15,w15,#2
-	add	w5,w5,w9
-	eor	w13,w13,w3
-	eor	v17.16b,v17.16b,v7.16b
-	add	w8,w8,w12
-	add	w9,w9,w15
-	and	w12,w6,w5
-	add	v2.4s,v2.4s,v17.4s
-	bic	w15,w7,w5
-	eor	w11,w5,w5,ror#5
-	add	w9,w9,w13
-	ushr	v18.4s,v2.4s,#17
-	orr	w12,w12,w15
-	ushr	v19.4s,v2.4s,#10
-	eor	w11,w11,w5,ror#19
-	eor	w15,w9,w9,ror#11
-	sli	v18.4s,v2.4s,#15
-	add	w8,w8,w12
-	ushr	v17.4s,v2.4s,#19
-	ror	w11,w11,#6
-	eor	w13,w9,w10
-	eor	v19.16b,v19.16b,v18.16b
-	eor	w15,w15,w9,ror#20
-	add	w8,w8,w11
-	sli	v17.4s,v2.4s,#13
-	ldr	w12,[sp,#44]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	ld1	{v4.4s},[x16], #16
-	add	w4,w4,w8
-	eor	v19.16b,v19.16b,v17.16b
-	eor	w14,w14,w10
-	eor	v17.16b,v17.16b,v17.16b
-	add	w7,w7,w12
-	add	w8,w8,w15
-	and	w12,w5,w4
-	mov	v17.d[1],v19.d[0]
-	bic	w15,w6,w4
-	eor	w11,w4,w4,ror#5
-	add	w8,w8,w14
-	add	v2.4s,v2.4s,v17.4s
-	orr	w12,w12,w15
-	eor	w11,w11,w4,ror#19
-	eor	w15,w8,w8,ror#11
-	add	v4.4s,v4.4s,v2.4s
-	add	w7,w7,w12
-	ror	w11,w11,#6
-	eor	w14,w8,w9
-	eor	w15,w15,w8,ror#20
-	add	w7,w7,w11
-	ldr	w12,[sp,#48]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w3,w3,w7
-	eor	w13,w13,w9
-	st1	{v4.4s},[x17], #16
-	ext	v4.16b,v3.16b,v0.16b,#4
-	add	w6,w6,w12
-	add	w7,w7,w15
-	and	w12,w4,w3
-	bic	w15,w5,w3
-	ext	v7.16b,v1.16b,v2.16b,#4
-	eor	w11,w3,w3,ror#5
-	add	w7,w7,w13
-	mov	d19,v2.d[1]
-	orr	w12,w12,w15
-	eor	w11,w11,w3,ror#19
-	ushr	v6.4s,v4.4s,#7
-	eor	w15,w7,w7,ror#11
-	ushr	v5.4s,v4.4s,#3
-	add	w6,w6,w12
-	add	v3.4s,v3.4s,v7.4s
-	ror	w11,w11,#6
-	sli	v6.4s,v4.4s,#25
-	eor	w13,w7,w8
-	eor	w15,w15,w7,ror#20
-	ushr	v7.4s,v4.4s,#18
-	add	w6,w6,w11
-	ldr	w12,[sp,#52]
-	and	w14,w14,w13
-	eor	v5.16b,v5.16b,v6.16b
-	ror	w15,w15,#2
-	add	w10,w10,w6
-	sli	v7.4s,v4.4s,#14
-	eor	w14,w14,w8
-	ushr	v16.4s,v19.4s,#17
-	add	w5,w5,w12
-	add	w6,w6,w15
-	and	w12,w3,w10
-	eor	v5.16b,v5.16b,v7.16b
-	bic	w15,w4,w10
-	eor	w11,w10,w10,ror#5
-	sli	v16.4s,v19.4s,#15
-	add	w6,w6,w14
-	orr	w12,w12,w15
-	ushr	v17.4s,v19.4s,#10
-	eor	w11,w11,w10,ror#19
-	eor	w15,w6,w6,ror#11
-	ushr	v7.4s,v19.4s,#19
-	add	w5,w5,w12
-	ror	w11,w11,#6
-	add	v3.4s,v3.4s,v5.4s
-	eor	w14,w6,w7
-	eor	w15,w15,w6,ror#20
-	sli	v7.4s,v19.4s,#13
-	add	w5,w5,w11
-	ldr	w12,[sp,#56]
-	and	w13,w13,w14
-	eor	v17.16b,v17.16b,v16.16b
-	ror	w15,w15,#2
-	add	w9,w9,w5
-	eor	w13,w13,w7
-	eor	v17.16b,v17.16b,v7.16b
-	add	w4,w4,w12
-	add	w5,w5,w15
-	and	w12,w10,w9
-	add	v3.4s,v3.4s,v17.4s
-	bic	w15,w3,w9
-	eor	w11,w9,w9,ror#5
-	add	w5,w5,w13
-	ushr	v18.4s,v3.4s,#17
-	orr	w12,w12,w15
-	ushr	v19.4s,v3.4s,#10
-	eor	w11,w11,w9,ror#19
-	eor	w15,w5,w5,ror#11
-	sli	v18.4s,v3.4s,#15
-	add	w4,w4,w12
-	ushr	v17.4s,v3.4s,#19
-	ror	w11,w11,#6
-	eor	w13,w5,w6
-	eor	v19.16b,v19.16b,v18.16b
-	eor	w15,w15,w5,ror#20
-	add	w4,w4,w11
-	sli	v17.4s,v3.4s,#13
-	ldr	w12,[sp,#60]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	ld1	{v4.4s},[x16], #16
-	add	w8,w8,w4
-	eor	v19.16b,v19.16b,v17.16b
-	eor	w14,w14,w6
-	eor	v17.16b,v17.16b,v17.16b
-	add	w3,w3,w12
-	add	w4,w4,w15
-	and	w12,w9,w8
-	mov	v17.d[1],v19.d[0]
-	bic	w15,w10,w8
-	eor	w11,w8,w8,ror#5
-	add	w4,w4,w14
-	add	v3.4s,v3.4s,v17.4s
-	orr	w12,w12,w15
-	eor	w11,w11,w8,ror#19
-	eor	w15,w4,w4,ror#11
-	add	v4.4s,v4.4s,v3.4s
-	add	w3,w3,w12
-	ror	w11,w11,#6
-	eor	w14,w4,w5
-	eor	w15,w15,w4,ror#20
-	add	w3,w3,w11
-	ldr	w12,[x16]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w7,w7,w3
-	eor	w13,w13,w5
-	st1	{v4.4s},[x17], #16
-	cmp	w12,#0				// check for K256 terminator
-	ldr	w12,[sp,#0]
-	sub	x17,x17,#64
-	bne	.L_00_48
-
-	sub	x16,x16,#256		// rewind x16
-	cmp	x1,x2
-	mov	x17, #64
-	csel	x17, x17, xzr, eq
-	sub	x1,x1,x17			// avoid SEGV
-	mov	x17,sp
-	add	w10,w10,w12
-	add	w3,w3,w15
-	and	w12,w8,w7
-	ld1	{v0.16b},[x1],#16
-	bic	w15,w9,w7
-	eor	w11,w7,w7,ror#5
-	ld1	{v4.4s},[x16],#16
-	add	w3,w3,w13
-	orr	w12,w12,w15
-	eor	w11,w11,w7,ror#19
-	eor	w15,w3,w3,ror#11
-	rev32	v0.16b,v0.16b
-	add	w10,w10,w12
-	ror	w11,w11,#6
-	eor	w13,w3,w4
-	eor	w15,w15,w3,ror#20
-	add	v4.4s,v4.4s,v0.4s
-	add	w10,w10,w11
-	ldr	w12,[sp,#4]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	add	w6,w6,w10
-	eor	w14,w14,w4
-	add	w9,w9,w12
-	add	w10,w10,w15
-	and	w12,w7,w6
-	bic	w15,w8,w6
-	eor	w11,w6,w6,ror#5
-	add	w10,w10,w14
-	orr	w12,w12,w15
-	eor	w11,w11,w6,ror#19
-	eor	w15,w10,w10,ror#11
-	add	w9,w9,w12
-	ror	w11,w11,#6
-	eor	w14,w10,w3
-	eor	w15,w15,w10,ror#20
-	add	w9,w9,w11
-	ldr	w12,[sp,#8]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w5,w5,w9
-	eor	w13,w13,w3
-	add	w8,w8,w12
-	add	w9,w9,w15
-	and	w12,w6,w5
-	bic	w15,w7,w5
-	eor	w11,w5,w5,ror#5
-	add	w9,w9,w13
-	orr	w12,w12,w15
-	eor	w11,w11,w5,ror#19
-	eor	w15,w9,w9,ror#11
-	add	w8,w8,w12
-	ror	w11,w11,#6
-	eor	w13,w9,w10
-	eor	w15,w15,w9,ror#20
-	add	w8,w8,w11
-	ldr	w12,[sp,#12]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	add	w4,w4,w8
-	eor	w14,w14,w10
-	add	w7,w7,w12
-	add	w8,w8,w15
-	and	w12,w5,w4
-	bic	w15,w6,w4
-	eor	w11,w4,w4,ror#5
-	add	w8,w8,w14
-	orr	w12,w12,w15
-	eor	w11,w11,w4,ror#19
-	eor	w15,w8,w8,ror#11
-	add	w7,w7,w12
-	ror	w11,w11,#6
-	eor	w14,w8,w9
-	eor	w15,w15,w8,ror#20
-	add	w7,w7,w11
-	ldr	w12,[sp,#16]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w3,w3,w7
-	eor	w13,w13,w9
-	st1	{v4.4s},[x17], #16
-	add	w6,w6,w12
-	add	w7,w7,w15
-	and	w12,w4,w3
-	ld1	{v1.16b},[x1],#16
-	bic	w15,w5,w3
-	eor	w11,w3,w3,ror#5
-	ld1	{v4.4s},[x16],#16
-	add	w7,w7,w13
-	orr	w12,w12,w15
-	eor	w11,w11,w3,ror#19
-	eor	w15,w7,w7,ror#11
-	rev32	v1.16b,v1.16b
-	add	w6,w6,w12
-	ror	w11,w11,#6
-	eor	w13,w7,w8
-	eor	w15,w15,w7,ror#20
-	add	v4.4s,v4.4s,v1.4s
-	add	w6,w6,w11
-	ldr	w12,[sp,#20]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	add	w10,w10,w6
-	eor	w14,w14,w8
-	add	w5,w5,w12
-	add	w6,w6,w15
-	and	w12,w3,w10
-	bic	w15,w4,w10
-	eor	w11,w10,w10,ror#5
-	add	w6,w6,w14
-	orr	w12,w12,w15
-	eor	w11,w11,w10,ror#19
-	eor	w15,w6,w6,ror#11
-	add	w5,w5,w12
-	ror	w11,w11,#6
-	eor	w14,w6,w7
-	eor	w15,w15,w6,ror#20
-	add	w5,w5,w11
-	ldr	w12,[sp,#24]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w9,w9,w5
-	eor	w13,w13,w7
-	add	w4,w4,w12
-	add	w5,w5,w15
-	and	w12,w10,w9
-	bic	w15,w3,w9
-	eor	w11,w9,w9,ror#5
-	add	w5,w5,w13
-	orr	w12,w12,w15
-	eor	w11,w11,w9,ror#19
-	eor	w15,w5,w5,ror#11
-	add	w4,w4,w12
-	ror	w11,w11,#6
-	eor	w13,w5,w6
-	eor	w15,w15,w5,ror#20
-	add	w4,w4,w11
-	ldr	w12,[sp,#28]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	add	w8,w8,w4
-	eor	w14,w14,w6
-	add	w3,w3,w12
-	add	w4,w4,w15
-	and	w12,w9,w8
-	bic	w15,w10,w8
-	eor	w11,w8,w8,ror#5
-	add	w4,w4,w14
-	orr	w12,w12,w15
-	eor	w11,w11,w8,ror#19
-	eor	w15,w4,w4,ror#11
-	add	w3,w3,w12
-	ror	w11,w11,#6
-	eor	w14,w4,w5
-	eor	w15,w15,w4,ror#20
-	add	w3,w3,w11
-	ldr	w12,[sp,#32]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w7,w7,w3
-	eor	w13,w13,w5
-	st1	{v4.4s},[x17], #16
-	add	w10,w10,w12
-	add	w3,w3,w15
-	and	w12,w8,w7
-	ld1	{v2.16b},[x1],#16
-	bic	w15,w9,w7
-	eor	w11,w7,w7,ror#5
-	ld1	{v4.4s},[x16],#16
-	add	w3,w3,w13
-	orr	w12,w12,w15
-	eor	w11,w11,w7,ror#19
-	eor	w15,w3,w3,ror#11
-	rev32	v2.16b,v2.16b
-	add	w10,w10,w12
-	ror	w11,w11,#6
-	eor	w13,w3,w4
-	eor	w15,w15,w3,ror#20
-	add	v4.4s,v4.4s,v2.4s
-	add	w10,w10,w11
-	ldr	w12,[sp,#36]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	add	w6,w6,w10
-	eor	w14,w14,w4
-	add	w9,w9,w12
-	add	w10,w10,w15
-	and	w12,w7,w6
-	bic	w15,w8,w6
-	eor	w11,w6,w6,ror#5
-	add	w10,w10,w14
-	orr	w12,w12,w15
-	eor	w11,w11,w6,ror#19
-	eor	w15,w10,w10,ror#11
-	add	w9,w9,w12
-	ror	w11,w11,#6
-	eor	w14,w10,w3
-	eor	w15,w15,w10,ror#20
-	add	w9,w9,w11
-	ldr	w12,[sp,#40]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w5,w5,w9
-	eor	w13,w13,w3
-	add	w8,w8,w12
-	add	w9,w9,w15
-	and	w12,w6,w5
-	bic	w15,w7,w5
-	eor	w11,w5,w5,ror#5
-	add	w9,w9,w13
-	orr	w12,w12,w15
-	eor	w11,w11,w5,ror#19
-	eor	w15,w9,w9,ror#11
-	add	w8,w8,w12
-	ror	w11,w11,#6
-	eor	w13,w9,w10
-	eor	w15,w15,w9,ror#20
-	add	w8,w8,w11
-	ldr	w12,[sp,#44]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	add	w4,w4,w8
-	eor	w14,w14,w10
-	add	w7,w7,w12
-	add	w8,w8,w15
-	and	w12,w5,w4
-	bic	w15,w6,w4
-	eor	w11,w4,w4,ror#5
-	add	w8,w8,w14
-	orr	w12,w12,w15
-	eor	w11,w11,w4,ror#19
-	eor	w15,w8,w8,ror#11
-	add	w7,w7,w12
-	ror	w11,w11,#6
-	eor	w14,w8,w9
-	eor	w15,w15,w8,ror#20
-	add	w7,w7,w11
-	ldr	w12,[sp,#48]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w3,w3,w7
-	eor	w13,w13,w9
-	st1	{v4.4s},[x17], #16
-	add	w6,w6,w12
-	add	w7,w7,w15
-	and	w12,w4,w3
-	ld1	{v3.16b},[x1],#16
-	bic	w15,w5,w3
-	eor	w11,w3,w3,ror#5
-	ld1	{v4.4s},[x16],#16
-	add	w7,w7,w13
-	orr	w12,w12,w15
-	eor	w11,w11,w3,ror#19
-	eor	w15,w7,w7,ror#11
-	rev32	v3.16b,v3.16b
-	add	w6,w6,w12
-	ror	w11,w11,#6
-	eor	w13,w7,w8
-	eor	w15,w15,w7,ror#20
-	add	v4.4s,v4.4s,v3.4s
-	add	w6,w6,w11
-	ldr	w12,[sp,#52]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	add	w10,w10,w6
-	eor	w14,w14,w8
-	add	w5,w5,w12
-	add	w6,w6,w15
-	and	w12,w3,w10
-	bic	w15,w4,w10
-	eor	w11,w10,w10,ror#5
-	add	w6,w6,w14
-	orr	w12,w12,w15
-	eor	w11,w11,w10,ror#19
-	eor	w15,w6,w6,ror#11
-	add	w5,w5,w12
-	ror	w11,w11,#6
-	eor	w14,w6,w7
-	eor	w15,w15,w6,ror#20
-	add	w5,w5,w11
-	ldr	w12,[sp,#56]
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w9,w9,w5
-	eor	w13,w13,w7
-	add	w4,w4,w12
-	add	w5,w5,w15
-	and	w12,w10,w9
-	bic	w15,w3,w9
-	eor	w11,w9,w9,ror#5
-	add	w5,w5,w13
-	orr	w12,w12,w15
-	eor	w11,w11,w9,ror#19
-	eor	w15,w5,w5,ror#11
-	add	w4,w4,w12
-	ror	w11,w11,#6
-	eor	w13,w5,w6
-	eor	w15,w15,w5,ror#20
-	add	w4,w4,w11
-	ldr	w12,[sp,#60]
-	and	w14,w14,w13
-	ror	w15,w15,#2
-	add	w8,w8,w4
-	eor	w14,w14,w6
-	add	w3,w3,w12
-	add	w4,w4,w15
-	and	w12,w9,w8
-	bic	w15,w10,w8
-	eor	w11,w8,w8,ror#5
-	add	w4,w4,w14
-	orr	w12,w12,w15
-	eor	w11,w11,w8,ror#19
-	eor	w15,w4,w4,ror#11
-	add	w3,w3,w12
-	ror	w11,w11,#6
-	eor	w14,w4,w5
-	eor	w15,w15,w4,ror#20
-	add	w3,w3,w11
-	and	w13,w13,w14
-	ror	w15,w15,#2
-	add	w7,w7,w3
-	eor	w13,w13,w5
-	st1	{v4.4s},[x17], #16
-	add	w3,w3,w15			// h+=Sigma0(a) from the past
-	ldp	w11,w12,[x0,#0]
-	add	w3,w3,w13			// h+=Maj(a,b,c) from the past
-	ldp	w13,w14,[x0,#8]
-	add	w3,w3,w11			// accumulate
-	add	w4,w4,w12
-	ldp	w11,w12,[x0,#16]
-	add	w5,w5,w13
-	add	w6,w6,w14
-	ldp	w13,w14,[x0,#24]
-	add	w7,w7,w11
-	add	w8,w8,w12
-	 ldr	w12,[sp,#0]
-	stp	w3,w4,[x0,#0]
-	add	w9,w9,w13
-	 mov	w13,wzr
-	stp	w5,w6,[x0,#8]
-	add	w10,w10,w14
-	stp	w7,w8,[x0,#16]
-	 eor	w14,w4,w5
-	stp	w9,w10,[x0,#24]
-	 mov	w15,wzr
-	 mov	x17,sp
-	b.ne	.L_00_48
-
-	ldr	x29,[x29]
-	add	sp,sp,#16*4+16
-	ret
-.size	sha256_block_neon,.-sha256_block_neon
-#ifndef	__KERNEL__
-.comm	OPENSSL_armcap_P,4,4
-#endif
diff --git a/arch/arm64/crypto/sha512-core.S_shipped b/arch/arm64/crypto/sha512-core.S_shipped
deleted file mode 100644
index e063a6106720..000000000000
--- a/arch/arm64/crypto/sha512-core.S_shipped
+++ /dev/null
@@ -1,1093 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// This code is taken from the OpenSSL project but the author (Andy Polyakov)
-// has relicensed it under the GPLv2. Therefore this program is free software;
-// you can redistribute it and/or modify it under the terms of the GNU General
-// Public License version 2 as published by the Free Software Foundation.
-//
-// The original headers, including the original license headers, are
-// included below for completeness.
-
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License").  You may not use
-// this file except in compliance with the License.  You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-//		SHA256-hw	SHA256(*)	SHA512
-// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
-// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
-// Denver	2.01		10.5 (+26%)	6.70 (+8%)
-// X-Gene			20.0 (+100%)	12.8 (+300%(***))
-// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
-//
-// (*)	Software SHA256 results are of lesser relevance, presented
-//	mostly for informational purposes.
-// (**)	The result is a trade-off: it's possible to improve it by
-//	10% (or by 1 cycle per round), but at the cost of 20% loss
-//	on Cortex-A53 (or by 4 cycles per round).
-// (***)	Super-impressive coefficients over gcc-generated code are
-//	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significanty faster
-//	and the gap is only 40-90%.
-//
-// October 2016.
-//
-// Originally it was reckoned that it makes no sense to implement NEON
-// version of SHA256 for 64-bit processors. This is because performance
-// improvement on most wide-spread Cortex-A5x processors was observed
-// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-// observed that 32-bit NEON SHA256 performs significantly better than
-// 64-bit scalar version on *some* of the more recent processors. As
-// result 64-bit NEON version of SHA256 was added to provide best
-// all-round performance. For example it executes ~30% faster on X-Gene
-// and Mongoose. [For reference, NEON version of SHA512 is bound to
-// deliver much less improvement, likely *negative* on Cortex-A5x.
-// Which is why NEON support is limited to SHA256.]
-
-#ifndef	__KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern	OPENSSL_armcap_P
-.globl	sha512_block_data_order
-.type	sha512_block_data_order,%function
-.align	6
-sha512_block_data_order:
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#4*8
-
-	ldp	x20,x21,[x0]				// load context
-	ldp	x22,x23,[x0,#2*8]
-	ldp	x24,x25,[x0,#4*8]
-	add	x2,x1,x2,lsl#7	// end of input
-	ldp	x26,x27,[x0,#6*8]
-	adr	x30,.LK512
-	stp	x0,x2,[x29,#96]
-
-.Loop:
-	ldp	x3,x4,[x1],#2*8
-	ldr	x19,[x30],#8			// *K++
-	eor	x28,x21,x22				// magic seed
-	str	x1,[x29,#112]
-#ifndef	__AARCH64EB__
-	rev	x3,x3			// 0
-#endif
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	eor	x6,x24,x24,ror#23
-	and	x17,x25,x24
-	bic	x19,x26,x24
-	add	x27,x27,x3			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x6,ror#18	// Sigma1(e)
-	ror	x6,x20,#28
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	eor	x17,x20,x20,ror#5
-	add	x27,x27,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x23,x23,x27			// d+=h
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x6,x17,ror#34	// Sigma0(a)
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x4,x4			// 1
-#endif
-	ldp	x5,x6,[x1],#2*8
-	add	x27,x27,x17			// h+=Sigma0(a)
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	eor	x7,x23,x23,ror#23
-	and	x17,x24,x23
-	bic	x28,x25,x23
-	add	x26,x26,x4			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x7,ror#18	// Sigma1(e)
-	ror	x7,x27,#28
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	eor	x17,x27,x27,ror#5
-	add	x26,x26,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x22,x22,x26			// d+=h
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x7,x17,ror#34	// Sigma0(a)
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x5,x5			// 2
-#endif
-	add	x26,x26,x17			// h+=Sigma0(a)
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	eor	x8,x22,x22,ror#23
-	and	x17,x23,x22
-	bic	x19,x24,x22
-	add	x25,x25,x5			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x8,ror#18	// Sigma1(e)
-	ror	x8,x26,#28
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	eor	x17,x26,x26,ror#5
-	add	x25,x25,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x21,x21,x25			// d+=h
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x8,x17,ror#34	// Sigma0(a)
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x6,x6			// 3
-#endif
-	ldp	x7,x8,[x1],#2*8
-	add	x25,x25,x17			// h+=Sigma0(a)
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	eor	x9,x21,x21,ror#23
-	and	x17,x22,x21
-	bic	x28,x23,x21
-	add	x24,x24,x6			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x9,ror#18	// Sigma1(e)
-	ror	x9,x25,#28
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	eor	x17,x25,x25,ror#5
-	add	x24,x24,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x20,x20,x24			// d+=h
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x9,x17,ror#34	// Sigma0(a)
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x7,x7			// 4
-#endif
-	add	x24,x24,x17			// h+=Sigma0(a)
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	eor	x10,x20,x20,ror#23
-	and	x17,x21,x20
-	bic	x19,x22,x20
-	add	x23,x23,x7			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x10,ror#18	// Sigma1(e)
-	ror	x10,x24,#28
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	eor	x17,x24,x24,ror#5
-	add	x23,x23,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x27,x27,x23			// d+=h
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x10,x17,ror#34	// Sigma0(a)
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x8,x8			// 5
-#endif
-	ldp	x9,x10,[x1],#2*8
-	add	x23,x23,x17			// h+=Sigma0(a)
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	eor	x11,x27,x27,ror#23
-	and	x17,x20,x27
-	bic	x28,x21,x27
-	add	x22,x22,x8			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x11,ror#18	// Sigma1(e)
-	ror	x11,x23,#28
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	eor	x17,x23,x23,ror#5
-	add	x22,x22,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x26,x26,x22			// d+=h
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x11,x17,ror#34	// Sigma0(a)
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x9,x9			// 6
-#endif
-	add	x22,x22,x17			// h+=Sigma0(a)
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	eor	x12,x26,x26,ror#23
-	and	x17,x27,x26
-	bic	x19,x20,x26
-	add	x21,x21,x9			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x12,ror#18	// Sigma1(e)
-	ror	x12,x22,#28
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	eor	x17,x22,x22,ror#5
-	add	x21,x21,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x25,x25,x21			// d+=h
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x12,x17,ror#34	// Sigma0(a)
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x10,x10			// 7
-#endif
-	ldp	x11,x12,[x1],#2*8
-	add	x21,x21,x17			// h+=Sigma0(a)
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	eor	x13,x25,x25,ror#23
-	and	x17,x26,x25
-	bic	x28,x27,x25
-	add	x20,x20,x10			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x13,ror#18	// Sigma1(e)
-	ror	x13,x21,#28
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	eor	x17,x21,x21,ror#5
-	add	x20,x20,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x24,x24,x20			// d+=h
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x13,x17,ror#34	// Sigma0(a)
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x20,x20,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x11,x11			// 8
-#endif
-	add	x20,x20,x17			// h+=Sigma0(a)
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	eor	x14,x24,x24,ror#23
-	and	x17,x25,x24
-	bic	x19,x26,x24
-	add	x27,x27,x11			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x14,ror#18	// Sigma1(e)
-	ror	x14,x20,#28
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	eor	x17,x20,x20,ror#5
-	add	x27,x27,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x23,x23,x27			// d+=h
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x14,x17,ror#34	// Sigma0(a)
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x12,x12			// 9
-#endif
-	ldp	x13,x14,[x1],#2*8
-	add	x27,x27,x17			// h+=Sigma0(a)
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	eor	x15,x23,x23,ror#23
-	and	x17,x24,x23
-	bic	x28,x25,x23
-	add	x26,x26,x12			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x15,ror#18	// Sigma1(e)
-	ror	x15,x27,#28
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	eor	x17,x27,x27,ror#5
-	add	x26,x26,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x22,x22,x26			// d+=h
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x15,x17,ror#34	// Sigma0(a)
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x13,x13			// 10
-#endif
-	add	x26,x26,x17			// h+=Sigma0(a)
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	eor	x0,x22,x22,ror#23
-	and	x17,x23,x22
-	bic	x19,x24,x22
-	add	x25,x25,x13			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x0,ror#18	// Sigma1(e)
-	ror	x0,x26,#28
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	eor	x17,x26,x26,ror#5
-	add	x25,x25,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x21,x21,x25			// d+=h
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x0,x17,ror#34	// Sigma0(a)
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x14,x14			// 11
-#endif
-	ldp	x15,x0,[x1],#2*8
-	add	x25,x25,x17			// h+=Sigma0(a)
-	str	x6,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	eor	x6,x21,x21,ror#23
-	and	x17,x22,x21
-	bic	x28,x23,x21
-	add	x24,x24,x14			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x6,ror#18	// Sigma1(e)
-	ror	x6,x25,#28
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	eor	x17,x25,x25,ror#5
-	add	x24,x24,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x20,x20,x24			// d+=h
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x6,x17,ror#34	// Sigma0(a)
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x15,x15			// 12
-#endif
-	add	x24,x24,x17			// h+=Sigma0(a)
-	str	x7,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	eor	x7,x20,x20,ror#23
-	and	x17,x21,x20
-	bic	x19,x22,x20
-	add	x23,x23,x15			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x7,ror#18	// Sigma1(e)
-	ror	x7,x24,#28
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	eor	x17,x24,x24,ror#5
-	add	x23,x23,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x27,x27,x23			// d+=h
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x7,x17,ror#34	// Sigma0(a)
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x0,x0			// 13
-#endif
-	ldp	x1,x2,[x1]
-	add	x23,x23,x17			// h+=Sigma0(a)
-	str	x8,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	eor	x8,x27,x27,ror#23
-	and	x17,x20,x27
-	bic	x28,x21,x27
-	add	x22,x22,x0			// h+=X[i]
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x8,ror#18	// Sigma1(e)
-	ror	x8,x23,#28
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	eor	x17,x23,x23,ror#5
-	add	x22,x22,x16			// h+=Sigma1(e)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	add	x26,x26,x22			// d+=h
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x8,x17,ror#34	// Sigma0(a)
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x1,x1			// 14
-#endif
-	ldr	x6,[sp,#24]
-	add	x22,x22,x17			// h+=Sigma0(a)
-	str	x9,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	eor	x9,x26,x26,ror#23
-	and	x17,x27,x26
-	bic	x19,x20,x26
-	add	x21,x21,x1			// h+=X[i]
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x9,ror#18	// Sigma1(e)
-	ror	x9,x22,#28
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	eor	x17,x22,x22,ror#5
-	add	x21,x21,x16			// h+=Sigma1(e)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	add	x25,x25,x21			// d+=h
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x9,x17,ror#34	// Sigma0(a)
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__AARCH64EB__
-	rev	x2,x2			// 15
-#endif
-	ldr	x7,[sp,#0]
-	add	x21,x21,x17			// h+=Sigma0(a)
-	str	x10,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x9,x4,#1
-	and	x17,x26,x25
-	ror	x8,x1,#19
-	bic	x28,x27,x25
-	ror	x10,x21,#28
-	add	x20,x20,x2			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x9,x9,x4,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x10,x10,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x8,x8,x1,ror#61
-	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x10,x21,ror#39	// Sigma0(a)
-	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
-	add	x3,x3,x12
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x3,x3,x9
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x3,x3,x8
-.Loop_16_xx:
-	ldr	x8,[sp,#8]
-	str	x11,[sp,#0]
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	ror	x10,x5,#1
-	and	x17,x25,x24
-	ror	x9,x2,#19
-	bic	x19,x26,x24
-	ror	x11,x20,#28
-	add	x27,x27,x3			// h+=X[i]
-	eor	x16,x16,x24,ror#18
-	eor	x10,x10,x5,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x24,ror#41	// Sigma1(e)
-	eor	x11,x11,x20,ror#34
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x9,x9,x2,ror#61
-	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
-	add	x27,x27,x16			// h+=Sigma1(e)
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x11,x20,ror#39	// Sigma0(a)
-	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
-	add	x4,x4,x13
-	add	x23,x23,x27			// d+=h
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x4,x4,x10
-	add	x27,x27,x17			// h+=Sigma0(a)
-	add	x4,x4,x9
-	ldr	x9,[sp,#16]
-	str	x12,[sp,#8]
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	ror	x11,x6,#1
-	and	x17,x24,x23
-	ror	x10,x3,#19
-	bic	x28,x25,x23
-	ror	x12,x27,#28
-	add	x26,x26,x4			// h+=X[i]
-	eor	x16,x16,x23,ror#18
-	eor	x11,x11,x6,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x23,ror#41	// Sigma1(e)
-	eor	x12,x12,x27,ror#34
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x10,x10,x3,ror#61
-	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
-	add	x26,x26,x16			// h+=Sigma1(e)
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x12,x27,ror#39	// Sigma0(a)
-	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
-	add	x5,x5,x14
-	add	x22,x22,x26			// d+=h
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x5,x5,x11
-	add	x26,x26,x17			// h+=Sigma0(a)
-	add	x5,x5,x10
-	ldr	x10,[sp,#24]
-	str	x13,[sp,#16]
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	ror	x12,x7,#1
-	and	x17,x23,x22
-	ror	x11,x4,#19
-	bic	x19,x24,x22
-	ror	x13,x26,#28
-	add	x25,x25,x5			// h+=X[i]
-	eor	x16,x16,x22,ror#18
-	eor	x12,x12,x7,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x22,ror#41	// Sigma1(e)
-	eor	x13,x13,x26,ror#34
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x11,x11,x4,ror#61
-	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
-	add	x25,x25,x16			// h+=Sigma1(e)
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x13,x26,ror#39	// Sigma0(a)
-	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
-	add	x6,x6,x15
-	add	x21,x21,x25			// d+=h
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x6,x6,x12
-	add	x25,x25,x17			// h+=Sigma0(a)
-	add	x6,x6,x11
-	ldr	x11,[sp,#0]
-	str	x14,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	ror	x13,x8,#1
-	and	x17,x22,x21
-	ror	x12,x5,#19
-	bic	x28,x23,x21
-	ror	x14,x25,#28
-	add	x24,x24,x6			// h+=X[i]
-	eor	x16,x16,x21,ror#18
-	eor	x13,x13,x8,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x21,ror#41	// Sigma1(e)
-	eor	x14,x14,x25,ror#34
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x12,x12,x5,ror#61
-	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
-	add	x24,x24,x16			// h+=Sigma1(e)
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x14,x25,ror#39	// Sigma0(a)
-	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
-	add	x7,x7,x0
-	add	x20,x20,x24			// d+=h
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x7,x7,x13
-	add	x24,x24,x17			// h+=Sigma0(a)
-	add	x7,x7,x12
-	ldr	x12,[sp,#8]
-	str	x15,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	ror	x14,x9,#1
-	and	x17,x21,x20
-	ror	x13,x6,#19
-	bic	x19,x22,x20
-	ror	x15,x24,#28
-	add	x23,x23,x7			// h+=X[i]
-	eor	x16,x16,x20,ror#18
-	eor	x14,x14,x9,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x20,ror#41	// Sigma1(e)
-	eor	x15,x15,x24,ror#34
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x13,x13,x6,ror#61
-	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
-	add	x23,x23,x16			// h+=Sigma1(e)
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x15,x24,ror#39	// Sigma0(a)
-	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
-	add	x8,x8,x1
-	add	x27,x27,x23			// d+=h
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x8,x8,x14
-	add	x23,x23,x17			// h+=Sigma0(a)
-	add	x8,x8,x13
-	ldr	x13,[sp,#16]
-	str	x0,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	ror	x15,x10,#1
-	and	x17,x20,x27
-	ror	x14,x7,#19
-	bic	x28,x21,x27
-	ror	x0,x23,#28
-	add	x22,x22,x8			// h+=X[i]
-	eor	x16,x16,x27,ror#18
-	eor	x15,x15,x10,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x27,ror#41	// Sigma1(e)
-	eor	x0,x0,x23,ror#34
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x14,x14,x7,ror#61
-	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
-	add	x22,x22,x16			// h+=Sigma1(e)
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x0,x23,ror#39	// Sigma0(a)
-	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
-	add	x9,x9,x2
-	add	x26,x26,x22			// d+=h
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x9,x9,x15
-	add	x22,x22,x17			// h+=Sigma0(a)
-	add	x9,x9,x14
-	ldr	x14,[sp,#24]
-	str	x1,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	ror	x0,x11,#1
-	and	x17,x27,x26
-	ror	x15,x8,#19
-	bic	x19,x20,x26
-	ror	x1,x22,#28
-	add	x21,x21,x9			// h+=X[i]
-	eor	x16,x16,x26,ror#18
-	eor	x0,x0,x11,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x26,ror#41	// Sigma1(e)
-	eor	x1,x1,x22,ror#34
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x15,x15,x8,ror#61
-	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
-	add	x21,x21,x16			// h+=Sigma1(e)
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x1,x22,ror#39	// Sigma0(a)
-	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
-	add	x10,x10,x3
-	add	x25,x25,x21			// d+=h
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x10,x10,x0
-	add	x21,x21,x17			// h+=Sigma0(a)
-	add	x10,x10,x15
-	ldr	x15,[sp,#0]
-	str	x2,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x1,x12,#1
-	and	x17,x26,x25
-	ror	x0,x9,#19
-	bic	x28,x27,x25
-	ror	x2,x21,#28
-	add	x20,x20,x10			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x1,x1,x12,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x2,x2,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x0,x0,x9,ror#61
-	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x2,x21,ror#39	// Sigma0(a)
-	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
-	add	x11,x11,x4
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x11,x11,x1
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x11,x11,x0
-	ldr	x0,[sp,#8]
-	str	x3,[sp,#0]
-	ror	x16,x24,#14
-	add	x27,x27,x19			// h+=K[i]
-	ror	x2,x13,#1
-	and	x17,x25,x24
-	ror	x1,x10,#19
-	bic	x19,x26,x24
-	ror	x3,x20,#28
-	add	x27,x27,x11			// h+=X[i]
-	eor	x16,x16,x24,ror#18
-	eor	x2,x2,x13,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x20,x21			// a^b, b^c in next round
-	eor	x16,x16,x24,ror#41	// Sigma1(e)
-	eor	x3,x3,x20,ror#34
-	add	x27,x27,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x1,x1,x10,ror#61
-	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
-	add	x27,x27,x16			// h+=Sigma1(e)
-	eor	x28,x28,x21			// Maj(a,b,c)
-	eor	x17,x3,x20,ror#39	// Sigma0(a)
-	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
-	add	x12,x12,x5
-	add	x23,x23,x27			// d+=h
-	add	x27,x27,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x12,x12,x2
-	add	x27,x27,x17			// h+=Sigma0(a)
-	add	x12,x12,x1
-	ldr	x1,[sp,#16]
-	str	x4,[sp,#8]
-	ror	x16,x23,#14
-	add	x26,x26,x28			// h+=K[i]
-	ror	x3,x14,#1
-	and	x17,x24,x23
-	ror	x2,x11,#19
-	bic	x28,x25,x23
-	ror	x4,x27,#28
-	add	x26,x26,x12			// h+=X[i]
-	eor	x16,x16,x23,ror#18
-	eor	x3,x3,x14,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x27,x20			// a^b, b^c in next round
-	eor	x16,x16,x23,ror#41	// Sigma1(e)
-	eor	x4,x4,x27,ror#34
-	add	x26,x26,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x2,x2,x11,ror#61
-	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
-	add	x26,x26,x16			// h+=Sigma1(e)
-	eor	x19,x19,x20			// Maj(a,b,c)
-	eor	x17,x4,x27,ror#39	// Sigma0(a)
-	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
-	add	x13,x13,x6
-	add	x22,x22,x26			// d+=h
-	add	x26,x26,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x13,x13,x3
-	add	x26,x26,x17			// h+=Sigma0(a)
-	add	x13,x13,x2
-	ldr	x2,[sp,#24]
-	str	x5,[sp,#16]
-	ror	x16,x22,#14
-	add	x25,x25,x19			// h+=K[i]
-	ror	x4,x15,#1
-	and	x17,x23,x22
-	ror	x3,x12,#19
-	bic	x19,x24,x22
-	ror	x5,x26,#28
-	add	x25,x25,x13			// h+=X[i]
-	eor	x16,x16,x22,ror#18
-	eor	x4,x4,x15,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x26,x27			// a^b, b^c in next round
-	eor	x16,x16,x22,ror#41	// Sigma1(e)
-	eor	x5,x5,x26,ror#34
-	add	x25,x25,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x3,x3,x12,ror#61
-	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
-	add	x25,x25,x16			// h+=Sigma1(e)
-	eor	x28,x28,x27			// Maj(a,b,c)
-	eor	x17,x5,x26,ror#39	// Sigma0(a)
-	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
-	add	x14,x14,x7
-	add	x21,x21,x25			// d+=h
-	add	x25,x25,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x14,x14,x4
-	add	x25,x25,x17			// h+=Sigma0(a)
-	add	x14,x14,x3
-	ldr	x3,[sp,#0]
-	str	x6,[sp,#24]
-	ror	x16,x21,#14
-	add	x24,x24,x28			// h+=K[i]
-	ror	x5,x0,#1
-	and	x17,x22,x21
-	ror	x4,x13,#19
-	bic	x28,x23,x21
-	ror	x6,x25,#28
-	add	x24,x24,x14			// h+=X[i]
-	eor	x16,x16,x21,ror#18
-	eor	x5,x5,x0,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x25,x26			// a^b, b^c in next round
-	eor	x16,x16,x21,ror#41	// Sigma1(e)
-	eor	x6,x6,x25,ror#34
-	add	x24,x24,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x4,x4,x13,ror#61
-	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
-	add	x24,x24,x16			// h+=Sigma1(e)
-	eor	x19,x19,x26			// Maj(a,b,c)
-	eor	x17,x6,x25,ror#39	// Sigma0(a)
-	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
-	add	x15,x15,x8
-	add	x20,x20,x24			// d+=h
-	add	x24,x24,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x15,x15,x5
-	add	x24,x24,x17			// h+=Sigma0(a)
-	add	x15,x15,x4
-	ldr	x4,[sp,#8]
-	str	x7,[sp,#0]
-	ror	x16,x20,#14
-	add	x23,x23,x19			// h+=K[i]
-	ror	x6,x1,#1
-	and	x17,x21,x20
-	ror	x5,x14,#19
-	bic	x19,x22,x20
-	ror	x7,x24,#28
-	add	x23,x23,x15			// h+=X[i]
-	eor	x16,x16,x20,ror#18
-	eor	x6,x6,x1,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x24,x25			// a^b, b^c in next round
-	eor	x16,x16,x20,ror#41	// Sigma1(e)
-	eor	x7,x7,x24,ror#34
-	add	x23,x23,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x5,x5,x14,ror#61
-	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
-	add	x23,x23,x16			// h+=Sigma1(e)
-	eor	x28,x28,x25			// Maj(a,b,c)
-	eor	x17,x7,x24,ror#39	// Sigma0(a)
-	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
-	add	x0,x0,x9
-	add	x27,x27,x23			// d+=h
-	add	x23,x23,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x0,x0,x6
-	add	x23,x23,x17			// h+=Sigma0(a)
-	add	x0,x0,x5
-	ldr	x5,[sp,#16]
-	str	x8,[sp,#8]
-	ror	x16,x27,#14
-	add	x22,x22,x28			// h+=K[i]
-	ror	x7,x2,#1
-	and	x17,x20,x27
-	ror	x6,x15,#19
-	bic	x28,x21,x27
-	ror	x8,x23,#28
-	add	x22,x22,x0			// h+=X[i]
-	eor	x16,x16,x27,ror#18
-	eor	x7,x7,x2,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x23,x24			// a^b, b^c in next round
-	eor	x16,x16,x27,ror#41	// Sigma1(e)
-	eor	x8,x8,x23,ror#34
-	add	x22,x22,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x6,x6,x15,ror#61
-	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
-	add	x22,x22,x16			// h+=Sigma1(e)
-	eor	x19,x19,x24			// Maj(a,b,c)
-	eor	x17,x8,x23,ror#39	// Sigma0(a)
-	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
-	add	x1,x1,x10
-	add	x26,x26,x22			// d+=h
-	add	x22,x22,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x1,x1,x7
-	add	x22,x22,x17			// h+=Sigma0(a)
-	add	x1,x1,x6
-	ldr	x6,[sp,#24]
-	str	x9,[sp,#16]
-	ror	x16,x26,#14
-	add	x21,x21,x19			// h+=K[i]
-	ror	x8,x3,#1
-	and	x17,x27,x26
-	ror	x7,x0,#19
-	bic	x19,x20,x26
-	ror	x9,x22,#28
-	add	x21,x21,x1			// h+=X[i]
-	eor	x16,x16,x26,ror#18
-	eor	x8,x8,x3,ror#8
-	orr	x17,x17,x19			// Ch(e,f,g)
-	eor	x19,x22,x23			// a^b, b^c in next round
-	eor	x16,x16,x26,ror#41	// Sigma1(e)
-	eor	x9,x9,x22,ror#34
-	add	x21,x21,x17			// h+=Ch(e,f,g)
-	and	x28,x28,x19			// (b^c)&=(a^b)
-	eor	x7,x7,x0,ror#61
-	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
-	add	x21,x21,x16			// h+=Sigma1(e)
-	eor	x28,x28,x23			// Maj(a,b,c)
-	eor	x17,x9,x22,ror#39	// Sigma0(a)
-	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
-	add	x2,x2,x11
-	add	x25,x25,x21			// d+=h
-	add	x21,x21,x28			// h+=Maj(a,b,c)
-	ldr	x28,[x30],#8		// *K++, x19 in next round
-	add	x2,x2,x8
-	add	x21,x21,x17			// h+=Sigma0(a)
-	add	x2,x2,x7
-	ldr	x7,[sp,#0]
-	str	x10,[sp,#24]
-	ror	x16,x25,#14
-	add	x20,x20,x28			// h+=K[i]
-	ror	x9,x4,#1
-	and	x17,x26,x25
-	ror	x8,x1,#19
-	bic	x28,x27,x25
-	ror	x10,x21,#28
-	add	x20,x20,x2			// h+=X[i]
-	eor	x16,x16,x25,ror#18
-	eor	x9,x9,x4,ror#8
-	orr	x17,x17,x28			// Ch(e,f,g)
-	eor	x28,x21,x22			// a^b, b^c in next round
-	eor	x16,x16,x25,ror#41	// Sigma1(e)
-	eor	x10,x10,x21,ror#34
-	add	x20,x20,x17			// h+=Ch(e,f,g)
-	and	x19,x19,x28			// (b^c)&=(a^b)
-	eor	x8,x8,x1,ror#61
-	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
-	add	x20,x20,x16			// h+=Sigma1(e)
-	eor	x19,x19,x22			// Maj(a,b,c)
-	eor	x17,x10,x21,ror#39	// Sigma0(a)
-	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
-	add	x3,x3,x12
-	add	x24,x24,x20			// d+=h
-	add	x20,x20,x19			// h+=Maj(a,b,c)
-	ldr	x19,[x30],#8		// *K++, x28 in next round
-	add	x3,x3,x9
-	add	x20,x20,x17			// h+=Sigma0(a)
-	add	x3,x3,x8
-	cbnz	x19,.Loop_16_xx
-
-	ldp	x0,x2,[x29,#96]
-	ldr	x1,[x29,#112]
-	sub	x30,x30,#648		// rewind
-
-	ldp	x3,x4,[x0]
-	ldp	x5,x6,[x0,#2*8]
-	add	x1,x1,#14*8			// advance input pointer
-	ldp	x7,x8,[x0,#4*8]
-	add	x20,x20,x3
-	ldp	x9,x10,[x0,#6*8]
-	add	x21,x21,x4
-	add	x22,x22,x5
-	add	x23,x23,x6
-	stp	x20,x21,[x0]
-	add	x24,x24,x7
-	add	x25,x25,x8
-	stp	x22,x23,[x0,#2*8]
-	add	x26,x26,x9
-	add	x27,x27,x10
-	cmp	x1,x2
-	stp	x24,x25,[x0,#4*8]
-	stp	x26,x27,[x0,#6*8]
-	b.ne	.Loop
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#4*8
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#128
-	ret
-.size	sha512_block_data_order,.-sha512_block_data_order
-
-.align	6
-.type	.LK512,%object
-.LK512:
-	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
-	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-	.quad	0x3956c25bf348b538,0x59f111f1b605d019
-	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
-	.quad	0xd807aa98a3030242,0x12835b0145706fbe
-	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
-	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
-	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
-	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
-	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
-	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
-	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
-	.quad	0x06ca6351e003826f,0x142929670a0e6e70
-	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
-	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
-	.quad	0x81c2c92e47edaee6,0x92722c851482353b
-	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
-	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
-	.quad	0xd192e819d6ef5218,0xd69906245565a910
-	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
-	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
-	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
-	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
-	.quad	0x90befffa23631e28,0xa4506cebde82bde9
-	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
-	.quad	0xca273eceea26619c,0xd186b8c721c0c207
-	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
-	.quad	0x113f9804bef90dae,0x1b710b35131c471b
-	.quad	0x28db77f523047d84,0x32caab7b40c72493
-	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
-	.quad	0	// terminator
-.size	.LK512,.-.LK512
-#ifndef	__KERNEL__
-.align	3
-.LOPENSSL_armcap_P:
-# ifdef	__ILP32__
-	.long	OPENSSL_armcap_P-.
-# else
-	.quad	OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz	"SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align	2
-#ifndef	__KERNEL__
-.comm	OPENSSL_armcap_P,4,4
-#endif
diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 8a078fc662ac..7e157ab6cd50 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -3,12 +3,10 @@
 #define __ASM_ALTERNATIVE_MACROS_H
 
 #include <asm/cpucaps.h>
+#include <asm/insn-def.h>
 
 #define ARM64_CB_PATCH ARM64_NCAPS
 
-/* A64 instructions are always 32 bits. */
-#define	AARCH64_INSN_SIZE		4
-
 #ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
@@ -197,11 +195,6 @@ alternative_endif
 #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)	\
 	alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
 
-.macro user_alt, label, oldinstr, newinstr, cond
-9999:	alternative_insn "\oldinstr", "\newinstr", \cond
-	_asm_extable 9999b, \label
-.endm
-
 #endif  /*  __ASSEMBLY__  */
 
 /*
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 934b9be582d2..4ad22c3135db 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -124,7 +124,8 @@ static inline u32 gic_read_rpr(void)
 #define gic_read_lpir(c)		readq_relaxed(c)
 #define gic_write_lpir(v, c)		writeq_relaxed(v, c)
 
-#define gic_flush_dcache_to_poc(a,l)	__flush_dcache_area((a), (l))
+#define gic_flush_dcache_to_poc(a,l)	\
+	dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l))
 
 #define gits_read_baser(c)		readq_relaxed(c)
 #define gits_write_baser(v, c)		writeq_relaxed(v, c)
diff --git a/arch/arm64/include/asm/asm-prototypes.h b/arch/arm64/include/asm/asm-prototypes.h
index 1c9a3a0c5fa5..ec1d9655f885 100644
--- a/arch/arm64/include/asm/asm-prototypes.h
+++ b/arch/arm64/include/asm/asm-prototypes.h
@@ -23,4 +23,10 @@ long long __ashlti3(long long a, int b);
 long long __ashrti3(long long a, int b);
 long long __lshrti3(long long a, int b);
 
+/*
+ * This function uses a custom calling convention and cannot be called from C so
+ * this prototype is not entirely accurate.
+ */
+void __hwasan_tag_mismatch(unsigned long addr, unsigned long access_info);
+
 #endif /* __ASM_PROTOTYPES_H */
diff --git a/arch/arm64/include/asm/asm_pointer_auth.h b/arch/arm64/include/asm/asm_pointer_auth.h
index 8ca2dc0661ee..f1bba5fc61c4 100644
--- a/arch/arm64/include/asm/asm_pointer_auth.h
+++ b/arch/arm64/include/asm/asm_pointer_auth.h
@@ -7,19 +7,7 @@
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
 
-#ifdef CONFIG_ARM64_PTR_AUTH
-/*
- * thread.keys_user.ap* as offset exceeds the #imm offset range
- * so use the base value of ldp as thread.keys_user and offset as
- * thread.keys_user.ap*.
- */
-	.macro __ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3
-	mov	\tmp1, #THREAD_KEYS_USER
-	add	\tmp1, \tsk, \tmp1
-	ldp	\tmp2, \tmp3, [\tmp1, #PTRAUTH_USER_KEY_APIA]
-	msr_s	SYS_APIAKEYLO_EL1, \tmp2
-	msr_s	SYS_APIAKEYHI_EL1, \tmp3
-	.endm
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
 
 	.macro __ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
 	mov	\tmp1, #THREAD_KEYS_KERNEL
@@ -42,6 +30,33 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
 alternative_else_nop_endif
 	.endm
 
+#else /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+	.macro __ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
+	.endm
+
+	.macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
+	.endm
+
+	.macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3
+	.endm
+
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+#ifdef CONFIG_ARM64_PTR_AUTH
+/*
+ * thread.keys_user.ap* as offset exceeds the #imm offset range
+ * so use the base value of ldp as thread.keys_user and offset as
+ * thread.keys_user.ap*.
+ */
+	.macro __ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3
+	mov	\tmp1, #THREAD_KEYS_USER
+	add	\tmp1, \tsk, \tmp1
+	ldp	\tmp2, \tmp3, [\tmp1, #PTRAUTH_USER_KEY_APIA]
+	msr_s	SYS_APIAKEYLO_EL1, \tmp2
+	msr_s	SYS_APIAKEYHI_EL1, \tmp3
+	.endm
+
 	.macro __ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3
 	mrs	\tmp1, id_aa64isar1_el1
 	ubfx	\tmp1, \tmp1, #ID_AA64ISAR1_APA_SHIFT, #8
@@ -64,17 +79,11 @@ alternative_else_nop_endif
 .Lno_addr_auth\@:
 	.endm
 
-#else /* CONFIG_ARM64_PTR_AUTH */
+#else /* !CONFIG_ARM64_PTR_AUTH */
 
 	.macro ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3
 	.endm
 
-	.macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
-	.endm
-
-	.macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3
-	.endm
-
 #endif /* CONFIG_ARM64_PTR_AUTH */
 
 #endif /* __ASM_ASM_POINTER_AUTH_H */
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 8418c1bd8f04..89faca0e740d 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -130,15 +130,27 @@ alternative_endif
 	.endm
 
 /*
- * Emit an entry into the exception table
+ * Create an exception table entry for `insn`, which will branch to `fixup`
+ * when an unhandled fault is taken.
  */
-	.macro		_asm_extable, from, to
+	.macro		_asm_extable, insn, fixup
 	.pushsection	__ex_table, "a"
 	.align		3
-	.long		(\from - .), (\to - .)
+	.long		(\insn - .), (\fixup - .)
 	.popsection
 	.endm
 
+/*
+ * Create an exception table entry for `insn` if `fixup` is provided. Otherwise
+ * do nothing.
+ */
+	.macro		_cond_extable, insn, fixup
+	.ifnc		\fixup,
+	_asm_extable	\insn, \fixup
+	.endif
+	.endm
+
+
 #define USER(l, x...)				\
 9999:	x;					\
 	_asm_extable	9999b, l
@@ -232,17 +244,25 @@ lr	.req	x30		// link register
 	 * @dst: destination register
 	 */
 #if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
-	.macro	this_cpu_offset, dst
+	.macro	get_this_cpu_offset, dst
 	mrs	\dst, tpidr_el2
 	.endm
 #else
-	.macro	this_cpu_offset, dst
+	.macro	get_this_cpu_offset, dst
 alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
 	mrs	\dst, tpidr_el1
 alternative_else
 	mrs	\dst, tpidr_el2
 alternative_endif
 	.endm
+
+	.macro	set_this_cpu_offset, src
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+	msr	tpidr_el1, \src
+alternative_else
+	msr	tpidr_el2, \src
+alternative_endif
+	.endm
 #endif
 
 	/*
@@ -253,7 +273,7 @@ alternative_endif
 	.macro adr_this_cpu, dst, sym, tmp
 	adrp	\tmp, \sym
 	add	\dst, \tmp, #:lo12:\sym
-	this_cpu_offset \tmp
+	get_this_cpu_offset \tmp
 	add	\dst, \dst, \tmp
 	.endm
 
@@ -264,7 +284,7 @@ alternative_endif
 	 */
 	.macro ldr_this_cpu dst, sym, tmp
 	adr_l	\dst, \sym
-	this_cpu_offset \tmp
+	get_this_cpu_offset \tmp
 	ldr	\dst, [\dst, \tmp]
 	.endm
 
@@ -375,51 +395,53 @@ alternative_cb_end
 	bfi	\tcr, \tmp0, \pos, #3
 	.endm
 
+	.macro __dcache_op_workaround_clean_cache, op, addr
+alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
+	dc	\op, \addr
+alternative_else
+	dc	civac, \addr
+alternative_endif
+	.endm
+
 /*
  * Macro to perform a data cache maintenance for the interval
- * [kaddr, kaddr + size)
+ * [start, end)
  *
  * 	op:		operation passed to dc instruction
  * 	domain:		domain used in dsb instruciton
- * 	kaddr:		starting virtual address of the region
- * 	size:		size of the region
- * 	Corrupts:	kaddr, size, tmp1, tmp2
+ * 	start:          starting virtual address of the region
+ * 	end:            end virtual address of the region
+ * 	fixup:		optional label to branch to on user fault
+ * 	Corrupts:       start, end, tmp1, tmp2
  */
-	.macro __dcache_op_workaround_clean_cache, op, kaddr
-alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
-	dc	\op, \kaddr
-alternative_else
-	dc	civac, \kaddr
-alternative_endif
-	.endm
-
-	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	.macro dcache_by_line_op op, domain, start, end, tmp1, tmp2, fixup
 	dcache_line_size \tmp1, \tmp2
-	add	\size, \kaddr, \size
 	sub	\tmp2, \tmp1, #1
-	bic	\kaddr, \kaddr, \tmp2
-9998:
+	bic	\start, \start, \tmp2
+.Ldcache_op\@:
 	.ifc	\op, cvau
-	__dcache_op_workaround_clean_cache \op, \kaddr
+	__dcache_op_workaround_clean_cache \op, \start
 	.else
 	.ifc	\op, cvac
-	__dcache_op_workaround_clean_cache \op, \kaddr
+	__dcache_op_workaround_clean_cache \op, \start
 	.else
 	.ifc	\op, cvap
-	sys	3, c7, c12, 1, \kaddr	// dc cvap
+	sys	3, c7, c12, 1, \start	// dc cvap
 	.else
 	.ifc	\op, cvadp
-	sys	3, c7, c13, 1, \kaddr	// dc cvadp
+	sys	3, c7, c13, 1, \start	// dc cvadp
 	.else
-	dc	\op, \kaddr
+	dc	\op, \start
 	.endif
 	.endif
 	.endif
 	.endif
-	add	\kaddr, \kaddr, \tmp1
-	cmp	\kaddr, \size
-	b.lo	9998b
+	add	\start, \start, \tmp1
+	cmp	\start, \end
+	b.lo	.Ldcache_op\@
 	dsb	\domain
+
+	_cond_extable .Ldcache_op\@, \fixup
 	.endm
 
 /*
@@ -427,20 +449,22 @@ alternative_endif
  * [start, end)
  *
  * 	start, end:	virtual addresses describing the region
- *	label:		A label to branch to on user fault.
+ *	fixup:		optional label to branch to on user fault
  * 	Corrupts:	tmp1, tmp2
  */
-	.macro invalidate_icache_by_line start, end, tmp1, tmp2, label
+	.macro invalidate_icache_by_line start, end, tmp1, tmp2, fixup
 	icache_line_size \tmp1, \tmp2
 	sub	\tmp2, \tmp1, #1
 	bic	\tmp2, \start, \tmp2
-9997:
-USER(\label, ic	ivau, \tmp2)			// invalidate I line PoU
+.Licache_op\@:
+	ic	ivau, \tmp2			// invalidate I line PoU
 	add	\tmp2, \tmp2, \tmp1
 	cmp	\tmp2, \end
-	b.lo	9997b
+	b.lo	.Licache_op\@
 	dsb	ish
 	isb
+
+	_cond_extable .Licache_op\@, \fixup
 	.endm
 
 /*
@@ -745,7 +769,7 @@ USER(\label, ic	ivau, \tmp2)			// invalidate I line PoU
 	cbz		\tmp, \lbl
 #endif
 	adr_l		\tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
-	this_cpu_offset	\tmp2
+	get_this_cpu_offset	\tmp2
 	ldr		w\tmp, [\tmp, \tmp2]
 	cbnz		w\tmp, \lbl	// yield on pending softirq in task context
 .Lnoyield_\@:
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index b56a4b2bc248..c9979273d389 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -223,6 +223,4 @@ static __always_inline long arch_atomic64_dec_if_positive(atomic64_t *v)
 
 #define arch_atomic64_dec_if_positive		arch_atomic64_dec_if_positive
 
-#define ARCH_ATOMIC
-
 #endif /* __ASM_ATOMIC_H */
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index a074459f8f2f..a9c0716e7440 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -47,7 +47,7 @@
  * cache before the transfer is done, causing old data to be seen by
  * the CPU.
  */
-#define ARCH_DMA_MINALIGN	(128)
+#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 
 #ifdef CONFIG_KASAN_SW_TAGS
 #define ARCH_SLAB_MINALIGN	(1ULL << KASAN_SHADOW_SCALE_SHIFT)
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 52e5c1623224..543c997eb3b7 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -30,45 +30,58 @@
  *	the implementation assumes non-aliasing VIPT D-cache and (aliasing)
  *	VIPT I-cache.
  *
- *	flush_icache_range(start, end)
+ *	All functions below apply to the interval [start, end)
+ *		- start  - virtual start address (inclusive)
+ *		- end    - virtual end address (exclusive)
  *
- *		Ensure coherency between the I-cache and the D-cache in the
- *		region described by start, end.
- *		- start  - virtual start address
- *		- end    - virtual end address
+ *	caches_clean_inval_pou(start, end)
  *
- *	invalidate_icache_range(start, end)
+ *		Ensure coherency between the I-cache and the D-cache region to
+ *		the Point of Unification.
  *
- *		Invalidate the I-cache in the region described by start, end.
- *		- start  - virtual start address
- *		- end    - virtual end address
+ *	caches_clean_inval_user_pou(start, end)
  *
- *	__flush_cache_user_range(start, end)
+ *		Ensure coherency between the I-cache and the D-cache region to
+ *		the Point of Unification.
+ *		Use only if the region might access user memory.
  *
- *		Ensure coherency between the I-cache and the D-cache in the
- *		region described by start, end.
- *		- start  - virtual start address
- *		- end    - virtual end address
+ *	icache_inval_pou(start, end)
  *
- *	__flush_dcache_area(kaddr, size)
+ *		Invalidate I-cache region to the Point of Unification.
  *
- *		Ensure that the data held in page is written back.
- *		- kaddr  - page address
- *		- size   - region size
+ *	dcache_clean_inval_poc(start, end)
+ *
+ *		Clean and invalidate D-cache region to the Point of Coherency.
+ *
+ *	dcache_inval_poc(start, end)
+ *
+ *		Invalidate D-cache region to the Point of Coherency.
+ *
+ *	dcache_clean_poc(start, end)
+ *
+ *		Clean D-cache region to the Point of Coherency.
+ *
+ *	dcache_clean_pop(start, end)
+ *
+ *		Clean D-cache region to the Point of Persistence.
+ *
+ *	dcache_clean_pou(start, end)
+ *
+ *		Clean D-cache region to the Point of Unification.
  */
-extern void __flush_icache_range(unsigned long start, unsigned long end);
-extern int  invalidate_icache_range(unsigned long start, unsigned long end);
-extern void __flush_dcache_area(void *addr, size_t len);
-extern void __inval_dcache_area(void *addr, size_t len);
-extern void __clean_dcache_area_poc(void *addr, size_t len);
-extern void __clean_dcache_area_pop(void *addr, size_t len);
-extern void __clean_dcache_area_pou(void *addr, size_t len);
-extern long __flush_cache_user_range(unsigned long start, unsigned long end);
-extern void sync_icache_aliases(void *kaddr, unsigned long len);
+extern void caches_clean_inval_pou(unsigned long start, unsigned long end);
+extern void icache_inval_pou(unsigned long start, unsigned long end);
+extern void dcache_clean_inval_poc(unsigned long start, unsigned long end);
+extern void dcache_inval_poc(unsigned long start, unsigned long end);
+extern void dcache_clean_poc(unsigned long start, unsigned long end);
+extern void dcache_clean_pop(unsigned long start, unsigned long end);
+extern void dcache_clean_pou(unsigned long start, unsigned long end);
+extern long caches_clean_inval_user_pou(unsigned long start, unsigned long end);
+extern void sync_icache_aliases(unsigned long start, unsigned long end);
 
 static inline void flush_icache_range(unsigned long start, unsigned long end)
 {
-	__flush_icache_range(start, end);
+	caches_clean_inval_pou(start, end);
 
 	/*
 	 * IPI all online CPUs so that they undergo a context synchronization
@@ -122,7 +135,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *,
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *);
 
-static __always_inline void __flush_icache_all(void)
+static __always_inline void icache_inval_all_pou(void)
 {
 	if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
 		return;
diff --git a/arch/arm64/include/asm/compiler.h b/arch/arm64/include/asm/compiler.h
index 6fb2e6bcc392..dc3ea4080e2e 100644
--- a/arch/arm64/include/asm/compiler.h
+++ b/arch/arm64/include/asm/compiler.h
@@ -23,4 +23,20 @@
 #define __builtin_return_address(val)					\
 	(void *)(ptrauth_clear_pac((unsigned long)__builtin_return_address(val)))
 
+#ifdef CONFIG_CFI_CLANG
+/*
+ * With CONFIG_CFI_CLANG, the compiler replaces function address
+ * references with the address of the function's CFI jump table
+ * entry. The function_nocfi macro always returns the address of the
+ * actual function instead.
+ */
+#define function_nocfi(x) ({						\
+	void *addr;							\
+	asm("adrp %0, " __stringify(x) "\n\t"				\
+	    "add  %0, %0, :lo12:" __stringify(x)			\
+	    : "=r" (addr));						\
+	addr;								\
+})
+#endif
+
 #endif /* __ASM_COMPILER_H */
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 7faae6ff3ab4..0f6d16faa540 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -12,26 +12,7 @@
 /*
  * Records attributes of an individual CPU.
  */
-struct cpuinfo_arm64 {
-	struct cpu	cpu;
-	struct kobject	kobj;
-	u32		reg_ctr;
-	u32		reg_cntfrq;
-	u32		reg_dczid;
-	u32		reg_midr;
-	u32		reg_revidr;
-
-	u64		reg_id_aa64dfr0;
-	u64		reg_id_aa64dfr1;
-	u64		reg_id_aa64isar0;
-	u64		reg_id_aa64isar1;
-	u64		reg_id_aa64mmfr0;
-	u64		reg_id_aa64mmfr1;
-	u64		reg_id_aa64mmfr2;
-	u64		reg_id_aa64pfr0;
-	u64		reg_id_aa64pfr1;
-	u64		reg_id_aa64zfr0;
-
+struct cpuinfo_32bit {
 	u32		reg_id_dfr0;
 	u32		reg_id_dfr1;
 	u32		reg_id_isar0;
@@ -54,6 +35,30 @@ struct cpuinfo_arm64 {
 	u32		reg_mvfr0;
 	u32		reg_mvfr1;
 	u32		reg_mvfr2;
+};
+
+struct cpuinfo_arm64 {
+	struct cpu	cpu;
+	struct kobject	kobj;
+	u64		reg_ctr;
+	u64		reg_cntfrq;
+	u64		reg_dczid;
+	u64		reg_midr;
+	u64		reg_revidr;
+	u64		reg_gmid;
+
+	u64		reg_id_aa64dfr0;
+	u64		reg_id_aa64dfr1;
+	u64		reg_id_aa64isar0;
+	u64		reg_id_aa64isar1;
+	u64		reg_id_aa64mmfr0;
+	u64		reg_id_aa64mmfr1;
+	u64		reg_id_aa64mmfr2;
+	u64		reg_id_aa64pfr0;
+	u64		reg_id_aa64pfr1;
+	u64		reg_id_aa64zfr0;
+
+	struct cpuinfo_32bit	aarch32;
 
 	/* pseudo-ZCR for recording maximum ZCR_EL1 LEN value: */
 	u64		reg_zcr;
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 338840c00e8e..9bb9d11750d7 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -619,6 +619,13 @@ static inline bool id_aa64pfr0_sve(u64 pfr0)
 	return val > 0;
 }
 
+static inline bool id_aa64pfr1_mte(u64 pfr1)
+{
+	u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_MTE_SHIFT);
+
+	return val >= ID_AA64PFR1_MTE;
+}
+
 void __init setup_cpu_features(void);
 void check_local_cpu_capabilities(void);
 
@@ -630,9 +637,15 @@ static inline bool cpu_supports_mixed_endian_el0(void)
 	return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
 }
 
+const struct cpumask *system_32bit_el0_cpumask(void);
+DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);
+
 static inline bool system_supports_32bit_el0(void)
 {
-	return cpus_have_const_cap(ARM64_HAS_32BIT_EL0);
+	u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+	return static_branch_unlikely(&arm64_mismatched_32bit_el0) ||
+	       id_aa64pfr0_32bit_el0(pfr0);
 }
 
 static inline bool system_supports_4kb_granule(void)
diff --git a/arch/arm64/include/asm/cpuidle.h b/arch/arm64/include/asm/cpuidle.h
index 3c5ddb429ea2..14a19d1141bd 100644
--- a/arch/arm64/include/asm/cpuidle.h
+++ b/arch/arm64/include/asm/cpuidle.h
@@ -18,4 +18,39 @@ static inline int arm_cpuidle_suspend(int index)
 	return -EOPNOTSUPP;
 }
 #endif
+
+#ifdef CONFIG_ARM64_PSEUDO_NMI
+#include <asm/arch_gicv3.h>
+
+struct arm_cpuidle_irq_context {
+	unsigned long pmr;
+	unsigned long daif_bits;
+};
+
+#define arm_cpuidle_save_irq_context(__c)				\
+	do {								\
+		struct arm_cpuidle_irq_context *c = __c;		\
+		if (system_uses_irq_prio_masking()) {			\
+			c->daif_bits = read_sysreg(daif);		\
+			write_sysreg(c->daif_bits | PSR_I_BIT | PSR_F_BIT, \
+				     daif);				\
+			c->pmr = gic_read_pmr();			\
+			gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); \
+		}							\
+	} while (0)
+
+#define arm_cpuidle_restore_irq_context(__c)				\
+	do {								\
+		struct arm_cpuidle_irq_context *c = __c;		\
+		if (system_uses_irq_prio_masking()) {			\
+			gic_write_pmr(c->pmr);				\
+			write_sysreg(c->daif_bits, daif);		\
+		}							\
+	} while (0)
+#else
+struct arm_cpuidle_irq_context { };
+
+#define arm_cpuidle_save_irq_context(c)		(void)c
+#define arm_cpuidle_restore_irq_context(c)	(void)c
+#endif
 #endif
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 3578aba9c608..1bed37eb013a 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -137,7 +137,7 @@ void efi_virtmap_unload(void);
 
 static inline void efi_capsule_flush_cache_range(void *addr, int size)
 {
-	__flush_dcache_area(addr, size);
+	dcache_clean_inval_poc((unsigned long)addr, (unsigned long)addr + size);
 }
 
 #endif /* _ASM_EFI_H */
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 6546158d2f2d..4afbc45b8bb0 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -31,20 +31,35 @@ static inline u32 disr_to_esr(u64 disr)
 	return esr;
 }
 
-asmlinkage void el1_sync_handler(struct pt_regs *regs);
-asmlinkage void el0_sync_handler(struct pt_regs *regs);
-asmlinkage void el0_sync_compat_handler(struct pt_regs *regs);
+asmlinkage void handle_bad_stack(struct pt_regs *regs);
 
-asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs);
-asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs);
+asmlinkage void el1t_64_sync_handler(struct pt_regs *regs);
+asmlinkage void el1t_64_irq_handler(struct pt_regs *regs);
+asmlinkage void el1t_64_fiq_handler(struct pt_regs *regs);
+asmlinkage void el1t_64_error_handler(struct pt_regs *regs);
+
+asmlinkage void el1h_64_sync_handler(struct pt_regs *regs);
+asmlinkage void el1h_64_irq_handler(struct pt_regs *regs);
+asmlinkage void el1h_64_fiq_handler(struct pt_regs *regs);
+asmlinkage void el1h_64_error_handler(struct pt_regs *regs);
+
+asmlinkage void el0t_64_sync_handler(struct pt_regs *regs);
+asmlinkage void el0t_64_irq_handler(struct pt_regs *regs);
+asmlinkage void el0t_64_fiq_handler(struct pt_regs *regs);
+asmlinkage void el0t_64_error_handler(struct pt_regs *regs);
+
+asmlinkage void el0t_32_sync_handler(struct pt_regs *regs);
+asmlinkage void el0t_32_irq_handler(struct pt_regs *regs);
+asmlinkage void el0t_32_fiq_handler(struct pt_regs *regs);
+asmlinkage void el0t_32_error_handler(struct pt_regs *regs);
+
+asmlinkage void call_on_irq_stack(struct pt_regs *regs,
+				  void (*func)(struct pt_regs *));
 asmlinkage void enter_from_user_mode(void);
 asmlinkage void exit_to_user_mode(void);
-void arm64_enter_nmi(struct pt_regs *regs);
-void arm64_exit_nmi(struct pt_regs *regs);
 void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs);
 void do_undefinstr(struct pt_regs *regs);
 void do_bti(struct pt_regs *regs);
-asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
 			struct pt_regs *regs);
 void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs);
@@ -57,4 +72,7 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs);
 void do_el0_svc(struct pt_regs *regs);
 void do_el0_svc_compat(struct pt_regs *regs);
 void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr);
+void do_serror(struct pt_regs *regs, unsigned int esr);
+
+void panic_bad_stack(struct pt_regs *regs, unsigned int esr, unsigned long far);
 #endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 2599504674b5..c072161d5c65 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -69,7 +69,7 @@ static inline void *sve_pffr(struct thread_struct *thread)
 extern void sve_save_state(void *state, u32 *pfpsr);
 extern void sve_load_state(void const *state, u32 const *pfpsr,
 			   unsigned long vq_minus_1);
-extern void sve_flush_live(void);
+extern void sve_flush_live(unsigned long vq_minus_1);
 extern void sve_load_from_fpsimd_state(struct user_fpsimd_state const *state,
 				       unsigned long vq_minus_1);
 extern unsigned int sve_get_vl(void);
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index a2563992d2dc..059204477ce6 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -213,8 +213,10 @@
 	mov	v\nz\().16b, v\nz\().16b
 .endm
 
-.macro sve_flush
+.macro sve_flush_z
  _for n, 0, 31, _sve_flush_z	\n
+.endm
+.macro sve_flush_p_ffr
  _for n, 0, 15, _sve_pfalse	\n
 		_sve_wrffr	0
 .endm
diff --git a/arch/arm64/include/asm/insn-def.h b/arch/arm64/include/asm/insn-def.h
new file mode 100644
index 000000000000..2c075f615c6a
--- /dev/null
+++ b/arch/arm64/include/asm/insn-def.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_INSN_DEF_H
+#define __ASM_INSN_DEF_H
+
+/* A64 instructions are always 32 bits. */
+#define	AARCH64_INSN_SIZE		4
+
+#endif /* __ASM_INSN_DEF_H */
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 4ebb9c054ccc..6b776c8667b2 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -10,7 +10,7 @@
 #include <linux/build_bug.h>
 #include <linux/types.h>
 
-#include <asm/alternative.h>
+#include <asm/insn-def.h>
 
 #ifndef __ASSEMBLY__
 /*
@@ -30,6 +30,7 @@
  */
 enum aarch64_insn_encoding_class {
 	AARCH64_INSN_CLS_UNKNOWN,	/* UNALLOCATED */
+	AARCH64_INSN_CLS_SVE,		/* SVE instructions */
 	AARCH64_INSN_CLS_DP_IMM,	/* Data processing - immediate */
 	AARCH64_INSN_CLS_DP_REG,	/* Data processing - register */
 	AARCH64_INSN_CLS_DP_FPSIMD,	/* Data processing - SIMD and FP */
@@ -294,6 +295,12 @@ __AARCH64_INSN_FUNCS(adr,	0x9F000000, 0x10000000)
 __AARCH64_INSN_FUNCS(adrp,	0x9F000000, 0x90000000)
 __AARCH64_INSN_FUNCS(prfm,	0x3FC00000, 0x39800000)
 __AARCH64_INSN_FUNCS(prfm_lit,	0xFF000000, 0xD8000000)
+__AARCH64_INSN_FUNCS(store_imm,	0x3FC00000, 0x39000000)
+__AARCH64_INSN_FUNCS(load_imm,	0x3FC00000, 0x39400000)
+__AARCH64_INSN_FUNCS(store_pre,	0x3FE00C00, 0x38000C00)
+__AARCH64_INSN_FUNCS(load_pre,	0x3FE00C00, 0x38400C00)
+__AARCH64_INSN_FUNCS(store_post,	0x3FE00C00, 0x38000400)
+__AARCH64_INSN_FUNCS(load_post,	0x3FE00C00, 0x38400400)
 __AARCH64_INSN_FUNCS(str_reg,	0x3FE0EC00, 0x38206800)
 __AARCH64_INSN_FUNCS(ldadd,	0x3F20FC00, 0x38200000)
 __AARCH64_INSN_FUNCS(ldr_reg,	0x3FE0EC00, 0x38606800)
@@ -302,6 +309,8 @@ __AARCH64_INSN_FUNCS(ldrsw_lit,	0xFF000000, 0x98000000)
 __AARCH64_INSN_FUNCS(exclusive,	0x3F800000, 0x08000000)
 __AARCH64_INSN_FUNCS(load_ex,	0x3F400000, 0x08400000)
 __AARCH64_INSN_FUNCS(store_ex,	0x3F400000, 0x08000000)
+__AARCH64_INSN_FUNCS(stp,	0x7FC00000, 0x29000000)
+__AARCH64_INSN_FUNCS(ldp,	0x7FC00000, 0x29400000)
 __AARCH64_INSN_FUNCS(stp_post,	0x7FC00000, 0x28800000)
 __AARCH64_INSN_FUNCS(ldp_post,	0x7FC00000, 0x28C00000)
 __AARCH64_INSN_FUNCS(stp_pre,	0x7FC00000, 0x29800000)
@@ -334,6 +343,7 @@ __AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
 __AARCH64_INSN_FUNCS(and,	0x7F200000, 0x0A000000)
 __AARCH64_INSN_FUNCS(bic,	0x7F200000, 0x0A200000)
 __AARCH64_INSN_FUNCS(orr,	0x7F200000, 0x2A000000)
+__AARCH64_INSN_FUNCS(mov_reg,	0x7FE0FFE0, 0x2A0003E0)
 __AARCH64_INSN_FUNCS(orn,	0x7F200000, 0x2A200000)
 __AARCH64_INSN_FUNCS(eor,	0x7F200000, 0x4A000000)
 __AARCH64_INSN_FUNCS(eon,	0x7F200000, 0x4A200000)
@@ -368,6 +378,14 @@ __AARCH64_INSN_FUNCS(eret_auth,	0xFFFFFBFF, 0xD69F0BFF)
 __AARCH64_INSN_FUNCS(mrs,	0xFFF00000, 0xD5300000)
 __AARCH64_INSN_FUNCS(msr_imm,	0xFFF8F01F, 0xD500401F)
 __AARCH64_INSN_FUNCS(msr_reg,	0xFFF00000, 0xD5100000)
+__AARCH64_INSN_FUNCS(dmb,	0xFFFFF0FF, 0xD50330BF)
+__AARCH64_INSN_FUNCS(dsb_base,	0xFFFFF0FF, 0xD503309F)
+__AARCH64_INSN_FUNCS(dsb_nxs,	0xFFFFF3FF, 0xD503323F)
+__AARCH64_INSN_FUNCS(isb,	0xFFFFF0FF, 0xD50330DF)
+__AARCH64_INSN_FUNCS(sb,	0xFFFFFFFF, 0xD50330FF)
+__AARCH64_INSN_FUNCS(clrex,	0xFFFFF0FF, 0xD503305F)
+__AARCH64_INSN_FUNCS(ssbb,	0xFFFFFFFF, 0xD503309F)
+__AARCH64_INSN_FUNCS(pssbb,	0xFFFFFFFF, 0xD503349F)
 
 #undef	__AARCH64_INSN_FUNCS
 
@@ -379,8 +397,47 @@ static inline bool aarch64_insn_is_adr_adrp(u32 insn)
 	return aarch64_insn_is_adr(insn) || aarch64_insn_is_adrp(insn);
 }
 
-int aarch64_insn_read(void *addr, u32 *insnp);
-int aarch64_insn_write(void *addr, u32 insn);
+static inline bool aarch64_insn_is_dsb(u32 insn)
+{
+	return aarch64_insn_is_dsb_base(insn) || aarch64_insn_is_dsb_nxs(insn);
+}
+
+static inline bool aarch64_insn_is_barrier(u32 insn)
+{
+	return aarch64_insn_is_dmb(insn) || aarch64_insn_is_dsb(insn) ||
+	       aarch64_insn_is_isb(insn) || aarch64_insn_is_sb(insn) ||
+	       aarch64_insn_is_clrex(insn) || aarch64_insn_is_ssbb(insn) ||
+	       aarch64_insn_is_pssbb(insn);
+}
+
+static inline bool aarch64_insn_is_store_single(u32 insn)
+{
+	return aarch64_insn_is_store_imm(insn) ||
+	       aarch64_insn_is_store_pre(insn) ||
+	       aarch64_insn_is_store_post(insn);
+}
+
+static inline bool aarch64_insn_is_store_pair(u32 insn)
+{
+	return aarch64_insn_is_stp(insn) ||
+	       aarch64_insn_is_stp_pre(insn) ||
+	       aarch64_insn_is_stp_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_single(u32 insn)
+{
+	return aarch64_insn_is_load_imm(insn) ||
+	       aarch64_insn_is_load_pre(insn) ||
+	       aarch64_insn_is_load_post(insn);
+}
+
+static inline bool aarch64_insn_is_load_pair(u32 insn)
+{
+	return aarch64_insn_is_ldp(insn) ||
+	       aarch64_insn_is_ldp_pre(insn) ||
+	       aarch64_insn_is_ldp_post(insn);
+}
+
 enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
 bool aarch64_insn_uses_literal(u32 insn);
 bool aarch64_insn_is_branch(u32 insn);
@@ -487,9 +544,6 @@ u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
 s32 aarch64_get_branch_offset(u32 insn);
 u32 aarch64_set_branch_offset(u32 insn, s32 offset);
 
-int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
-int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
-
 s32 aarch64_insn_adrp_get_offset(u32 insn);
 u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset);
 
@@ -506,6 +560,7 @@ u32 aarch32_insn_mcr_extract_crm(u32 insn);
 
 typedef bool (pstate_check_t)(unsigned long);
 extern pstate_check_t * const aarch32_opcode_cond_checks[16];
+
 #endif /* __ASSEMBLY__ */
 
 #endif	/* __ASM_INSN_H */
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index d44df9d62fc9..3512184cfec1 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -18,9 +18,9 @@
  * 64K (section size = 512M).
  */
 #ifdef CONFIG_ARM64_4K_PAGES
-#define ARM64_SWAPPER_USES_SECTION_MAPS 1
+#define ARM64_KERNEL_USES_PMD_MAPS 1
 #else
-#define ARM64_SWAPPER_USES_SECTION_MAPS 0
+#define ARM64_KERNEL_USES_PMD_MAPS 0
 #endif
 
 /*
@@ -33,7 +33,7 @@
  * VA range, so pages required to map highest possible PA are reserved in all
  * cases.
  */
-#if ARM64_SWAPPER_USES_SECTION_MAPS
+#if ARM64_KERNEL_USES_PMD_MAPS
 #define SWAPPER_PGTABLE_LEVELS	(CONFIG_PGTABLE_LEVELS - 1)
 #define IDMAP_PGTABLE_LEVELS	(ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT) - 1)
 #else
@@ -90,9 +90,9 @@
 #define IDMAP_DIR_SIZE		(IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
 
 /* Initial memory map size */
-#if ARM64_SWAPPER_USES_SECTION_MAPS
-#define SWAPPER_BLOCK_SHIFT	SECTION_SHIFT
-#define SWAPPER_BLOCK_SIZE	SECTION_SIZE
+#if ARM64_KERNEL_USES_PMD_MAPS
+#define SWAPPER_BLOCK_SHIFT	PMD_SHIFT
+#define SWAPPER_BLOCK_SIZE	PMD_SIZE
 #define SWAPPER_TABLE_SHIFT	PUD_SHIFT
 #else
 #define SWAPPER_BLOCK_SHIFT	PAGE_SHIFT
@@ -100,16 +100,13 @@
 #define SWAPPER_TABLE_SHIFT	PMD_SHIFT
 #endif
 
-/* The size of the initial kernel direct mapping */
-#define SWAPPER_INIT_MAP_SIZE	(_AC(1, UL) << SWAPPER_TABLE_SHIFT)
-
 /*
  * Initial memory map attributes.
  */
 #define SWAPPER_PTE_FLAGS	(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
 #define SWAPPER_PMD_FLAGS	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
 
-#if ARM64_SWAPPER_USES_SECTION_MAPS
+#if ARM64_KERNEL_USES_PMD_MAPS
 #define SWAPPER_MM_MMUFLAGS	(PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
 #else
 #define SWAPPER_MM_MMUFLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
@@ -125,7 +122,7 @@
 #if defined(CONFIG_ARM64_4K_PAGES)
 #define ARM64_MEMSTART_SHIFT		PUD_SHIFT
 #elif defined(CONFIG_ARM64_16K_PAGES)
-#define ARM64_MEMSTART_SHIFT		(PMD_SHIFT + 5)
+#define ARM64_MEMSTART_SHIFT		CONT_PMD_SHIFT
 #else
 #define ARM64_MEMSTART_SHIFT		PMD_SHIFT
 #endif
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 692c9049befa..d436831dd706 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -12,7 +12,8 @@
 #include <asm/types.h>
 
 /* Hyp Configuration Register (HCR) bits */
-#define HCR_ATA		(UL(1) << 56)
+#define HCR_ATA_SHIFT	56
+#define HCR_ATA		(UL(1) << HCR_ATA_SHIFT)
 #define HCR_FWB		(UL(1) << 46)
 #define HCR_API		(UL(1) << 41)
 #define HCR_APK		(UL(1) << 40)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 5e9b33cbac51..9f0bf2109be7 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -8,6 +8,7 @@
 #define __ARM_KVM_ASM_H__
 
 #include <asm/hyp_image.h>
+#include <asm/insn.h>
 #include <asm/virt.h>
 
 #define ARM_EXIT_WITH_SERROR_BIT  31
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 01b9857757f2..fd418955e31e 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 	if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
 	    vcpu_el1_is_32bit(vcpu))
 		vcpu->arch.hcr_el2 |= HCR_TID2;
+
+	if (kvm_has_mte(vcpu->kvm))
+		vcpu->arch.hcr_el2 |= HCR_ATA;
 }
 
 static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7cd7d5c8c4bc..41911585ae0c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -46,6 +46,7 @@
 #define KVM_REQ_VCPU_RESET	KVM_ARCH_REQ(2)
 #define KVM_REQ_RECORD_STEAL	KVM_ARCH_REQ(3)
 #define KVM_REQ_RELOAD_GICv4	KVM_ARCH_REQ(4)
+#define KVM_REQ_RELOAD_PMU	KVM_ARCH_REQ(5)
 
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
 				     KVM_DIRTY_LOG_INITIALLY_SET)
@@ -132,6 +133,9 @@ struct kvm_arch {
 
 	u8 pfr0_csv2;
 	u8 pfr0_csv3;
+
+	/* Memory Tagging Extension enabled for the guest */
+	bool mte_enabled;
 };
 
 struct kvm_vcpu_fault_info {
@@ -206,6 +210,12 @@ enum vcpu_sysreg {
 	CNTP_CVAL_EL0,
 	CNTP_CTL_EL0,
 
+	/* Memory Tagging Extension registers */
+	RGSR_EL1,	/* Random Allocation Tag Seed Register */
+	GCR_EL1,	/* Tag Control Register */
+	TFSR_EL1,	/* Tag Fault Status Register (EL1) */
+	TFSRE0_EL1,	/* Tag Fault Status Register (EL0) */
+
 	/* 32bit specific registers. Keep them at the end of the range */
 	DACR32_EL2,	/* Domain Access Control Register */
 	IFSR32_EL2,	/* Instruction Fault Status Register */
@@ -556,16 +566,11 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
 }
 
 struct kvm_vm_stat {
-	ulong remote_tlb_flush;
+	struct kvm_vm_stat_generic generic;
 };
 
 struct kvm_vcpu_stat {
-	u64 halt_successful_poll;
-	u64 halt_attempted_poll;
-	u64 halt_poll_success_ns;
-	u64 halt_poll_fail_ns;
-	u64 halt_poll_invalid;
-	u64 halt_wakeup;
+	struct kvm_vcpu_stat_generic generic;
 	u64 hvc_exit_stat;
 	u64 wfe_exit_stat;
 	u64 wfi_exit_stat;
@@ -721,6 +726,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 			       struct kvm_device_attr *attr);
 
+long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
+				struct kvm_arm_copy_mte_tags *copy_tags);
+
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
@@ -769,6 +777,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
 #define kvm_arm_vcpu_sve_finalized(vcpu) \
 	((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
 
+#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
 #define kvm_vcpu_has_pmu(vcpu)					\
 	(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 25ed956f9af1..b52c5c4b9a3d 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -180,17 +180,16 @@ static inline void *__kvm_vector_slot2addr(void *base,
 
 struct kvm;
 
-#define kvm_flush_dcache_to_poc(a,l)	__flush_dcache_area((a), (l))
+#define kvm_flush_dcache_to_poc(a,l)	\
+	dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l))
 
 static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {
 	return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
+static inline void __clean_dcache_guest_page(void *va, size_t size)
 {
-	void *va = page_address(pfn_to_page(pfn));
-
 	/*
 	 * With FWB, we ensure that the guest always accesses memory using
 	 * cacheable attributes, and we don't have to clean to PoC when
@@ -203,18 +202,14 @@ static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
 	kvm_flush_dcache_to_poc(va, size);
 }
 
-static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
-						  unsigned long size)
+static inline void __invalidate_icache_guest_page(void *va, size_t size)
 {
 	if (icache_is_aliasing()) {
 		/* any kind of VIPT cache */
-		__flush_icache_all();
+		icache_inval_all_pou();
 	} else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
 		/* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
-		void *va = page_address(pfn_to_page(pfn));
-
-		invalidate_icache_range((unsigned long)va,
-					(unsigned long)va + size);
+		icache_inval_pou((unsigned long)va, (unsigned long)va + size);
 	}
 }
 
diff --git a/arch/arm64/include/asm/kvm_mte.h b/arch/arm64/include/asm/kvm_mte.h
new file mode 100644
index 000000000000..de002636eb1f
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_mte.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2021 ARM Ltd.
+ */
+#ifndef __ASM_KVM_MTE_H
+#define __ASM_KVM_MTE_H
+
+#ifdef __ASSEMBLY__
+
+#include <asm/sysreg.h>
+
+#ifdef CONFIG_ARM64_MTE
+
+.macro mte_switch_to_guest g_ctxt, h_ctxt, reg1
+alternative_if_not ARM64_MTE
+	b	.L__skip_switch\@
+alternative_else_nop_endif
+	mrs	\reg1, hcr_el2
+	tbz	\reg1, #(HCR_ATA_SHIFT), .L__skip_switch\@
+
+	mrs_s	\reg1, SYS_RGSR_EL1
+	str	\reg1, [\h_ctxt, #CPU_RGSR_EL1]
+	mrs_s	\reg1, SYS_GCR_EL1
+	str	\reg1, [\h_ctxt, #CPU_GCR_EL1]
+
+	ldr	\reg1, [\g_ctxt, #CPU_RGSR_EL1]
+	msr_s	SYS_RGSR_EL1, \reg1
+	ldr	\reg1, [\g_ctxt, #CPU_GCR_EL1]
+	msr_s	SYS_GCR_EL1, \reg1
+
+.L__skip_switch\@:
+.endm
+
+.macro mte_switch_to_hyp g_ctxt, h_ctxt, reg1
+alternative_if_not ARM64_MTE
+	b	.L__skip_switch\@
+alternative_else_nop_endif
+	mrs	\reg1, hcr_el2
+	tbz	\reg1, #(HCR_ATA_SHIFT), .L__skip_switch\@
+
+	mrs_s	\reg1, SYS_RGSR_EL1
+	str	\reg1, [\g_ctxt, #CPU_RGSR_EL1]
+	mrs_s	\reg1, SYS_GCR_EL1
+	str	\reg1, [\g_ctxt, #CPU_GCR_EL1]
+
+	ldr	\reg1, [\h_ctxt, #CPU_RGSR_EL1]
+	msr_s	SYS_RGSR_EL1, \reg1
+	ldr	\reg1, [\h_ctxt, #CPU_GCR_EL1]
+	msr_s	SYS_GCR_EL1, \reg1
+
+	isb
+
+.L__skip_switch\@:
+.endm
+
+#else /* !CONFIG_ARM64_MTE */
+
+.macro mte_switch_to_guest g_ctxt, h_ctxt, reg1
+.endm
+
+.macro mte_switch_to_hyp g_ctxt, h_ctxt, reg1
+.endm
+
+#endif /* CONFIG_ARM64_MTE */
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_KVM_MTE_H */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index c3674c47d48c..f004c0115d89 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -27,23 +27,29 @@ typedef u64 kvm_pte_t;
 
 /**
  * struct kvm_pgtable_mm_ops - Memory management callbacks.
- * @zalloc_page:	Allocate a single zeroed memory page. The @arg parameter
- *			can be used by the walker to pass a memcache. The
- *			initial refcount of the page is 1.
- * @zalloc_pages_exact:	Allocate an exact number of zeroed memory pages. The
- *			@size parameter is in bytes, and is rounded-up to the
- *			next page boundary. The resulting allocation is
- *			physically contiguous.
- * @free_pages_exact:	Free an exact number of memory pages previously
- *			allocated by zalloc_pages_exact.
- * @get_page:		Increment the refcount on a page.
- * @put_page:		Decrement the refcount on a page. When the refcount
- *			reaches 0 the page is automatically freed.
- * @page_count:		Return the refcount of a page.
- * @phys_to_virt:	Convert a physical address into a virtual address mapped
- *			in the current context.
- * @virt_to_phys:	Convert a virtual address mapped in the current context
- *			into a physical address.
+ * @zalloc_page:		Allocate a single zeroed memory page.
+ *				The @arg parameter can be used by the walker
+ *				to pass a memcache. The initial refcount of
+ *				the page is 1.
+ * @zalloc_pages_exact:		Allocate an exact number of zeroed memory pages.
+ *				The @size parameter is in bytes, and is rounded
+ *				up to the next page boundary. The resulting
+ *				allocation is physically contiguous.
+ * @free_pages_exact:		Free an exact number of memory pages previously
+ *				allocated by zalloc_pages_exact.
+ * @get_page:			Increment the refcount on a page.
+ * @put_page:			Decrement the refcount on a page. When the
+ *				refcount reaches 0 the page is automatically
+ *				freed.
+ * @page_count:			Return the refcount of a page.
+ * @phys_to_virt:		Convert a physical address into a virtual
+ *				address	mapped in the current context.
+ * @virt_to_phys:		Convert a virtual address mapped in the current
+ *				context into a physical address.
+ * @dcache_clean_inval_poc:	Clean and invalidate the data cache to the PoC
+ *				for the	specified memory address range.
+ * @icache_inval_pou:		Invalidate the instruction cache to the PoU
+ *				for the specified memory address range.
  */
 struct kvm_pgtable_mm_ops {
 	void*		(*zalloc_page)(void *arg);
@@ -54,6 +60,8 @@ struct kvm_pgtable_mm_ops {
 	int		(*page_count)(void *addr);
 	void*		(*phys_to_virt)(phys_addr_t phys);
 	phys_addr_t	(*virt_to_phys)(void *addr);
+	void		(*dcache_clean_inval_poc)(void *addr, size_t size);
+	void		(*icache_inval_pou)(void *addr, size_t size);
 };
 
 /**
diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index ba89a9af820a..9906541a6861 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -56,8 +56,16 @@
 		SYM_FUNC_START_ALIAS(__pi_##x);	\
 		SYM_FUNC_START_WEAK(x)
 
+#define SYM_FUNC_START_WEAK_ALIAS_PI(x)		\
+		SYM_FUNC_START_ALIAS(__pi_##x);	\
+		SYM_START(x, SYM_L_WEAK, SYM_A_ALIGN)
+
 #define SYM_FUNC_END_PI(x)			\
 		SYM_FUNC_END(x);		\
 		SYM_FUNC_END_ALIAS(__pi_##x)
 
+#define SYM_FUNC_END_ALIAS_PI(x)		\
+		SYM_FUNC_END_ALIAS(x);		\
+		SYM_FUNC_END_ALIAS(__pi_##x)
+
 #endif
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 9027b7e16c4c..824a3655dd93 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -135,10 +135,8 @@
 #define MT_NORMAL		0
 #define MT_NORMAL_TAGGED	1
 #define MT_NORMAL_NC		2
-#define MT_NORMAL_WT		3
-#define MT_DEVICE_nGnRnE	4
-#define MT_DEVICE_nGnRE		5
-#define MT_DEVICE_GRE		6
+#define MT_DEVICE_nGnRnE	3
+#define MT_DEVICE_nGnRE		4
 
 /*
  * Memory types for Stage-2 translation
@@ -323,22 +321,6 @@ static inline void *phys_to_virt(phys_addr_t x)
 #define virt_to_pfn(x)		__phys_to_pfn(__virt_to_phys((unsigned long)(x)))
 #define sym_to_pfn(x)		__phys_to_pfn(__pa_symbol(x))
 
-#ifdef CONFIG_CFI_CLANG
-/*
- * With CONFIG_CFI_CLANG, the compiler replaces function address
- * references with the address of the function's CFI jump table
- * entry. The function_nocfi macro always returns the address of the
- * actual function instead.
- */
-#define function_nocfi(x) ({						\
-	void *addr;							\
-	asm("adrp %0, " __stringify(x) "\n\t"				\
-	    "add  %0, %0, :lo12:" __stringify(x)			\
-	    : "=r" (addr));						\
-	addr;								\
-})
-#endif
-
 /*
  *  virt_to_page(x)	convert a _valid_ virtual address to struct page *
  *  virt_addr_valid(x)	indicates whether a virtual address is valid
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index d3cef9133539..eeb210997149 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -177,9 +177,9 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
 		return;
 
 	if (mm == &init_mm)
-		ttbr = __pa_symbol(reserved_pg_dir);
+		ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
 	else
-		ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
+		ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48;
 
 	WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
 }
diff --git a/arch/arm64/include/asm/module.lds.h b/arch/arm64/include/asm/module.lds.h
index 810045628c66..a11ccadd47d2 100644
--- a/arch/arm64/include/asm/module.lds.h
+++ b/arch/arm64/include/asm/module.lds.h
@@ -1,7 +1,20 @@
-#ifdef CONFIG_ARM64_MODULE_PLTS
 SECTIONS {
+#ifdef CONFIG_ARM64_MODULE_PLTS
 	.plt 0 (NOLOAD) : { BYTE(0) }
 	.init.plt 0 (NOLOAD) : { BYTE(0) }
 	.text.ftrace_trampoline 0 (NOLOAD) : { BYTE(0) }
-}
 #endif
+
+#ifdef CONFIG_KASAN_SW_TAGS
+	/*
+	 * Outlined checks go into comdat-deduplicated sections named .text.hot.
+	 * Because they are in comdats they are not combined by the linker and
+	 * we otherwise end up with multiple sections with the same .text.hot
+	 * name in the .ko file. The kernel module loader warns if it sees
+	 * multiple sections with the same name so we use this sections
+	 * directive to force them into a single section and silence the
+	 * warning.
+	 */
+	.text.hot : { *(.text.hot) }
+#endif
+}
diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
index cf241b0f0a42..626d359b396e 100644
--- a/arch/arm64/include/asm/mte-def.h
+++ b/arch/arm64/include/asm/mte-def.h
@@ -7,6 +7,7 @@
 
 #define MTE_GRANULE_SIZE	UL(16)
 #define MTE_GRANULE_MASK	(~(MTE_GRANULE_SIZE - 1))
+#define MTE_GRANULES_PER_PAGE	(PAGE_SIZE / MTE_GRANULE_SIZE)
 #define MTE_TAG_SHIFT		56
 #define MTE_TAG_SIZE		4
 #define MTE_TAG_MASK		GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index ddd4d17cf9a0..d952352bd008 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -48,43 +48,84 @@ static inline u8 mte_get_random_tag(void)
 	return mte_get_ptr_tag(addr);
 }
 
+static inline u64 __stg_post(u64 p)
+{
+	asm volatile(__MTE_PREAMBLE "stg %0, [%0], #16"
+		     : "+r"(p)
+		     :
+		     : "memory");
+	return p;
+}
+
+static inline u64 __stzg_post(u64 p)
+{
+	asm volatile(__MTE_PREAMBLE "stzg %0, [%0], #16"
+		     : "+r"(p)
+		     :
+		     : "memory");
+	return p;
+}
+
+static inline void __dc_gva(u64 p)
+{
+	asm volatile(__MTE_PREAMBLE "dc gva, %0" : : "r"(p) : "memory");
+}
+
+static inline void __dc_gzva(u64 p)
+{
+	asm volatile(__MTE_PREAMBLE "dc gzva, %0" : : "r"(p) : "memory");
+}
+
 /*
  * Assign allocation tags for a region of memory based on the pointer tag.
  * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
- * size must be non-zero and MTE_GRANULE_SIZE aligned.
+ * size must be MTE_GRANULE_SIZE aligned.
  */
-static inline void mte_set_mem_tag_range(void *addr, size_t size,
-						u8 tag, bool init)
+static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag,
+					 bool init)
 {
-	u64 curr, end;
+	u64 curr, mask, dczid_bs, end1, end2, end3;
 
-	if (!size)
-		return;
+	/* Read DC G(Z)VA block size from the system register. */
+	dczid_bs = 4ul << (read_cpuid(DCZID_EL0) & 0xf);
 
 	curr = (u64)__tag_set(addr, tag);
-	end = curr + size;
+	mask = dczid_bs - 1;
+	/* STG/STZG up to the end of the first block. */
+	end1 = curr | mask;
+	end3 = curr + size;
+	/* DC GVA / GZVA in [end1, end2) */
+	end2 = end3 & ~mask;
 
 	/*
-	 * 'asm volatile' is required to prevent the compiler to move
-	 * the statement outside of the loop.
+	 * The following code uses STG on the first DC GVA block even if the
+	 * start address is aligned - it appears to be faster than an alignment
+	 * check + conditional branch. Also, if the range size is at least 2 DC
+	 * GVA blocks, the first two loops can use post-condition to save one
+	 * branch each.
 	 */
-	if (init) {
-		do {
-			asm volatile(__MTE_PREAMBLE "stzg %0, [%0]"
-				     :
-				     : "r" (curr)
-				     : "memory");
-			curr += MTE_GRANULE_SIZE;
-		} while (curr != end);
-	} else {
-		do {
-			asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
-				     :
-				     : "r" (curr)
-				     : "memory");
-			curr += MTE_GRANULE_SIZE;
-		} while (curr != end);
-	}
+#define SET_MEMTAG_RANGE(stg_post, dc_gva)		\
+	do {						\
+		if (size >= 2 * dczid_bs) {		\
+			do {				\
+				curr = stg_post(curr);	\
+			} while (curr < end1);		\
+							\
+			do {				\
+				dc_gva(curr);		\
+				curr += dczid_bs;	\
+			} while (curr < end2);		\
+		}					\
+							\
+		while (curr < end3)			\
+			curr = stg_post(curr);		\
+	} while (0)
+
+	if (init)
+		SET_MEMTAG_RANGE(__stzg_post, __dc_gzva);
+	else
+		SET_MEMTAG_RANGE(__stg_post, __dc_gva);
+#undef SET_MEMTAG_RANGE
 }
 
 void mte_enable_kernel_sync(void);
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index bc88a1ced0d7..58c7f80f5596 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -37,7 +37,8 @@ void mte_free_tag_storage(char *storage);
 /* track which pages have valid allocation tags */
 #define PG_mte_tagged	PG_arch_2
 
-void mte_sync_tags(pte_t *ptep, pte_t pte);
+void mte_zero_clear_page_tags(void *addr);
+void mte_sync_tags(pte_t old_pte, pte_t pte);
 void mte_copy_page_tags(void *kto, const void *kfrom);
 void mte_thread_init_user(void);
 void mte_thread_switch(struct task_struct *next);
@@ -53,7 +54,10 @@ int mte_ptrace_copy_tags(struct task_struct *child, long request,
 /* unused if !CONFIG_ARM64_MTE, silence the compiler */
 #define PG_mte_tagged	0
 
-static inline void mte_sync_tags(pte_t *ptep, pte_t pte)
+static inline void mte_zero_clear_page_tags(void *addr)
+{
+}
+static inline void mte_sync_tags(pte_t old_pte, pte_t pte)
 {
 }
 static inline void mte_copy_page_tags(void *kto, const void *kfrom)
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index fcbef3eec4b2..993a27ea6f54 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -13,6 +13,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/personality.h> /* for READ_IMPLIES_EXEC */
+#include <linux/types.h> /* for gfp_t */
 #include <asm/pgtable-types.h>
 
 struct page;
@@ -28,9 +29,12 @@ void copy_user_highpage(struct page *to, struct page *from,
 void copy_highpage(struct page *to, struct page *from);
 #define __HAVE_ARCH_COPY_HIGHPAGE
 
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+						unsigned long vaddr);
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
+
+void tag_clear_highpage(struct page *to);
+#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
 
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h
new file mode 100644
index 000000000000..6bf5adc56295
--- /dev/null
+++ b/arch/arm64/include/asm/patching.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef	__ASM_PATCHING_H
+#define	__ASM_PATCHING_H
+
+#include <linux/types.h>
+
+int aarch64_insn_read(void *addr, u32 *insnp);
+int aarch64_insn_write(void *addr, u32 insn);
+
+int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
+int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
+
+#endif	/* __ASM_PATCHING_H */
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 60731f602d3e..4ef6f19331f9 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -239,6 +239,11 @@
 /* PMMIR_EL1.SLOTS mask */
 #define ARMV8_PMU_SLOTS_MASK	0xff
 
+#define ARMV8_PMU_BUS_SLOTS_SHIFT 8
+#define ARMV8_PMU_BUS_SLOTS_MASK 0xff
+#define ARMV8_PMU_BUS_WIDTH_SHIFT 16
+#define ARMV8_PMU_BUS_WIDTH_MASK 0xf
+
 #ifdef CONFIG_PERF_EVENTS
 struct pt_regs;
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index b82575a33f8b..40085e53f573 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -72,13 +72,6 @@
 #define PTRS_PER_PGD		(1 << (VA_BITS - PGDIR_SHIFT))
 
 /*
- * Section address mask and size definitions.
- */
-#define SECTION_SHIFT		PMD_SHIFT
-#define SECTION_SIZE		(_AC(1, UL) << SECTION_SHIFT)
-#define SECTION_MASK		(~(SECTION_SIZE-1))
-
-/*
  * Contiguous page definitions.
  */
 #define CONT_PTE_SHIFT		(CONFIG_ARM64_CONT_PTE_SHIFT + PAGE_SHIFT)
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 938092df76cf..7032f04c8ac6 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -55,7 +55,6 @@ extern bool arm64_use_ng_mappings;
 #define PROT_DEVICE_nGnRnE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
 #define PROT_DEVICE_nGnRE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE))
 #define PROT_NORMAL_NC		(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC))
-#define PROT_NORMAL_WT		(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_WT))
 #define PROT_NORMAL		(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL))
 #define PROT_NORMAL_TAGGED	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED))
 
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 25f5c04b43ce..508c7ffad515 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -312,9 +312,25 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 	if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
 		__sync_icache_dcache(pte);
 
-	if (system_supports_mte() &&
-	    pte_present(pte) && pte_tagged(pte) && !pte_special(pte))
-		mte_sync_tags(ptep, pte);
+	/*
+	 * If the PTE would provide user space access to the tags associated
+	 * with it then ensure that the MTE tags are synchronised.  Although
+	 * pte_access_permitted() returns false for exec only mappings, they
+	 * don't expose tags (instruction fetches don't check tags).
+	 */
+	if (system_supports_mte() && pte_access_permitted(pte, false) &&
+	    !pte_special(pte)) {
+		pte_t old_pte = READ_ONCE(*ptep);
+		/*
+		 * We only need to synchronise if the new PTE has tags enabled
+		 * or if swapping in (in which case another mapping may have
+		 * set tags in the past even if this PTE isn't tagged).
+		 * (!pte_none() && !pte_present()) is an open coded version of
+		 * is_swap_pte()
+		 */
+		if (pte_tagged(pte) || (!pte_none(old_pte) && !pte_present(old_pte)))
+			mte_sync_tags(old_pte, pte);
+	}
 
 	__check_racy_pte_update(mm, ptep, pte);
 
@@ -509,13 +525,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
 
-#define pmd_bad(pmd)		(!(pmd_val(pmd) & PMD_TABLE_BIT))
-
 #define pmd_table(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_SECT)
 #define pmd_leaf(pmd)		pmd_sect(pmd)
+#define pmd_bad(pmd)		(!pmd_table(pmd))
 
 #define pmd_leaf_size(pmd)	(pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
 #define pte_leaf_size(pte)	(pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
@@ -602,7 +617,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 	pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
 
 #define pud_none(pud)		(!pud_val(pud))
-#define pud_bad(pud)		(!(pud_val(pud) & PUD_TABLE_BIT))
+#define pud_bad(pud)		(!pud_table(pud))
 #define pud_present(pud)	pte_present(pud_pte(pud))
 #define pud_leaf(pud)		pud_sect(pud)
 #define pud_valid(pud)		pte_valid(pud_pte(pud))
diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h
index d50416be99be..28a78b67d9b4 100644
--- a/arch/arm64/include/asm/pointer_auth.h
+++ b/arch/arm64/include/asm/pointer_auth.h
@@ -31,10 +31,6 @@ struct ptrauth_keys_user {
 	struct ptrauth_key apga;
 };
 
-struct ptrauth_keys_kernel {
-	struct ptrauth_key apia;
-};
-
 #define __ptrauth_key_install_nosync(k, v)			\
 do {								\
 	struct ptrauth_key __pki_v = (v);			\
@@ -42,6 +38,29 @@ do {								\
 	write_sysreg_s(__pki_v.hi, SYS_ ## k ## KEYHI_EL1);	\
 } while (0)
 
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+
+struct ptrauth_keys_kernel {
+	struct ptrauth_key apia;
+};
+
+static __always_inline void ptrauth_keys_init_kernel(struct ptrauth_keys_kernel *keys)
+{
+	if (system_supports_address_auth())
+		get_random_bytes(&keys->apia, sizeof(keys->apia));
+}
+
+static __always_inline void ptrauth_keys_switch_kernel(struct ptrauth_keys_kernel *keys)
+{
+	if (!system_supports_address_auth())
+		return;
+
+	__ptrauth_key_install_nosync(APIA, keys->apia);
+	isb();
+}
+
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
 static inline void ptrauth_keys_install_user(struct ptrauth_keys_user *keys)
 {
 	if (system_supports_address_auth()) {
@@ -69,21 +88,6 @@ static inline void ptrauth_keys_init_user(struct ptrauth_keys_user *keys)
 	ptrauth_keys_install_user(keys);
 }
 
-static __always_inline void ptrauth_keys_init_kernel(struct ptrauth_keys_kernel *keys)
-{
-	if (system_supports_address_auth())
-		get_random_bytes(&keys->apia, sizeof(keys->apia));
-}
-
-static __always_inline void ptrauth_keys_switch_kernel(struct ptrauth_keys_kernel *keys)
-{
-	if (!system_supports_address_auth())
-		return;
-
-	__ptrauth_key_install_nosync(APIA, keys->apia);
-	isb();
-}
-
 extern int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg);
 
 extern int ptrauth_set_enabled_keys(struct task_struct *tsk, unsigned long keys,
@@ -121,11 +125,6 @@ static __always_inline void ptrauth_enable(void)
 #define ptrauth_thread_switch_user(tsk)                                        \
 	ptrauth_keys_install_user(&(tsk)->thread.keys_user)
 
-#define ptrauth_thread_init_kernel(tsk)					\
-	ptrauth_keys_init_kernel(&(tsk)->thread.keys_kernel)
-#define ptrauth_thread_switch_kernel(tsk)				\
-	ptrauth_keys_switch_kernel(&(tsk)->thread.keys_kernel)
-
 #else /* CONFIG_ARM64_PTR_AUTH */
 #define ptrauth_enable()
 #define ptrauth_prctl_reset_keys(tsk, arg)	(-EINVAL)
@@ -134,11 +133,19 @@ static __always_inline void ptrauth_enable(void)
 #define ptrauth_strip_insn_pac(lr)	(lr)
 #define ptrauth_suspend_exit()
 #define ptrauth_thread_init_user()
-#define ptrauth_thread_init_kernel(tsk)
 #define ptrauth_thread_switch_user(tsk)
-#define ptrauth_thread_switch_kernel(tsk)
 #endif /* CONFIG_ARM64_PTR_AUTH */
 
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+#define ptrauth_thread_init_kernel(tsk)					\
+	ptrauth_keys_init_kernel(&(tsk)->thread.keys_kernel)
+#define ptrauth_thread_switch_kernel(tsk)				\
+	ptrauth_keys_switch_kernel(&(tsk)->thread.keys_kernel)
+#else
+#define ptrauth_thread_init_kernel(tsk)
+#define ptrauth_thread_switch_kernel(tsk)
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
 #define PR_PAC_ENABLED_KEYS_MASK                                               \
 	(PR_PAC_APIAKEY | PR_PAC_APIBKEY | PR_PAC_APDAKEY | PR_PAC_APDBKEY)
 
diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
index 80e946b2abee..e83f0982b99c 100644
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -23,7 +23,7 @@ static inline void preempt_count_set(u64 pc)
 } while (0)
 
 #define init_idle_preempt_count(p, cpu) do { \
-	task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+	task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
 } while (0)
 
 static inline void set_preempt_need_resched(void)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 9df3feeee890..b6517fd03d7b 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -148,8 +148,10 @@ struct thread_struct {
 	struct debug_info	debug;		/* debugging */
 #ifdef CONFIG_ARM64_PTR_AUTH
 	struct ptrauth_keys_user	keys_user;
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
 	struct ptrauth_keys_kernel	keys_kernel;
 #endif
+#endif
 #ifdef CONFIG_ARM64_MTE
 	u64			gcr_user_excl;
 #endif
@@ -257,8 +259,6 @@ void set_task_sctlr_el1(u64 sctlr);
 extern struct task_struct *cpu_switch_to(struct task_struct *prev,
 					 struct task_struct *next);
 
-asmlinkage void arm64_preempt_schedule_irq(void);
-
 #define task_pt_regs(p) \
 	((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1)
 
@@ -329,13 +329,13 @@ long get_tagged_addr_ctrl(struct task_struct *task);
  * of header definitions for the use of task_stack_page.
  */
 
-#define current_top_of_stack()							\
-({										\
-	struct stack_info _info;						\
-	BUG_ON(!on_accessible_stack(current, current_stack_pointer, &_info));	\
-	_info.high;								\
+#define current_top_of_stack()								\
+({											\
+	struct stack_info _info;							\
+	BUG_ON(!on_accessible_stack(current, current_stack_pointer, 1, &_info));	\
+	_info.high;									\
 })
-#define on_thread_stack()	(on_task_stack(current, current_stack_pointer, NULL))
+#define on_thread_stack()	(on_task_stack(current, current_stack_pointer, 1, NULL))
 
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index eaa2cd92e4c1..8297bccf0784 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -9,18 +9,18 @@
 #ifdef CONFIG_SHADOW_CALL_STACK
 	scs_sp	.req	x18
 
-	.macro scs_load tsk, tmp
+	.macro scs_load tsk
 	ldr	scs_sp, [\tsk, #TSK_TI_SCS_SP]
 	.endm
 
-	.macro scs_save tsk, tmp
+	.macro scs_save tsk
 	str	scs_sp, [\tsk, #TSK_TI_SCS_SP]
 	.endm
 #else
-	.macro scs_load tsk, tmp
+	.macro scs_load tsk
 	.endm
 
-	.macro scs_save tsk, tmp
+	.macro scs_save tsk
 	.endm
 #endif /* CONFIG_SHADOW_CALL_STACK */
 
diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 63e0b92a5fbb..7bea1d705dd6 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -37,13 +37,17 @@ struct sdei_registered_event;
 asmlinkage unsigned long __sdei_handler(struct pt_regs *regs,
 					struct sdei_registered_event *arg);
 
+unsigned long do_sdei_event(struct pt_regs *regs,
+			    struct sdei_registered_event *arg);
+
 unsigned long sdei_arch_get_entry_point(int conduit);
 #define sdei_arch_get_entry_point(x)	sdei_arch_get_entry_point(x)
 
 struct stack_info;
 
-bool _on_sdei_stack(unsigned long sp, struct stack_info *info);
-static inline bool on_sdei_stack(unsigned long sp,
+bool _on_sdei_stack(unsigned long sp, unsigned long size,
+		    struct stack_info *info);
+static inline bool on_sdei_stack(unsigned long sp, unsigned long size,
 				struct stack_info *info)
 {
 	if (!IS_ENABLED(CONFIG_VMAP_STACK))
@@ -51,7 +55,7 @@ static inline bool on_sdei_stack(unsigned long sp,
 	if (!IS_ENABLED(CONFIG_ARM_SDE_INTERFACE))
 		return false;
 	if (in_nmi())
-		return _on_sdei_stack(sp, info);
+		return _on_sdei_stack(sp, size, info);
 
 	return false;
 }
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 0e357757c0cc..fc55f5a57a06 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -73,12 +73,10 @@ asmlinkage void secondary_start_kernel(void);
 
 /*
  * Initial data for bringing up a secondary CPU.
- * @stack  - sp for the secondary CPU
  * @status - Result passed back from the secondary CPU to
  *           indicate failure.
  */
 struct secondary_data {
-	void *stack;
 	struct task_struct *task;
 	long status;
 };
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 4b33ca620679..1801399204d7 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -69,14 +69,14 @@ extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 
 DECLARE_PER_CPU(unsigned long *, irq_stack_ptr);
 
-static inline bool on_stack(unsigned long sp, unsigned long low,
-				unsigned long high, enum stack_type type,
-				struct stack_info *info)
+static inline bool on_stack(unsigned long sp, unsigned long size,
+			    unsigned long low, unsigned long high,
+			    enum stack_type type, struct stack_info *info)
 {
 	if (!low)
 		return false;
 
-	if (sp < low || sp >= high)
+	if (sp < low || sp + size < sp || sp + size > high)
 		return false;
 
 	if (info) {
@@ -87,38 +87,38 @@ static inline bool on_stack(unsigned long sp, unsigned long low,
 	return true;
 }
 
-static inline bool on_irq_stack(unsigned long sp,
+static inline bool on_irq_stack(unsigned long sp, unsigned long size,
 				struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr);
 	unsigned long high = low + IRQ_STACK_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_IRQ, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_IRQ, info);
 }
 
 static inline bool on_task_stack(const struct task_struct *tsk,
-				 unsigned long sp,
+				 unsigned long sp, unsigned long size,
 				 struct stack_info *info)
 {
 	unsigned long low = (unsigned long)task_stack_page(tsk);
 	unsigned long high = low + THREAD_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_TASK, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_TASK, info);
 }
 
 #ifdef CONFIG_VMAP_STACK
 DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack);
 
-static inline bool on_overflow_stack(unsigned long sp,
+static inline bool on_overflow_stack(unsigned long sp, unsigned long size,
 				struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack);
 	unsigned long high = low + OVERFLOW_STACK_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_OVERFLOW, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info);
 }
 #else
-static inline bool on_overflow_stack(unsigned long sp,
+static inline bool on_overflow_stack(unsigned long sp, unsigned long size,
 			struct stack_info *info) { return false; }
 #endif
 
@@ -128,21 +128,21 @@ static inline bool on_overflow_stack(unsigned long sp,
  * context.
  */
 static inline bool on_accessible_stack(const struct task_struct *tsk,
-				       unsigned long sp,
+				       unsigned long sp, unsigned long size,
 				       struct stack_info *info)
 {
 	if (info)
 		info->type = STACK_TYPE_UNKNOWN;
 
-	if (on_task_stack(tsk, sp, info))
+	if (on_task_stack(tsk, sp, size, info))
 		return true;
 	if (tsk != current || preemptible())
 		return false;
-	if (on_irq_stack(sp, info))
+	if (on_irq_stack(sp, size, info))
 		return true;
-	if (on_overflow_stack(sp, info))
+	if (on_overflow_stack(sp, size, info))
 		return true;
-	if (on_sdei_stack(sp, info))
+	if (on_sdei_stack(sp, size, info))
 		return true;
 
 	return false;
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 65d15700a168..7b9c3acba684 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -651,7 +651,8 @@
 
 #define INIT_SCTLR_EL2_MMU_ON						\
 	(SCTLR_ELx_M  | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I |	\
-	 SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1)
+	 SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 |		\
+	 SCTLR_ELx_ITFSB | SCTLR_EL2_RES1)
 
 #define INIT_SCTLR_EL2_MMU_OFF \
 	(SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
@@ -703,9 +704,7 @@
 /* MAIR_ELx memory attributes (used by Linux) */
 #define MAIR_ATTR_DEVICE_nGnRnE		UL(0x00)
 #define MAIR_ATTR_DEVICE_nGnRE		UL(0x04)
-#define MAIR_ATTR_DEVICE_GRE		UL(0x0c)
 #define MAIR_ATTR_NORMAL_NC		UL(0x44)
-#define MAIR_ATTR_NORMAL_WT		UL(0xbb)
 #define MAIR_ATTR_NORMAL_TAGGED		UL(0xf0)
 #define MAIR_ATTR_NORMAL		UL(0xff)
 #define MAIR_ATTR_MASK			UL(0xff)
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 61c97d3b58c7..c995d1f4594f 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -28,6 +28,10 @@ static void tlb_flush(struct mmu_gather *tlb);
  */
 static inline int tlb_get_level(struct mmu_gather *tlb)
 {
+	/* The TTL field is only valid for the leaf entry. */
+	if (tlb->freed_tables)
+		return 0;
+
 	if (tlb->cleared_ptes && !(tlb->cleared_pmds ||
 				   tlb->cleared_puds ||
 				   tlb->cleared_p4ds))
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 5dab69d2c22b..99ffcafc736c 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -893,7 +893,8 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise)
 __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
 #define __NR_mount_setattr 442
 __SYSCALL(__NR_mount_setattr, sys_mount_setattr)
-/* 443 is reserved for quotactl_path */
+#define __NR_quotactl_fd 443
+__SYSCALL(__NR_quotactl_fd, sys_quotactl_fd)
 #define __NR_landlock_create_ruleset 444
 __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
 #define __NR_landlock_add_rule 445
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 24223adae150..b3edde68bc3e 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -184,6 +184,17 @@ struct kvm_vcpu_events {
 	__u32 reserved[12];
 };
 
+struct kvm_arm_copy_mte_tags {
+	__u64 guest_ipa;
+	__u64 length;
+	void __user *addr;
+	__u64 flags;
+	__u64 reserved[2];
+};
+
+#define KVM_ARM_TAGS_TO_GUEST		0
+#define KVM_ARM_TAGS_FROM_GUEST		1
+
 /* If you need to interpret the index values, here is the key: */
 #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
 #define KVM_REG_ARM_COPROC_SHIFT	16
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 6cc97730790e..cce308586fcc 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,15 +14,22 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_syscall.o	 = -fstack-protector -fstack-protector-strong
 CFLAGS_syscall.o	+= -fno-stack-protector
 
+# It's not safe to invoke KCOV when portions of the kernel environment aren't
+# available or are out-of-sync with HW state. Since `noinstr` doesn't always
+# inhibit KCOV instrumentation, disable it for the entire compilation unit.
+KCOV_INSTRUMENT_entry.o := n
+KCOV_INSTRUMENT_idle.o := n
+
 # Object file lists.
 obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   entry-common.o entry-fpsimd.o process.o ptrace.o	\
 			   setup.o signal.o sys.o stacktrace.o time.o traps.o	\
-			   io.o vdso.o hyp-stub.o psci.o cpu_ops.o insn.o	\
+			   io.o vdso.o hyp-stub.o psci.o cpu_ops.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o proton-pack.o idreg-override.o
+			   syscall.o proton-pack.o idreg-override.o idle.o	\
+			   patching.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index cada0b816c8a..f3851724fe35 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -239,6 +239,18 @@ done:
 	}
 }
 
+static pgprot_t __acpi_get_writethrough_mem_attribute(void)
+{
+	/*
+	 * Although UEFI specifies the use of Normal Write-through for
+	 * EFI_MEMORY_WT, it is seldom used in practice and not implemented
+	 * by most (all?) CPUs. Rather than allocate a MAIR just for this
+	 * purpose, emit a warning and use Normal Non-cacheable instead.
+	 */
+	pr_warn_once("No MAIR allocation for EFI_MEMORY_WT; treating as Normal Non-cacheable\n");
+	return __pgprot(PROT_NORMAL_NC);
+}
+
 pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
 {
 	/*
@@ -246,7 +258,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
 	 * types" of UEFI 2.5 section 2.3.6.1, each EFI memory type is
 	 * mapped to a corresponding MAIR attribute encoding.
 	 * The EFI memory attribute advises all possible capabilities
-	 * of a memory region. We use the most efficient capability.
+	 * of a memory region.
 	 */
 
 	u64 attr;
@@ -254,10 +266,10 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
 	attr = efi_mem_attributes(addr);
 	if (attr & EFI_MEMORY_WB)
 		return PAGE_KERNEL;
-	if (attr & EFI_MEMORY_WT)
-		return __pgprot(PROT_NORMAL_WT);
 	if (attr & EFI_MEMORY_WC)
 		return __pgprot(PROT_NORMAL_NC);
+	if (attr & EFI_MEMORY_WT)
+		return __acpi_get_writethrough_mem_attribute();
 	return __pgprot(PROT_DEVICE_nGnRnE);
 }
 
@@ -340,10 +352,10 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
 		default:
 			if (region->attribute & EFI_MEMORY_WB)
 				prot = PAGE_KERNEL;
-			else if (region->attribute & EFI_MEMORY_WT)
-				prot = __pgprot(PROT_NORMAL_WT);
 			else if (region->attribute & EFI_MEMORY_WC)
 				prot = __pgprot(PROT_NORMAL_NC);
+			else if (region->attribute & EFI_MEMORY_WT)
+				prot = __acpi_get_writethrough_mem_attribute();
 		}
 	}
 	return __ioremap(phys, size, prot);
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index c906d20c7b52..3fb79b76e9d9 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -181,7 +181,7 @@ static void __nocfi __apply_alternatives(struct alt_region *region, bool is_modu
 	 */
 	if (!is_module) {
 		dsb(ish);
-		__flush_icache_all();
+		icache_inval_all_pou();
 		isb();
 
 		/* Ignore ARM64_CB bit from feature mask */
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 0cb34ccb6e73..c85670692afa 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -27,6 +27,7 @@
 int main(void)
 {
   DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
+  DEFINE(TSK_CPU,		offsetof(struct task_struct, cpu));
   BLANK();
   DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
   DEFINE(TSK_TI_PREEMPT,	offsetof(struct task_struct, thread_info.preempt_count));
@@ -46,6 +47,8 @@ int main(void)
   DEFINE(THREAD_SCTLR_USER,	offsetof(struct task_struct, thread.sctlr_user));
 #ifdef CONFIG_ARM64_PTR_AUTH
   DEFINE(THREAD_KEYS_USER,	offsetof(struct task_struct, thread.keys_user));
+#endif
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
   DEFINE(THREAD_KEYS_KERNEL,	offsetof(struct task_struct, thread.keys_kernel));
 #endif
 #ifdef CONFIG_ARM64_MTE
@@ -99,7 +102,6 @@ int main(void)
   DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
   DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
   BLANK();
-  DEFINE(CPU_BOOT_STACK,	offsetof(struct secondary_data, stack));
   DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));
   BLANK();
   DEFINE(FTR_OVR_VAL_OFFSET,	offsetof(struct arm64_ftr_override, val));
@@ -111,6 +113,8 @@ int main(void)
   DEFINE(VCPU_WORKAROUND_FLAGS,	offsetof(struct kvm_vcpu, arch.workaround_flags));
   DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
   DEFINE(CPU_USER_PT_REGS,	offsetof(struct kvm_cpu_context, regs));
+  DEFINE(CPU_RGSR_EL1,		offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1]));
+  DEFINE(CPU_GCR_EL1,		offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1]));
   DEFINE(CPU_APIAKEYLO_EL1,	offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1]));
   DEFINE(CPU_APIBKEYLO_EL1,	offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1]));
   DEFINE(CPU_APDAKEYLO_EL1,	offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1]));
@@ -138,6 +142,15 @@ int main(void)
   DEFINE(ARM_SMCCC_RES_X2_OFFS,		offsetof(struct arm_smccc_res, a2));
   DEFINE(ARM_SMCCC_QUIRK_ID_OFFS,	offsetof(struct arm_smccc_quirk, id));
   DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS,	offsetof(struct arm_smccc_quirk, state));
+  DEFINE(ARM_SMCCC_1_2_REGS_X0_OFFS,	offsetof(struct arm_smccc_1_2_regs, a0));
+  DEFINE(ARM_SMCCC_1_2_REGS_X2_OFFS,	offsetof(struct arm_smccc_1_2_regs, a2));
+  DEFINE(ARM_SMCCC_1_2_REGS_X4_OFFS,	offsetof(struct arm_smccc_1_2_regs, a4));
+  DEFINE(ARM_SMCCC_1_2_REGS_X6_OFFS,	offsetof(struct arm_smccc_1_2_regs, a6));
+  DEFINE(ARM_SMCCC_1_2_REGS_X8_OFFS,	offsetof(struct arm_smccc_1_2_regs, a8));
+  DEFINE(ARM_SMCCC_1_2_REGS_X10_OFFS,	offsetof(struct arm_smccc_1_2_regs, a10));
+  DEFINE(ARM_SMCCC_1_2_REGS_X12_OFFS,	offsetof(struct arm_smccc_1_2_regs, a12));
+  DEFINE(ARM_SMCCC_1_2_REGS_X14_OFFS,	offsetof(struct arm_smccc_1_2_regs, a14));
+  DEFINE(ARM_SMCCC_1_2_REGS_X16_OFFS,	offsetof(struct arm_smccc_1_2_regs, a16));
   BLANK();
   DEFINE(HIBERN_PBE_ORIG,	offsetof(struct pbe, orig_address));
   DEFINE(HIBERN_PBE_ADDR,	offsetof(struct pbe, address));
@@ -153,7 +166,9 @@ int main(void)
 #endif
 #ifdef CONFIG_ARM64_PTR_AUTH
   DEFINE(PTRAUTH_USER_KEY_APIA,		offsetof(struct ptrauth_keys_user, apia));
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
   DEFINE(PTRAUTH_KERNEL_KEY_APIA,	offsetof(struct ptrauth_keys_kernel, apia));
+#endif
   BLANK();
 #endif
   return 0;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index efed2830d141..125d5c9471ac 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -76,6 +76,7 @@
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
 #include <asm/fpsimd.h>
+#include <asm/insn.h>
 #include <asm/kvm_host.h>
 #include <asm/mmu_context.h>
 #include <asm/mte.h>
@@ -108,6 +109,24 @@ bool arm64_use_ng_mappings = false;
 EXPORT_SYMBOL(arm64_use_ng_mappings);
 
 /*
+ * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs
+ * support it?
+ */
+static bool __read_mostly allow_mismatched_32bit_el0;
+
+/*
+ * Static branch enabled only if allow_mismatched_32bit_el0 is set and we have
+ * seen at least one CPU capable of 32-bit EL0.
+ */
+DEFINE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);
+
+/*
+ * Mask of CPUs supporting 32-bit EL0.
+ * Only valid if arm64_mismatched_32bit_el0 is enabled.
+ */
+static cpumask_var_t cpu_32bit_el0_mask __cpumask_var_read_mostly;
+
+/*
  * Flag to indicate if we have computed the system wide
  * capabilities based on the boot time active CPUs. This
  * will be used to determine if a new booting CPU should
@@ -400,6 +419,11 @@ static const struct arm64_ftr_bits ftr_dczid[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_gmid[] = {
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, SYS_GMID_EL1_BS_SHIFT, 4, 0),
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_isar0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DIVIDE_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DEBUG_SHIFT, 4, 0),
@@ -617,6 +641,9 @@ static const struct __ftr_reg_entry {
 	/* Op1 = 0, CRn = 1, CRm = 2 */
 	ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr),
 
+	/* Op1 = 1, CRn = 0, CRm = 0 */
+	ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid),
+
 	/* Op1 = 3, CRn = 0, CRm = 0 */
 	{ SYS_CTR_EL0, &arm64_ftr_reg_ctrel0 },
 	ARM64_FTR_REG(SYS_DCZID_EL0, ftr_dczid),
@@ -767,7 +794,7 @@ static void __init sort_ftr_regs(void)
  * Any bits that are not covered by an arm64_ftr_bits entry are considered
  * RES0 for the system-wide value, and must strictly match.
  */
-static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new)
+static void init_cpu_ftr_reg(u32 sys_reg, u64 new)
 {
 	u64 val = 0;
 	u64 strict_mask = ~0x0ULL;
@@ -863,6 +890,31 @@ static void __init init_cpu_hwcaps_indirect_list(void)
 
 static void __init setup_boot_cpu_capabilities(void);
 
+static void init_32bit_cpu_features(struct cpuinfo_32bit *info)
+{
+	init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
+	init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1);
+	init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
+	init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
+	init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
+	init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3);
+	init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4);
+	init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5);
+	init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6);
+	init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0);
+	init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
+	init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
+	init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
+	init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4);
+	init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5);
+	init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
+	init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
+	init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2);
+	init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
+	init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
+	init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
+}
+
 void __init init_cpu_features(struct cpuinfo_arm64 *info)
 {
 	/* Before we start using the tables, make sure it is sorted */
@@ -882,35 +934,17 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0);
 
-	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
-		init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
-		init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1);
-		init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
-		init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
-		init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
-		init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3);
-		init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4);
-		init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5);
-		init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6);
-		init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0);
-		init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
-		init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
-		init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
-		init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4);
-		init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5);
-		init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
-		init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
-		init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2);
-		init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
-		init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
-		init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
-	}
+	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
+		init_32bit_cpu_features(&info->aarch32);
 
 	if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) {
 		init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr);
 		sve_init_vq_map();
 	}
 
+	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
+		init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid);
+
 	/*
 	 * Initialize the indirect array of CPU hwcaps capabilities pointers
 	 * before we handle the boot CPU below.
@@ -975,21 +1009,29 @@ static void relax_cpu_ftr_reg(u32 sys_id, int field)
 	WARN_ON(!ftrp->width);
 }
 
-static int update_32bit_cpu_features(int cpu, struct cpuinfo_arm64 *info,
-				     struct cpuinfo_arm64 *boot)
+static void lazy_init_32bit_cpu_features(struct cpuinfo_arm64 *info,
+					 struct cpuinfo_arm64 *boot)
+{
+	static bool boot_cpu_32bit_regs_overridden = false;
+
+	if (!allow_mismatched_32bit_el0 || boot_cpu_32bit_regs_overridden)
+		return;
+
+	if (id_aa64pfr0_32bit_el0(boot->reg_id_aa64pfr0))
+		return;
+
+	boot->aarch32 = info->aarch32;
+	init_32bit_cpu_features(&boot->aarch32);
+	boot_cpu_32bit_regs_overridden = true;
+}
+
+static int update_32bit_cpu_features(int cpu, struct cpuinfo_32bit *info,
+				     struct cpuinfo_32bit *boot)
 {
 	int taint = 0;
 	u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
 
 	/*
-	 * If we don't have AArch32 at all then skip the checks entirely
-	 * as the register values may be UNKNOWN and we're not going to be
-	 * using them for anything.
-	 */
-	if (!id_aa64pfr0_32bit_el0(pfr0))
-		return taint;
-
-	/*
 	 * If we don't have AArch32 at EL1, then relax the strictness of
 	 * EL1-dependent register fields to avoid spurious sanity check fails.
 	 */
@@ -1135,10 +1177,29 @@ void update_cpu_features(int cpu,
 	}
 
 	/*
+	 * The kernel uses the LDGM/STGM instructions and the number of tags
+	 * they read/write depends on the GMID_EL1.BS field. Check that the
+	 * value is the same on all CPUs.
+	 */
+	if (IS_ENABLED(CONFIG_ARM64_MTE) &&
+	    id_aa64pfr1_mte(info->reg_id_aa64pfr1)) {
+		taint |= check_update_ftr_reg(SYS_GMID_EL1, cpu,
+					      info->reg_gmid, boot->reg_gmid);
+	}
+
+	/*
+	 * If we don't have AArch32 at all then skip the checks entirely
+	 * as the register values may be UNKNOWN and we're not going to be
+	 * using them for anything.
+	 *
 	 * This relies on a sanitised view of the AArch64 ID registers
 	 * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last.
 	 */
-	taint |= update_32bit_cpu_features(cpu, info, boot);
+	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
+		lazy_init_32bit_cpu_features(info, boot);
+		taint |= update_32bit_cpu_features(cpu, &info->aarch32,
+						   &boot->aarch32);
+	}
 
 	/*
 	 * Mismatched CPU features are a recipe for disaster. Don't even
@@ -1248,6 +1309,28 @@ has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope)
 	return feature_matches(val, entry);
 }
 
+const struct cpumask *system_32bit_el0_cpumask(void)
+{
+	if (!system_supports_32bit_el0())
+		return cpu_none_mask;
+
+	if (static_branch_unlikely(&arm64_mismatched_32bit_el0))
+		return cpu_32bit_el0_mask;
+
+	return cpu_possible_mask;
+}
+
+static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope)
+{
+	if (!has_cpuid_feature(entry, scope))
+		return allow_mismatched_32bit_el0;
+
+	if (scope == SCOPE_SYSTEM)
+		pr_info("detected: 32-bit EL0 Support\n");
+
+	return true;
+}
+
 static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope)
 {
 	bool has_sre;
@@ -1866,10 +1949,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_copy_el2regs,
 	},
 	{
-		.desc = "32-bit EL0 Support",
-		.capability = ARM64_HAS_32BIT_EL0,
+		.capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE,
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
-		.matches = has_cpuid_feature,
+		.matches = has_32bit_el0,
 		.sys_reg = SYS_ID_AA64PFR0_EL1,
 		.sign = FTR_UNSIGNED,
 		.field_pos = ID_AA64PFR0_EL0_SHIFT,
@@ -2378,7 +2460,7 @@ static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = {
 	{},
 };
 
-static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap)
+static void cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 {
 	switch (cap->hwcap_type) {
 	case CAP_HWCAP:
@@ -2423,7 +2505,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 	return rc;
 }
 
-static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
+static void setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
 {
 	/* We support emulation of accesses to CPU ID feature registers */
 	cpu_set_named_feature(CPUID);
@@ -2598,7 +2680,7 @@ static void check_early_cpu_features(void)
 }
 
 static void
-verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
+__verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
 {
 
 	for (; caps->matches; caps++)
@@ -2609,6 +2691,14 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
 		}
 }
 
+static void verify_local_elf_hwcaps(void)
+{
+	__verify_local_elf_hwcaps(arm64_elf_hwcaps);
+
+	if (id_aa64pfr0_32bit_el0(read_cpuid(ID_AA64PFR0_EL1)))
+		__verify_local_elf_hwcaps(compat_elf_hwcaps);
+}
+
 static void verify_sve_features(void)
 {
 	u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1);
@@ -2673,11 +2763,7 @@ static void verify_local_cpu_capabilities(void)
 	 * on all secondary CPUs.
 	 */
 	verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU);
-
-	verify_local_elf_hwcaps(arm64_elf_hwcaps);
-
-	if (system_supports_32bit_el0())
-		verify_local_elf_hwcaps(compat_elf_hwcaps);
+	verify_local_elf_hwcaps();
 
 	if (system_supports_sve())
 		verify_sve_features();
@@ -2812,6 +2898,34 @@ void __init setup_cpu_features(void)
 			ARCH_DMA_MINALIGN);
 }
 
+static int enable_mismatched_32bit_el0(unsigned int cpu)
+{
+	struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu);
+	bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0);
+
+	if (cpu_32bit) {
+		cpumask_set_cpu(cpu, cpu_32bit_el0_mask);
+		static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0);
+		setup_elf_hwcaps(compat_elf_hwcaps);
+	}
+
+	return 0;
+}
+
+static int __init init_32bit_el0_mask(void)
+{
+	if (!allow_mismatched_32bit_el0)
+		return 0;
+
+	if (!zalloc_cpumask_var(&cpu_32bit_el0_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	return cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				 "arm64/mismatched_32bit_el0:online",
+				 enable_mismatched_32bit_el0, NULL);
+}
+subsys_initcall_sync(init_32bit_el0_mask);
+
 static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap)
 {
 	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
@@ -2905,8 +3019,8 @@ static int emulate_mrs(struct pt_regs *regs, u32 insn)
 }
 
 static struct undef_hook mrs_hook = {
-	.instr_mask = 0xfff00000,
-	.instr_val  = 0xd5300000,
+	.instr_mask = 0xffff0000,
+	.instr_val  = 0xd5380000,
 	.pstate_mask = PSR_AA32_MODE_MASK,
 	.pstate_val = PSR_MODE_EL0t,
 	.fn = emulate_mrs,
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 51fcf99d5351..87731fea5e41 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -246,7 +246,7 @@ static struct kobj_type cpuregs_kobj_type = {
 		struct cpuinfo_arm64 *info = kobj_to_cpuinfo(kobj);		\
 										\
 		if (info->reg_midr)						\
-			return sprintf(buf, "0x%016x\n", info->reg_##_field);	\
+			return sprintf(buf, "0x%016llx\n", info->reg_##_field);	\
 		else								\
 			return 0;						\
 	}									\
@@ -344,6 +344,32 @@ static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
 	pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu);
 }
 
+static void __cpuinfo_store_cpu_32bit(struct cpuinfo_32bit *info)
+{
+	info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
+	info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1);
+	info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
+	info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
+	info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
+	info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
+	info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
+	info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
+	info->reg_id_isar6 = read_cpuid(ID_ISAR6_EL1);
+	info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
+	info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
+	info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
+	info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
+	info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1);
+	info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1);
+	info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
+	info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
+	info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1);
+
+	info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
+	info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
+	info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
+}
+
 static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 {
 	info->reg_cntfrq = arch_timer_get_cntfrq();
@@ -371,31 +397,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
 	info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1);
 
-	/* Update the 32bit ID registers only if AArch32 is implemented */
-	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
-		info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
-		info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1);
-		info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
-		info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
-		info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
-		info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
-		info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
-		info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
-		info->reg_id_isar6 = read_cpuid(ID_ISAR6_EL1);
-		info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
-		info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
-		info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
-		info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
-		info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1);
-		info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1);
-		info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
-		info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
-		info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1);
-
-		info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
-		info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
-		info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
-	}
+	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
+		info->reg_gmid = read_cpuid(GMID_EL1);
+
+	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
+		__cpuinfo_store_cpu_32bit(&info->aarch32);
 
 	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
 	    id_aa64pfr0_sve(info->reg_id_aa64pfr0))
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index 0073b24b5d25..61a87fa1c305 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -28,7 +28,8 @@ SYM_CODE_START(efi_enter_kernel)
 	 * stale icache entries from before relocation.
 	 */
 	ldr	w1, =kernel_size
-	bl	__clean_dcache_area_poc
+	add	x1, x0, x1
+	bl	dcache_clean_poc
 	ic	ialluis
 
 	/*
@@ -36,8 +37,8 @@ SYM_CODE_START(efi_enter_kernel)
 	 * so that we can safely disable the MMU and caches.
 	 */
 	adr	x0, 0f
-	ldr	w1, 3f
-	bl	__clean_dcache_area_poc
+	adr	x1, 3f
+	bl	dcache_clean_poc
 0:
 	/* Turn off Dcache and MMU */
 	mrs	x0, CurrentEL
@@ -64,5 +65,5 @@ SYM_CODE_START(efi_enter_kernel)
 	mov	x2, xzr
 	mov	x3, xzr
 	br	x19
+3:
 SYM_CODE_END(efi_enter_kernel)
-3:	.long	. - 0b
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 340d04e13617..12ce14a98b7c 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -6,7 +6,11 @@
  */
 
 #include <linux/context_tracking.h>
+#include <linux/linkage.h>
+#include <linux/lockdep.h>
 #include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/thread_info.h>
 
 #include <asm/cpufeature.h>
@@ -15,7 +19,11 @@
 #include <asm/exception.h>
 #include <asm/kprobes.h>
 #include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/sdei.h>
+#include <asm/stacktrace.h>
 #include <asm/sysreg.h>
+#include <asm/system_misc.h>
 
 /*
  * This is intended to match the logic in irqentry_enter(), handling the kernel
@@ -67,7 +75,7 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs)
 	}
 }
 
-void noinstr arm64_enter_nmi(struct pt_regs *regs)
+static void noinstr arm64_enter_nmi(struct pt_regs *regs)
 {
 	regs->lockdep_hardirqs = lockdep_hardirqs_enabled();
 
@@ -80,7 +88,7 @@ void noinstr arm64_enter_nmi(struct pt_regs *regs)
 	ftrace_nmi_enter();
 }
 
-void noinstr arm64_exit_nmi(struct pt_regs *regs)
+static void noinstr arm64_exit_nmi(struct pt_regs *regs)
 {
 	bool restore = regs->lockdep_hardirqs;
 
@@ -97,7 +105,7 @@ void noinstr arm64_exit_nmi(struct pt_regs *regs)
 	__nmi_exit();
 }
 
-asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
+static void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
 {
 	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
 		arm64_enter_nmi(regs);
@@ -105,7 +113,7 @@ asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs)
 		enter_from_kernel_mode(regs);
 }
 
-asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
+static void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
 {
 	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
 		arm64_exit_nmi(regs);
@@ -113,6 +121,65 @@ asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs)
 		exit_to_kernel_mode(regs);
 }
 
+static void __sched arm64_preempt_schedule_irq(void)
+{
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC
+	 * priority masking is used the GIC irqchip driver will clear DAIF.IF
+	 * using gic_arch_enable_irqs() for normal IRQs. If anything is set in
+	 * DAIF we must have handled an NMI, so skip preemption.
+	 */
+	if (system_uses_irq_prio_masking() && read_sysreg(daif))
+		return;
+
+	/*
+	 * Preempting a task from an IRQ means we leave copies of PSTATE
+	 * on the stack. cpufeature's enable calls may modify PSTATE, but
+	 * resuming one of these preempted tasks would undo those changes.
+	 *
+	 * Only allow a task to be preempted once cpufeatures have been
+	 * enabled.
+	 */
+	if (system_capabilities_finalized())
+		preempt_schedule_irq();
+}
+
+static void do_interrupt_handler(struct pt_regs *regs,
+				 void (*handler)(struct pt_regs *))
+{
+	if (on_thread_stack())
+		call_on_irq_stack(regs, handler);
+	else
+		handler(regs);
+}
+
+extern void (*handle_arch_irq)(struct pt_regs *);
+extern void (*handle_arch_fiq)(struct pt_regs *);
+
+static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector,
+				      unsigned int esr)
+{
+	arm64_enter_nmi(regs);
+
+	console_verbose();
+
+	pr_crit("Unhandled %s exception on CPU%d, ESR 0x%08x -- %s\n",
+		vector, smp_processor_id(), esr,
+		esr_get_class_string(esr));
+
+	__show_regs(regs);
+	panic("Unhandled exception");
+}
+
+#define UNHANDLED(el, regsize, vector)							\
+asmlinkage void noinstr el##_##regsize##_##vector##_handler(struct pt_regs *regs)	\
+{											\
+	const char *desc = #regsize "-bit " #el " " #vector;				\
+	__panic_unhandled(regs, desc, read_sysreg(esr_el1));				\
+}
+
 #ifdef CONFIG_ARM64_ERRATUM_1463225
 static DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
 
@@ -162,6 +229,11 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
 }
 #endif /* CONFIG_ARM64_ERRATUM_1463225 */
 
+UNHANDLED(el1t, 64, sync)
+UNHANDLED(el1t, 64, irq)
+UNHANDLED(el1t, 64, fiq)
+UNHANDLED(el1t, 64, error)
+
 static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr)
 {
 	unsigned long far = read_sysreg(far_el1);
@@ -193,15 +265,6 @@ static void noinstr el1_undef(struct pt_regs *regs)
 	exit_to_kernel_mode(regs);
 }
 
-static void noinstr el1_inv(struct pt_regs *regs, unsigned long esr)
-{
-	enter_from_kernel_mode(regs);
-	local_daif_inherit(regs);
-	bad_mode(regs, 0, esr);
-	local_daif_mask();
-	exit_to_kernel_mode(regs);
-}
-
 static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs)
 {
 	regs->lockdep_hardirqs = lockdep_hardirqs_enabled();
@@ -245,7 +308,7 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
 	exit_to_kernel_mode(regs);
 }
 
-asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs)
+asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
 
@@ -275,10 +338,50 @@ asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs)
 		el1_fpac(regs, esr);
 		break;
 	default:
-		el1_inv(regs, esr);
+		__panic_unhandled(regs, "64-bit el1h sync", esr);
 	}
 }
 
+static void noinstr el1_interrupt(struct pt_regs *regs,
+				  void (*handler)(struct pt_regs *))
+{
+	write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
+
+	enter_el1_irq_or_nmi(regs);
+	do_interrupt_handler(regs, handler);
+
+	/*
+	 * Note: thread_info::preempt_count includes both thread_info::count
+	 * and thread_info::need_resched, and is not equivalent to
+	 * preempt_count().
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPTION) &&
+	    READ_ONCE(current_thread_info()->preempt_count) == 0)
+		arm64_preempt_schedule_irq();
+
+	exit_el1_irq_or_nmi(regs);
+}
+
+asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs)
+{
+	el1_interrupt(regs, handle_arch_irq);
+}
+
+asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs)
+{
+	el1_interrupt(regs, handle_arch_fiq);
+}
+
+asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs)
+{
+	unsigned long esr = read_sysreg(esr_el1);
+
+	local_daif_restore(DAIF_ERRCTX);
+	arm64_enter_nmi(regs);
+	do_serror(regs, esr);
+	arm64_exit_nmi(regs);
+}
+
 asmlinkage void noinstr enter_from_user_mode(void)
 {
 	lockdep_hardirqs_off(CALLER_ADDR0);
@@ -398,7 +501,7 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr)
 
 	enter_from_user_mode();
 	do_debug_exception(far, esr, regs);
-	local_daif_restore(DAIF_PROCCTX_NOIRQ);
+	local_daif_restore(DAIF_PROCCTX);
 }
 
 static void noinstr el0_svc(struct pt_regs *regs)
@@ -415,7 +518,7 @@ static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr)
 	do_ptrauth_fault(regs, esr);
 }
 
-asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
 
@@ -468,6 +571,56 @@ asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs)
 	}
 }
 
+static void noinstr el0_interrupt(struct pt_regs *regs,
+				  void (*handler)(struct pt_regs *))
+{
+	enter_from_user_mode();
+
+	write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
+
+	if (regs->pc & BIT(55))
+		arm64_apply_bp_hardening();
+
+	do_interrupt_handler(regs, handler);
+}
+
+static void noinstr __el0_irq_handler_common(struct pt_regs *regs)
+{
+	el0_interrupt(regs, handle_arch_irq);
+}
+
+asmlinkage void noinstr el0t_64_irq_handler(struct pt_regs *regs)
+{
+	__el0_irq_handler_common(regs);
+}
+
+static void noinstr __el0_fiq_handler_common(struct pt_regs *regs)
+{
+	el0_interrupt(regs, handle_arch_fiq);
+}
+
+asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs)
+{
+	__el0_fiq_handler_common(regs);
+}
+
+static void __el0_error_handler_common(struct pt_regs *regs)
+{
+	unsigned long esr = read_sysreg(esr_el1);
+
+	enter_from_user_mode();
+	local_daif_restore(DAIF_ERRCTX);
+	arm64_enter_nmi(regs);
+	do_serror(regs, esr);
+	arm64_exit_nmi(regs);
+	local_daif_restore(DAIF_PROCCTX);
+}
+
+asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs)
+{
+	__el0_error_handler_common(regs);
+}
+
 #ifdef CONFIG_COMPAT
 static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr)
 {
@@ -483,7 +636,7 @@ static void noinstr el0_svc_compat(struct pt_regs *regs)
 	do_el0_svc_compat(regs);
 }
 
-asmlinkage void noinstr el0_sync_compat_handler(struct pt_regs *regs)
+asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
 
@@ -526,4 +679,71 @@ asmlinkage void noinstr el0_sync_compat_handler(struct pt_regs *regs)
 		el0_inv(regs, esr);
 	}
 }
+
+asmlinkage void noinstr el0t_32_irq_handler(struct pt_regs *regs)
+{
+	__el0_irq_handler_common(regs);
+}
+
+asmlinkage void noinstr el0t_32_fiq_handler(struct pt_regs *regs)
+{
+	__el0_fiq_handler_common(regs);
+}
+
+asmlinkage void noinstr el0t_32_error_handler(struct pt_regs *regs)
+{
+	__el0_error_handler_common(regs);
+}
+#else /* CONFIG_COMPAT */
+UNHANDLED(el0t, 32, sync)
+UNHANDLED(el0t, 32, irq)
+UNHANDLED(el0t, 32, fiq)
+UNHANDLED(el0t, 32, error)
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_VMAP_STACK
+asmlinkage void noinstr handle_bad_stack(struct pt_regs *regs)
+{
+	unsigned int esr = read_sysreg(esr_el1);
+	unsigned long far = read_sysreg(far_el1);
+
+	arm64_enter_nmi(regs);
+	panic_bad_stack(regs, esr, far);
+}
+#endif /* CONFIG_VMAP_STACK */
+
+#ifdef CONFIG_ARM_SDE_INTERFACE
+asmlinkage noinstr unsigned long
+__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg)
+{
+	unsigned long ret;
+
+	/*
+	 * We didn't take an exception to get here, so the HW hasn't
+	 * set/cleared bits in PSTATE that we may rely on.
+	 *
+	 * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to
+	 * whether PSTATE bits are inherited unchanged or generated from
+	 * scratch, and the TF-A implementation always clears PAN and always
+	 * clears UAO. There are no other known implementations.
+	 *
+	 * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how
+	 * PSTATE is modified upon architectural exceptions, and so PAN is
+	 * either inherited or set per SCTLR_ELx.SPAN, and UAO is always
+	 * cleared.
+	 *
+	 * We must explicitly reset PAN to the expected state, including
+	 * clearing it when the host isn't using it, in case a VM had it set.
+	 */
+	if (system_uses_hw_pan())
+		set_pstate_pan(1);
+	else if (cpu_has_pan())
+		set_pstate_pan(0);
+
+	arm64_enter_nmi(regs);
+	ret = do_sdei_event(regs, arg);
+	arm64_exit_nmi(regs);
+
+	return ret;
+}
+#endif /* CONFIG_ARM_SDE_INTERFACE */
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 3ecec60d3295..0a7a64753878 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -63,16 +63,24 @@ SYM_FUNC_END(sve_set_vq)
  * and the rest zeroed. All the other SVE registers will be zeroed.
  */
 SYM_FUNC_START(sve_load_from_fpsimd_state)
-		sve_load_vq	x1, x2, x3
-		fpsimd_restore	x0, 8
- _for n, 0, 15, _sve_pfalse	\n
-		_sve_wrffr	0
-		ret
+	sve_load_vq	x1, x2, x3
+	fpsimd_restore	x0, 8
+	sve_flush_p_ffr
+	ret
 SYM_FUNC_END(sve_load_from_fpsimd_state)
 
-/* Zero all SVE registers but the first 128-bits of each vector */
+/*
+ * Zero all SVE registers but the first 128-bits of each vector
+ *
+ * VQ must already be configured by caller, any further updates of VQ
+ * will need to ensure that the register state remains valid.
+ *
+ * x0 = VQ - 1
+ */
 SYM_FUNC_START(sve_flush_live)
-	sve_flush
+	cbz		x0, 1f	// A VQ-1 of 0 is 128 bits so no extra Z state
+	sve_flush_z
+1:	sve_flush_p_ffr
 	ret
 SYM_FUNC_END(sve_flush_live)
 
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 3513984a88bd..863d44f73028 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -33,12 +33,6 @@
  * Context tracking and irqflag tracing need to instrument transitions between
  * user and kernel mode.
  */
-	.macro user_exit_irqoff
-#if defined(CONFIG_CONTEXT_TRACKING) || defined(CONFIG_TRACE_IRQFLAGS)
-	bl	enter_from_user_mode
-#endif
-	.endm
-
 	.macro user_enter_irqoff
 #if defined(CONFIG_CONTEXT_TRACKING) || defined(CONFIG_TRACE_IRQFLAGS)
 	bl	exit_to_user_mode
@@ -51,16 +45,7 @@
 	.endr
 	.endm
 
-/*
- * Bad Abort numbers
- *-----------------
- */
-#define BAD_SYNC	0
-#define BAD_IRQ		1
-#define BAD_FIQ		2
-#define BAD_ERROR	3
-
-	.macro kernel_ventry, el, label, regsize = 64
+	.macro kernel_ventry, el:req, ht:req, regsize:req, label:req
 	.align 7
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	.if	\el == 0
@@ -87,7 +72,7 @@ alternative_else_nop_endif
 	tbnz	x0, #THREAD_SHIFT, 0f
 	sub	x0, sp, x0			// x0'' = sp' - x0' = (sp + x0) - sp = x0
 	sub	sp, sp, x0			// sp'' = sp' - x0 = (sp + x0) - x0 = sp
-	b	el\()\el\()_\label
+	b	el\el\ht\()_\regsize\()_\label
 
 0:
 	/*
@@ -119,7 +104,7 @@ alternative_else_nop_endif
 	sub	sp, sp, x0
 	mrs	x0, tpidrro_el0
 #endif
-	b	el\()\el\()_\label
+	b	el\el\ht\()_\regsize\()_\label
 	.endm
 
 	.macro tramp_alias, dst, sym
@@ -275,7 +260,7 @@ alternative_else_nop_endif
 
 	mte_set_kernel_gcr x22, x23
 
-	scs_load tsk, x20
+	scs_load tsk
 	.else
 	add	x21, sp, #PT_REGS_SIZE
 	get_current_task tsk
@@ -285,7 +270,7 @@ alternative_else_nop_endif
 	stp	lr, x21, [sp, #S_LR]
 
 	/*
-	 * For exceptions from EL0, create a terminal frame record.
+	 * For exceptions from EL0, create a final frame record.
 	 * For exceptions from EL1, create a synthetic frame record so the
 	 * interrupted code shows up in the backtrace.
 	 */
@@ -375,7 +360,7 @@ alternative_if ARM64_WORKAROUND_845719
 alternative_else_nop_endif
 #endif
 3:
-	scs_save tsk, x0
+	scs_save tsk
 
 #ifdef CONFIG_ARM64_PTR_AUTH
 alternative_if ARM64_HAS_ADDRESS_AUTH
@@ -486,63 +471,12 @@ SYM_CODE_START_LOCAL(__swpan_exit_el0)
 SYM_CODE_END(__swpan_exit_el0)
 #endif
 
-	.macro	irq_stack_entry
-	mov	x19, sp			// preserve the original sp
-#ifdef CONFIG_SHADOW_CALL_STACK
-	mov	x24, scs_sp		// preserve the original shadow stack
-#endif
-
-	/*
-	 * Compare sp with the base of the task stack.
-	 * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
-	 * and should switch to the irq stack.
-	 */
-	ldr	x25, [tsk, TSK_STACK]
-	eor	x25, x25, x19
-	and	x25, x25, #~(THREAD_SIZE - 1)
-	cbnz	x25, 9998f
-
-	ldr_this_cpu x25, irq_stack_ptr, x26
-	mov	x26, #IRQ_STACK_SIZE
-	add	x26, x25, x26
-
-	/* switch to the irq stack */
-	mov	sp, x26
-
-#ifdef CONFIG_SHADOW_CALL_STACK
-	/* also switch to the irq shadow stack */
-	ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x26
-#endif
-
-9998:
-	.endm
-
-	/*
-	 * The callee-saved regs (x19-x29) should be preserved between
-	 * irq_stack_entry and irq_stack_exit, but note that kernel_entry
-	 * uses x20-x23 to store data for later use.
-	 */
-	.macro	irq_stack_exit
-	mov	sp, x19
-#ifdef CONFIG_SHADOW_CALL_STACK
-	mov	scs_sp, x24
-#endif
-	.endm
-
 /* GPRs used by entry code */
 tsk	.req	x28		// current thread_info
 
 /*
  * Interrupt handling.
  */
-	.macro	irq_handler, handler:req
-	ldr_l	x1, \handler
-	mov	x0, sp
-	irq_stack_entry
-	blr	x1
-	irq_stack_exit
-	.endm
-
 	.macro	gic_prio_kentry_setup, tmp:req
 #ifdef CONFIG_ARM64_PSEUDO_NMI
 	alternative_if ARM64_HAS_IRQ_PRIO_MASKING
@@ -552,45 +486,6 @@ tsk	.req	x28		// current thread_info
 #endif
 	.endm
 
-	.macro el1_interrupt_handler, handler:req
-	enable_da
-
-	mov	x0, sp
-	bl	enter_el1_irq_or_nmi
-
-	irq_handler	\handler
-
-#ifdef CONFIG_PREEMPTION
-	ldr	x24, [tsk, #TSK_TI_PREEMPT]	// get preempt count
-alternative_if ARM64_HAS_IRQ_PRIO_MASKING
-	/*
-	 * DA were cleared at start of handling, and IF are cleared by
-	 * the GIC irqchip driver using gic_arch_enable_irqs() for
-	 * normal IRQs. If anything is set, it means we come back from
-	 * an NMI instead of a normal IRQ, so skip preemption
-	 */
-	mrs	x0, daif
-	orr	x24, x24, x0
-alternative_else_nop_endif
-	cbnz	x24, 1f				// preempt count != 0 || NMI return path
-	bl	arm64_preempt_schedule_irq	// irq en/disable is done inside
-1:
-#endif
-
-	mov	x0, sp
-	bl	exit_el1_irq_or_nmi
-	.endm
-
-	.macro el0_interrupt_handler, handler:req
-	user_exit_irqoff
-	enable_da
-
-	tbz	x22, #55, 1f
-	bl	do_el0_irq_bp_hardening
-1:
-	irq_handler	\handler
-	.endm
-
 	.text
 
 /*
@@ -600,32 +495,25 @@ alternative_else_nop_endif
 
 	.align	11
 SYM_CODE_START(vectors)
-	kernel_ventry	1, sync_invalid			// Synchronous EL1t
-	kernel_ventry	1, irq_invalid			// IRQ EL1t
-	kernel_ventry	1, fiq_invalid			// FIQ EL1t
-	kernel_ventry	1, error_invalid		// Error EL1t
-
-	kernel_ventry	1, sync				// Synchronous EL1h
-	kernel_ventry	1, irq				// IRQ EL1h
-	kernel_ventry	1, fiq				// FIQ EL1h
-	kernel_ventry	1, error			// Error EL1h
-
-	kernel_ventry	0, sync				// Synchronous 64-bit EL0
-	kernel_ventry	0, irq				// IRQ 64-bit EL0
-	kernel_ventry	0, fiq				// FIQ 64-bit EL0
-	kernel_ventry	0, error			// Error 64-bit EL0
-
-#ifdef CONFIG_COMPAT
-	kernel_ventry	0, sync_compat, 32		// Synchronous 32-bit EL0
-	kernel_ventry	0, irq_compat, 32		// IRQ 32-bit EL0
-	kernel_ventry	0, fiq_compat, 32		// FIQ 32-bit EL0
-	kernel_ventry	0, error_compat, 32		// Error 32-bit EL0
-#else
-	kernel_ventry	0, sync_invalid, 32		// Synchronous 32-bit EL0
-	kernel_ventry	0, irq_invalid, 32		// IRQ 32-bit EL0
-	kernel_ventry	0, fiq_invalid, 32		// FIQ 32-bit EL0
-	kernel_ventry	0, error_invalid, 32		// Error 32-bit EL0
-#endif
+	kernel_ventry	1, t, 64, sync		// Synchronous EL1t
+	kernel_ventry	1, t, 64, irq		// IRQ EL1t
+	kernel_ventry	1, t, 64, fiq		// FIQ EL1h
+	kernel_ventry	1, t, 64, error		// Error EL1t
+
+	kernel_ventry	1, h, 64, sync		// Synchronous EL1h
+	kernel_ventry	1, h, 64, irq		// IRQ EL1h
+	kernel_ventry	1, h, 64, fiq		// FIQ EL1h
+	kernel_ventry	1, h, 64, error		// Error EL1h
+
+	kernel_ventry	0, t, 64, sync		// Synchronous 64-bit EL0
+	kernel_ventry	0, t, 64, irq		// IRQ 64-bit EL0
+	kernel_ventry	0, t, 64, fiq		// FIQ 64-bit EL0
+	kernel_ventry	0, t, 64, error		// Error 64-bit EL0
+
+	kernel_ventry	0, t, 32, sync		// Synchronous 32-bit EL0
+	kernel_ventry	0, t, 32, irq		// IRQ 32-bit EL0
+	kernel_ventry	0, t, 32, fiq		// FIQ 32-bit EL0
+	kernel_ventry	0, t, 32, error		// Error 32-bit EL0
 SYM_CODE_END(vectors)
 
 #ifdef CONFIG_VMAP_STACK
@@ -656,147 +544,46 @@ __bad_stack:
 	ASM_BUG()
 #endif /* CONFIG_VMAP_STACK */
 
-/*
- * Invalid mode handlers
- */
-	.macro	inv_entry, el, reason, regsize = 64
+
+	.macro entry_handler el:req, ht:req, regsize:req, label:req
+SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label)
 	kernel_entry \el, \regsize
 	mov	x0, sp
-	mov	x1, #\reason
-	mrs	x2, esr_el1
-	bl	bad_mode
-	ASM_BUG()
+	bl	el\el\ht\()_\regsize\()_\label\()_handler
+	.if \el == 0
+	b	ret_to_user
+	.else
+	b	ret_to_kernel
+	.endif
+SYM_CODE_END(el\el\ht\()_\regsize\()_\label)
 	.endm
 
-SYM_CODE_START_LOCAL(el0_sync_invalid)
-	inv_entry 0, BAD_SYNC
-SYM_CODE_END(el0_sync_invalid)
-
-SYM_CODE_START_LOCAL(el0_irq_invalid)
-	inv_entry 0, BAD_IRQ
-SYM_CODE_END(el0_irq_invalid)
-
-SYM_CODE_START_LOCAL(el0_fiq_invalid)
-	inv_entry 0, BAD_FIQ
-SYM_CODE_END(el0_fiq_invalid)
-
-SYM_CODE_START_LOCAL(el0_error_invalid)
-	inv_entry 0, BAD_ERROR
-SYM_CODE_END(el0_error_invalid)
-
-SYM_CODE_START_LOCAL(el1_sync_invalid)
-	inv_entry 1, BAD_SYNC
-SYM_CODE_END(el1_sync_invalid)
-
-SYM_CODE_START_LOCAL(el1_irq_invalid)
-	inv_entry 1, BAD_IRQ
-SYM_CODE_END(el1_irq_invalid)
-
-SYM_CODE_START_LOCAL(el1_fiq_invalid)
-	inv_entry 1, BAD_FIQ
-SYM_CODE_END(el1_fiq_invalid)
-
-SYM_CODE_START_LOCAL(el1_error_invalid)
-	inv_entry 1, BAD_ERROR
-SYM_CODE_END(el1_error_invalid)
-
 /*
- * EL1 mode handlers.
+ * Early exception handlers
  */
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el1_sync)
-	kernel_entry 1
-	mov	x0, sp
-	bl	el1_sync_handler
-	kernel_exit 1
-SYM_CODE_END(el1_sync)
-
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el1_irq)
-	kernel_entry 1
-	el1_interrupt_handler handle_arch_irq
-	kernel_exit 1
-SYM_CODE_END(el1_irq)
-
-SYM_CODE_START_LOCAL_NOALIGN(el1_fiq)
-	kernel_entry 1
-	el1_interrupt_handler handle_arch_fiq
-	kernel_exit 1
-SYM_CODE_END(el1_fiq)
-
-/*
- * EL0 mode handlers.
- */
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el0_sync)
-	kernel_entry 0
-	mov	x0, sp
-	bl	el0_sync_handler
-	b	ret_to_user
-SYM_CODE_END(el0_sync)
-
-#ifdef CONFIG_COMPAT
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el0_sync_compat)
-	kernel_entry 0, 32
-	mov	x0, sp
-	bl	el0_sync_compat_handler
-	b	ret_to_user
-SYM_CODE_END(el0_sync_compat)
-
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el0_irq_compat)
-	kernel_entry 0, 32
-	b	el0_irq_naked
-SYM_CODE_END(el0_irq_compat)
-
-SYM_CODE_START_LOCAL_NOALIGN(el0_fiq_compat)
-	kernel_entry 0, 32
-	b	el0_fiq_naked
-SYM_CODE_END(el0_fiq_compat)
-
-SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat)
-	kernel_entry 0, 32
-	b	el0_error_naked
-SYM_CODE_END(el0_error_compat)
-#endif
-
-	.align	6
-SYM_CODE_START_LOCAL_NOALIGN(el0_irq)
-	kernel_entry 0
-el0_irq_naked:
-	el0_interrupt_handler handle_arch_irq
-	b	ret_to_user
-SYM_CODE_END(el0_irq)
-
-SYM_CODE_START_LOCAL_NOALIGN(el0_fiq)
-	kernel_entry 0
-el0_fiq_naked:
-	el0_interrupt_handler handle_arch_fiq
-	b	ret_to_user
-SYM_CODE_END(el0_fiq)
-
-SYM_CODE_START_LOCAL(el1_error)
-	kernel_entry 1
-	mrs	x1, esr_el1
-	enable_dbg
-	mov	x0, sp
-	bl	do_serror
+	entry_handler	1, t, 64, sync
+	entry_handler	1, t, 64, irq
+	entry_handler	1, t, 64, fiq
+	entry_handler	1, t, 64, error
+
+	entry_handler	1, h, 64, sync
+	entry_handler	1, h, 64, irq
+	entry_handler	1, h, 64, fiq
+	entry_handler	1, h, 64, error
+
+	entry_handler	0, t, 64, sync
+	entry_handler	0, t, 64, irq
+	entry_handler	0, t, 64, fiq
+	entry_handler	0, t, 64, error
+
+	entry_handler	0, t, 32, sync
+	entry_handler	0, t, 32, irq
+	entry_handler	0, t, 32, fiq
+	entry_handler	0, t, 32, error
+
+SYM_CODE_START_LOCAL(ret_to_kernel)
 	kernel_exit 1
-SYM_CODE_END(el1_error)
-
-SYM_CODE_START_LOCAL(el0_error)
-	kernel_entry 0
-el0_error_naked:
-	mrs	x25, esr_el1
-	user_exit_irqoff
-	enable_dbg
-	mov	x0, sp
-	mov	x1, x25
-	bl	do_serror
-	enable_da
-	b	ret_to_user
-SYM_CODE_END(el0_error)
+SYM_CODE_END(ret_to_kernel)
 
 /*
  * "slow" syscall return path.
@@ -979,8 +766,8 @@ SYM_FUNC_START(cpu_switch_to)
 	mov	sp, x9
 	msr	sp_el0, x1
 	ptrauth_keys_install_kernel x1, x8, x9, x10
-	scs_save x0, x8
-	scs_load x1, x8
+	scs_save x0
+	scs_load x1
 	ret
 SYM_FUNC_END(cpu_switch_to)
 NOKPROBE(cpu_switch_to)
@@ -998,6 +785,42 @@ SYM_CODE_START(ret_from_fork)
 SYM_CODE_END(ret_from_fork)
 NOKPROBE(ret_from_fork)
 
+/*
+ * void call_on_irq_stack(struct pt_regs *regs,
+ * 		          void (*func)(struct pt_regs *));
+ *
+ * Calls func(regs) using this CPU's irq stack and shadow irq stack.
+ */
+SYM_FUNC_START(call_on_irq_stack)
+#ifdef CONFIG_SHADOW_CALL_STACK
+	stp	scs_sp, xzr, [sp, #-16]!
+	ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x17
+#endif
+	/* Create a frame record to save our LR and SP (implicit in FP) */
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+
+	ldr_this_cpu x16, irq_stack_ptr, x17
+	mov	x15, #IRQ_STACK_SIZE
+	add	x16, x16, x15
+
+	/* Move to the new stack and call the function there */
+	mov	sp, x16
+	blr	x1
+
+	/*
+	 * Restore the SP from the FP, and restore the FP and LR from the frame
+	 * record.
+	 */
+	mov	sp, x29
+	ldp	x29, x30, [sp], #16
+#ifdef CONFIG_SHADOW_CALL_STACK
+	ldp	scs_sp, xzr, [sp], #16
+#endif
+	ret
+SYM_FUNC_END(call_on_irq_stack)
+NOKPROBE(call_on_irq_stack)
+
 #ifdef CONFIG_ARM_SDE_INTERFACE
 
 #include <asm/sdei.h>
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index ad3dd34a83cf..e57b23f95284 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -957,8 +957,10 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs)
 	 * disabling the trap, otherwise update our in-memory copy.
 	 */
 	if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
-		sve_set_vq(sve_vq_from_vl(current->thread.sve_vl) - 1);
-		sve_flush_live();
+		unsigned long vq_minus_one =
+			sve_vq_from_vl(current->thread.sve_vl) - 1;
+		sve_set_vq(vq_minus_one);
+		sve_flush_live(vq_minus_one);
 		fpsimd_bind_task_to_cpu();
 	} else {
 		fpsimd_to_sve(current);
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index b5d3ddaf69d9..7f467bd9db7a 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -15,6 +15,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/ftrace.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /*
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 96873dfa67fd..c5c994a73a64 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -16,6 +16,7 @@
 #include <asm/asm_pointer_auth.h>
 #include <asm/assembler.h>
 #include <asm/boot.h>
+#include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
@@ -117,8 +118,8 @@ SYM_CODE_START_LOCAL(preserve_boot_args)
 	dmb	sy				// needed before dc ivac with
 						// MMU off
 
-	mov	x1, #0x20			// 4 x 8 bytes
-	b	__inval_dcache_area		// tail call
+	add	x1, x0, #0x20			// 4 x 8 bytes
+	b	dcache_inval_poc		// tail call
 SYM_CODE_END(preserve_boot_args)
 
 /*
@@ -195,7 +196,7 @@ SYM_CODE_END(preserve_boot_args)
 	and	\iend, \iend, \istart	// iend = (vend >> shift) & (ptrs - 1)
 	mov	\istart, \ptrs
 	mul	\istart, \istart, \count
-	add	\iend, \iend, \istart	// iend += (count - 1) * ptrs
+	add	\iend, \iend, \istart	// iend += count * ptrs
 					// our entries span multiple tables
 
 	lsr	\istart, \vstart, \shift
@@ -268,8 +269,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 	 */
 	adrp	x0, init_pg_dir
 	adrp	x1, init_pg_end
-	sub	x1, x1, x0
-	bl	__inval_dcache_area
+	bl	dcache_inval_poc
 
 	/*
 	 * Clear the init page tables.
@@ -354,7 +354,6 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 #endif
 1:
 	ldr_l	x4, idmap_ptrs_per_pgd
-	mov	x5, x3				// __pa(__idmap_text_start)
 	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
 
 	map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
@@ -382,39 +381,57 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
 
 	adrp	x0, idmap_pg_dir
 	adrp	x1, idmap_pg_end
-	sub	x1, x1, x0
-	bl	__inval_dcache_area
+	bl	dcache_inval_poc
 
 	adrp	x0, init_pg_dir
 	adrp	x1, init_pg_end
-	sub	x1, x1, x0
-	bl	__inval_dcache_area
+	bl	dcache_inval_poc
 
 	ret	x28
 SYM_FUNC_END(__create_page_tables)
 
+	/*
+	 * Initialize CPU registers with task-specific and cpu-specific context.
+	 *
+	 * Create a final frame record at task_pt_regs(current)->stackframe, so
+	 * that the unwinder can identify the final frame record of any task by
+	 * its location in the task stack. We reserve the entire pt_regs space
+	 * for consistency with user tasks and kthreads.
+	 */
+	.macro	init_cpu_task tsk, tmp1, tmp2
+	msr	sp_el0, \tsk
+
+	ldr	\tmp1, [\tsk, #TSK_STACK]
+	add	sp, \tmp1, #THREAD_SIZE
+	sub	sp, sp, #PT_REGS_SIZE
+
+	stp	xzr, xzr, [sp, #S_STACKFRAME]
+	add	x29, sp, #S_STACKFRAME
+
+	scs_load \tsk
+
+	adr_l	\tmp1, __per_cpu_offset
+	ldr	w\tmp2, [\tsk, #TSK_CPU]
+	ldr	\tmp1, [\tmp1, \tmp2, lsl #3]
+	set_this_cpu_offset \tmp1
+	.endm
+
 /*
  * The following fragment of code is executed with the MMU enabled.
  *
  *   x0 = __PHYS_OFFSET
  */
 SYM_FUNC_START_LOCAL(__primary_switched)
-	adrp	x4, init_thread_union
-	add	sp, x4, #THREAD_SIZE
-	adr_l	x5, init_task
-	msr	sp_el0, x5			// Save thread_info
+	adr_l	x4, init_task
+	init_cpu_task x4, x5, x6
 
 	adr_l	x8, vectors			// load VBAR_EL1 with virtual
 	msr	vbar_el1, x8			// vector table address
 	isb
 
-	stp	xzr, x30, [sp, #-16]!
+	stp	x29, x30, [sp, #-16]!
 	mov	x29, sp
 
-#ifdef CONFIG_SHADOW_CALL_STACK
-	adr_l	scs_sp, init_shadow_call_stack	// Set shadow call stack
-#endif
-
 	str_l	x21, __fdt_pointer, x5		// Save FDT pointer
 
 	ldr_l	x4, kimage_vaddr		// Save the offset between
@@ -446,10 +463,9 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 0:
 #endif
 	bl	switch_to_vhe			// Prefer VHE if possible
-	add	sp, sp, #16
-	mov	x29, #0
-	mov	x30, #0
-	b	start_kernel
+	ldp	x29, x30, [sp], #16
+	bl	start_kernel
+	ASM_BUG()
 SYM_FUNC_END(__primary_switched)
 
 	.pushsection ".rodata", "a"
@@ -551,7 +567,7 @@ SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
 	cmp	w0, #BOOT_CPU_MODE_EL2
 	b.ne	1f
 	add	x1, x1, #4
-1:	str	w0, [x1]			// This CPU has booted in EL1
+1:	str	w0, [x1]			// Save CPU boot mode
 	dmb	sy
 	dc	ivac, x1			// Invalidate potentially stale cache line
 	ret
@@ -632,21 +648,17 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
 	isb
 
 	adr_l	x0, secondary_data
-	ldr	x1, [x0, #CPU_BOOT_STACK]	// get secondary_data.stack
-	cbz	x1, __secondary_too_slow
-	mov	sp, x1
 	ldr	x2, [x0, #CPU_BOOT_TASK]
 	cbz	x2, __secondary_too_slow
-	msr	sp_el0, x2
-	scs_load x2, x3
-	mov	x29, #0
-	mov	x30, #0
+
+	init_cpu_task x2, x1, x3
 
 #ifdef CONFIG_ARM64_PTR_AUTH
 	ptrauth_keys_init_cpu x2, x3, x4, x5
 #endif
 
-	b	secondary_start_kernel
+	bl	secondary_start_kernel
+	ASM_BUG()
 SYM_FUNC_END(__secondary_switched)
 
 SYM_FUNC_START_LOCAL(__secondary_too_slow)
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
index 8ccca660034e..81c0186a5e32 100644
--- a/arch/arm64/kernel/hibernate-asm.S
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -45,7 +45,7 @@
  * Because this code has to be copied to a 'safe' page, it can't call out to
  * other functions by PC-relative address. Also remember that it may be
  * mid-way through over-writing other functions. For this reason it contains
- * code from flush_icache_range() and uses the copy_page() macro.
+ * code from caches_clean_inval_pou() and uses the copy_page() macro.
  *
  * This 'safe' page is mapped via ttbr0, and executed from there. This function
  * switches to a copy of the linear map in ttbr1, performs the restore, then
@@ -87,11 +87,12 @@ SYM_CODE_START(swsusp_arch_suspend_exit)
 	copy_page	x0, x1, x2, x3, x4, x5, x6, x7, x8, x9
 
 	add	x1, x10, #PAGE_SIZE
-	/* Clean the copied page to PoU - based on flush_icache_range() */
+	/* Clean the copied page to PoU - based on caches_clean_inval_pou() */
 	raw_dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x10, x3
-2:	dc	cvau, x4	/* clean D line / unified line */
+2:	/* clean D line / unified line */
+alternative_insn "dc cvau, x4",  "dc civac, x4",  ARM64_WORKAROUND_CLEAN_CACHE
 	add	x4, x4, x2
 	cmp	x4, x1
 	b.lo	2b
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index b1cef371df2b..46a0b4d6e251 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -210,7 +210,7 @@ static int create_safe_exec_page(void *src_start, size_t length,
 		return -ENOMEM;
 
 	memcpy(page, src_start, length);
-	__flush_icache_range((unsigned long)page, (unsigned long)page + length);
+	caches_clean_inval_pou((unsigned long)page, (unsigned long)page + length);
 	rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page);
 	if (rc)
 		return rc;
@@ -240,8 +240,6 @@ static int create_safe_exec_page(void *src_start, size_t length,
 	return 0;
 }
 
-#define dcache_clean_range(start, end)	__flush_dcache_area(start, (end - start))
-
 #ifdef CONFIG_ARM64_MTE
 
 static DEFINE_XARRAY(mte_pages);
@@ -383,13 +381,18 @@ int swsusp_arch_suspend(void)
 		ret = swsusp_save();
 	} else {
 		/* Clean kernel core startup/idle code to PoC*/
-		dcache_clean_range(__mmuoff_data_start, __mmuoff_data_end);
-		dcache_clean_range(__idmap_text_start, __idmap_text_end);
+		dcache_clean_inval_poc((unsigned long)__mmuoff_data_start,
+				    (unsigned long)__mmuoff_data_end);
+		dcache_clean_inval_poc((unsigned long)__idmap_text_start,
+				    (unsigned long)__idmap_text_end);
 
 		/* Clean kvm setup code to PoC? */
 		if (el2_reset_needed()) {
-			dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end);
-			dcache_clean_range(__hyp_text_start, __hyp_text_end);
+			dcache_clean_inval_poc(
+				(unsigned long)__hyp_idmap_text_start,
+				(unsigned long)__hyp_idmap_text_end);
+			dcache_clean_inval_poc((unsigned long)__hyp_text_start,
+					    (unsigned long)__hyp_text_end);
 		}
 
 		swsusp_mte_restore_tags();
@@ -474,7 +477,8 @@ int swsusp_arch_resume(void)
 	 * The hibernate exit text contains a set of el2 vectors, that will
 	 * be executed at el2 with the mmu off in order to reload hyp-stub.
 	 */
-	__flush_dcache_area(hibernate_exit, exit_size);
+	dcache_clean_inval_poc((unsigned long)hibernate_exit,
+			    (unsigned long)hibernate_exit + exit_size);
 
 	/*
 	 * KASLR will cause the el2 vectors to be in a different location in
diff --git a/arch/arm64/kernel/idle.c b/arch/arm64/kernel/idle.c
new file mode 100644
index 000000000000..a2cfbacec2bb
--- /dev/null
+++ b/arch/arm64/kernel/idle.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Low-level idle sequences
+ */
+
+#include <linux/cpu.h>
+#include <linux/irqflags.h>
+
+#include <asm/barrier.h>
+#include <asm/cpuidle.h>
+#include <asm/cpufeature.h>
+#include <asm/sysreg.h>
+
+/*
+ *	cpu_do_idle()
+ *
+ *	Idle the processor (wait for interrupt).
+ *
+ *	If the CPU supports priority masking we must do additional work to
+ *	ensure that interrupts are not masked at the PMR (because the core will
+ *	not wake up if we block the wake up signal in the interrupt controller).
+ */
+void noinstr cpu_do_idle(void)
+{
+	struct arm_cpuidle_irq_context context;
+
+	arm_cpuidle_save_irq_context(&context);
+
+	dsb(sy);
+	wfi();
+
+	arm_cpuidle_restore_irq_context(&context);
+}
+
+/*
+ * This is our default idle handler.
+ */
+void noinstr arch_cpu_idle(void)
+{
+	/*
+	 * This should do all the clock switching and wait for interrupt
+	 * tricks
+	 */
+	cpu_do_idle();
+	raw_local_irq_enable();
+}
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c
index e628c8ce1ffe..53a381a7f65d 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -237,7 +237,8 @@ asmlinkage void __init init_feature_override(void)
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
 		if (regs[i]->override)
-			__flush_dcache_area(regs[i]->override,
+			dcache_clean_inval_poc((unsigned long)regs[i]->override,
+					    (unsigned long)regs[i]->override +
 					    sizeof(*regs[i]->override));
 	}
 }
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index bcf3c2755370..c96a9a0043bf 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -35,7 +35,7 @@ __efistub_strnlen		= __pi_strnlen;
 __efistub_strcmp		= __pi_strcmp;
 __efistub_strncmp		= __pi_strncmp;
 __efistub_strrchr		= __pi_strrchr;
-__efistub___clean_dcache_area_poc = __pi___clean_dcache_area_poc;
+__efistub_dcache_clean_poc = __pi_dcache_clean_poc;
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 __efistub___memcpy		= __pi_memcpy;
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index 9a8a0ae1e75f..fc98037e1220 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/jump_label.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 341342b207f6..cfa2cfde3019 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -72,7 +72,9 @@ u64 __init kaslr_early_init(void)
 	 * we end up running with module randomization disabled.
 	 */
 	module_alloc_base = (u64)_etext - MODULES_VSIZE;
-	__flush_dcache_area(&module_alloc_base, sizeof(module_alloc_base));
+	dcache_clean_inval_poc((unsigned long)&module_alloc_base,
+			    (unsigned long)&module_alloc_base +
+				    sizeof(module_alloc_base));
 
 	/*
 	 * Try to map the FDT early. If this fails, we simply bail,
@@ -170,8 +172,12 @@ u64 __init kaslr_early_init(void)
 	module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
 	module_alloc_base &= PAGE_MASK;
 
-	__flush_dcache_area(&module_alloc_base, sizeof(module_alloc_base));
-	__flush_dcache_area(&memstart_offset_seed, sizeof(memstart_offset_seed));
+	dcache_clean_inval_poc((unsigned long)&module_alloc_base,
+			    (unsigned long)&module_alloc_base +
+				    sizeof(module_alloc_base));
+	dcache_clean_inval_poc((unsigned long)&memstart_offset_seed,
+			    (unsigned long)&memstart_offset_seed +
+				    sizeof(memstart_offset_seed));
 
 	return offset;
 }
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index 1a157ca33262..2aede780fb80 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -17,6 +17,7 @@
 
 #include <asm/debug-monitors.h>
 #include <asm/insn.h>
+#include <asm/patching.h>
 #include <asm/traps.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 90a335c74442..03ceabe4d912 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -68,10 +68,16 @@ int machine_kexec_post_load(struct kimage *kimage)
 	kimage->arch.kern_reloc = __pa(reloc_code);
 	kexec_image_info(kimage);
 
-	/* Flush the reloc_code in preparation for its execution. */
-	__flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size);
-	flush_icache_range((uintptr_t)reloc_code, (uintptr_t)reloc_code +
-			   arm64_relocate_new_kernel_size);
+	/*
+	 * For execution with the MMU off, reloc_code needs to be cleaned to the
+	 * PoC and invalidated from the I-cache.
+	 */
+	dcache_clean_inval_poc((unsigned long)reloc_code,
+			    (unsigned long)reloc_code +
+				    arm64_relocate_new_kernel_size);
+	icache_inval_pou((uintptr_t)reloc_code,
+				(uintptr_t)reloc_code +
+					arm64_relocate_new_kernel_size);
 
 	return 0;
 }
@@ -102,16 +108,18 @@ static void kexec_list_flush(struct kimage *kimage)
 
 	for (entry = &kimage->head; ; entry++) {
 		unsigned int flag;
-		void *addr;
+		unsigned long addr;
 
 		/* flush the list entries. */
-		__flush_dcache_area(entry, sizeof(kimage_entry_t));
+		dcache_clean_inval_poc((unsigned long)entry,
+				    (unsigned long)entry +
+					    sizeof(kimage_entry_t));
 
 		flag = *entry & IND_FLAGS;
 		if (flag == IND_DONE)
 			break;
 
-		addr = phys_to_virt(*entry & PAGE_MASK);
+		addr = (unsigned long)phys_to_virt(*entry & PAGE_MASK);
 
 		switch (flag) {
 		case IND_INDIRECTION:
@@ -120,7 +128,7 @@ static void kexec_list_flush(struct kimage *kimage)
 			break;
 		case IND_SOURCE:
 			/* flush the source pages. */
-			__flush_dcache_area(addr, PAGE_SIZE);
+			dcache_clean_inval_poc(addr, addr + PAGE_SIZE);
 			break;
 		case IND_DESTINATION:
 			break;
@@ -147,8 +155,10 @@ static void kexec_segment_flush(const struct kimage *kimage)
 			kimage->segment[i].memsz,
 			kimage->segment[i].memsz /  PAGE_SIZE);
 
-		__flush_dcache_area(phys_to_virt(kimage->segment[i].mem),
-			kimage->segment[i].memsz);
+		dcache_clean_inval_poc(
+			(unsigned long)phys_to_virt(kimage->segment[i].mem),
+			(unsigned long)phys_to_virt(kimage->segment[i].mem) +
+				kimage->segment[i].memsz);
 	}
 }
 
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 125a10e413e9..69b3fde8759e 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -32,10 +32,9 @@ DEFINE_STATIC_KEY_FALSE(mte_async_mode);
 EXPORT_SYMBOL_GPL(mte_async_mode);
 #endif
 
-static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
+static void mte_sync_page_tags(struct page *page, pte_t old_pte,
+			       bool check_swap, bool pte_is_tagged)
 {
-	pte_t old_pte = READ_ONCE(*ptep);
-
 	if (check_swap && is_swap_pte(old_pte)) {
 		swp_entry_t entry = pte_to_swp_entry(old_pte);
 
@@ -43,6 +42,9 @@ static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
 			return;
 	}
 
+	if (!pte_is_tagged)
+		return;
+
 	page_kasan_tag_reset(page);
 	/*
 	 * We need smp_wmb() in between setting the flags and clearing the
@@ -55,16 +57,22 @@ static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
 	mte_clear_page_tags(page_address(page));
 }
 
-void mte_sync_tags(pte_t *ptep, pte_t pte)
+void mte_sync_tags(pte_t old_pte, pte_t pte)
 {
 	struct page *page = pte_page(pte);
 	long i, nr_pages = compound_nr(page);
 	bool check_swap = nr_pages == 1;
+	bool pte_is_tagged = pte_tagged(pte);
+
+	/* Early out if there's nothing to do */
+	if (!check_swap && !pte_is_tagged)
+		return;
 
 	/* if PG_mte_tagged is set, tags have already been initialised */
 	for (i = 0; i < nr_pages; i++, page++) {
 		if (!test_and_set_bit(PG_mte_tagged, &page->flags))
-			mte_sync_page_tags(page, ptep, check_swap);
+			mte_sync_page_tags(page, old_pte, check_swap,
+					   pte_is_tagged);
 	}
 }
 
diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
new file mode 100644
index 000000000000..771f543464e0
--- /dev/null
+++ b/arch/arm64/kernel/patching.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/stop_machine.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+#include <asm/insn.h>
+#include <asm/kprobes.h>
+#include <asm/patching.h>
+#include <asm/sections.h>
+
+static DEFINE_RAW_SPINLOCK(patch_lock);
+
+static bool is_exit_text(unsigned long addr)
+{
+	/* discarded with init text/data */
+	return system_state < SYSTEM_RUNNING &&
+		addr >= (unsigned long)__exittext_begin &&
+		addr < (unsigned long)__exittext_end;
+}
+
+static bool is_image_text(unsigned long addr)
+{
+	return core_kernel_text(addr) || is_exit_text(addr);
+}
+
+static void __kprobes *patch_map(void *addr, int fixmap)
+{
+	unsigned long uintaddr = (uintptr_t) addr;
+	bool image = is_image_text(uintaddr);
+	struct page *page;
+
+	if (image)
+		page = phys_to_page(__pa_symbol(addr));
+	else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		page = vmalloc_to_page(addr);
+	else
+		return addr;
+
+	BUG_ON(!page);
+	return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
+			(uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap)
+{
+	clear_fixmap(fixmap);
+}
+/*
+ * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
+ * little-endian.
+ */
+int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
+{
+	int ret;
+	__le32 val;
+
+	ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE);
+	if (!ret)
+		*insnp = le32_to_cpu(val);
+
+	return ret;
+}
+
+static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
+{
+	void *waddr = addr;
+	unsigned long flags = 0;
+	int ret;
+
+	raw_spin_lock_irqsave(&patch_lock, flags);
+	waddr = patch_map(addr, FIX_TEXT_POKE0);
+
+	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
+
+	patch_unmap(FIX_TEXT_POKE0);
+	raw_spin_unlock_irqrestore(&patch_lock, flags);
+
+	return ret;
+}
+
+int __kprobes aarch64_insn_write(void *addr, u32 insn)
+{
+	return __aarch64_insn_write(addr, cpu_to_le32(insn));
+}
+
+int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
+{
+	u32 *tp = addr;
+	int ret;
+
+	/* A64 instructions must be word aligned */
+	if ((uintptr_t)tp & 0x3)
+		return -EINVAL;
+
+	ret = aarch64_insn_write(tp, insn);
+	if (ret == 0)
+		caches_clean_inval_pou((uintptr_t)tp,
+				     (uintptr_t)tp + AARCH64_INSN_SIZE);
+
+	return ret;
+}
+
+struct aarch64_insn_patch {
+	void		**text_addrs;
+	u32		*new_insns;
+	int		insn_cnt;
+	atomic_t	cpu_count;
+};
+
+static int __kprobes aarch64_insn_patch_text_cb(void *arg)
+{
+	int i, ret = 0;
+	struct aarch64_insn_patch *pp = arg;
+
+	/* The first CPU becomes master */
+	if (atomic_inc_return(&pp->cpu_count) == 1) {
+		for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
+			ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
+							     pp->new_insns[i]);
+		/* Notify other processors with an additional increment. */
+		atomic_inc(&pp->cpu_count);
+	} else {
+		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
+			cpu_relax();
+		isb();
+	}
+
+	return ret;
+}
+
+int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
+{
+	struct aarch64_insn_patch patch = {
+		.text_addrs = addrs,
+		.new_insns = insns,
+		.insn_cnt = cnt,
+		.cpu_count = ATOMIC_INIT(0),
+	};
+
+	if (cnt <= 0)
+		return -EINVAL;
+
+	return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch,
+				       cpu_online_mask);
+}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 88ff471b0bce..4a72c2727309 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -116,7 +116,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 		tail = (struct frame_tail __user *)regs->regs[29];
 
 		while (entry->nr < entry->max_stack &&
-		       tail && !((unsigned long)tail & 0xf))
+		       tail && !((unsigned long)tail & 0x7))
 			tail = user_backtrace(tail, entry);
 	} else {
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index f594957e29bd..d07788dad388 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -165,10 +165,7 @@ armv8pmu_events_sysfs_show(struct device *dev,
 }
 
 #define ARMV8_EVENT_ATTR(name, config)						\
-	(&((struct perf_pmu_events_attr) {					\
-		.attr = __ATTR(name, 0444, armv8pmu_events_sysfs_show, NULL),	\
-		.id = config,							\
-	}).attr.attr)
+	PMU_EVENT_ATTR_ID(name, armv8pmu_events_sysfs_show, config)
 
 static struct attribute *armv8_pmuv3_event_attrs[] = {
 	ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR),
@@ -312,13 +309,46 @@ static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
 	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
 	u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK;
 
-	return snprintf(page, PAGE_SIZE, "0x%08x\n", slots);
+	return sysfs_emit(page, "0x%08x\n", slots);
 }
 
 static DEVICE_ATTR_RO(slots);
 
+static ssize_t bus_slots_show(struct device *dev, struct device_attribute *attr,
+			      char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+	u32 bus_slots = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_SLOTS_SHIFT)
+			& ARMV8_PMU_BUS_SLOTS_MASK;
+
+	return sysfs_emit(page, "0x%08x\n", bus_slots);
+}
+
+static DEVICE_ATTR_RO(bus_slots);
+
+static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr,
+			      char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+	u32 bus_width = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_WIDTH_SHIFT)
+			& ARMV8_PMU_BUS_WIDTH_MASK;
+	u32 val = 0;
+
+	/* Encoded as Log2(number of bytes), plus one */
+	if (bus_width > 2 && bus_width < 13)
+		val = 1 << (bus_width - 1);
+
+	return sysfs_emit(page, "0x%08x\n", val);
+}
+
+static DEVICE_ATTR_RO(bus_width);
+
 static struct attribute *armv8_pmuv3_caps_attrs[] = {
 	&dev_attr_slots.attr,
+	&dev_attr_bus_slots.attr,
+	&dev_attr_bus_width.attr,
 	NULL,
 };
 
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index d607c9912025..6dbcc89f6662 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -7,26 +7,28 @@
  * Copyright (C) 2013 Linaro Limited.
  * Author: Sandeepa Prabhu <sandeepa.prabhu@linaro.org>
  */
+#include <linux/extable.h>
 #include <linux/kasan.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
-#include <linux/extable.h>
-#include <linux/slab.h>
-#include <linux/stop_machine.h>
 #include <linux/sched/debug.h>
 #include <linux/set_memory.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
 #include <linux/stringify.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
-#include <asm/traps.h>
-#include <asm/ptrace.h>
+
 #include <asm/cacheflush.h>
-#include <asm/debug-monitors.h>
 #include <asm/daifflags.h>
-#include <asm/system_misc.h>
+#include <asm/debug-monitors.h>
 #include <asm/insn.h>
-#include <linux/uaccess.h>
 #include <asm/irq.h>
+#include <asm/patching.h>
+#include <asm/ptrace.h>
 #include <asm/sections.h>
+#include <asm/system_misc.h>
+#include <asm/traps.h>
 
 #include "decode-insn.h"
 
@@ -277,23 +279,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr)
 	case KPROBE_HIT_ACTIVE:
 	case KPROBE_HIT_SSDONE:
 		/*
-		 * We increment the nmissed count for accounting,
-		 * we can also use npre/npostfault count for accounting
-		 * these specific fault cases.
-		 */
-		kprobes_inc_nmissed_count(cur);
-
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, fsr))
-			return 1;
-
-		/*
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c
index 25f67ec59635..22d0b3252476 100644
--- a/arch/arm64/kernel/probes/simulate-insn.c
+++ b/arch/arm64/kernel/probes/simulate-insn.c
@@ -10,6 +10,7 @@
 #include <linux/kprobes.h>
 
 #include <asm/ptrace.h>
+#include <asm/traps.h>
 
 #include "simulate-insn.h"
 
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index 2c247634552b..9be668f3f034 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -21,7 +21,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 	memcpy(dst, src, len);
 
 	/* flush caches (dcache/icache) */
-	sync_icache_aliases(dst, len);
+	sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len);
 
 	kunmap_atomic(xol_page_kaddr);
 }
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index b4bb67f17a2c..5ba0ed036dee 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -18,7 +18,6 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
-#include <linux/lockdep.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/nospec.h>
@@ -46,7 +45,6 @@
 #include <linux/prctl.h>
 
 #include <asm/alternative.h>
-#include <asm/arch_gicv3.h>
 #include <asm/compat.h>
 #include <asm/cpufeature.h>
 #include <asm/cacheflush.h>
@@ -74,63 +72,6 @@ EXPORT_SYMBOL_GPL(pm_power_off);
 
 void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
 
-static void noinstr __cpu_do_idle(void)
-{
-	dsb(sy);
-	wfi();
-}
-
-static void noinstr __cpu_do_idle_irqprio(void)
-{
-	unsigned long pmr;
-	unsigned long daif_bits;
-
-	daif_bits = read_sysreg(daif);
-	write_sysreg(daif_bits | PSR_I_BIT | PSR_F_BIT, daif);
-
-	/*
-	 * Unmask PMR before going idle to make sure interrupts can
-	 * be raised.
-	 */
-	pmr = gic_read_pmr();
-	gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
-
-	__cpu_do_idle();
-
-	gic_write_pmr(pmr);
-	write_sysreg(daif_bits, daif);
-}
-
-/*
- *	cpu_do_idle()
- *
- *	Idle the processor (wait for interrupt).
- *
- *	If the CPU supports priority masking we must do additional work to
- *	ensure that interrupts are not masked at the PMR (because the core will
- *	not wake up if we block the wake up signal in the interrupt controller).
- */
-void noinstr cpu_do_idle(void)
-{
-	if (system_uses_irq_prio_masking())
-		__cpu_do_idle_irqprio();
-	else
-		__cpu_do_idle();
-}
-
-/*
- * This is our default idle handler.
- */
-void noinstr arch_cpu_idle(void)
-{
-	/*
-	 * This should do all the clock switching and wait for interrupt
-	 * tricks
-	 */
-	cpu_do_idle();
-	raw_local_irq_enable();
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 void arch_cpu_idle_dead(void)
 {
@@ -435,6 +376,11 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	}
 	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
 	p->thread.cpu_context.sp = (unsigned long)childregs;
+	/*
+	 * For the benefit of the unwinder, set up childregs->stackframe
+	 * as the final frame for the new task.
+	 */
+	p->thread.cpu_context.fp = (unsigned long)childregs->stackframe;
 
 	ptrace_hw_copy_thread(p);
 
@@ -527,6 +473,15 @@ static void erratum_1418040_thread_switch(struct task_struct *prev,
 	write_sysreg(val, cntkctl_el1);
 }
 
+static void compat_thread_switch(struct task_struct *next)
+{
+	if (!is_compat_thread(task_thread_info(next)))
+		return;
+
+	if (static_branch_unlikely(&arm64_mismatched_32bit_el0))
+		set_tsk_thread_flag(next, TIF_NOTIFY_RESUME);
+}
+
 static void update_sctlr_el1(u64 sctlr)
 {
 	/*
@@ -568,6 +523,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	ssbs_thread_switch(next);
 	erratum_1418040_thread_switch(prev, next);
 	ptrauth_thread_switch_user(next);
+	compat_thread_switch(next);
 
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
@@ -598,7 +554,7 @@ unsigned long get_wchan(struct task_struct *p)
 	struct stackframe frame;
 	unsigned long stack_page, ret = 0;
 	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
+	if (!p || p == current || task_is_running(p))
 		return 0;
 
 	stack_page = (unsigned long)try_get_task_stack(p);
@@ -633,8 +589,15 @@ unsigned long arch_align_stack(unsigned long sp)
  */
 void arch_setup_new_exec(void)
 {
-	current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
+	unsigned long mmflags = 0;
+
+	if (is_compat_task()) {
+		mmflags = MMCF_AARCH32;
+		if (static_branch_unlikely(&arm64_mismatched_32bit_el0))
+			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+	}
 
+	current->mm->context.flags = mmflags;
 	ptrauth_thread_init_user();
 	mte_thread_init_user();
 
@@ -724,22 +687,6 @@ static int __init tagged_addr_init(void)
 core_initcall(tagged_addr_init);
 #endif	/* CONFIG_ARM64_TAGGED_ADDR_ABI */
 
-asmlinkage void __sched arm64_preempt_schedule_irq(void)
-{
-	lockdep_assert_irqs_disabled();
-
-	/*
-	 * Preempting a task from an IRQ means we leave copies of PSTATE
-	 * on the stack. cpufeature's enable calls may modify PSTATE, but
-	 * resuming one of these preempted tasks would undo those changes.
-	 *
-	 * Only allow a task to be preempted once cpufeatures have been
-	 * enabled.
-	 */
-	if (system_capabilities_finalized())
-		preempt_schedule_irq();
-}
-
 #ifdef CONFIG_BINFMT_ELF
 int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state,
 			 bool has_interp, bool is_interp)
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index eb2f73939b7b..499b6b2f9757 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -122,7 +122,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
 {
 	return ((addr & ~(THREAD_SIZE - 1))  ==
 		(kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) ||
-		on_irq_stack(addr, NULL);
+		on_irq_stack(addr, sizeof(unsigned long), NULL);
 }
 
 /**
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 2c7ca449dd51..47f77d1234cb 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -162,31 +162,33 @@ static int init_sdei_scs(void)
 	return err;
 }
 
-static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info)
+static bool on_sdei_normal_stack(unsigned long sp, unsigned long size,
+				 struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr);
 	unsigned long high = low + SDEI_STACK_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_SDEI_NORMAL, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_SDEI_NORMAL, info);
 }
 
-static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info)
+static bool on_sdei_critical_stack(unsigned long sp, unsigned long size,
+				   struct stack_info *info)
 {
 	unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr);
 	unsigned long high = low + SDEI_STACK_SIZE;
 
-	return on_stack(sp, low, high, STACK_TYPE_SDEI_CRITICAL, info);
+	return on_stack(sp, size, low, high, STACK_TYPE_SDEI_CRITICAL, info);
 }
 
-bool _on_sdei_stack(unsigned long sp, struct stack_info *info)
+bool _on_sdei_stack(unsigned long sp, unsigned long size, struct stack_info *info)
 {
 	if (!IS_ENABLED(CONFIG_VMAP_STACK))
 		return false;
 
-	if (on_sdei_critical_stack(sp, info))
+	if (on_sdei_critical_stack(sp, size, info))
 		return true;
 
-	if (on_sdei_normal_stack(sp, info))
+	if (on_sdei_normal_stack(sp, size, info))
 		return true;
 
 	return false;
@@ -231,13 +233,13 @@ out_err:
 }
 
 /*
- * __sdei_handler() returns one of:
+ * do_sdei_event() returns one of:
  *  SDEI_EV_HANDLED -  success, return to the interrupted context.
  *  SDEI_EV_FAILED  -  failure, return this error code to firmare.
  *  virtual-address -  success, return to this address.
  */
-static __kprobes unsigned long _sdei_handler(struct pt_regs *regs,
-					     struct sdei_registered_event *arg)
+unsigned long __kprobes do_sdei_event(struct pt_regs *regs,
+				      struct sdei_registered_event *arg)
 {
 	u32 mode;
 	int i, err = 0;
@@ -292,45 +294,3 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs,
 
 	return vbar + 0x480;
 }
-
-static void __kprobes notrace __sdei_pstate_entry(void)
-{
-	/*
-	 * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to
-	 * whether PSTATE bits are inherited unchanged or generated from
-	 * scratch, and the TF-A implementation always clears PAN and always
-	 * clears UAO. There are no other known implementations.
-	 *
-	 * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how
-	 * PSTATE is modified upon architectural exceptions, and so PAN is
-	 * either inherited or set per SCTLR_ELx.SPAN, and UAO is always
-	 * cleared.
-	 *
-	 * We must explicitly reset PAN to the expected state, including
-	 * clearing it when the host isn't using it, in case a VM had it set.
-	 */
-	if (system_uses_hw_pan())
-		set_pstate_pan(1);
-	else if (cpu_has_pan())
-		set_pstate_pan(0);
-}
-
-asmlinkage noinstr unsigned long
-__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg)
-{
-	unsigned long ret;
-
-	/*
-	 * We didn't take an exception to get here, so the HW hasn't
-	 * set/cleared bits in PSTATE that we may rely on. Initialize PAN.
-	 */
-	__sdei_pstate_entry();
-
-	arm64_enter_nmi(regs);
-
-	ret = _sdei_handler(regs, arg);
-
-	arm64_exit_nmi(regs);
-
-	return ret;
-}
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 787bc0f601b3..880f40bae60e 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -88,12 +88,6 @@ void __init smp_setup_processor_id(void)
 	u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
 	set_cpu_logical_map(0, mpidr);
 
-	/*
-	 * clear __my_cpu_offset on boot CPU to avoid hang caused by
-	 * using percpu variable early, for example, lockdep will
-	 * access percpu variable inside lock_release
-	 */
-	set_my_cpu_offset(0);
 	pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n",
 		(unsigned long)mpidr, read_cpuid_id());
 }
@@ -382,7 +376,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	 * faults in case uaccess_enable() is inadvertently called by the init
 	 * thread.
 	 */
-	init_task.thread_info.ttbr0 = __pa_symbol(reserved_pg_dir);
+	init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
 #endif
 
 	if (boot_args[1] || boot_args[2] || boot_args[3]) {
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6237486ff6bb..f8192f4ae0b8 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -911,6 +911,19 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 }
 
+static bool cpu_affinity_invalid(struct pt_regs *regs)
+{
+	if (!compat_user_mode(regs))
+		return false;
+
+	/*
+	 * We're preemptible, but a reschedule will cause us to check the
+	 * affinity again.
+	 */
+	return !cpumask_test_cpu(raw_smp_processor_id(),
+				 system_32bit_el0_cpumask());
+}
+
 asmlinkage void do_notify_resume(struct pt_regs *regs,
 				 unsigned long thread_flags)
 {
@@ -938,6 +951,19 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 			if (thread_flags & _TIF_NOTIFY_RESUME) {
 				tracehook_notify_resume(regs);
 				rseq_handle_notify_resume(NULL, regs);
+
+				/*
+				 * If we reschedule after checking the affinity
+				 * then we must ensure that TIF_NOTIFY_RESUME
+				 * is set so that we check the affinity again.
+				 * Since tracehook_notify_resume() clears the
+				 * flag, ensure that the compiler doesn't move
+				 * it after the affinity check.
+				 */
+				barrier();
+
+				if (cpu_affinity_invalid(regs))
+					force_sig(SIGKILL);
 			}
 
 			if (thread_flags & _TIF_FOREIGN_FPSTATE)
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index d62447964ed9..d3d37f932b97 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -7,8 +7,34 @@
 
 #include <asm/asm-offsets.h>
 #include <asm/assembler.h>
+#include <asm/thread_info.h>
+
+/*
+ * If we have SMCCC v1.3 and (as is likely) no SVE state in
+ * the registers then set the SMCCC hint bit to say there's no
+ * need to preserve it.  Do this by directly adjusting the SMCCC
+ * function value which is already stored in x0 ready to be called.
+ */
+SYM_FUNC_START(__arm_smccc_sve_check)
+
+	ldr_l	x16, smccc_has_sve_hint
+	cbz	x16, 2f
+
+	get_current_task x16
+	ldr	x16, [x16, #TSK_TI_FLAGS]
+	tbnz	x16, #TIF_FOREIGN_FPSTATE, 1f	// Any live FP state?
+	tbnz	x16, #TIF_SVE, 2f		// Does that state include SVE?
+
+1:	orr	x0, x0, ARM_SMCCC_1_3_SVE_HINT
+
+2:	ret
+SYM_FUNC_END(__arm_smccc_sve_check)
+EXPORT_SYMBOL(__arm_smccc_sve_check)
 
 	.macro SMCCC instr
+alternative_if ARM64_SVE
+	bl	__arm_smccc_sve_check
+alternative_else_nop_endif
 	\instr	#0
 	ldr	x4, [sp]
 	stp	x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
@@ -43,3 +69,60 @@ SYM_FUNC_START(__arm_smccc_hvc)
 	SMCCC	hvc
 SYM_FUNC_END(__arm_smccc_hvc)
 EXPORT_SYMBOL(__arm_smccc_hvc)
+
+	.macro SMCCC_1_2 instr
+	/* Save `res` and free a GPR that won't be clobbered */
+	stp     x1, x19, [sp, #-16]!
+
+	/* Ensure `args` won't be clobbered while loading regs in next step */
+	mov	x19, x0
+
+	/* Load the registers x0 - x17 from the struct arm_smccc_1_2_regs */
+	ldp	x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS]
+	ldp	x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS]
+	ldp	x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS]
+	ldp	x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS]
+	ldp	x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS]
+	ldp	x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS]
+	ldp	x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS]
+	ldp	x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS]
+	ldp	x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS]
+
+	\instr #0
+
+	/* Load the `res` from the stack */
+	ldr	x19, [sp]
+
+	/* Store the registers x0 - x17 into the result structure */
+	stp	x0, x1, [x19, #ARM_SMCCC_1_2_REGS_X0_OFFS]
+	stp	x2, x3, [x19, #ARM_SMCCC_1_2_REGS_X2_OFFS]
+	stp	x4, x5, [x19, #ARM_SMCCC_1_2_REGS_X4_OFFS]
+	stp	x6, x7, [x19, #ARM_SMCCC_1_2_REGS_X6_OFFS]
+	stp	x8, x9, [x19, #ARM_SMCCC_1_2_REGS_X8_OFFS]
+	stp	x10, x11, [x19, #ARM_SMCCC_1_2_REGS_X10_OFFS]
+	stp	x12, x13, [x19, #ARM_SMCCC_1_2_REGS_X12_OFFS]
+	stp	x14, x15, [x19, #ARM_SMCCC_1_2_REGS_X14_OFFS]
+	stp	x16, x17, [x19, #ARM_SMCCC_1_2_REGS_X16_OFFS]
+
+	/* Restore original x19 */
+	ldp     xzr, x19, [sp], #16
+	ret
+.endm
+
+/*
+ * void arm_smccc_1_2_hvc(const struct arm_smccc_1_2_regs *args,
+ *			  struct arm_smccc_1_2_regs *res);
+ */
+SYM_FUNC_START(arm_smccc_1_2_hvc)
+	SMCCC_1_2 hvc
+SYM_FUNC_END(arm_smccc_1_2_hvc)
+EXPORT_SYMBOL(arm_smccc_1_2_hvc)
+
+/*
+ * void arm_smccc_1_2_smc(const struct arm_smccc_1_2_regs *args,
+ *			  struct arm_smccc_1_2_regs *res);
+ */
+SYM_FUNC_START(arm_smccc_1_2_smc)
+	SMCCC_1_2 smc
+SYM_FUNC_END(arm_smccc_1_2_smc)
+EXPORT_SYMBOL(arm_smccc_1_2_smc)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index dcd7041b2b07..6f6ff072acbd 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -120,9 +120,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	 * page tables.
 	 */
 	secondary_data.task = idle;
-	secondary_data.stack = task_stack_page(idle) + THREAD_SIZE;
 	update_cpu_boot_status(CPU_MMU_OFF);
-	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
 
 	/* Now bring the CPU into our world */
 	ret = boot_secondary(cpu, idle);
@@ -142,8 +140,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 
 	pr_crit("CPU%u: failed to come online\n", cpu);
 	secondary_data.task = NULL;
-	secondary_data.stack = NULL;
-	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
 	status = READ_ONCE(secondary_data.status);
 	if (status == CPU_MMU_OFF)
 		status = READ_ONCE(__early_cpu_boot_status);
@@ -202,10 +198,7 @@ asmlinkage notrace void secondary_start_kernel(void)
 	u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
 	struct mm_struct *mm = &init_mm;
 	const struct cpu_operations *ops;
-	unsigned int cpu;
-
-	cpu = task_cpu(current);
-	set_my_cpu_offset(per_cpu_offset(cpu));
+	unsigned int cpu = smp_processor_id();
 
 	/*
 	 * All kernel threads share the same mm context; grab a
@@ -224,7 +217,6 @@ asmlinkage notrace void secondary_start_kernel(void)
 		init_gic_priority_masking();
 
 	rcu_cpu_starting(cpu);
-	preempt_disable();
 	trace_hardirqs_off();
 
 	/*
@@ -352,7 +344,7 @@ void __cpu_die(unsigned int cpu)
 		pr_crit("CPU%u: cpu didn't die\n", cpu);
 		return;
 	}
-	pr_notice("CPU%u: shutdown\n", cpu);
+	pr_debug("CPU%u: shutdown\n", cpu);
 
 	/*
 	 * Now that the dying CPU is beyond the point of no return w.r.t.
@@ -452,6 +444,11 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 void __init smp_prepare_boot_cpu(void)
 {
+	/*
+	 * The runtime per-cpu areas have been allocated by
+	 * setup_per_cpu_areas(), and CPU0's boot time per-cpu area will be
+	 * freed shortly, so we must move over to the runtime per-cpu area.
+	 */
 	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
 	cpuinfo_store_boot_cpu();
 
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index c45a83512805..7e1624ecab3c 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -36,7 +36,7 @@ static void write_pen_release(u64 val)
 	unsigned long size = sizeof(secondary_holding_pen_release);
 
 	secondary_holding_pen_release = val;
-	__flush_dcache_area(start, size);
+	dcache_clean_inval_poc((unsigned long)start, (unsigned long)start + size);
 }
 
 
@@ -90,8 +90,9 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu)
 	 * the boot protocol.
 	 */
 	writeq_relaxed(pa_holding_pen, release_addr);
-	__flush_dcache_area((__force void *)release_addr,
-			    sizeof(*release_addr));
+	dcache_clean_inval_poc((__force unsigned long)release_addr,
+			    (__force unsigned long)release_addr +
+				    sizeof(*release_addr));
 
 	/*
 	 * Send an event to wake up the secondary CPU.
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index de07147a7926..b189de5ca6cb 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -68,13 +68,17 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 	unsigned long fp = frame->fp;
 	struct stack_info info;
 
-	if (fp & 0xf)
-		return -EINVAL;
-
 	if (!tsk)
 		tsk = current;
 
-	if (!on_accessible_stack(tsk, fp, &info))
+	/* Final frame; nothing to unwind */
+	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
+		return -ENOENT;
+
+	if (fp & 0x7)
+		return -EINVAL;
+
+	if (!on_accessible_stack(tsk, fp, 16, &info))
 		return -EINVAL;
 
 	if (test_bit(info.type, frame->stacks_done))
@@ -128,12 +132,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 
 	frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
-	/*
-	 * This is a terminal record, so we have finished unwinding.
-	 */
-	if (!frame->fp && !frame->pc)
-		return -ENOENT;
-
 	return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index e3f72df9509d..938ce6fbee8a 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -7,6 +7,7 @@
 #include <asm/alternative.h>
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
+#include <asm/cpuidle.h>
 #include <asm/daifflags.h>
 #include <asm/debug-monitors.h>
 #include <asm/exec.h>
@@ -91,6 +92,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	int ret = 0;
 	unsigned long flags;
 	struct sleep_stack_data state;
+	struct arm_cpuidle_irq_context context;
 
 	/* Report any MTE async fault before going to suspend */
 	mte_suspend_enter();
@@ -103,12 +105,18 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	flags = local_daif_save();
 
 	/*
-	 * Function graph tracer state gets incosistent when the kernel
+	 * Function graph tracer state gets inconsistent when the kernel
 	 * calls functions that never return (aka suspend finishers) hence
 	 * disable graph tracing during their execution.
 	 */
 	pause_graph_tracing();
 
+	/*
+	 * Switch to using DAIF.IF instead of PMR in order to reliably
+	 * resume if we're using pseudo-NMIs.
+	 */
+	arm_cpuidle_save_irq_context(&context);
+
 	if (__cpu_suspend_enter(&state)) {
 		/* Call the suspend finisher */
 		ret = fn(arg);
@@ -126,6 +134,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 		RCU_NONIDLE(__cpu_suspend_exit());
 	}
 
+	arm_cpuidle_restore_irq_context(&context);
+
 	unpause_graph_tracing();
 
 	/*
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 265fe3eb1069..db5159a3055f 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -41,7 +41,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
 			dsb(ish);
 		}
 
-		ret = __flush_cache_user_range(start, start + chunk);
+		ret = caches_clean_inval_user_pou(start, start + chunk);
 		if (ret)
 			return ret;
 
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index a05d34f0e82a..b03e383d944a 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -38,6 +38,7 @@
 #include <asm/extable.h>
 #include <asm/insn.h>
 #include <asm/kprobes.h>
+#include <asm/patching.h>
 #include <asm/traps.h>
 #include <asm/smp.h>
 #include <asm/stack_pointer.h>
@@ -45,11 +46,102 @@
 #include <asm/system_misc.h>
 #include <asm/sysreg.h>
 
-static const char *handler[] = {
-	"Synchronous Abort",
-	"IRQ",
-	"FIQ",
-	"Error"
+static bool __kprobes __check_eq(unsigned long pstate)
+{
+	return (pstate & PSR_Z_BIT) != 0;
+}
+
+static bool __kprobes __check_ne(unsigned long pstate)
+{
+	return (pstate & PSR_Z_BIT) == 0;
+}
+
+static bool __kprobes __check_cs(unsigned long pstate)
+{
+	return (pstate & PSR_C_BIT) != 0;
+}
+
+static bool __kprobes __check_cc(unsigned long pstate)
+{
+	return (pstate & PSR_C_BIT) == 0;
+}
+
+static bool __kprobes __check_mi(unsigned long pstate)
+{
+	return (pstate & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_pl(unsigned long pstate)
+{
+	return (pstate & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_vs(unsigned long pstate)
+{
+	return (pstate & PSR_V_BIT) != 0;
+}
+
+static bool __kprobes __check_vc(unsigned long pstate)
+{
+	return (pstate & PSR_V_BIT) == 0;
+}
+
+static bool __kprobes __check_hi(unsigned long pstate)
+{
+	pstate &= ~(pstate >> 1);	/* PSR_C_BIT &= ~PSR_Z_BIT */
+	return (pstate & PSR_C_BIT) != 0;
+}
+
+static bool __kprobes __check_ls(unsigned long pstate)
+{
+	pstate &= ~(pstate >> 1);	/* PSR_C_BIT &= ~PSR_Z_BIT */
+	return (pstate & PSR_C_BIT) == 0;
+}
+
+static bool __kprobes __check_ge(unsigned long pstate)
+{
+	pstate ^= (pstate << 3);	/* PSR_N_BIT ^= PSR_V_BIT */
+	return (pstate & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_lt(unsigned long pstate)
+{
+	pstate ^= (pstate << 3);	/* PSR_N_BIT ^= PSR_V_BIT */
+	return (pstate & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_gt(unsigned long pstate)
+{
+	/*PSR_N_BIT ^= PSR_V_BIT */
+	unsigned long temp = pstate ^ (pstate << 3);
+
+	temp |= (pstate << 1);	/*PSR_N_BIT |= PSR_Z_BIT */
+	return (temp & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_le(unsigned long pstate)
+{
+	/*PSR_N_BIT ^= PSR_V_BIT */
+	unsigned long temp = pstate ^ (pstate << 3);
+
+	temp |= (pstate << 1);	/*PSR_N_BIT |= PSR_Z_BIT */
+	return (temp & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_al(unsigned long pstate)
+{
+	return true;
+}
+
+/*
+ * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that
+ * it behaves identically to 0b1110 ("al").
+ */
+pstate_check_t * const aarch32_opcode_cond_checks[16] = {
+	__check_eq, __check_ne, __check_cs, __check_cc,
+	__check_mi, __check_pl, __check_vs, __check_vc,
+	__check_hi, __check_ls, __check_ge, __check_lt,
+	__check_gt, __check_le, __check_al, __check_al
 };
 
 int show_unhandled_signals = 0;
@@ -751,27 +843,8 @@ const char *esr_get_class_string(u32 esr)
 }
 
 /*
- * bad_mode handles the impossible case in the exception vector. This is always
- * fatal.
- */
-asmlinkage void notrace bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
-{
-	arm64_enter_nmi(regs);
-
-	console_verbose();
-
-	pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n",
-		handler[reason], smp_processor_id(), esr,
-		esr_get_class_string(esr));
-
-	__show_regs(regs);
-	local_daif_mask();
-	panic("bad mode");
-}
-
-/*
  * bad_el0_sync handles unexpected, but potentially recoverable synchronous
- * exceptions taken from EL0. Unlike bad_mode, this returns.
+ * exceptions taken from EL0.
  */
 void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 {
@@ -789,15 +862,11 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
 DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)
 	__aligned(16);
 
-asmlinkage void noinstr handle_bad_stack(struct pt_regs *regs)
+void panic_bad_stack(struct pt_regs *regs, unsigned int esr, unsigned long far)
 {
 	unsigned long tsk_stk = (unsigned long)current->stack;
 	unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr);
 	unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack);
-	unsigned int esr = read_sysreg(esr_el1);
-	unsigned long far = read_sysreg(far_el1);
-
-	arm64_enter_nmi(regs);
 
 	console_verbose();
 	pr_emerg("Insufficient stack space to handle exception!");
@@ -870,15 +939,11 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr)
 	}
 }
 
-asmlinkage void noinstr do_serror(struct pt_regs *regs, unsigned int esr)
+void do_serror(struct pt_regs *regs, unsigned int esr)
 {
-	arm64_enter_nmi(regs);
-
 	/* non-RAS errors are not containable */
 	if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr))
 		arm64_serror_panic(regs, esr);
-
-	arm64_exit_nmi(regs);
 }
 
 /* GENERIC_BUG traps */
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 3964acf5451e..a4eba0908bfa 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -20,8 +20,6 @@ if VIRTUALIZATION
 menuconfig KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
 	depends on OF
-	# for TASKSTATS/TASK_DELAY_ACCT:
-	depends on NET && MULTIUSER
 	select MMU_NOTIFIER
 	select PREEMPT_NOTIFIERS
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
@@ -38,8 +36,7 @@ menuconfig KVM
 	select IRQ_BYPASS_MANAGER
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_VCPU_RUN_PID_CHANGE
-	select TASKSTATS
-	select TASK_DELAY_ACCT
+	select SCHED_INFO
 	help
 	  Support hosting virtualized guest machines.
 
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 589921392cb1..989bb5dad2c8 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o
 obj-$(CONFIG_KVM) += hyp/
 
 kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
-	 $(KVM)/vfio.o $(KVM)/irqchip.o \
+	 $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
 	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
 	 inject_fault.o va_layout.o handle_exit.o \
 	 guest.o debug.o reset.o sys_regs.o \
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 74e0699661e9..3df67c127489 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -9,6 +9,7 @@
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/irqdomain.h>
 #include <linux/uaccess.h>
 
 #include <clocksource/arm_arch_timer.h>
@@ -973,36 +974,154 @@ static int kvm_timer_dying_cpu(unsigned int cpu)
 	return 0;
 }
 
-int kvm_timer_hyp_init(bool has_gic)
+static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
 {
-	struct arch_timer_kvm_info *info;
-	int err;
+	if (vcpu)
+		irqd_set_forwarded_to_vcpu(d);
+	else
+		irqd_clr_forwarded_to_vcpu(d);
 
-	info = arch_timer_get_kvm_info();
-	timecounter = &info->timecounter;
+	return 0;
+}
 
-	if (!timecounter->cc) {
-		kvm_err("kvm_arch_timer: uninitialized timecounter\n");
-		return -ENODEV;
+static int timer_irq_set_irqchip_state(struct irq_data *d,
+				       enum irqchip_irq_state which, bool val)
+{
+	if (which != IRQCHIP_STATE_ACTIVE || !irqd_is_forwarded_to_vcpu(d))
+		return irq_chip_set_parent_state(d, which, val);
+
+	if (val)
+		irq_chip_mask_parent(d);
+	else
+		irq_chip_unmask_parent(d);
+
+	return 0;
+}
+
+static void timer_irq_eoi(struct irq_data *d)
+{
+	if (!irqd_is_forwarded_to_vcpu(d))
+		irq_chip_eoi_parent(d);
+}
+
+static void timer_irq_ack(struct irq_data *d)
+{
+	d = d->parent_data;
+	if (d->chip->irq_ack)
+		d->chip->irq_ack(d);
+}
+
+static struct irq_chip timer_chip = {
+	.name			= "KVM",
+	.irq_ack		= timer_irq_ack,
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+	.irq_eoi		= timer_irq_eoi,
+	.irq_set_type		= irq_chip_set_type_parent,
+	.irq_set_vcpu_affinity	= timer_irq_set_vcpu_affinity,
+	.irq_set_irqchip_state	= timer_irq_set_irqchip_state,
+};
+
+static int timer_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				  unsigned int nr_irqs, void *arg)
+{
+	irq_hw_number_t hwirq = (uintptr_t)arg;
+
+	return irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
+					     &timer_chip, NULL);
+}
+
+static void timer_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+				  unsigned int nr_irqs)
+{
+}
+
+static const struct irq_domain_ops timer_domain_ops = {
+	.alloc	= timer_irq_domain_alloc,
+	.free	= timer_irq_domain_free,
+};
+
+static struct irq_ops arch_timer_irq_ops = {
+	.get_input_level = kvm_arch_timer_get_input_level,
+};
+
+static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags)
+{
+	*flags = irq_get_trigger_type(virq);
+	if (*flags != IRQF_TRIGGER_HIGH && *flags != IRQF_TRIGGER_LOW) {
+		kvm_err("Invalid trigger for timer IRQ%d, assuming level low\n",
+			virq);
+		*flags = IRQF_TRIGGER_LOW;
 	}
+}
 
-	/* First, do the virtual EL1 timer irq */
+static int kvm_irq_init(struct arch_timer_kvm_info *info)
+{
+	struct irq_domain *domain = NULL;
 
 	if (info->virtual_irq <= 0) {
 		kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
 			info->virtual_irq);
 		return -ENODEV;
 	}
+
 	host_vtimer_irq = info->virtual_irq;
+	kvm_irq_fixup_flags(host_vtimer_irq, &host_vtimer_irq_flags);
+
+	if (kvm_vgic_global_state.no_hw_deactivation) {
+		struct fwnode_handle *fwnode;
+		struct irq_data *data;
+
+		fwnode = irq_domain_alloc_named_fwnode("kvm-timer");
+		if (!fwnode)
+			return -ENOMEM;
+
+		/* Assume both vtimer and ptimer in the same parent */
+		data = irq_get_irq_data(host_vtimer_irq);
+		domain = irq_domain_create_hierarchy(data->domain, 0,
+						     NR_KVM_TIMERS, fwnode,
+						     &timer_domain_ops, NULL);
+		if (!domain) {
+			irq_domain_free_fwnode(fwnode);
+			return -ENOMEM;
+		}
+
+		arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE;
+		WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq,
+					    (void *)TIMER_VTIMER));
+	}
 
-	host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
-	if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
-	    host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
-		kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
-			host_vtimer_irq);
-		host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
+	if (info->physical_irq > 0) {
+		host_ptimer_irq = info->physical_irq;
+		kvm_irq_fixup_flags(host_ptimer_irq, &host_ptimer_irq_flags);
+
+		if (domain)
+			WARN_ON(irq_domain_push_irq(domain, host_ptimer_irq,
+						    (void *)TIMER_PTIMER));
 	}
 
+	return 0;
+}
+
+int kvm_timer_hyp_init(bool has_gic)
+{
+	struct arch_timer_kvm_info *info;
+	int err;
+
+	info = arch_timer_get_kvm_info();
+	timecounter = &info->timecounter;
+
+	if (!timecounter->cc) {
+		kvm_err("kvm_arch_timer: uninitialized timecounter\n");
+		return -ENODEV;
+	}
+
+	err = kvm_irq_init(info);
+	if (err)
+		return err;
+
+	/* First, do the virtual EL1 timer irq */
+
 	err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
 				 "kvm guest vtimer", kvm_get_running_vcpus());
 	if (err) {
@@ -1027,15 +1146,6 @@ int kvm_timer_hyp_init(bool has_gic)
 	/* Now let's do the physical EL1 timer irq */
 
 	if (info->physical_irq > 0) {
-		host_ptimer_irq = info->physical_irq;
-		host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
-		if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
-		    host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
-			kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
-				host_ptimer_irq);
-			host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
-		}
-
 		err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
 					 "kvm guest ptimer", kvm_get_running_vcpus());
 		if (err) {
@@ -1143,7 +1253,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 	ret = kvm_vgic_map_phys_irq(vcpu,
 				    map.direct_vtimer->host_timer_irq,
 				    map.direct_vtimer->irq.irq,
-				    kvm_arch_timer_get_input_level);
+				    &arch_timer_irq_ops);
 	if (ret)
 		return ret;
 
@@ -1151,7 +1261,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 		ret = kvm_vgic_map_phys_irq(vcpu,
 					    map.direct_ptimer->host_timer_irq,
 					    map.direct_ptimer->irq.irq,
-					    kvm_arch_timer_get_input_level);
+					    &arch_timer_irq_ops);
 	}
 
 	if (ret)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e720148232a0..e9a2b8f27792 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -93,6 +93,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		r = 0;
 		kvm->arch.return_nisv_io_abort_to_user = true;
 		break;
+	case KVM_CAP_ARM_MTE:
+		if (!system_supports_mte() || kvm->created_vcpus)
+			return -EINVAL;
+		r = 0;
+		kvm->arch.mte_enabled = true;
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -237,6 +243,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		 */
 		r = 1;
 		break;
+	case KVM_CAP_ARM_MTE:
+		r = system_supports_mte();
+		break;
 	case KVM_CAP_STEAL_TIME:
 		r = kvm_arm_pvtime_supported();
 		break;
@@ -689,9 +698,22 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
 			vgic_v4_load(vcpu);
 			preempt_enable();
 		}
+
+		if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
+			kvm_pmu_handle_pmcr(vcpu,
+					    __vcpu_sys_reg(vcpu, PMCR_EL0));
 	}
 }
 
+static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
+{
+	if (likely(!vcpu_mode_is_32bit(vcpu)))
+		return false;
+
+	return !system_supports_32bit_el0() ||
+		static_branch_unlikely(&arm64_mismatched_32bit_el0);
+}
+
 /**
  * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
  * @vcpu:	The VCPU pointer
@@ -877,7 +899,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		 * with the asymmetric AArch32 case), return to userspace with
 		 * a fatal error.
 		 */
-		if (!system_supports_32bit_el0() && vcpu_mode_is_32bit(vcpu)) {
+		if (vcpu_mode_is_bad_32bit(vcpu)) {
 			/*
 			 * As we have caught the guest red-handed, decide that
 			 * it isn't fit for purpose anymore by making the vcpu
@@ -1078,7 +1100,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 		if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
 			stage2_unmap_vm(vcpu->kvm);
 		else
-			__flush_icache_all();
+			icache_inval_all_pou();
 	}
 
 	vcpu_reset_hcr(vcpu);
@@ -1350,6 +1372,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		return 0;
 	}
+	case KVM_ARM_MTE_COPY_TAGS: {
+		struct kvm_arm_copy_mte_tags copy_tags;
+
+		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
+			return -EFAULT;
+		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 5cb4a1cd5603..1dfb83578277 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -28,20 +28,40 @@
 
 #include "trace.h"
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT("halt_successful_poll", halt_successful_poll),
-	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
-	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
-	VCPU_STAT("halt_wakeup", halt_wakeup),
-	VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
-	VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
-	VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
-	VCPU_STAT("mmio_exit_user", mmio_exit_user),
-	VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
-	VCPU_STAT("exits", exits),
-	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
-	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
-	{ NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+	KVM_GENERIC_VM_STATS()
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+		sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+	.id_offset =  sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+	KVM_GENERIC_VCPU_STATS(),
+	STATS_DESC_COUNTER(VCPU, hvc_exit_stat),
+	STATS_DESC_COUNTER(VCPU, wfe_exit_stat),
+	STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
+	STATS_DESC_COUNTER(VCPU, mmio_exit_user),
+	STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
+	STATS_DESC_COUNTER(VCPU, exits)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+		sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+	.name_size = KVM_STATS_NAME_SIZE,
+	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+	.id_offset = sizeof(struct kvm_stats_header),
+	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+		       sizeof(kvm_vcpu_stats_desc),
 };
 
 static bool core_reg_offset_is_vreg(u64 off)
@@ -995,3 +1015,89 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 
 	return ret;
 }
+
+long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
+				struct kvm_arm_copy_mte_tags *copy_tags)
+{
+	gpa_t guest_ipa = copy_tags->guest_ipa;
+	size_t length = copy_tags->length;
+	void __user *tags = copy_tags->addr;
+	gpa_t gfn;
+	bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
+	int ret = 0;
+
+	if (!kvm_has_mte(kvm))
+		return -EINVAL;
+
+	if (copy_tags->reserved[0] || copy_tags->reserved[1])
+		return -EINVAL;
+
+	if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
+		return -EINVAL;
+
+	if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
+		return -EINVAL;
+
+	gfn = gpa_to_gfn(guest_ipa);
+
+	mutex_lock(&kvm->slots_lock);
+
+	while (length > 0) {
+		kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+		void *maddr;
+		unsigned long num_tags;
+		struct page *page;
+
+		if (is_error_noslot_pfn(pfn)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		page = pfn_to_online_page(pfn);
+		if (!page) {
+			/* Reject ZONE_DEVICE memory */
+			ret = -EFAULT;
+			goto out;
+		}
+		maddr = page_address(page);
+
+		if (!write) {
+			if (test_bit(PG_mte_tagged, &page->flags))
+				num_tags = mte_copy_tags_to_user(tags, maddr,
+							MTE_GRANULES_PER_PAGE);
+			else
+				/* No tags in memory, so write zeros */
+				num_tags = MTE_GRANULES_PER_PAGE -
+					clear_user(tags, MTE_GRANULES_PER_PAGE);
+			kvm_release_pfn_clean(pfn);
+		} else {
+			num_tags = mte_copy_tags_from_user(maddr, tags,
+							MTE_GRANULES_PER_PAGE);
+
+			/*
+			 * Set the flag after checking the write
+			 * completed fully
+			 */
+			if (num_tags == MTE_GRANULES_PER_PAGE)
+				set_bit(PG_mte_tagged, &page->flags);
+
+			kvm_release_pfn_dirty(pfn);
+		}
+
+		if (num_tags != MTE_GRANULES_PER_PAGE) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		gfn++;
+		tags += num_tags;
+		length -= PAGE_SIZE;
+	}
+
+out:
+	mutex_unlock(&kvm->slots_lock);
+	/* If some data has been copied report the number of bytes copied */
+	if (length != copy_tags->length)
+		return copy_tags->length - length;
+	return ret;
+}
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index e831d3dfd50d..435346ea1504 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -13,6 +13,7 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_mte.h>
 #include <asm/kvm_ptrauth.h>
 
 	.text
@@ -51,6 +52,9 @@ alternative_else_nop_endif
 
 	add	x29, x0, #VCPU_CONTEXT
 
+	// mte_switch_to_guest(g_ctxt, h_ctxt, tmp1)
+	mte_switch_to_guest x29, x1, x2
+
 	// Macro ptrauth_switch_to_guest format:
 	// 	ptrauth_switch_to_guest(guest cxt, tmp1, tmp2, tmp3)
 	// The below macro to restore guest keys is not implemented in C code
@@ -142,6 +146,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
 	// when this feature is enabled for kernel code.
 	ptrauth_switch_to_hyp x1, x2, x3, x4, x5
 
+	// mte_switch_to_hyp(g_ctxt, h_ctxt, reg1)
+	mte_switch_to_hyp x1, x2, x3
+
 	// Restore hyp's sp_el0
 	restore_sp_el0 x2, x3
 
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 11541b94b328..0418399e0a20 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -112,7 +112,8 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
 	new |= (old & PSR_C_BIT);
 	new |= (old & PSR_V_BIT);
 
-	// TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
+	if (kvm_has_mte(vcpu->kvm))
+		new |= PSR_TCO_BIT;
 
 	new |= (old & PSR_DIT_BIT);
 
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 5f49df4ffdd8..9aa9b73475c9 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -76,6 +76,7 @@ el1_trap:
 	b	__guest_exit
 
 el1_irq:
+el1_fiq:
 	get_vcpu_ptr	x1, x0
 	mov	x0, #ARM_EXCEPTION_IRQ
 	b	__guest_exit
@@ -131,7 +132,6 @@ SYM_CODE_END(\label)
 	invalid_vector	el2t_error_invalid
 	invalid_vector	el2h_irq_invalid
 	invalid_vector	el2h_fiq_invalid
-	invalid_vector	el1_fiq_invalid
 
 	.ltorg
 
@@ -179,12 +179,12 @@ SYM_CODE_START(__kvm_hyp_vector)
 
 	valid_vect	el1_sync		// Synchronous 64-bit EL1
 	valid_vect	el1_irq			// IRQ 64-bit EL1
-	invalid_vect	el1_fiq_invalid		// FIQ 64-bit EL1
+	valid_vect	el1_fiq			// FIQ 64-bit EL1
 	valid_vect	el1_error		// Error 64-bit EL1
 
 	valid_vect	el1_sync		// Synchronous 32-bit EL1
 	valid_vect	el1_irq			// IRQ 32-bit EL1
-	invalid_vect	el1_fiq_invalid		// FIQ 32-bit EL1
+	valid_vect	el1_fiq			// FIQ 32-bit EL1
 	valid_vect	el1_error		// Error 32-bit EL1
 SYM_CODE_END(__kvm_hyp_vector)
 
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index cce43bfe158f..de7e14c862e6 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -14,6 +14,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 
 static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
 {
@@ -26,6 +27,16 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
 	ctxt_sys_reg(ctxt, TPIDRRO_EL0)	= read_sysreg(tpidrro_el0);
 }
 
+static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
+{
+	struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
+
+	if (!vcpu)
+		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
+
+	return kvm_has_mte(kern_hyp_va(vcpu->kvm));
+}
+
 static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
 {
 	ctxt_sys_reg(ctxt, CSSELR_EL1)	= read_sysreg(csselr_el1);
@@ -46,6 +57,11 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
 	ctxt_sys_reg(ctxt, PAR_EL1)	= read_sysreg_par();
 	ctxt_sys_reg(ctxt, TPIDR_EL1)	= read_sysreg(tpidr_el1);
 
+	if (ctxt_has_mte(ctxt)) {
+		ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);
+		ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1);
+	}
+
 	ctxt_sys_reg(ctxt, SP_EL1)	= read_sysreg(sp_el1);
 	ctxt_sys_reg(ctxt, ELR_EL1)	= read_sysreg_el1(SYS_ELR);
 	ctxt_sys_reg(ctxt, SPSR_EL1)	= read_sysreg_el1(SYS_SPSR);
@@ -107,6 +123,11 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
 	write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1),	par_el1);
 	write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1),	tpidr_el1);
 
+	if (ctxt_has_mte(ctxt)) {
+		write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR);
+		write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1);
+	}
+
 	if (!has_vhe() &&
 	    cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) &&
 	    ctxt->__hyp_running_vcpu) {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 18a4494337bd..fb0f523d1492 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -7,7 +7,7 @@
 #include <nvhe/memory.h>
 #include <nvhe/spinlock.h>
 
-#define HYP_NO_ORDER	UINT_MAX
+#define HYP_NO_ORDER	USHRT_MAX
 
 struct hyp_pool {
 	/*
@@ -19,48 +19,13 @@ struct hyp_pool {
 	struct list_head free_area[MAX_ORDER];
 	phys_addr_t range_start;
 	phys_addr_t range_end;
-	unsigned int max_order;
+	unsigned short max_order;
 };
 
-static inline void hyp_page_ref_inc(struct hyp_page *p)
-{
-	struct hyp_pool *pool = hyp_page_to_pool(p);
-
-	hyp_spin_lock(&pool->lock);
-	p->refcount++;
-	hyp_spin_unlock(&pool->lock);
-}
-
-static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
-{
-	struct hyp_pool *pool = hyp_page_to_pool(p);
-	int ret;
-
-	hyp_spin_lock(&pool->lock);
-	p->refcount--;
-	ret = (p->refcount == 0);
-	hyp_spin_unlock(&pool->lock);
-
-	return ret;
-}
-
-static inline void hyp_set_page_refcounted(struct hyp_page *p)
-{
-	struct hyp_pool *pool = hyp_page_to_pool(p);
-
-	hyp_spin_lock(&pool->lock);
-	if (p->refcount) {
-		hyp_spin_unlock(&pool->lock);
-		BUG();
-	}
-	p->refcount = 1;
-	hyp_spin_unlock(&pool->lock);
-}
-
 /* Allocation */
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
-void hyp_get_page(void *addr);
-void hyp_put_page(void *addr);
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
+void hyp_get_page(struct hyp_pool *pool, void *addr);
+void hyp_put_page(struct hyp_pool *pool, void *addr);
 
 /* Used pages cannot be freed */
 int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 42d81ec739fa..9c227d87c36d 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -23,7 +23,7 @@ extern struct host_kvm host_kvm;
 int __pkvm_prot_finalize(void);
 int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
 
-int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
+int kvm_host_prepare_stage2(void *pgt_pool_base);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 
 static __always_inline void __load_host_stage2(void)
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index fd78bde939ee..592b7edb3edb 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -7,12 +7,9 @@
 
 #include <linux/types.h>
 
-struct hyp_pool;
 struct hyp_page {
-	unsigned int refcount;
-	unsigned int order;
-	struct hyp_pool *pool;
-	struct list_head node;
+	unsigned short refcount;
+	unsigned short order;
 };
 
 extern u64 __hyp_vmemmap;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 0095f6289742..8ec3a5a7744b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -78,19 +78,20 @@ static inline unsigned long hyp_s1_pgtable_pages(void)
 	return res;
 }
 
-static inline unsigned long host_s2_mem_pgtable_pages(void)
+static inline unsigned long host_s2_pgtable_pages(void)
 {
+	unsigned long res;
+
 	/*
 	 * Include an extra 16 pages to safely upper-bound the worst case of
 	 * concatenated pgds.
 	 */
-	return __hyp_pgtable_total_pages() + 16;
-}
+	res = __hyp_pgtable_total_pages() + 16;
 
-static inline unsigned long host_s2_dev_pgtable_pages(void)
-{
 	/* Allow 1 GiB for MMIO mappings */
-	return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+	res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+
+	return res;
 }
 
 #endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
index 36cef6915428..958734f4d6b0 100644
--- a/arch/arm64/kvm/hyp/nvhe/cache.S
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -7,7 +7,7 @@
 #include <asm/assembler.h>
 #include <asm/alternative.h>
 
-SYM_FUNC_START_PI(__flush_dcache_area)
+SYM_FUNC_START_PI(dcache_clean_inval_poc)
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__flush_dcache_area)
+SYM_FUNC_END_PI(dcache_clean_inval_poc)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 4b60c0056c04..d938ce95d3bd 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -23,8 +23,7 @@
 extern unsigned long hyp_nr_cpus;
 struct host_kvm host_kvm;
 
-static struct hyp_pool host_s2_mem;
-static struct hyp_pool host_s2_dev;
+static struct hyp_pool host_s2_pool;
 
 /*
  * Copies of the host's CPU features registers holding sanitized values.
@@ -36,7 +35,7 @@ static const u8 pkvm_hyp_id = 1;
 
 static void *host_s2_zalloc_pages_exact(size_t size)
 {
-	return hyp_alloc_pages(&host_s2_mem, get_order(size));
+	return hyp_alloc_pages(&host_s2_pool, get_order(size));
 }
 
 static void *host_s2_zalloc_page(void *pool)
@@ -44,20 +43,24 @@ static void *host_s2_zalloc_page(void *pool)
 	return hyp_alloc_pages(pool, 0);
 }
 
-static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
+static void host_s2_get_page(void *addr)
+{
+	hyp_get_page(&host_s2_pool, addr);
+}
+
+static void host_s2_put_page(void *addr)
+{
+	hyp_put_page(&host_s2_pool, addr);
+}
+
+static int prepare_s2_pool(void *pgt_pool_base)
 {
 	unsigned long nr_pages, pfn;
 	int ret;
 
-	pfn = hyp_virt_to_pfn(mem_pgt_pool);
-	nr_pages = host_s2_mem_pgtable_pages();
-	ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
-	if (ret)
-		return ret;
-
-	pfn = hyp_virt_to_pfn(dev_pgt_pool);
-	nr_pages = host_s2_dev_pgtable_pages();
-	ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
+	pfn = hyp_virt_to_pfn(pgt_pool_base);
+	nr_pages = host_s2_pgtable_pages();
+	ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0);
 	if (ret)
 		return ret;
 
@@ -67,8 +70,8 @@ static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
 		.phys_to_virt = hyp_phys_to_virt,
 		.virt_to_phys = hyp_virt_to_phys,
 		.page_count = hyp_page_count,
-		.get_page = hyp_get_page,
-		.put_page = hyp_put_page,
+		.get_page = host_s2_get_page,
+		.put_page = host_s2_put_page,
 	};
 
 	return 0;
@@ -86,7 +89,7 @@ static void prepare_host_vtcr(void)
 					  id_aa64mmfr1_el1_sys_val, phys_shift);
 }
 
-int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
+int kvm_host_prepare_stage2(void *pgt_pool_base)
 {
 	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
 	int ret;
@@ -94,7 +97,7 @@ int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
 	prepare_host_vtcr();
 	hyp_spin_lock_init(&host_kvm.lock);
 
-	ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
+	ret = prepare_s2_pool(pgt_pool_base);
 	if (ret)
 		return ret;
 
@@ -199,11 +202,10 @@ static bool range_is_memory(u64 start, u64 end)
 }
 
 static inline int __host_stage2_idmap(u64 start, u64 end,
-				      enum kvm_pgtable_prot prot,
-				      struct hyp_pool *pool)
+				      enum kvm_pgtable_prot prot)
 {
 	return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
-				      prot, pool);
+				      prot, &host_s2_pool);
 }
 
 static int host_stage2_idmap(u64 addr)
@@ -211,7 +213,6 @@ static int host_stage2_idmap(u64 addr)
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
 	struct kvm_mem_range range;
 	bool is_memory = find_mem_range(addr, &range);
-	struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
 	int ret;
 
 	if (is_memory)
@@ -222,22 +223,21 @@ static int host_stage2_idmap(u64 addr)
 	if (ret)
 		goto unlock;
 
-	ret = __host_stage2_idmap(range.start, range.end, prot, pool);
-	if (is_memory || ret != -ENOMEM)
+	ret = __host_stage2_idmap(range.start, range.end, prot);
+	if (ret != -ENOMEM)
 		goto unlock;
 
 	/*
-	 * host_s2_mem has been provided with enough pages to cover all of
-	 * memory with page granularity, so we should never hit the ENOMEM case.
-	 * However, it is difficult to know how much of the MMIO range we will
-	 * need to cover upfront, so we may need to 'recycle' the pages if we
-	 * run out.
+	 * The pool has been provided with enough pages to cover all of memory
+	 * with page granularity, but it is difficult to know how much of the
+	 * MMIO range we will need to cover upfront, so we may need to 'recycle'
+	 * the pages if we run out.
 	 */
 	ret = host_stage2_unmap_dev_all();
 	if (ret)
 		goto unlock;
 
-	ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+	ret = __host_stage2_idmap(range.start, range.end, prot);
 
 unlock:
 	hyp_spin_unlock(&host_kvm.lock);
@@ -258,7 +258,7 @@ int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
 
 	hyp_spin_lock(&host_kvm.lock);
 	ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
-					   &host_s2_mem, pkvm_hyp_id);
+					   &host_s2_pool, pkvm_hyp_id);
 	hyp_spin_unlock(&host_kvm.lock);
 
 	return ret != -EAGAIN ? ret : 0;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 237e03bf0cb1..41fc25bdfb34 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -32,7 +32,7 @@ u64 __hyp_vmemmap;
  */
 static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
 					     struct hyp_page *p,
-					     unsigned int order)
+					     unsigned short order)
 {
 	phys_addr_t addr = hyp_page_to_phys(p);
 
@@ -51,21 +51,49 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
 /* Find a buddy page currently available for allocation */
 static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
 					   struct hyp_page *p,
-					   unsigned int order)
+					   unsigned short order)
 {
 	struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
 
-	if (!buddy || buddy->order != order || list_empty(&buddy->node))
+	if (!buddy || buddy->order != order || buddy->refcount)
 		return NULL;
 
 	return buddy;
 
 }
 
+/*
+ * Pages that are available for allocation are tracked in free-lists, so we use
+ * the pages themselves to store the list nodes to avoid wasting space. As the
+ * allocator always returns zeroed pages (which are zeroed on the hyp_put_page()
+ * path to optimize allocation speed), we also need to clean-up the list node in
+ * each page when we take it out of the list.
+ */
+static inline void page_remove_from_list(struct hyp_page *p)
+{
+	struct list_head *node = hyp_page_to_virt(p);
+
+	__list_del_entry(node);
+	memset(node, 0, sizeof(*node));
+}
+
+static inline void page_add_to_list(struct hyp_page *p, struct list_head *head)
+{
+	struct list_head *node = hyp_page_to_virt(p);
+
+	INIT_LIST_HEAD(node);
+	list_add_tail(node, head);
+}
+
+static inline struct hyp_page *node_to_page(struct list_head *node)
+{
+	return hyp_virt_to_page(node);
+}
+
 static void __hyp_attach_page(struct hyp_pool *pool,
 			      struct hyp_page *p)
 {
-	unsigned int order = p->order;
+	unsigned short order = p->order;
 	struct hyp_page *buddy;
 
 	memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
@@ -83,32 +111,23 @@ static void __hyp_attach_page(struct hyp_pool *pool,
 			break;
 
 		/* Take the buddy out of its list, and coallesce with @p */
-		list_del_init(&buddy->node);
+		page_remove_from_list(buddy);
 		buddy->order = HYP_NO_ORDER;
 		p = min(p, buddy);
 	}
 
 	/* Mark the new head, and insert it */
 	p->order = order;
-	list_add_tail(&p->node, &pool->free_area[order]);
-}
-
-static void hyp_attach_page(struct hyp_page *p)
-{
-	struct hyp_pool *pool = hyp_page_to_pool(p);
-
-	hyp_spin_lock(&pool->lock);
-	__hyp_attach_page(pool, p);
-	hyp_spin_unlock(&pool->lock);
+	page_add_to_list(p, &pool->free_area[order]);
 }
 
 static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
 					   struct hyp_page *p,
-					   unsigned int order)
+					   unsigned short order)
 {
 	struct hyp_page *buddy;
 
-	list_del_init(&p->node);
+	page_remove_from_list(p);
 	while (p->order > order) {
 		/*
 		 * The buddy of order n - 1 currently has HYP_NO_ORDER as it
@@ -119,30 +138,64 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
 		p->order--;
 		buddy = __find_buddy_nocheck(pool, p, p->order);
 		buddy->order = p->order;
-		list_add_tail(&buddy->node, &pool->free_area[buddy->order]);
+		page_add_to_list(buddy, &pool->free_area[buddy->order]);
 	}
 
 	return p;
 }
 
-void hyp_put_page(void *addr)
+static inline void hyp_page_ref_inc(struct hyp_page *p)
 {
-	struct hyp_page *p = hyp_virt_to_page(addr);
+	BUG_ON(p->refcount == USHRT_MAX);
+	p->refcount++;
+}
 
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+	p->refcount--;
+	return (p->refcount == 0);
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+	BUG_ON(p->refcount);
+	p->refcount = 1;
+}
+
+static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
+{
 	if (hyp_page_ref_dec_and_test(p))
-		hyp_attach_page(p);
+		__hyp_attach_page(pool, p);
+}
+
+/*
+ * Changes to the buddy tree and page refcounts must be done with the hyp_pool
+ * lock held. If a refcount change requires an update to the buddy tree (e.g.
+ * hyp_put_page()), both operations must be done within the same critical
+ * section to guarantee transient states (e.g. a page with null refcount but
+ * not yet attached to a free list) can't be observed by well-behaved readers.
+ */
+void hyp_put_page(struct hyp_pool *pool, void *addr)
+{
+	struct hyp_page *p = hyp_virt_to_page(addr);
+
+	hyp_spin_lock(&pool->lock);
+	__hyp_put_page(pool, p);
+	hyp_spin_unlock(&pool->lock);
 }
 
-void hyp_get_page(void *addr)
+void hyp_get_page(struct hyp_pool *pool, void *addr)
 {
 	struct hyp_page *p = hyp_virt_to_page(addr);
 
+	hyp_spin_lock(&pool->lock);
 	hyp_page_ref_inc(p);
+	hyp_spin_unlock(&pool->lock);
 }
 
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
 {
-	unsigned int i = order;
+	unsigned short i = order;
 	struct hyp_page *p;
 
 	hyp_spin_lock(&pool->lock);
@@ -156,11 +209,11 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
 	}
 
 	/* Extract it from the tree at the right order */
-	p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
+	p = node_to_page(pool->free_area[i].next);
 	p = __hyp_extract_page(pool, p, order);
 
-	hyp_spin_unlock(&pool->lock);
 	hyp_set_page_refcounted(p);
+	hyp_spin_unlock(&pool->lock);
 
 	return hyp_page_to_virt(p);
 }
@@ -181,15 +234,14 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
 
 	/* Init the vmemmap portion */
 	p = hyp_phys_to_page(phys);
-	memset(p, 0, sizeof(*p) * nr_pages);
 	for (i = 0; i < nr_pages; i++) {
-		p[i].pool = pool;
-		INIT_LIST_HEAD(&p[i].node);
+		p[i].order = 0;
+		hyp_set_page_refcounted(&p[i]);
 	}
 
 	/* Attach the unused pages to the buddy tree */
 	for (i = reserved_pages; i < nr_pages; i++)
-		__hyp_attach_page(pool, &p[i]);
+		__hyp_put_page(pool, &p[i]);
 
 	return 0;
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index a3d3a275344e..0b574d106519 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -24,8 +24,7 @@ unsigned long hyp_nr_cpus;
 
 static void *vmemmap_base;
 static void *hyp_pgt_base;
-static void *host_s2_mem_pgt_base;
-static void *host_s2_dev_pgt_base;
+static void *host_s2_pgt_base;
 static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
 
 static int divide_memory_pool(void *virt, unsigned long size)
@@ -45,14 +44,9 @@ static int divide_memory_pool(void *virt, unsigned long size)
 	if (!hyp_pgt_base)
 		return -ENOMEM;
 
-	nr_pages = host_s2_mem_pgtable_pages();
-	host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages);
-	if (!host_s2_mem_pgt_base)
-		return -ENOMEM;
-
-	nr_pages = host_s2_dev_pgtable_pages();
-	host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages);
-	if (!host_s2_dev_pgt_base)
+	nr_pages = host_s2_pgtable_pages();
+	host_s2_pgt_base = hyp_early_alloc_contig(nr_pages);
+	if (!host_s2_pgt_base)
 		return -ENOMEM;
 
 	return 0;
@@ -134,7 +128,8 @@ static void update_nvhe_init_params(void)
 	for (i = 0; i < hyp_nr_cpus; i++) {
 		params = per_cpu_ptr(&kvm_init_params, i);
 		params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd);
-		__flush_dcache_area(params, sizeof(*params));
+		dcache_clean_inval_poc((unsigned long)params,
+				    (unsigned long)params + sizeof(*params));
 	}
 }
 
@@ -143,6 +138,16 @@ static void *hyp_zalloc_hyp_page(void *arg)
 	return hyp_alloc_pages(&hpool, 0);
 }
 
+static void hpool_get_page(void *addr)
+{
+	hyp_get_page(&hpool, addr);
+}
+
+static void hpool_put_page(void *addr)
+{
+	hyp_put_page(&hpool, addr);
+}
+
 void __noreturn __pkvm_init_finalise(void)
 {
 	struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -158,7 +163,7 @@ void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
-	ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base);
+	ret = kvm_host_prepare_stage2(host_s2_pgt_base);
 	if (ret)
 		goto out;
 
@@ -166,8 +171,8 @@ void __noreturn __pkvm_init_finalise(void)
 		.zalloc_page = hyp_zalloc_hyp_page,
 		.phys_to_virt = hyp_phys_to_virt,
 		.virt_to_phys = hyp_virt_to_phys,
-		.get_page = hyp_get_page,
-		.put_page = hyp_put_page,
+		.get_page = hpool_get_page,
+		.put_page = hpool_put_page,
 	};
 	pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
 
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 83dc3b271bc5..38ed0f6f2703 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -104,7 +104,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	 * you should be running with VHE enabled.
 	 */
 	if (icache_is_vpipt())
-		__flush_icache_all();
+		icache_inval_all_pou();
 
 	__tlb_switch_to_host(&cxt);
 }
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c37c1dc4feaf..05321f4165e3 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -577,12 +577,24 @@ static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
 	mm_ops->put_page(ptep);
 }
 
+static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
+{
+	u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+	return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
+}
+
+static bool stage2_pte_executable(kvm_pte_t pte)
+{
+	return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 				      kvm_pte_t *ptep,
 				      struct stage2_map_data *data)
 {
 	kvm_pte_t new, old = *ptep;
 	u64 granule = kvm_granule_size(level), phys = data->phys;
+	struct kvm_pgtable *pgt = data->mmu->pgt;
 	struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 
 	if (!kvm_block_mapping_supported(addr, end, phys, level))
@@ -606,6 +618,14 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
 	}
 
+	/* Perform CMOs before installation of the guest stage-2 PTE */
+	if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
+		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
+						granule);
+
+	if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
+		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
+
 	smp_store_release(ptep, new);
 	if (stage2_pte_is_counted(new))
 		mm_ops->get_page(ptep);
@@ -798,12 +818,6 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
 	return ret;
 }
 
-static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
-{
-	u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-	return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
-}
-
 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 			       enum kvm_pgtable_walk_flags flag,
 			       void * const arg)
@@ -839,8 +853,11 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	stage2_put_pte(ptep, mmu, addr, level, mm_ops);
 
 	if (need_flush) {
-		__flush_dcache_area(kvm_pte_follow(pte, mm_ops),
-				    kvm_granule_size(level));
+		kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops);
+
+		dcache_clean_inval_poc((unsigned long)pte_follow,
+				    (unsigned long)pte_follow +
+					    kvm_granule_size(level));
 	}
 
 	if (childp)
@@ -861,10 +878,11 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
 }
 
 struct stage2_attr_data {
-	kvm_pte_t	attr_set;
-	kvm_pte_t	attr_clr;
-	kvm_pte_t	pte;
-	u32		level;
+	kvm_pte_t			attr_set;
+	kvm_pte_t			attr_clr;
+	kvm_pte_t			pte;
+	u32				level;
+	struct kvm_pgtable_mm_ops	*mm_ops;
 };
 
 static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
@@ -873,6 +891,7 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 {
 	kvm_pte_t pte = *ptep;
 	struct stage2_attr_data *data = arg;
+	struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 
 	if (!kvm_pte_valid(pte))
 		return 0;
@@ -887,8 +906,17 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	 * but worst-case the access flag update gets lost and will be
 	 * set on the next access instead.
 	 */
-	if (data->pte != pte)
+	if (data->pte != pte) {
+		/*
+		 * Invalidate instruction cache before updating the guest
+		 * stage-2 PTE if we are going to add executable permission.
+		 */
+		if (mm_ops->icache_inval_pou &&
+		    stage2_pte_executable(pte) && !stage2_pte_executable(*ptep))
+			mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
+						  kvm_granule_size(level));
 		WRITE_ONCE(*ptep, pte);
+	}
 
 	return 0;
 }
@@ -903,6 +931,7 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
 	struct stage2_attr_data data = {
 		.attr_set	= attr_set & attr_mask,
 		.attr_clr	= attr_clr & attr_mask,
+		.mm_ops		= pgt->mm_ops,
 	};
 	struct kvm_pgtable_walker walker = {
 		.cb		= stage2_attr_walker,
@@ -988,11 +1017,15 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	struct kvm_pgtable *pgt = arg;
 	struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
 	kvm_pte_t pte = *ptep;
+	kvm_pte_t *pte_follow;
 
 	if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
 		return 0;
 
-	__flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level));
+	pte_follow = kvm_pte_follow(pte, mm_ops);
+	dcache_clean_inval_poc((unsigned long)pte_follow,
+			    (unsigned long)pte_follow +
+				    kvm_granule_size(level));
 	return 0;
 }
 
diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c
index 83ca23ac259b..d654921dd09b 100644
--- a/arch/arm64/kvm/hyp/reserved_mem.c
+++ b/arch/arm64/kvm/hyp/reserved_mem.c
@@ -71,8 +71,7 @@ void __init kvm_hyp_reserve(void)
 	}
 
 	hyp_mem_pages += hyp_s1_pgtable_pages();
-	hyp_mem_pages += host_s2_mem_pgtable_pages();
-	hyp_mem_pages += host_s2_dev_pgtable_pages();
+	hyp_mem_pages += host_s2_pgtable_pages();
 
 	/*
 	 * The hyp_vmemmap needs to be backed by pages, but these pages
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 5e0d40f9fb86..3155c9e778f0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -126,6 +126,16 @@ static void *kvm_host_va(phys_addr_t phys)
 	return __va(phys);
 }
 
+static void clean_dcache_guest_page(void *va, size_t size)
+{
+	__clean_dcache_guest_page(va, size);
+}
+
+static void invalidate_icache_guest_page(void *va, size_t size)
+{
+	__invalidate_icache_guest_page(va, size);
+}
+
 /*
  * Unmapping vs dcache management:
  *
@@ -432,6 +442,8 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
 	.page_count		= kvm_host_page_count,
 	.phys_to_virt		= kvm_host_va,
 	.virt_to_phys		= kvm_host_pa,
+	.dcache_clean_inval_poc	= clean_dcache_guest_page,
+	.icache_inval_pou	= invalidate_icache_guest_page,
 };
 
 /**
@@ -693,16 +705,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-	__clean_dcache_guest_page(pfn, size);
-}
-
-static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-	__invalidate_icache_guest_page(pfn, size);
-}
-
 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
 {
 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
@@ -822,6 +824,74 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
 	return PAGE_SIZE;
 }
 
+static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
+{
+	unsigned long pa;
+
+	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
+		return huge_page_shift(hstate_vma(vma));
+
+	if (!(vma->vm_flags & VM_PFNMAP))
+		return PAGE_SHIFT;
+
+	VM_BUG_ON(is_vm_hugetlb_page(vma));
+
+	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
+
+#ifndef __PAGETABLE_PMD_FOLDED
+	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
+	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
+	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
+		return PUD_SHIFT;
+#endif
+
+	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
+	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
+	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
+		return PMD_SHIFT;
+
+	return PAGE_SHIFT;
+}
+
+/*
+ * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
+ * able to see the page's tags and therefore they must be initialised first. If
+ * PG_mte_tagged is set, tags have already been initialised.
+ *
+ * The race in the test/set of the PG_mte_tagged flag is handled by:
+ * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
+ *   racing to santise the same page
+ * - mmap_lock protects between a VM faulting a page in and the VMM performing
+ *   an mprotect() to add VM_MTE
+ */
+static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+			     unsigned long size)
+{
+	unsigned long i, nr_pages = size >> PAGE_SHIFT;
+	struct page *page;
+
+	if (!kvm_has_mte(kvm))
+		return 0;
+
+	/*
+	 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
+	 * that may not support tags.
+	 */
+	page = pfn_to_online_page(pfn);
+
+	if (!page)
+		return -EFAULT;
+
+	for (i = 0; i < nr_pages; i++, page++) {
+		if (!test_bit(PG_mte_tagged, &page->flags)) {
+			mte_clear_page_tags(page_address(page));
+			set_bit(PG_mte_tagged, &page->flags);
+		}
+	}
+
+	return 0;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -830,6 +900,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	bool write_fault, writable, force_pte = false;
 	bool exec_fault;
 	bool device = false;
+	bool shared;
 	unsigned long mmu_seq;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -853,7 +924,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	/* Let's check if we will get back a huge page backed by hugetlbfs */
+	/*
+	 * Let's check if we will get back a huge page backed by hugetlbfs, or
+	 * get block mapping for device MMIO region.
+	 */
 	mmap_read_lock(current->mm);
 	vma = vma_lookup(current->mm, hva);
 	if (unlikely(!vma)) {
@@ -862,17 +936,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 	}
 
-	if (is_vm_hugetlb_page(vma))
-		vma_shift = huge_page_shift(hstate_vma(vma));
-	else
-		vma_shift = PAGE_SHIFT;
-
-	if (logging_active ||
-	    (vma->vm_flags & VM_PFNMAP)) {
+	/*
+	 * logging_active is guaranteed to never be true for VM_PFNMAP
+	 * memslots.
+	 */
+	if (logging_active) {
 		force_pte = true;
 		vma_shift = PAGE_SHIFT;
+	} else {
+		vma_shift = get_vma_page_shift(vma, hva);
 	}
 
+	shared = (vma->vm_flags & VM_PFNMAP);
+
 	switch (vma_shift) {
 #ifndef __PAGETABLE_PMD_FOLDED
 	case PUD_SHIFT:
@@ -943,8 +1019,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 
 	if (kvm_is_device_pfn(pfn)) {
+		/*
+		 * If the page was identified as device early by looking at
+		 * the VMA flags, vma_pagesize is already representing the
+		 * largest quantity we can map.  If instead it was mapped
+		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
+		 * and must not be upgraded.
+		 *
+		 * In both cases, we don't let transparent_hugepage_adjust()
+		 * change things at the last minute.
+		 */
 		device = true;
-		force_pte = true;
 	} else if (logging_active && !write_fault) {
 		/*
 		 * Only actually map the page as writable if this was a write
@@ -965,19 +1050,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * If we are not forced to use page mapping, check if we are
 	 * backed by a THP and thus use block mapping if possible.
 	 */
-	if (vma_pagesize == PAGE_SIZE && !force_pte)
+	if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
 		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
 							   &pfn, &fault_ipa);
+
+	if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
+		/* Check the VMM hasn't introduced a new VM_SHARED VMA */
+		if (!shared)
+			ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
+		else
+			ret = -EFAULT;
+		if (ret)
+			goto out_unlock;
+	}
+
 	if (writable)
 		prot |= KVM_PGTABLE_PROT_W;
 
-	if (fault_status != FSC_PERM && !device)
-		clean_dcache_guest_page(pfn, vma_pagesize);
-
-	if (exec_fault) {
+	if (exec_fault)
 		prot |= KVM_PGTABLE_PROT_X;
-		invalidate_icache_guest_page(pfn, vma_pagesize);
-	}
 
 	if (device)
 		prot |= KVM_PGTABLE_PROT_DEVICE;
@@ -1168,19 +1259,22 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	kvm_pfn_t pfn = pte_pfn(range->pte);
+	int ret;
 
 	if (!kvm->arch.mmu.pgt)
 		return false;
 
 	WARN_ON(range->end - range->start != 1);
 
-	/*
-	 * We've moved a page around, probably through CoW, so let's treat it
-	 * just like a translation fault and clean the cache to the PoC.
-	 */
-	clean_dcache_guest_page(pfn, PAGE_SIZE);
+	ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
+	if (ret)
+		return false;
 
 	/*
+	 * We've moved a page around, probably through CoW, so let's treat
+	 * it just like a translation fault and the map handler will clean
+	 * the cache to the PoC.
+	 *
 	 * The MMU notifiers will have unmapped a huge PMD before calling
 	 * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
 	 * therefore we never need to clear out a huge PMD through this
@@ -1346,7 +1440,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 {
 	hva_t hva = mem->userspace_addr;
 	hva_t reg_end = hva + mem->memory_size;
-	bool writable = !(mem->flags & KVM_MEM_READONLY);
 	int ret = 0;
 
 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
@@ -1363,8 +1456,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	mmap_read_lock(current->mm);
 	/*
 	 * A memory region could potentially cover multiple VMAs, and any holes
-	 * between them, so iterate over all of them to find out if we can map
-	 * any of them right now.
+	 * between them, so iterate over all of them.
 	 *
 	 *     +--------------------------------------------+
 	 * +---------------+----------------+   +----------------+
@@ -1375,51 +1467,29 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	 */
 	do {
 		struct vm_area_struct *vma;
-		hva_t vm_start, vm_end;
 
 		vma = find_vma_intersection(current->mm, hva, reg_end);
 		if (!vma)
 			break;
 
 		/*
-		 * Take the intersection of this VMA with the memory region
+		 * VM_SHARED mappings are not allowed with MTE to avoid races
+		 * when updating the PG_mte_tagged page flag, see
+		 * sanitise_mte_tags for more details.
 		 */
-		vm_start = max(hva, vma->vm_start);
-		vm_end = min(reg_end, vma->vm_end);
+		if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED)
+			return -EINVAL;
 
 		if (vma->vm_flags & VM_PFNMAP) {
-			gpa_t gpa = mem->guest_phys_addr +
-				    (vm_start - mem->userspace_addr);
-			phys_addr_t pa;
-
-			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
-			pa += vm_start - vma->vm_start;
-
 			/* IO region dirty page logging not allowed */
 			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 				ret = -EINVAL;
-				goto out;
-			}
-
-			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
-						    vm_end - vm_start,
-						    writable);
-			if (ret)
 				break;
+			}
 		}
-		hva = vm_end;
+		hva = min(reg_end, vma->vm_end);
 	} while (hva < reg_end);
 
-	if (change == KVM_MR_FLAGS_ONLY)
-		goto out;
-
-	spin_lock(&kvm->mmu_lock);
-	if (ret)
-		unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
-	else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
-		stage2_flush_memslot(kvm, memslot);
-	spin_unlock(&kvm->mmu_lock);
-out:
 	mmap_read_unlock(current->mm);
 	return ret;
 }
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index fd167d4f4215..f33825c995cb 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -578,6 +578,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
 		kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
 
 	if (val & ARMV8_PMU_PMCR_P) {
+		mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
 		for_each_set_bit(i, &mask, 32)
 			kvm_pmu_set_counter_value(vcpu, i, 0);
 	}
@@ -850,6 +851,9 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
 		   return -EINVAL;
 	}
 
+	/* One-off reload of the PMU on first run */
+	kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
 	return 0;
 }
 
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index d37ebee085cf..cba7872d69a8 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -176,6 +176,10 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu)
 	if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit)
 		return false;
 
+	/* MTE is incompatible with AArch32 */
+	if (kvm_has_mte(vcpu->kvm) && is32bit)
+		return false;
+
 	/* Check that the vcpus are either all 32bit or all 64bit */
 	kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
 		if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1a7968ad078c..f6f126eb6ac1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1047,6 +1047,13 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
 		break;
 	case SYS_ID_AA64PFR1_EL1:
 		val &= ~FEATURE(ID_AA64PFR1_MTE);
+		if (kvm_has_mte(vcpu->kvm)) {
+			u64 pfr, mte;
+
+			pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
+			mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
+			val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
+		}
 		break;
 	case SYS_ID_AA64ISAR1_EL1:
 		if (!vcpu_has_ptrauth(vcpu))
@@ -1302,6 +1309,23 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 	return true;
 }
 
+static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
+				   const struct sys_reg_desc *rd)
+{
+	if (kvm_has_mte(vcpu->kvm))
+		return 0;
+
+	return REG_HIDDEN;
+}
+
+#define MTE_REG(name) {				\
+	SYS_DESC(SYS_##name),			\
+	.access = undef_access,			\
+	.reset = reset_unknown,			\
+	.reg = name,				\
+	.visibility = mte_visibility,		\
+}
+
 /* sys_reg_desc initialiser for known cpufeature ID registers */
 #define ID_SANITISED(name) {			\
 	SYS_DESC(SYS_##name),			\
@@ -1470,8 +1494,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
 	{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
 
-	{ SYS_DESC(SYS_RGSR_EL1), undef_access },
-	{ SYS_DESC(SYS_GCR_EL1), undef_access },
+	MTE_REG(RGSR_EL1),
+	MTE_REG(GCR_EL1),
 
 	{ SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
 	{ SYS_DESC(SYS_TRFCR_EL1), undef_access },
@@ -1498,8 +1522,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
 	{ SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },
 
-	{ SYS_DESC(SYS_TFSR_EL1), undef_access },
-	{ SYS_DESC(SYS_TFSRE0_EL1), undef_access },
+	MTE_REG(TFSR_EL1),
+	MTE_REG(TFSRE0_EL1),
 
 	{ SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
 	{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 58cbda00e56d..340c51d87677 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -482,6 +482,16 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+static struct gic_kvm_info *gic_kvm_info;
+
+void __init vgic_set_kvm_info(const struct gic_kvm_info *info)
+{
+	BUG_ON(gic_kvm_info != NULL);
+	gic_kvm_info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (gic_kvm_info)
+		*gic_kvm_info = *info;
+}
+
 /**
  * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
  *
@@ -509,18 +519,29 @@ void kvm_vgic_init_cpu_hardware(void)
  */
 int kvm_vgic_hyp_init(void)
 {
-	const struct gic_kvm_info *gic_kvm_info;
+	bool has_mask;
 	int ret;
 
-	gic_kvm_info = gic_get_kvm_info();
 	if (!gic_kvm_info)
 		return -ENODEV;
 
-	if (!gic_kvm_info->maint_irq) {
+	has_mask = !gic_kvm_info->no_maint_irq_mask;
+
+	if (has_mask && !gic_kvm_info->maint_irq) {
 		kvm_err("No vgic maintenance irq\n");
 		return -ENXIO;
 	}
 
+	/*
+	 * If we get one of these oddball non-GICs, taint the kernel,
+	 * as we have no idea of how they *really* behave.
+	 */
+	if (gic_kvm_info->no_hw_deactivation) {
+		kvm_info("Non-architectural vgic, tainting kernel\n");
+		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+		kvm_vgic_global_state.no_hw_deactivation = true;
+	}
+
 	switch (gic_kvm_info->type) {
 	case GIC_V2:
 		ret = vgic_v2_probe(gic_kvm_info);
@@ -536,10 +557,17 @@ int kvm_vgic_hyp_init(void)
 		ret = -ENODEV;
 	}
 
+	kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+
+	kfree(gic_kvm_info);
+	gic_kvm_info = NULL;
+
 	if (ret)
 		return ret;
 
-	kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+	if (!has_mask)
+		return 0;
+
 	ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
 				 vgic_maintenance_handler,
 				 "vgic", kvm_get_running_vcpus());
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 11934c2af2f4..2c580204f1dc 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -108,11 +108,22 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 		 * If this causes us to lower the level, we have to also clear
 		 * the physical active state, since we will otherwise never be
 		 * told when the interrupt becomes asserted again.
+		 *
+		 * Another case is when the interrupt requires a helping hand
+		 * on deactivation (no HW deactivation, for example).
 		 */
-		if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
-			irq->line_level = vgic_get_phys_line_level(irq);
+		if (vgic_irq_is_mapped_level(irq)) {
+			bool resample = false;
+
+			if (val & GICH_LR_PENDING_BIT) {
+				irq->line_level = vgic_get_phys_line_level(irq);
+				resample = !irq->line_level;
+			} else if (vgic_irq_needs_resampling(irq) &&
+				   !(irq->active || irq->pending_latch)) {
+				resample = true;
+			}
 
-			if (!irq->line_level)
+			if (resample)
 				vgic_irq_set_phys_active(irq, false);
 		}
 
@@ -152,7 +163,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
 	if (irq->group)
 		val |= GICH_LR_GROUP1;
 
-	if (irq->hw) {
+	if (irq->hw && !vgic_irq_needs_resampling(irq)) {
 		val |= GICH_LR_HW;
 		val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
 		/*
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 41ecf219c333..66004f61cd83 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -101,11 +101,22 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 		 * If this causes us to lower the level, we have to also clear
 		 * the physical active state, since we will otherwise never be
 		 * told when the interrupt becomes asserted again.
+		 *
+		 * Another case is when the interrupt requires a helping hand
+		 * on deactivation (no HW deactivation, for example).
 		 */
-		if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
-			irq->line_level = vgic_get_phys_line_level(irq);
+		if (vgic_irq_is_mapped_level(irq)) {
+			bool resample = false;
+
+			if (val & ICH_LR_PENDING_BIT) {
+				irq->line_level = vgic_get_phys_line_level(irq);
+				resample = !irq->line_level;
+			} else if (vgic_irq_needs_resampling(irq) &&
+				   !(irq->active || irq->pending_latch)) {
+				resample = true;
+			}
 
-			if (!irq->line_level)
+			if (resample)
 				vgic_irq_set_phys_active(irq, false);
 		}
 
@@ -136,7 +147,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
 		}
 	}
 
-	if (irq->hw) {
+	if (irq->hw && !vgic_irq_needs_resampling(irq)) {
 		val |= ICH_LR_HW;
 		val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
 		/*
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 15b666200f0b..111bff47e471 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -182,8 +182,8 @@ bool vgic_get_phys_line_level(struct vgic_irq *irq)
 
 	BUG_ON(!irq->hw);
 
-	if (irq->get_input_level)
-		return irq->get_input_level(irq->intid);
+	if (irq->ops && irq->ops->get_input_level)
+		return irq->ops->get_input_level(irq->intid);
 
 	WARN_ON(irq_get_irqchip_state(irq->host_irq,
 				      IRQCHIP_STATE_PENDING,
@@ -480,7 +480,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
 /* @irq->irq_lock must be held */
 static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
 			    unsigned int host_irq,
-			    bool (*get_input_level)(int vindid))
+			    struct irq_ops *ops)
 {
 	struct irq_desc *desc;
 	struct irq_data *data;
@@ -500,7 +500,7 @@ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
 	irq->hw = true;
 	irq->host_irq = host_irq;
 	irq->hwintid = data->hwirq;
-	irq->get_input_level = get_input_level;
+	irq->ops = ops;
 	return 0;
 }
 
@@ -509,11 +509,11 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
 {
 	irq->hw = false;
 	irq->hwintid = 0;
-	irq->get_input_level = NULL;
+	irq->ops = NULL;
 }
 
 int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
-			  u32 vintid, bool (*get_input_level)(int vindid))
+			  u32 vintid, struct irq_ops *ops)
 {
 	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
 	unsigned long flags;
@@ -522,7 +522,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
 	BUG_ON(!irq);
 
 	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
+	ret = kvm_vgic_map_irq(vcpu, irq, host_irq, ops);
 	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
 	vgic_put_irq(vcpu->kvm, irq);
 
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..6dd56a49790a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
-		   clear_page.o csum.o memchr.o memcpy.o memmove.o	\
+		   clear_page.o csum.o insn.o memchr.o memcpy.o		\
 		   memset.o memcmp.o strcmp.o strncmp.o strlen.o	\
 		   strnlen.o strchr.o strrchr.o tishift.o
 
@@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_ARM64_MTE) += mte.o
+
+obj-$(CONFIG_KASAN_SW_TAGS) += kasan_sw_tags.o
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index af9afcbec92c..a7efb2ad2a1c 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -1,12 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Based on arch/arm/lib/clear_user.S
- *
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
  */
-#include <linux/linkage.h>
 
-#include <asm/asm-uaccess.h>
+#include <linux/linkage.h>
 #include <asm/assembler.h>
 
 	.text
@@ -19,25 +16,33 @@
  *
  * Alignment fixed up by hardware.
  */
+
+	.p2align 4
+	// Alignment is for the loop, but since the prologue (including BTI)
+	// is also 16 bytes we can keep any padding outside the function
 SYM_FUNC_START(__arch_clear_user)
-	mov	x2, x1			// save the size for fixup return
+	add	x2, x0, x1
 	subs	x1, x1, #8
 	b.mi	2f
 1:
-user_ldst 9f, sttr, xzr, x0, 8
+USER(9f, sttr	xzr, [x0])
+	add	x0, x0, #8
 	subs	x1, x1, #8
-	b.pl	1b
-2:	adds	x1, x1, #4
-	b.mi	3f
-user_ldst 9f, sttr, wzr, x0, 4
-	sub	x1, x1, #4
-3:	adds	x1, x1, #2
-	b.mi	4f
-user_ldst 9f, sttrh, wzr, x0, 2
-	sub	x1, x1, #2
-4:	adds	x1, x1, #1
-	b.mi	5f
-user_ldst 9f, sttrb, wzr, x0, 0
+	b.hi	1b
+USER(9f, sttr	xzr, [x2, #-8])
+	mov	x0, #0
+	ret
+
+2:	tbz	x1, #2, 3f
+USER(9f, sttr	wzr, [x0])
+USER(8f, sttr	wzr, [x2, #-4])
+	mov	x0, #0
+	ret
+
+3:	tbz	x1, #1, 4f
+USER(9f, sttrh	wzr, [x0])
+4:	tbz	x1, #0, 5f
+USER(7f, sttrb	wzr, [x2, #-1])
 5:	mov	x0, #0
 	ret
 SYM_FUNC_END(__arch_clear_user)
@@ -45,6 +50,8 @@ EXPORT_SYMBOL(__arch_clear_user)
 
 	.section .fixup,"ax"
 	.align	2
-9:	mov	x0, x2			// return the original size
+7:	sub	x0, x2, #5	// Adjust for faulting on the final byte...
+8:	add	x0, x0, #4	// ...or the second word of the 4-7 byte case
+9:	sub	x0, x2, x0
 	ret
 	.previous
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/lib/insn.c
index 6c0de2f60ea9..b506a4b1e38c 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -7,21 +7,14 @@
  */
 #include <linux/bitops.h>
 #include <linux/bug.h>
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/spinlock.h>
-#include <linux/stop_machine.h>
+#include <linux/printk.h>
+#include <linux/sizes.h>
 #include <linux/types.h>
-#include <linux/uaccess.h>
 
-#include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
-#include <asm/fixmap.h>
+#include <asm/errno.h>
 #include <asm/insn.h>
 #include <asm/kprobes.h>
-#include <asm/sections.h>
 
 #define AARCH64_INSN_SF_BIT	BIT(31)
 #define AARCH64_INSN_N_BIT	BIT(22)
@@ -30,7 +23,7 @@
 static const int aarch64_insn_encoding_class[] = {
 	AARCH64_INSN_CLS_UNKNOWN,
 	AARCH64_INSN_CLS_UNKNOWN,
-	AARCH64_INSN_CLS_UNKNOWN,
+	AARCH64_INSN_CLS_SVE,
 	AARCH64_INSN_CLS_UNKNOWN,
 	AARCH64_INSN_CLS_LDST,
 	AARCH64_INSN_CLS_DP_REG,
@@ -83,81 +76,6 @@ bool aarch64_insn_is_branch_imm(u32 insn)
 		aarch64_insn_is_bcond(insn));
 }
 
-static DEFINE_RAW_SPINLOCK(patch_lock);
-
-static bool is_exit_text(unsigned long addr)
-{
-	/* discarded with init text/data */
-	return system_state < SYSTEM_RUNNING &&
-		addr >= (unsigned long)__exittext_begin &&
-		addr < (unsigned long)__exittext_end;
-}
-
-static bool is_image_text(unsigned long addr)
-{
-	return core_kernel_text(addr) || is_exit_text(addr);
-}
-
-static void __kprobes *patch_map(void *addr, int fixmap)
-{
-	unsigned long uintaddr = (uintptr_t) addr;
-	bool image = is_image_text(uintaddr);
-	struct page *page;
-
-	if (image)
-		page = phys_to_page(__pa_symbol(addr));
-	else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
-		page = vmalloc_to_page(addr);
-	else
-		return addr;
-
-	BUG_ON(!page);
-	return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
-			(uintaddr & ~PAGE_MASK));
-}
-
-static void __kprobes patch_unmap(int fixmap)
-{
-	clear_fixmap(fixmap);
-}
-/*
- * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
- * little-endian.
- */
-int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
-{
-	int ret;
-	__le32 val;
-
-	ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE);
-	if (!ret)
-		*insnp = le32_to_cpu(val);
-
-	return ret;
-}
-
-static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
-{
-	void *waddr = addr;
-	unsigned long flags = 0;
-	int ret;
-
-	raw_spin_lock_irqsave(&patch_lock, flags);
-	waddr = patch_map(addr, FIX_TEXT_POKE0);
-
-	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
-
-	patch_unmap(FIX_TEXT_POKE0);
-	raw_spin_unlock_irqrestore(&patch_lock, flags);
-
-	return ret;
-}
-
-int __kprobes aarch64_insn_write(void *addr, u32 insn)
-{
-	return __aarch64_insn_write(addr, cpu_to_le32(insn));
-}
-
 bool __kprobes aarch64_insn_uses_literal(u32 insn)
 {
 	/* ldr/ldrsw (literal), prfm */
@@ -187,67 +105,6 @@ bool __kprobes aarch64_insn_is_branch(u32 insn)
 		aarch64_insn_is_bcond(insn);
 }
 
-int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
-{
-	u32 *tp = addr;
-	int ret;
-
-	/* A64 instructions must be word aligned */
-	if ((uintptr_t)tp & 0x3)
-		return -EINVAL;
-
-	ret = aarch64_insn_write(tp, insn);
-	if (ret == 0)
-		__flush_icache_range((uintptr_t)tp,
-				     (uintptr_t)tp + AARCH64_INSN_SIZE);
-
-	return ret;
-}
-
-struct aarch64_insn_patch {
-	void		**text_addrs;
-	u32		*new_insns;
-	int		insn_cnt;
-	atomic_t	cpu_count;
-};
-
-static int __kprobes aarch64_insn_patch_text_cb(void *arg)
-{
-	int i, ret = 0;
-	struct aarch64_insn_patch *pp = arg;
-
-	/* The first CPU becomes master */
-	if (atomic_inc_return(&pp->cpu_count) == 1) {
-		for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
-			ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
-							     pp->new_insns[i]);
-		/* Notify other processors with an additional increment. */
-		atomic_inc(&pp->cpu_count);
-	} else {
-		while (atomic_read(&pp->cpu_count) <= num_online_cpus())
-			cpu_relax();
-		isb();
-	}
-
-	return ret;
-}
-
-int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
-{
-	struct aarch64_insn_patch patch = {
-		.text_addrs = addrs,
-		.new_insns = insns,
-		.insn_cnt = cnt,
-		.cpu_count = ATOMIC_INIT(0),
-	};
-
-	if (cnt <= 0)
-		return -EINVAL;
-
-	return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch,
-				       cpu_online_mask);
-}
-
 static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
 						u32 *maskp, int *shiftp)
 {
@@ -1432,104 +1289,6 @@ u32 aarch32_insn_mcr_extract_crm(u32 insn)
 	return insn & CRM_MASK;
 }
 
-static bool __kprobes __check_eq(unsigned long pstate)
-{
-	return (pstate & PSR_Z_BIT) != 0;
-}
-
-static bool __kprobes __check_ne(unsigned long pstate)
-{
-	return (pstate & PSR_Z_BIT) == 0;
-}
-
-static bool __kprobes __check_cs(unsigned long pstate)
-{
-	return (pstate & PSR_C_BIT) != 0;
-}
-
-static bool __kprobes __check_cc(unsigned long pstate)
-{
-	return (pstate & PSR_C_BIT) == 0;
-}
-
-static bool __kprobes __check_mi(unsigned long pstate)
-{
-	return (pstate & PSR_N_BIT) != 0;
-}
-
-static bool __kprobes __check_pl(unsigned long pstate)
-{
-	return (pstate & PSR_N_BIT) == 0;
-}
-
-static bool __kprobes __check_vs(unsigned long pstate)
-{
-	return (pstate & PSR_V_BIT) != 0;
-}
-
-static bool __kprobes __check_vc(unsigned long pstate)
-{
-	return (pstate & PSR_V_BIT) == 0;
-}
-
-static bool __kprobes __check_hi(unsigned long pstate)
-{
-	pstate &= ~(pstate >> 1);	/* PSR_C_BIT &= ~PSR_Z_BIT */
-	return (pstate & PSR_C_BIT) != 0;
-}
-
-static bool __kprobes __check_ls(unsigned long pstate)
-{
-	pstate &= ~(pstate >> 1);	/* PSR_C_BIT &= ~PSR_Z_BIT */
-	return (pstate & PSR_C_BIT) == 0;
-}
-
-static bool __kprobes __check_ge(unsigned long pstate)
-{
-	pstate ^= (pstate << 3);	/* PSR_N_BIT ^= PSR_V_BIT */
-	return (pstate & PSR_N_BIT) == 0;
-}
-
-static bool __kprobes __check_lt(unsigned long pstate)
-{
-	pstate ^= (pstate << 3);	/* PSR_N_BIT ^= PSR_V_BIT */
-	return (pstate & PSR_N_BIT) != 0;
-}
-
-static bool __kprobes __check_gt(unsigned long pstate)
-{
-	/*PSR_N_BIT ^= PSR_V_BIT */
-	unsigned long temp = pstate ^ (pstate << 3);
-
-	temp |= (pstate << 1);	/*PSR_N_BIT |= PSR_Z_BIT */
-	return (temp & PSR_N_BIT) == 0;
-}
-
-static bool __kprobes __check_le(unsigned long pstate)
-{
-	/*PSR_N_BIT ^= PSR_V_BIT */
-	unsigned long temp = pstate ^ (pstate << 3);
-
-	temp |= (pstate << 1);	/*PSR_N_BIT |= PSR_Z_BIT */
-	return (temp & PSR_N_BIT) != 0;
-}
-
-static bool __kprobes __check_al(unsigned long pstate)
-{
-	return true;
-}
-
-/*
- * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that
- * it behaves identically to 0b1110 ("al").
- */
-pstate_check_t * const aarch32_opcode_cond_checks[16] = {
-	__check_eq, __check_ne, __check_cs, __check_cc,
-	__check_mi, __check_pl, __check_vs, __check_vc,
-	__check_hi, __check_ls, __check_ge, __check_lt,
-	__check_gt, __check_le, __check_al, __check_al
-};
-
 static bool range_of_ones(u64 val)
 {
 	/* Doesn't handle full ones or full zeroes */
diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S
new file mode 100644
index 000000000000..5b04464c045e
--- /dev/null
+++ b/arch/arm64/lib/kasan_sw_tags.S
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Report a tag mismatch detected by tag-based KASAN.
+ *
+ * A compiler-generated thunk calls this with a non-AAPCS calling
+ * convention. Upon entry to this function, registers are as follows:
+ *
+ * x0:         fault address (see below for restore)
+ * x1:         fault description (see below for restore)
+ * x2 to x15:  callee-saved
+ * x16 to x17: safe to clobber
+ * x18 to x30: callee-saved
+ * sp:         pre-decremented by 256 bytes (see below for restore)
+ *
+ * The caller has decremented the SP by 256 bytes, and created a
+ * structure on the stack as follows:
+ *
+ * sp + 0..15:    x0 and x1 to be restored
+ * sp + 16..231:  free for use
+ * sp + 232..247: x29 and x30 (same as in GPRs)
+ * sp + 248..255: free for use
+ *
+ * Note that this is not a struct pt_regs.
+ *
+ * To call a regular AAPCS function we must save x2 to x15 (which we can
+ * store in the gaps), and create a frame record (for which we can use
+ * x29 and x30 spilled by the caller as those match the GPRs).
+ *
+ * The caller expects x0 and x1 to be restored from the structure, and
+ * for the structure to be removed from the stack (i.e. the SP must be
+ * incremented by 256 prior to return).
+ */
+SYM_CODE_START(__hwasan_tag_mismatch)
+#ifdef BTI_C
+	BTI_C
+#endif
+	add	x29, sp, #232
+	stp	x2, x3, [sp, #8 * 2]
+	stp	x4, x5, [sp, #8 * 4]
+	stp	x6, x7, [sp, #8 * 6]
+	stp	x8, x9, [sp, #8 * 8]
+	stp	x10, x11, [sp, #8 * 10]
+	stp	x12, x13, [sp, #8 * 12]
+	stp	x14, x15, [sp, #8 * 14]
+#ifndef CONFIG_SHADOW_CALL_STACK
+	str	x18, [sp, #8 * 18]
+#endif
+
+	mov	x2, x30
+	bl	kasan_tag_mismatch
+
+	ldp	x0, x1, [sp]
+	ldp	x2, x3, [sp, #8 * 2]
+	ldp	x4, x5, [sp, #8 * 4]
+	ldp	x6, x7, [sp, #8 * 6]
+	ldp	x8, x9, [sp, #8 * 8]
+	ldp	x10, x11, [sp, #8 * 10]
+	ldp	x12, x13, [sp, #8 * 12]
+	ldp	x14, x15, [sp, #8 * 14]
+#ifndef CONFIG_SHADOW_CALL_STACK
+	ldr	x18, [sp, #8 * 18]
+#endif
+	ldp	x29, x30, [sp, #8 * 29]
+
+	/* remove the structure from the stack */
+	add	sp, sp, #256
+	ret
+SYM_CODE_END(__hwasan_tag_mismatch)
+EXPORT_SYMBOL(__hwasan_tag_mismatch)
diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S
index edf6b970a277..7c2276fdab54 100644
--- a/arch/arm64/lib/memchr.S
+++ b/arch/arm64/lib/memchr.S
@@ -1,9 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Based on arch/arm/lib/memchr.S
- *
- * Copyright (C) 1995-2000 Russell King
- * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
  */
 
 #include <linux/linkage.h>
@@ -19,16 +16,60 @@
  * Returns:
  *	x0 - address of first occurrence of 'c' or 0
  */
+
+#define L(label) .L ## label
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define wordcnt		x3
+#define rep01		x4
+#define repchr		x5
+#define cur_word	x6
+#define cur_byte	w6
+#define tmp		x7
+#define tmp2		x8
+
+	.p2align 4
+	nop
 SYM_FUNC_START_WEAK_PI(memchr)
-	and	w1, w1, #0xff
-1:	subs	x2, x2, #1
-	b.mi	2f
-	ldrb	w3, [x0], #1
-	cmp	w3, w1
-	b.ne	1b
-	sub	x0, x0, #1
+	and	chrin, chrin, #0xff
+	lsr	wordcnt, cntin, #3
+	cbz	wordcnt, L(byte_loop)
+	mov	rep01, #REP8_01
+	mul	repchr, x1, rep01
+	and	cntin, cntin, #7
+L(word_loop):
+	ldr	cur_word, [srcin], #8
+	sub	wordcnt, wordcnt, #1
+	eor	cur_word, cur_word, repchr
+	sub	tmp, cur_word, rep01
+	orr	tmp2, cur_word, #REP8_7f
+	bics	tmp, tmp, tmp2
+	b.ne	L(found_word)
+	cbnz	wordcnt, L(word_loop)
+L(byte_loop):
+	cbz	cntin, L(not_found)
+	ldrb	cur_byte, [srcin], #1
+	sub	cntin, cntin, #1
+	cmp	cur_byte, chrin
+	b.ne	L(byte_loop)
+	sub	srcin, srcin, #1
+	ret
+L(found_word):
+CPU_LE(	rev	tmp, tmp)
+	clz	tmp, tmp
+	sub	tmp, tmp, #64
+	add	result, srcin, tmp, asr #3
 	ret
-2:	mov	x0, #0
+L(not_found):
+	mov	result, #0
 	ret
 SYM_FUNC_END_PI(memchr)
 EXPORT_SYMBOL_NOKASAN(memchr)
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index c0671e793ea9..7d956384222f 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -1,247 +1,139 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2021, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
-* compare memory areas(when two memory areas' offset are different,
-* alignment handled by the hardware)
-*
-* Parameters:
-*  x0 - const memory area 1 pointer
-*  x1 - const memory area 2 pointer
-*  x2 - the maximal compare byte length
-* Returns:
-*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
-*/
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#define L(label) .L ## label
 
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-limit		.req	x2
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
 
 /* Internal variables.  */
-data1		.req	x3
-data1w		.req	w3
-data2		.req	x4
-data2w		.req	w4
-has_nul		.req	x5
-diff		.req	x6
-endloop		.req	x7
-tmp1		.req	x8
-tmp2		.req	x9
-tmp3		.req	x10
-pos		.req	x11
-limit_wd	.req	x12
-mask		.req	x13
+#define data1		x3
+#define data1w		w3
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 SYM_FUNC_START_WEAK_PI(memcmp)
-	cbz	limit, .Lret0
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
-	/*
-	* The input source addresses are at alignment boundary.
-	* Directly compare eight bytes each time.
-	*/
-.Lloop_aligned:
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
-	cbz	endloop, .Lloop_aligned
-
-	/* Not reached the limit, must have found a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
-
-	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-	/*
-	* The remained bytes less than 8. It is needed to extract valid data
-	* from last eight bytes of the intended memory range.
-	*/
-	lsl	limit, limit, #3	/* bytes-> bits.  */
-	mov	mask, #~0
-CPU_BE( lsr	mask, mask, limit )
-CPU_LE( lsl	mask, mask, limit )
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-	b	.Lnot_limit
-
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary. Round down the addresses and then mask off
-	* the bytes that precede the start point.
-	*/
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	/*
-	* We can not add limit with alignment offset(tmp1) here. Since the
-	* addition probably make the limit overflown.
-	*/
-	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	add	tmp3, tmp3, tmp1
-	add	limit_wd, limit_wd, tmp3, lsr #3
-	add	limit, limit, tmp1/* Adjust the limit for the extra.  */
-
-	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
-	neg	tmp1, tmp1/* Bits to alignment -64.  */
-	mov	tmp2, #~0
-	/*mask off the non-intended bytes before the start address.*/
-CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
-	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )
-
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	.Lstart_realigned
-
-	/*src1 and src2 have different alignment offset.*/
-.Lmisaligned8:
-	cmp	limit, #8
-	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/
-
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/
-
-	sub	limit, limit, pos
-	/*compare the proceeding bytes in the first 8 byte segment.*/
-.Ltinycmp:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*diff occurred before the last byte.*/
-	cmp	data1w, data2w
-	b.eq	.Lstart_align
-1:
-	sub	result, data1, data2
+	subs	limit, limit, 8
+	b.lo	L(less8)
+
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
+	.p2align 4
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
+
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp	data1, data2
+L(ret_eq):
+	cset	result, ne
+	cneg	result, result, lo
 	ret
 
-.Lstart_align:
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	/*process more leading bytes to make src1 aligned...*/
-	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
-	add	src2, src2, tmp3
-	sub	limit, limit, tmp3
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-	/*load 8 bytes from aligned SRC1..*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2  /*Non-zero if differences found.*/
-	csinv	endloop, diff, xzr, ne
-	cbnz	endloop, .Lunequal_proc
-	/*How far is the current SRC2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-
-.Lrecal_offset:/*src1 is aligned now..*/
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes and compare from
-	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
-	* the second part's comparison. Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	cbnz	diff, .Lnot_limit
-
-	/*The second part process*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	subs	limit_wd, limit_wd, #1
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	cbz	endloop, .Lloopcmp_proc
-.Lunequal_proc:
-	cbz	diff, .Lremain8
-
-/* There is difference occurred in the latest comparison. */
-.Lnot_limit:
-/*
-* For little endian,reverse the low significant equal bits into MSB,then
-* following CLZ can find how many equal bits exist.
-*/
-CPU_LE( rev	diff, diff )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-
-	/*
-	* The MS-non-zero bit of DIFF marks either the first bit
-	* that is different, or the end of the significant data.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* We need to zero-extend (char is unsigned) the value and then
-	* perform a signed subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	L(return)
+	sub	limit, limit, 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_eq)
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+	sub	result, data1w, data2w
 	ret
 
-.Lremain8:
-	/* Limit % 8 == 0 =>. all data are equal.*/
-	ands	limit, limit, #7
-	b.eq	.Lret0
-
-.Ltiny8proc:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
-	b.eq	.Ltiny8proc
-	sub	result, data1, data2
-	ret
-.Lret0:
-	mov	result, #0
-	ret
 SYM_FUNC_END_PI(memcmp)
 EXPORT_SYMBOL_NOKASAN(memcmp)
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index dc8d2a216a6e..b82fd64ee1e1 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,66 +1,252 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2021, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/cache.h>
 
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
  *
- * Parameters:
- *	x0 - dest
- *	x1 - src
- *	x2 - n
- * Returns:
- *	x0 - dest
  */
-	.macro ldrb1 reg, ptr, val
-	ldrb  \reg, [\ptr], \val
-	.endm
-
-	.macro strb1 reg, ptr, val
-	strb \reg, [\ptr], \val
-	.endm
 
-	.macro ldrh1 reg, ptr, val
-	ldrh  \reg, [\ptr], \val
-	.endm
+#define L(label) .L ## label
 
-	.macro strh1 reg, ptr, val
-	strh \reg, [\ptr], \val
-	.endm
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_lw	w10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
+#define G_l	count
+#define G_h	dst
+#define H_l	src
+#define H_h	srcend
+#define tmp1	x14
 
-	.macro ldr1 reg, ptr, val
-	ldr \reg, [\ptr], \val
-	.endm
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
 
-	.macro str1 reg, ptr, val
-	str \reg, [\ptr], \val
-	.endm
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
 
-	.macro ldp1 reg1, reg2, ptr, val
-	ldp \reg1, \reg2, [\ptr], \val
-	.endm
-
-	.macro stp1 reg1, reg2, ptr, val
-	stp \reg1, \reg2, [\ptr], \val
-	.endm
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
 
+SYM_FUNC_START_ALIAS(__memmove)
+SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
 SYM_FUNC_START_ALIAS(__memcpy)
 SYM_FUNC_START_WEAK_PI(memcpy)
-#include "copy_template.S"
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldp	A_l, A_h, [src]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
+L(copy96):
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(copy0)
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+	ldp	D_l, D_h, [src]
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldp	D_l, D_h, [srcend, -16]
+	and	tmp1, dstend, 15
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+	ret
+
 SYM_FUNC_END_PI(memcpy)
 EXPORT_SYMBOL(memcpy)
 SYM_FUNC_END_ALIAS(__memcpy)
 EXPORT_SYMBOL(__memcpy)
+SYM_FUNC_END_ALIAS_PI(memmove)
+EXPORT_SYMBOL(memmove)
+SYM_FUNC_END_ALIAS(__memmove)
+EXPORT_SYMBOL(__memmove)
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
deleted file mode 100644
index 1035dce4bdaf..000000000000
--- a/arch/arm64/lib/memmove.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
- *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-/*
- * Move a buffer from src to test (alignment handled by the hardware).
- * If dest <= src, call memcpy, otherwise copy in reverse order.
- *
- * Parameters:
- *	x0 - dest
- *	x1 - src
- *	x2 - n
- * Returns:
- *	x0 - dest
- */
-dstin	.req	x0
-src	.req	x1
-count	.req	x2
-tmp1	.req	x3
-tmp1w	.req	w3
-tmp2	.req	x4
-tmp2w	.req	w4
-tmp3	.req	x5
-tmp3w	.req	w5
-dst	.req	x6
-
-A_l	.req	x7
-A_h	.req	x8
-B_l	.req	x9
-B_h	.req	x10
-C_l	.req	x11
-C_h	.req	x12
-D_l	.req	x13
-D_h	.req	x14
-
-SYM_FUNC_START_ALIAS(__memmove)
-SYM_FUNC_START_WEAK_PI(memmove)
-	cmp	dstin, src
-	b.lo	__memcpy
-	add	tmp1, src, count
-	cmp	dstin, tmp1
-	b.hs	__memcpy		/* No overlap.  */
-
-	add	dst, dstin, count
-	add	src, src, count
-	cmp	count, #16
-	b.lo	.Ltail15  /*probably non-alignment accesses.*/
-
-	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
-	b.eq	.LSrcAligned
-	sub	count, count, tmp2
-	/*
-	* process the aligned offset length to make the src aligned firstly.
-	* those extra instructions' cost is acceptable. It also make the
-	* coming accesses are based on aligned address.
-	*/
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src, #-1]!
-	strb	tmp1w, [dst, #-1]!
-1:
-	tbz	tmp2, #1, 2f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-2:
-	tbz	tmp2, #2, 3f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-3:
-	tbz	tmp2, #3, .LSrcAligned
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-
-.LSrcAligned:
-	cmp	count, #64
-	b.ge	.Lcpy_over64
-
-	/*
-	* Deal with small copies quickly by dropping straight into the
-	* exit block.
-	*/
-.Ltail63:
-	/*
-	* Copy up to 48 bytes of data. At this point we only need the
-	* bottom 6 bits of count to be accurate.
-	*/
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-16]!
-	stp	A_l, A_h, [dst, #-16]!
-1:
-	ldp	A_l, A_h, [src, #-16]!
-	stp	A_l, A_h, [dst, #-16]!
-2:
-	ldp	A_l, A_h, [src, #-16]!
-	stp	A_l, A_h, [dst, #-16]!
-
-.Ltail15:
-	tbz	count, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	count, #2, 2f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-2:
-	tbz	count, #1, 3f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-3:
-	tbz	count, #0, .Lexitfunc
-	ldrb	tmp1w, [src, #-1]
-	strb	tmp1w, [dst, #-1]
-
-.Lexitfunc:
-	ret
-
-.Lcpy_over64:
-	subs	count, count, #128
-	b.ge	.Lcpy_body_large
-	/*
-	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
-	* to the tail.
-	*/
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-	stp	D_l, D_h, [dst, #-64]!
-
-	tst	count, #0x3f
-	b.ne	.Ltail63
-	ret
-
-	/*
-	* Critical loop. Start at a new cache line boundary. Assuming
-	* 64 bytes per line this ensures the entire loop is in one line.
-	*/
-	.p2align	L1_CACHE_SHIFT
-.Lcpy_body_large:
-	/* pre-load 64 bytes data. */
-	ldp	A_l, A_h, [src, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-1:
-	/*
-	* interlace the load of next 64 bytes data block with store of the last
-	* loaded 64 bytes data.
-	*/
-	stp	A_l, A_h, [dst, #-16]
-	ldp	A_l, A_h, [src, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	ldp	B_l, B_h, [src, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	C_l, C_h, [src, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	ldp	D_l, D_h, [src, #-64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-
-	tst	count, #0x3f
-	b.ne	.Ltail63
-	ret
-SYM_FUNC_END_PI(memmove)
-EXPORT_SYMBOL(memmove)
-SYM_FUNC_END_ALIAS(__memmove)
-EXPORT_SYMBOL(__memmove)
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index 351537c12f36..e83643b3995f 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -37,6 +37,26 @@ SYM_FUNC_START(mte_clear_page_tags)
 SYM_FUNC_END(mte_clear_page_tags)
 
 /*
+ * Zero the page and tags at the same time
+ *
+ * Parameters:
+ *	x0 - address to the beginning of the page
+ */
+SYM_FUNC_START(mte_zero_clear_page_tags)
+	mrs	x1, dczid_el0
+	and	w1, w1, #0xf
+	mov	x2, #4
+	lsl	x1, x2, x1
+	and	x0, x0, #(1 << MTE_TAG_SHIFT) - 1	// clear the tag
+
+1:	dc	gzva, x0
+	add	x0, x0, x1
+	tst	x0, #(PAGE_SIZE - 1)
+	b.ne	1b
+	ret
+SYM_FUNC_END(mte_zero_clear_page_tags)
+
+/*
  * Copy the tags from the source page to the destination one
  *   x0 - address of the destination page
  *   x1 - address of the source page
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 4e79566726c8..d7bee210a798 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -1,84 +1,123 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012-2021, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/strcmp.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * compare two strings
+/* Assumptions:
  *
- * Parameters:
- *	x0 - const string 1 pointer
- *    x1 - const string 2 pointer
- * Returns:
- * x0 - an integer less than, equal to, or greater than zero
- * if  s1  is  found, respectively, to be less than, to match,
- * or be greater than s2.
+ * ARMv8-a, AArch64
  */
 
+#define L(label) .L ## label
+
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define result		x0
 
 /* Internal variables.  */
-data1		.req	x2
-data1w		.req	w2
-data2		.req	x3
-data2w		.req	w3
-has_nul		.req	x4
-diff		.req	x5
-syndrome	.req	x6
-tmp1		.req	x7
-tmp2		.req	x8
-tmp3		.req	x9
-zeroones	.req	x10
-pos		.req	x11
-
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define syndrome	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
+
+	/* Start of performance-critical section  -- one 64B cache line.  */
+	.align 6
 SYM_FUNC_START_WEAK_PI(strcmp)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
-	b.ne	.Lmisaligned8
+	b.ne	L(misaligned8)
 	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-
-	/*
-	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	* can be done in parallel across the entire word.
-	*/
-.Lloop_aligned:
+	b.ne	L(mutual_align)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, .Lloop_aligned
-	b	.Lcal_cmpresult
+	cbz	syndrome, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+L(end):
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
 
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary.  Round down the addresses and then mask off
-	* the bytes that preceed the start point.
-	*/
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that preceed the start point.  */
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
@@ -86,138 +125,52 @@ SYM_FUNC_START_WEAK_PI(strcmp)
 	neg	tmp1, tmp1		/* Bits to alignment -64.  */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
+#ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
 	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
-
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	b	.Lstart_realigned
-
-.Lmisaligned8:
-	/*
-	* Get the align offset length to compare per byte first.
-	* After this process, one string's address will be aligned.
-	*/
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */
-.Ltinycmp:
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*find the null or unequal...*/
 	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs
-	b.eq	.Lstart_align /*the last bytes are equal....*/
-1:
-	sub	result, data1, data2
-	ret
-
-.Lstart_align:
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	/*process more leading bytes to make str1 aligned...*/
-	add	src1, src1, tmp3
-	add	src2, src2, tmp3
-	/*load 8 bytes from aligned str1 and non-aligned str2..*/
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	eor	diff, data1, data2 /* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	cbnz	syndrome, .Lcal_cmpresult
-	/*How far is the current str2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-.Lrecal_offset:
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes from the SRC2 alignment
-	* boundary,then compare with the relative bytes from SRC1.
-	* If all 8 bytes are equal,then start the second part's comparison.
-	* Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	cbnz	syndrome, .Lcal_cmpresult
-
-	/*The second part process*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, .Lloopcmp_proc
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
 
-.Lcal_cmpresult:
-	/*
-	* reversed the byte-order as big-endian,then CLZ can find the most
-	* significant zero bits.
-	*/
-CPU_LE( rev	syndrome, syndrome )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-
-	/*
-	* For big-endian we cannot use the trick with the syndrome value
-	* as carry-propagation can corrupt the upper bits if the trailing
-	* bytes in the string contain 0x01.
-	* However, if there is no NUL byte in the dword, we can generate
-	* the result directly.  We cannot just subtract the bytes as the
-	* MSB might be significant.
-	*/
-CPU_BE( cbnz	has_nul, 1f )
-CPU_BE( cmp	data1, data2 )
-CPU_BE( cset	result, ne )
-CPU_BE( cneg	result, result, lo )
-CPU_BE( ret )
-CPU_BE( 1: )
-	/*Re-compute the NUL-byte detection, using a byte-reversed value. */
-CPU_BE(	rev	tmp3, data1 )
-CPU_BE(	sub	tmp1, tmp3, zeroones )
-CPU_BE(	orr	tmp2, tmp3, #REP8_7f )
-CPU_BE(	bic	has_nul, tmp1, tmp2 )
-CPU_BE(	rev	has_nul, has_nul )
-CPU_BE(	orr	syndrome, diff, has_nul )
-
-	clz	pos, syndrome
-	/*
-	* The MS-non-zero bit of the syndrome marks either the first bit
-	* that is different, or the top bit of the first zero byte.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* But we need to zero-extend (char is unsigned) the value and then
-	* perform a signed 32-bit subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+L(done):
+	sub	result, data1, data2
 	ret
+
 SYM_FUNC_END_PI(strcmp)
 EXPORT_SYMBOL_NOKASAN(strcmp)
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
index ee3ed882dd79..35fbdb7d6e1a 100644
--- a/arch/arm64/lib/strlen.S
+++ b/arch/arm64/lib/strlen.S
@@ -1,115 +1,203 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2021, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * calculate the length of a string
+/* Assumptions:
  *
- * Parameters:
- *	x0 - const string pointer
- * Returns:
- *	x0 - the return length of specific string
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
+#define L(label) .L ## label
+
 /* Arguments and results.  */
-srcin		.req	x0
-len		.req	x0
+#define srcin		x0
+#define len		x0
 
 /* Locals and temporaries.  */
-src		.req	x1
-data1		.req	x2
-data2		.req	x3
-data2a		.req	x4
-has_nul1	.req	x5
-has_nul2	.req	x6
-tmp1		.req	x7
-tmp2		.req	x8
-tmp3		.req	x9
-tmp4		.req	x10
-zeroones	.req	x11
-pos		.req	x12
+#define src		x1
+#define data1		x2
+#define data2		x3
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word. A faster check
+	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+	   false hits for characters 129..255.	*/
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
+#define MIN_PAGE_SIZE 4096
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned ldp
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   strlen will be repeatedly called on strings with the same length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 32 bytes per iteration
+	   using the fast NUL check.  If we encounter non-ASCII characters,
+	   fallback to a second loop using the full NUL check.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
 SYM_FUNC_START_WEAK_PI(strlen)
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
-	/*
-	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	* can be done in parallel across the entire word.
-	*/
-	/*
-	* The inner loop deals with two Dwords at a time. This has a
-	* slightly higher start-up cost, but we should win quite quickly,
-	* especially on cores with a high number of issue slots per
-	* cycle, as we get much better parallelism out of the operations.
-	*/
-.Lloop:
-	ldp	data1, data2, [src], #16
-.Lrealigned:
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lloop
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+
+	/* Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
 
+	/* The inner loop processes 32 bytes per iteration and uses the fast
+	   NUL check.  If we encounter non-ASCII characters, use a second
+	   loop with the accurate NUL check.  */
+	.p2align 4
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+L(main_loop):
+	ldp	data1, data2, [src, 32]!
+L(page_cross_entry):
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	bne	1f
+	ldp	data1, data2, [src, 16]
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	beq	L(main_loop)
+	add	src, src, 16
+1:
+	/* The fast check failed, so do the slower, accurate NUL check.	 */
+	orr	tmp2, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+
+	/* Enter with C = has_nul1 == 0.  */
+L(tail):
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, cc
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, cc
+#endif
 	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
-CPU_BE(	mov	data2, data1 )	/*prepare data to re-calculate the syndrome*/
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-.Lnul_in_data2:
-	/*
-	* For big-endian, carry propagation (if the final byte in the
-	* string is 0x01) means we cannot use has_nul directly.  The
-	* easiest way to get the correct byte is to byte-swap the data
-	* and calculate the syndrome a second time.
-	*/
-CPU_BE( rev	data2, data2 )
-CPU_BE( sub	tmp1, data2, zeroones )
-CPU_BE( orr	tmp2, data2, #REP8_7f )
-CPU_BE( bic	has_nul2, tmp1, tmp2 )
-
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	rev	has_nul1, has_nul1
+	add	tmp2, len, 8
+	clz	tmp1, has_nul1
+	csel	len, len, tmp2, cc
+	add	len, len, tmp1, lsr 3
 	ret
 
-.Lmisaligned:
-	cmp	tmp1, #8
-	neg	tmp1, tmp1
-	ldp	data1, data2, [src], #16
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	mov	tmp2, #~0
-	/* Big-endian.  Early bytes are at MSB.  */
-CPU_BE( lsl	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+L(nonascii_loop):
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	bne	L(tail)
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+	b	L(tail)
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0x7f, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	bic	src, srcin, 15
+	ldp	data1, data2, [src]
+	lsl	tmp1, srcin, 3
+	mov	tmp4, -1
+#ifdef __AARCH64EB__
+	/* Big-endian.	Early bytes are at MSB.	 */
+	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+#else
 	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )	/* Shift (tmp1 & 63).  */
+	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	tmp1, tmp1, REP8_80
+	orn	data1, data1, tmp1
+	orn	tmp2, data2, tmp1
+	tst	srcin, 8
+	csel	data1, data1, tmp4, eq
+	csel	data2, data2, tmp2, eq
+	b	L(page_cross_entry)
 
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	.Lrealigned
 SYM_FUNC_END_PI(strlen)
 EXPORT_SYMBOL_NOKASAN(strlen)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index 2a7ee949ed47..48d44f7fddb1 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -1,299 +1,261 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2021, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/strncmp.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * compare two strings
+/* Assumptions:
  *
- * Parameters:
- *  x0 - const string 1 pointer
- *  x1 - const string 2 pointer
- *  x2 - the maximal length to be compared
- * Returns:
- *  x0 - an integer less than, equal to, or greater than zero if s1 is found,
- *     respectively, to be less than, to match, or be greater than s2.
+ * ARMv8-a, AArch64
  */
 
+#define L(label) .L ## label
+
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-limit		.req	x2
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
 
 /* Internal variables.  */
-data1		.req	x3
-data1w		.req	w3
-data2		.req	x4
-data2w		.req	w4
-has_nul		.req	x5
-diff		.req	x6
-syndrome	.req	x7
-tmp1		.req	x8
-tmp2		.req	x9
-tmp3		.req	x10
-zeroones	.req	x11
-pos		.req	x12
-limit_wd	.req	x13
-mask		.req	x14
-endloop		.req	x15
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define limit_wd	x13
+#define mask		x14
+#define endloop		x15
+#define count		mask
 
 SYM_FUNC_START_WEAK_PI(strncmp)
-	cbz	limit, .Lret0
+	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
+	and	count, src1, #7
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
-	/*
-	* when limit is mulitply of 8, if not sub 1,
-	* the judgement of last dword will wrong.
-	*/
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3  /* Convert to Dwords.  */
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
 
-	/*
-	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	* can be done in parallel across the entire word.
-	*/
-.Lloop_aligned:
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	.p2align 4
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	subs	limit_wd, limit_wd, #1
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, pl  /* Last Dword or differences.*/
-	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
-	b.eq	.Lloop_aligned
+	b.eq	L(loop_aligned)
+	/* End of main loop */
 
-	/*Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit_wd, #63, L(not_limit)
 
 	/* Limit % 8 == 0 => all bytes significant.  */
 	ands	limit, limit, #7
-	b.eq	.Lnot_limit
+	b.eq	L(not_limit)
 
-	lsl	limit, limit, #3    /* Bits -> bytes.  */
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
 	mov	mask, #~0
-CPU_BE( lsr	mask, mask, limit )
-CPU_LE( lsl	mask, mask, limit )
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
 	bic	data1, data1, mask
 	bic	data2, data2, mask
 
 	/* Make sure that the NUL byte is marked in the syndrome.  */
 	orr	has_nul, has_nul, mask
 
-.Lnot_limit:
+L(not_limit):
 	orr	syndrome, diff, has_nul
-	b	.Lcal_cmpresult
 
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary.  Round down the addresses and then mask off
-	* the bytes that precede the start point.
-	* We also need to adjust the limit calculations, but without
-	* overflowing if the limit is near ULONG_MAX.
-	*/
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3  /* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+#ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-CPU_BE( lsl	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#else
 	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp3 )	/* Shift (tmp1 & 63).  */
-
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
-	b	.Lstart_realigned
+	b	L(start_realigned)
+
+	.p2align 4
+	/* Don't bother with dwords for up to 16 bytes.  */
+L(misaligned8):
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
 
-/*when src1 offset is not equal to src2 offset...*/
-.Lmisaligned8:
-	cmp	limit, #8
-	b.lo	.Ltiny8proc /*limit < 8... */
-	/*
-	* Get the align offset length to compare per byte first.
-	* After this process, one string's address will be aligned.*/
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum. */
-	/*
-	* Here, limit is not less than 8, so directly run .Ltinycmp
-	* without checking the limit.*/
-	sub	limit, limit, pos
-.Ltinycmp:
+L(byte_loop):
+	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*find the null or unequal...*/
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs
-	b.eq	.Lstart_align /*the last bytes are equal....*/
-1:
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	ret
-
-.Lstart_align:
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
 	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-	/*process more leading bytes to make str1 aligned...*/
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	add	src1, src1, tmp3	/*tmp3 is positive in this branch.*/
-	add	src2, src2, tmp3
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
+	cbz	count, L(do_misaligned)
 
-	sub	limit, limit, tmp3
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
 	lsr	limit_wd, limit, #3
-	subs	limit_wd, limit_wd, #1
 
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	bics	has_nul, tmp1, tmp2
-	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
-	b.ne	.Lunequal_proc
-	/*How far is the current str2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-.Lrecal_offset:
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes from the SRC2 alignment
-	* boundary,then compare with the relative bytes from SRC1.
-	* If all 8 bytes are equal,then start the second part's comparison.
-	* Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bics	has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, eq
-	cbnz	endloop, .Lunequal_proc
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
+L(loop_misaligned):
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
 
-	/*The second part process*/
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-	subs	limit_wd, limit_wd, #1
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	bics	has_nul, tmp1, tmp2
-	ccmp	endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
-	b.eq	.Lloopcmp_proc
-
-.Lunequal_proc:
-	orr	syndrome, diff, has_nul
-	cbz	syndrome, .Lremain8
-.Lcal_cmpresult:
-	/*
-	* reversed the byte-order as big-endian,then CLZ can find the most
-	* significant zero bits.
-	*/
-CPU_LE( rev	syndrome, syndrome )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-	/*
-	* For big-endian we cannot use the trick with the syndrome value
-	* as carry-propagation can corrupt the upper bits if the trailing
-	* bytes in the string contain 0x01.
-	* However, if there is no NUL byte in the dword, we can generate
-	* the result directly.  We can't just subtract the bytes as the
-	* MSB might be significant.
-	*/
-CPU_BE( cbnz	has_nul, 1f )
-CPU_BE( cmp	data1, data2 )
-CPU_BE( cset	result, ne )
-CPU_BE( cneg	result, result, lo )
-CPU_BE( ret )
-CPU_BE( 1: )
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.*/
-CPU_BE( rev	tmp3, data1 )
-CPU_BE( sub	tmp1, tmp3, zeroones )
-CPU_BE( orr	tmp2, tmp3, #REP8_7f )
-CPU_BE( bic	has_nul, tmp1, tmp2 )
-CPU_BE( rev	has_nul, has_nul )
-CPU_BE( orr	syndrome, diff, has_nul )
-	/*
-	* The MS-non-zero bit of the syndrome marks either the first bit
-	* that is different, or the top bit of the first zero byte.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	clz	pos, syndrome
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* But we need to zero-extend (char is unsigned) the value and then
-	* perform a signed 32-bit subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-
-.Lremain8:
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	.Lret0
-.Ltiny8proc:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
 
-	ccmp	data1w, #1, #0, ne  /* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-	b.eq	.Ltiny8proc
-	sub	result, data1, data2
-	ret
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
 
-.Lret0:
+L(ret0):
 	mov	result, #0
 	ret
+
 SYM_FUNC_END_PI(strncmp)
 EXPORT_SYMBOL_NOKASAN(strncmp)
diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c
index c83bb5a4aad2..baee22961bdb 100644
--- a/arch/arm64/lib/uaccess_flushcache.c
+++ b/arch/arm64/lib/uaccess_flushcache.c
@@ -15,7 +15,7 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt)
 	 * barrier to order the cache maintenance against the memcpy.
 	 */
 	memcpy(dst, src, cnt);
-	__clean_dcache_area_pop(dst, cnt);
+	dcache_clean_pop((unsigned long)dst, (unsigned long)dst + cnt);
 }
 EXPORT_SYMBOL_GPL(memcpy_flushcache);
 
@@ -33,6 +33,6 @@ unsigned long __copy_user_flushcache(void *to, const void __user *from,
 	rc = raw_copy_from_user(to, from, n);
 
 	/* See above */
-	__clean_dcache_area_pop(to, n - rc);
+	dcache_clean_pop((unsigned long)to, (unsigned long)to + n - rc);
 	return rc;
 }
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 2d881f34dd9d..5051b3c1a4f1 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -15,7 +15,7 @@
 #include <asm/asm-uaccess.h>
 
 /*
- *	flush_icache_range(start,end)
+ *	caches_clean_inval_pou_macro(start,end) [fixup]
  *
  *	Ensure that the I and D caches are coherent within specified region.
  *	This is typically used when code has been written to a memory region,
@@ -23,12 +23,27 @@
  *
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
+ *	- fixup   - optional label to branch to on user fault
  */
-SYM_FUNC_START(__flush_icache_range)
-	/* FALLTHROUGH */
+.macro	caches_clean_inval_pou_macro, fixup
+alternative_if ARM64_HAS_CACHE_IDC
+	dsb     ishst
+	b       .Ldc_skip_\@
+alternative_else_nop_endif
+	mov     x2, x0
+	mov     x3, x1
+	dcache_by_line_op cvau, ish, x2, x3, x4, x5, \fixup
+.Ldc_skip_\@:
+alternative_if ARM64_HAS_CACHE_DIC
+	isb
+	b	.Lic_skip_\@
+alternative_else_nop_endif
+	invalidate_icache_by_line x0, x1, x2, x3, \fixup
+.Lic_skip_\@:
+.endm
 
 /*
- *	__flush_cache_user_range(start,end)
+ *	caches_clean_inval_pou(start,end)
  *
  *	Ensure that the I and D caches are coherent within specified region.
  *	This is typically used when code has been written to a memory region,
@@ -37,117 +52,103 @@ SYM_FUNC_START(__flush_icache_range)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START(__flush_cache_user_range)
+SYM_FUNC_START(caches_clean_inval_pou)
+	caches_clean_inval_pou_macro
+	ret
+SYM_FUNC_END(caches_clean_inval_pou)
+
+/*
+ *	caches_clean_inval_user_pou(start,end)
+ *
+ *	Ensure that the I and D caches are coherent within specified region.
+ *	This is typically used when code has been written to a memory region,
+ *	and will be executed.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+SYM_FUNC_START(caches_clean_inval_user_pou)
 	uaccess_ttbr0_enable x2, x3, x4
-alternative_if ARM64_HAS_CACHE_IDC
-	dsb	ishst
-	b	7f
-alternative_else_nop_endif
-	dcache_line_size x2, x3
-	sub	x3, x2, #1
-	bic	x4, x0, x3
-1:
-user_alt 9f, "dc cvau, x4",  "dc civac, x4",  ARM64_WORKAROUND_CLEAN_CACHE
-	add	x4, x4, x2
-	cmp	x4, x1
-	b.lo	1b
-	dsb	ish
 
-7:
-alternative_if ARM64_HAS_CACHE_DIC
-	isb
-	b	8f
-alternative_else_nop_endif
-	invalidate_icache_by_line x0, x1, x2, x3, 9f
-8:	mov	x0, #0
+	caches_clean_inval_pou_macro 2f
+	mov	x0, xzr
 1:
 	uaccess_ttbr0_disable x1, x2
 	ret
-9:
+2:
 	mov	x0, #-EFAULT
 	b	1b
-SYM_FUNC_END(__flush_icache_range)
-SYM_FUNC_END(__flush_cache_user_range)
+SYM_FUNC_END(caches_clean_inval_user_pou)
 
 /*
- *	invalidate_icache_range(start,end)
+ *	icache_inval_pou(start,end)
  *
  *	Ensure that the I cache is invalid within specified region.
  *
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START(invalidate_icache_range)
+SYM_FUNC_START(icache_inval_pou)
 alternative_if ARM64_HAS_CACHE_DIC
-	mov	x0, xzr
 	isb
 	ret
 alternative_else_nop_endif
 
-	uaccess_ttbr0_enable x2, x3, x4
-
-	invalidate_icache_by_line x0, x1, x2, x3, 2f
-	mov	x0, xzr
-1:
-	uaccess_ttbr0_disable x1, x2
+	invalidate_icache_by_line x0, x1, x2, x3
 	ret
-2:
-	mov	x0, #-EFAULT
-	b	1b
-SYM_FUNC_END(invalidate_icache_range)
+SYM_FUNC_END(icache_inval_pou)
 
 /*
- *	__flush_dcache_area(kaddr, size)
+ *	dcache_clean_inval_poc(start, end)
  *
- *	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ *	Ensure that any D-cache lines for the interval [start, end)
  *	are cleaned and invalidated to the PoC.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
  */
-SYM_FUNC_START_PI(__flush_dcache_area)
+SYM_FUNC_START_PI(dcache_clean_inval_poc)
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__flush_dcache_area)
+SYM_FUNC_END_PI(dcache_clean_inval_poc)
 
 /*
- *	__clean_dcache_area_pou(kaddr, size)
+ *	dcache_clean_pou(start, end)
  *
- * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoU.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
  */
-SYM_FUNC_START(__clean_dcache_area_pou)
+SYM_FUNC_START(dcache_clean_pou)
 alternative_if ARM64_HAS_CACHE_IDC
 	dsb	ishst
 	ret
 alternative_else_nop_endif
 	dcache_by_line_op cvau, ish, x0, x1, x2, x3
 	ret
-SYM_FUNC_END(__clean_dcache_area_pou)
+SYM_FUNC_END(dcache_clean_pou)
 
 /*
- *	__inval_dcache_area(kaddr, size)
+ *	dcache_inval_poc(start, end)
  *
- * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are invalidated. Any partial lines at the ends of the interval are
  *	also cleaned to PoC to prevent data loss.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - kernel start address of region
+ *	- end     - kernel end address of region
  */
 SYM_FUNC_START_LOCAL(__dma_inv_area)
-SYM_FUNC_START_PI(__inval_dcache_area)
+SYM_FUNC_START_PI(dcache_inval_poc)
 	/* FALLTHROUGH */
 
 /*
- *	__dma_inv_area(start, size)
+ *	__dma_inv_area(start, end)
  *	- start   - virtual start address of region
- *	- size    - size in question
+ *	- end     - virtual end address of region
  */
-	add	x1, x1, x0
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	tst	x1, x3				// end cache line aligned?
@@ -165,48 +166,48 @@ SYM_FUNC_START_PI(__inval_dcache_area)
 	b.lo	2b
 	dsb	sy
 	ret
-SYM_FUNC_END_PI(__inval_dcache_area)
+SYM_FUNC_END_PI(dcache_inval_poc)
 SYM_FUNC_END(__dma_inv_area)
 
 /*
- *	__clean_dcache_area_poc(kaddr, size)
+ *	dcache_clean_poc(start, end)
  *
- * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoC.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
  */
 SYM_FUNC_START_LOCAL(__dma_clean_area)
-SYM_FUNC_START_PI(__clean_dcache_area_poc)
+SYM_FUNC_START_PI(dcache_clean_poc)
 	/* FALLTHROUGH */
 
 /*
- *	__dma_clean_area(start, size)
+ *	__dma_clean_area(start, end)
  *	- start   - virtual start address of region
- *	- size    - size in question
+ *	- end     - virtual end address of region
  */
 	dcache_by_line_op cvac, sy, x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__clean_dcache_area_poc)
+SYM_FUNC_END_PI(dcache_clean_poc)
 SYM_FUNC_END(__dma_clean_area)
 
 /*
- *	__clean_dcache_area_pop(kaddr, size)
+ *	dcache_clean_pop(start, end)
  *
- * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	Ensure that any D-cache lines for the interval [start, end)
  * 	are cleaned to the PoP.
  *
- *	- kaddr   - kernel address
- *	- size    - size in question
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
  */
-SYM_FUNC_START_PI(__clean_dcache_area_pop)
+SYM_FUNC_START_PI(dcache_clean_pop)
 	alternative_if_not ARM64_HAS_DCPOP
-	b	__clean_dcache_area_poc
+	b	dcache_clean_poc
 	alternative_else_nop_endif
 	dcache_by_line_op cvap, sy, x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__clean_dcache_area_pop)
+SYM_FUNC_END_PI(dcache_clean_pop)
 
 /*
  *	__dma_flush_area(start, size)
@@ -217,6 +218,7 @@ SYM_FUNC_END_PI(__clean_dcache_area_pop)
  *	- size    - size in question
  */
 SYM_FUNC_START_PI(__dma_flush_area)
+	add	x1, x0, x1
 	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__dma_flush_area)
@@ -228,6 +230,7 @@ SYM_FUNC_END_PI(__dma_flush_area)
  *	- dir	- DMA direction
  */
 SYM_FUNC_START_PI(__dma_map_area)
+	add	x1, x0, x1
 	cmp	w2, #DMA_FROM_DEVICE
 	b.eq	__dma_inv_area
 	b	__dma_clean_area
@@ -240,6 +243,7 @@ SYM_FUNC_END_PI(__dma_map_area)
  *	- dir	- DMA direction
  */
 SYM_FUNC_START_PI(__dma_unmap_area)
+	add	x1, x0, x1
 	cmp	w2, #DMA_TO_DEVICE
 	b.ne	__dma_inv_area
 	ret
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 001737a8f309..cd72576ae2b7 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -402,14 +402,12 @@ static int asids_init(void)
 {
 	asid_bits = get_cpu_asid_bits();
 	atomic64_set(&asid_generation, ASID_FIRST_VERSION);
-	asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS), sizeof(*asid_map),
-			   GFP_KERNEL);
+	asid_map = bitmap_zalloc(NUM_USER_ASIDS, GFP_KERNEL);
 	if (!asid_map)
 		panic("Failed to allocate bitmap for %lu ASIDs\n",
 		      NUM_USER_ASIDS);
 
-	pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS),
-				  sizeof(*pinned_asid_map), GFP_KERNEL);
+	pinned_asid_map = bitmap_zalloc(NUM_USER_ASIDS, GFP_KERNEL);
 	nr_pinned_asids = 0;
 
 	/*
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 871c82ab0a30..349c488765ca 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -99,6 +99,8 @@ static void mem_abort_decode(unsigned int esr)
 	pr_alert("  EA = %lu, S1PTW = %lu\n",
 		 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
 		 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
+	pr_alert("  FSC = 0x%02x: %s\n", (esr & ESR_ELx_FSC),
+		 esr_to_fault_info(esr)->name);
 
 	if (esr_is_data_abort(esr))
 		data_abort_decode(esr);
@@ -232,13 +234,17 @@ static bool is_el1_instruction_abort(unsigned int esr)
 	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
 }
 
+static bool is_el1_data_abort(unsigned int esr)
+{
+	return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
+}
+
 static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr,
 					   struct pt_regs *regs)
 {
-	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
-	if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
+	if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
 		return false;
 
 	if (fsc_type == ESR_ELx_FSC_PERM)
@@ -258,7 +264,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
 	unsigned long flags;
 	u64 par, dfsc;
 
-	if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR ||
+	if (!is_el1_data_abort(esr) ||
 	    (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
 		return false;
 
@@ -346,10 +352,9 @@ static void do_tag_recovery(unsigned long addr, unsigned int esr,
 
 static bool is_el1_mte_sync_tag_check_fault(unsigned int esr)
 {
-	unsigned int ec = ESR_ELx_EC(esr);
 	unsigned int fsc = esr & ESR_ELx_FSC;
 
-	if (ec != ESR_ELx_EC_DABT_CUR)
+	if (!is_el1_data_abort(esr))
 		return false;
 
 	if (fsc == ESR_ELx_FSC_MTE)
@@ -504,7 +509,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (!(vma->vm_flags & vm_flags))
 		return VM_FAULT_BADACCESS;
-	return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs);
+	return handle_mm_fault(vma, addr, mm_flags, regs);
 }
 
 static bool is_el0_instruction_abort(unsigned int esr)
@@ -836,13 +841,6 @@ void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(do_mem_abort);
 
-void do_el0_irq_bp_hardening(void)
-{
-	/* PC has already been checked in entry.S */
-	arm64_apply_bp_hardening();
-}
-NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
-
 void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
 	arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
@@ -921,3 +919,29 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
 	debug_exception_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug_exception);
+
+/*
+ * Used during anonymous page fault handling.
+ */
+struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+						unsigned long vaddr)
+{
+	gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
+
+	/*
+	 * If the page is mapped with PROT_MTE, initialise the tags at the
+	 * point of allocation and page zeroing as this is usually faster than
+	 * separate DC ZVA and STGM.
+	 */
+	if (vma->vm_flags & VM_MTE)
+		flags |= __GFP_ZEROTAGS;
+
+	return alloc_page_vma(flags, vma, vaddr);
+}
+
+void tag_clear_highpage(struct page *page)
+{
+	mte_zero_clear_page_tags(page_address(page));
+	page_kasan_tag_reset(page);
+	set_bit(PG_mte_tagged, &page->flags);
+}
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 6d44c028d1c9..2aaf950b906c 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -14,28 +14,25 @@
 #include <asm/cache.h>
 #include <asm/tlbflush.h>
 
-void sync_icache_aliases(void *kaddr, unsigned long len)
+void sync_icache_aliases(unsigned long start, unsigned long end)
 {
-	unsigned long addr = (unsigned long)kaddr;
-
 	if (icache_is_aliasing()) {
-		__clean_dcache_area_pou(kaddr, len);
-		__flush_icache_all();
+		dcache_clean_pou(start, end);
+		icache_inval_all_pou();
 	} else {
 		/*
 		 * Don't issue kick_all_cpus_sync() after I-cache invalidation
 		 * for user mappings.
 		 */
-		__flush_icache_range(addr, addr + len);
+		caches_clean_inval_pou(start, end);
 	}
 }
 
-static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
-				unsigned long uaddr, void *kaddr,
-				unsigned long len)
+static void flush_ptrace_access(struct vm_area_struct *vma, unsigned long start,
+				unsigned long end)
 {
 	if (vma->vm_flags & VM_EXEC)
-		sync_icache_aliases(kaddr, len);
+		sync_icache_aliases(start, end);
 }
 
 /*
@@ -48,7 +45,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
 		       unsigned long len)
 {
 	memcpy(dst, src, len);
-	flush_ptrace_access(vma, page, uaddr, dst, len);
+	flush_ptrace_access(vma, (unsigned long)dst, (unsigned long)dst + len);
 }
 
 void __sync_icache_dcache(pte_t pte)
@@ -56,7 +53,9 @@ void __sync_icache_dcache(pte_t pte)
 	struct page *page = pte_page(pte);
 
 	if (!test_bit(PG_dcache_clean, &page->flags)) {
-		sync_icache_aliases(page_address(page), page_size(page));
+		sync_icache_aliases((unsigned long)page_address(page),
+				    (unsigned long)page_address(page) +
+					    page_size(page));
 		set_bit(PG_dcache_clean, &page->flags);
 	}
 }
@@ -77,20 +76,20 @@ EXPORT_SYMBOL(flush_dcache_page);
 /*
  * Additional functions defined in assembly.
  */
-EXPORT_SYMBOL(__flush_icache_range);
+EXPORT_SYMBOL(caches_clean_inval_pou);
 
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size)
 {
 	/* Ensure order against any prior non-cacheable writes */
 	dmb(osh);
-	__clean_dcache_area_pop(addr, size);
+	dcache_clean_pop((unsigned long)addr, (unsigned long)addr + size);
 }
 EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 
 void arch_invalidate_pmem(void *addr, size_t size)
 {
-	__inval_dcache_area(addr, size);
+	dcache_inval_poc((unsigned long)addr, (unsigned long)addr + size);
 }
 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
 #endif
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 49019ea0c8a8..8490ed2917ff 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -474,6 +474,13 @@ void __init mem_init(void)
 	BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
 #endif
 
+	/*
+	 * Selected page table levels should match when derived from
+	 * scratch using the virtual address range and page size.
+	 */
+	BUILD_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
+		     CONFIG_PGTABLE_LEVELS);
+
 	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
 		extern int sysctl_overcommit_memory;
 		/*
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 7553a7eab3db..595fde9a47dd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -228,7 +228,7 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
 		next = pmd_addr_end(addr, end);
 
 		/* try section mapping first */
-		if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
+		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
 			pmd_set_huge(pmdp, phys, prot);
 
@@ -1114,14 +1114,14 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
 }
 #endif
 
-#if !ARM64_SWAPPER_USES_SECTION_MAPS
+#if !ARM64_KERNEL_USES_PMD_MAPS
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
 	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
 	return vmemmap_populate_basepages(start, end, node, altmap);
 }
-#else	/* !ARM64_SWAPPER_USES_SECTION_MAPS */
+#else	/* !ARM64_KERNEL_USES_PMD_MAPS */
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
@@ -1166,17 +1166,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 
 	return 0;
 }
-#endif	/* !ARM64_SWAPPER_USES_SECTION_MAPS */
+#endif	/* !ARM64_KERNEL_USES_PMD_MAPS */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap)
 {
-#ifdef CONFIG_MEMORY_HOTPLUG
 	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
 
 	unmap_hotplug_range(start, end, true, altmap);
 	free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
-#endif
 }
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 static inline pud_t *fixmap_pud(unsigned long addr)
 {
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 97d7bcd8d4f2..35936c5ae1ce 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -46,9 +46,13 @@
 #endif
 
 #ifdef CONFIG_KASAN_HW_TAGS
-#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1
+#define TCR_MTE_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1
 #else
-#define TCR_KASAN_HW_FLAGS 0
+/*
+ * The mte_zero_clear_page_tags() implementation uses DC GZVA, which relies on
+ * TBI being enabled at EL1.
+ */
+#define TCR_MTE_FLAGS TCR_TBI1 | TCR_TBID1
 #endif
 
 /*
@@ -58,10 +62,8 @@
 #define MAIR_EL1_SET							\
 	(MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |	\
 	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) |	\
-	 MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) |		\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) |		\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) |			\
-	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) |		\
 	 MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED))
 
 #ifdef CONFIG_CPU_PM
@@ -83,11 +85,7 @@ SYM_FUNC_START(cpu_do_suspend)
 	mrs	x9, mdscr_el1
 	mrs	x10, oslsr_el1
 	mrs	x11, sctlr_el1
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
-	mrs	x12, tpidr_el1
-alternative_else
-	mrs	x12, tpidr_el2
-alternative_endif
+	get_this_cpu_offset x12
 	mrs	x13, sp_el0
 	stp	x2, x3, [x0]
 	stp	x4, x5, [x0, #16]
@@ -145,11 +143,7 @@ SYM_FUNC_START(cpu_do_resume)
 	msr	mdscr_el1, x10
 
 	msr	sctlr_el1, x12
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
-	msr	tpidr_el1, x13
-alternative_else
-	msr	tpidr_el2, x13
-alternative_endif
+	set_this_cpu_offset x13
 	msr	sp_el0, x14
 	/*
 	 * Restore oslsr_el1 by writing oslar_el1
@@ -464,7 +458,7 @@ SYM_FUNC_START(__cpu_setup)
 	msr_s	SYS_TFSRE0_EL1, xzr
 
 	/* set the TCR_EL1 bits */
-	mov_q	x10, TCR_KASAN_HW_FLAGS
+	mov_q	x10, TCR_MTE_FLAGS
 	orr	tcr, tcr, x10
 1:
 #endif
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index a1937dfff31c..1c403536c9bb 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -159,10 +159,6 @@ static const struct prot_bits pte_bits[] = {
 		.set	= "DEVICE/nGnRE",
 	}, {
 		.mask	= PTE_ATTRINDX_MASK,
-		.val	= PTE_ATTRINDX(MT_DEVICE_GRE),
-		.set	= "DEVICE/GRE",
-	}, {
-		.mask	= PTE_ATTRINDX_MASK,
 		.val	= PTE_ATTRINDX(MT_NORMAL_NC),
 		.set	= "MEM/NORMAL-NC",
 	}, {
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index f7b194878a99..dccf98a37283 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -16,6 +16,7 @@
 #include <asm/byteorder.h>
 #include <asm/cacheflush.h>
 #include <asm/debug-monitors.h>
+#include <asm/insn.h>
 #include <asm/set_memory.h>
 
 #include "bpf_jit.h"
@@ -178,9 +179,6 @@ static bool is_addsub_imm(u32 imm)
 	return !(imm & ~0xfff) || !(imm & ~0xfff000);
 }
 
-/* Stack must be multiples of 16B */
-#define STACK_ALIGN(sz) (((sz) + 15) & ~15)
-
 /* Tail call offset to jump into */
 #if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)
 #define PROLOGUE_OFFSET 8
@@ -255,7 +253,8 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 			emit(A64_BTI_J, ctx);
 	}
 
-	ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth);
+	/* Stack must be multiples of 16B */
+	ctx->stack_size = round_up(prog->aux->stack_depth, 16);
 
 	/* Set up function call stack */
 	emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
@@ -487,17 +486,12 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 		break;
 	case BPF_ALU | BPF_DIV | BPF_X:
 	case BPF_ALU64 | BPF_DIV | BPF_X:
+		emit(A64_UDIV(is64, dst, dst, src), ctx);
+		break;
 	case BPF_ALU | BPF_MOD | BPF_X:
 	case BPF_ALU64 | BPF_MOD | BPF_X:
-		switch (BPF_OP(code)) {
-		case BPF_DIV:
-			emit(A64_UDIV(is64, dst, dst, src), ctx);
-			break;
-		case BPF_MOD:
-			emit(A64_UDIV(is64, tmp, dst, src), ctx);
-			emit(A64_MSUB(is64, dst, dst, tmp, src), ctx);
-			break;
-		}
+		emit(A64_UDIV(is64, tmp, dst, src), ctx);
+		emit(A64_MSUB(is64, dst, dst, tmp, src), ctx);
 		break;
 	case BPF_ALU | BPF_LSH | BPF_X:
 	case BPF_ALU64 | BPF_LSH | BPF_X:
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 21fbdda7086e..49305c2e6dfd 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -3,7 +3,8 @@
 # Internal CPU capabilities constants, keep this list sorted
 
 BTI
-HAS_32BIT_EL0
+# Unreliable: use system_supports_32bit_el0() instead.
+HAS_32BIT_EL0_DO_NOT_USE
 HAS_32BIT_EL1
 HAS_ADDRESS_AUTH
 HAS_ADDRESS_AUTH_ARCH