84 files changed, 525 insertions, 5131 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 55fc331af337..7c456aa838b9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -21,8 +21,6 @@ config ARM64
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_HAS_CACHE_LINE_SIZE
 	select ARCH_HAS_CC_PLATFORM
-	select ARCH_HAS_CRC32
-	select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEBUG_VM_PGTABLE
@@ -187,12 +185,12 @@ config ARM64
 	select HAVE_ARCH_KCSAN if EXPERT
 	select HAVE_ARCH_KFENCE
 	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_KSTACK_ERASE
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
 	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
-	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
@@ -256,6 +254,7 @@ config ARM64
 	select HOTPLUG_SMT if HOTPLUG_CPU
 	select IRQ_DOMAIN
 	select IRQ_FORCED_THREADING
+	select JUMP_LABEL
 	select KASAN_VMALLOC if KASAN
 	select LOCK_MM_AND_FIND_VMA
 	select MODULES_USE_ELF_RELA
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi b/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
index 8b7cbc2e78f5..51cd148f4227 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
@@ -131,7 +131,7 @@
 				       "PH5", "PH6", "PH7", "PH9", "PH10",
 				       "PH14", "PH15", "PH16", "PH17", "PH18";
 				allwinner,pinmux = <5>;
-				function = "emac0";
+				function = "gmac0";
 				drive-strength = <40>;
 				bias-disable;
 			};
@@ -540,8 +540,8 @@
 			status = "disabled";
 		};
 
-		emac0: ethernet@4500000 {
-			compatible = "allwinner,sun55i-a523-emac0",
+		gmac0: ethernet@4500000 {
+			compatible = "allwinner,sun55i-a523-gmac0",
 				     "allwinner,sun50i-a64-emac";
 			reg = <0x04500000 0x10000>;
 			clocks = <&ccu CLK_BUS_EMAC0>;
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts b/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
index 0f58d92a6adc..8bc0f2c72a24 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
+++ b/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
@@ -12,7 +12,7 @@
 	compatible = "radxa,cubie-a5e", "allwinner,sun55i-a527";
 
 	aliases {
-		ethernet0 = &emac0;
+		ethernet0 = &gmac0;
 		serial0 = &uart0;
 	};
 
@@ -55,7 +55,7 @@
 	status = "okay";
 };
 
-&emac0 {
+&gmac0 {
 	phy-mode = "rgmii-id";
 	phy-handle = <&ext_rgmii_phy>;
 	phy-supply = <&reg_cldo3>;
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts b/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
index 08127f0cdd35..142177c1f737 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
+++ b/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
@@ -12,7 +12,7 @@
 	compatible = "yuzukihd,avaota-a1", "allwinner,sun55i-t527";
 
 	aliases {
-		ethernet0 = &emac0;
+		ethernet0 = &gmac0;
 		serial0 = &uart0;
 	};
 
@@ -65,7 +65,7 @@
 	status = "okay";
 };
 
-&emac0 {
+&gmac0 {
 	phy-mode = "rgmii-id";
 	phy-handle = <&ext_rgmii_phy>;
 	phy-supply = <&reg_dcdc4>;
diff --git a/arch/arm64/boot/dts/apple/spi1-nvram.dtsi b/arch/arm64/boot/dts/apple/spi1-nvram.dtsi
index 3df2fd3993b5..9740fbf200f0 100644
--- a/arch/arm64/boot/dts/apple/spi1-nvram.dtsi
+++ b/arch/arm64/boot/dts/apple/spi1-nvram.dtsi
@@ -20,8 +20,6 @@
 		compatible = "jedec,spi-nor";
 		reg = <0x0>;
 		spi-max-frequency = <25000000>;
-		#address-cells = <1>;
-		#size-cells = <1>;
 
 		partitions {
 			compatible = "fixed-partitions";
diff --git a/arch/arm64/boot/dts/apple/t8103-j293.dts b/arch/arm64/boot/dts/apple/t8103-j293.dts
index e2d9439397f7..5b3c42e9f0e6 100644
--- a/arch/arm64/boot/dts/apple/t8103-j293.dts
+++ b/arch/arm64/boot/dts/apple/t8103-j293.dts
@@ -100,6 +100,8 @@
 
 &displaydfr_mipi {
 	status = "okay";
+	#address-cells = <1>;
+	#size-cells = <0>;
 
 	dfr_panel: panel@0 {
 		compatible = "apple,j293-summit", "apple,summit";
diff --git a/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi b/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi
index 8e82231acab5..0c8206156bfe 100644
--- a/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi
+++ b/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi
@@ -71,7 +71,7 @@
  */
 &port00 {
 	bus-range = <1 1>;
-	wifi0: network@0,0 {
+	wifi0: wifi@0,0 {
 		compatible = "pci14e4,4425";
 		reg = <0x10000 0x0 0x0 0x0 0x0>;
 		/* To be filled by the loader */
diff --git a/arch/arm64/boot/dts/apple/t8103.dtsi b/arch/arm64/boot/dts/apple/t8103.dtsi
index 20faf0c0d809..3a204845b85b 100644
--- a/arch/arm64/boot/dts/apple/t8103.dtsi
+++ b/arch/arm64/boot/dts/apple/t8103.dtsi
@@ -405,8 +405,6 @@
 			compatible = "apple,t8103-display-pipe-mipi", "apple,h7-display-pipe-mipi";
 			reg = <0x2 0x28600000 0x0 0x100000>;
 			power-domains = <&ps_mipi_dsi>;
-			#address-cells = <1>;
-			#size-cells = <0>;
 			status = "disabled";
 
 			ports {
diff --git a/arch/arm64/boot/dts/apple/t8112-j493.dts b/arch/arm64/boot/dts/apple/t8112-j493.dts
index be86d34c6696..fb8ad7d4c65a 100644
--- a/arch/arm64/boot/dts/apple/t8112-j493.dts
+++ b/arch/arm64/boot/dts/apple/t8112-j493.dts
@@ -63,6 +63,8 @@
 
 &displaydfr_mipi {
 	status = "okay";
+	#address-cells = <1>;
+	#size-cells = <0>;
 
 	dfr_panel: panel@0 {
 		compatible = "apple,j493-summit", "apple,summit";
diff --git a/arch/arm64/boot/dts/apple/t8112.dtsi b/arch/arm64/boot/dts/apple/t8112.dtsi
index e95711d8337f..f68354194355 100644
--- a/arch/arm64/boot/dts/apple/t8112.dtsi
+++ b/arch/arm64/boot/dts/apple/t8112.dtsi
@@ -420,8 +420,6 @@
 			compatible = "apple,t8112-display-pipe-mipi", "apple,h7-display-pipe-mipi";
 			reg = <0x2 0x28600000 0x0 0x100000>;
 			power-domains = <&ps_mipi_dsi>;
-			#address-cells = <1>;
-			#size-cells = <0>;
 			status = "disabled";
 
 			ports {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index 0baf256b4400..983b2f0e8797 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
@@ -687,11 +687,12 @@
 		};
 
 		wdog0: watchdog@2ad0000 {
-			compatible = "fsl,imx21-wdt";
+			compatible = "fsl,ls1046a-wdt", "fsl,imx21-wdt";
 			reg = <0x0 0x2ad0000 0x0 0x10000>;
 			interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
 					    QORIQ_CLK_PLL_DIV(2)>;
+			big-endian;
 		};
 
 		edma0: dma-controller@2c00000 {
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
index d29710772569..1594ce9182a5 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
@@ -464,6 +464,7 @@
 			};
 
 			reg_nvcc_sd: LDO5 {
+				regulator-always-on;
 				regulator-max-microvolt = <3300000>;
 				regulator-min-microvolt = <1800000>;
 				regulator-name = "On-module +V3.3_1.8_SD (LDO5)";
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
index 2f740d74707b..4bf818873fe3 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
@@ -70,7 +70,7 @@
 	tpm@1 {
 		compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
 		reg = <0x1>;
-		spi-max-frequency = <36000000>;
+		spi-max-frequency = <25000000>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
index 5ab3ffe9931d..cf747ec6fa16 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
@@ -110,7 +110,7 @@
 	tpm@1 {
 		compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
 		reg = <0x1>;
-		spi-max-frequency = <36000000>;
+		spi-max-frequency = <25000000>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
index e2b5e7ac3e46..5eb114d2360a 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
@@ -122,7 +122,7 @@
 	tpm@1 {
 		compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
 		reg = <0x1>;
-		spi-max-frequency = <36000000>;
+		spi-max-frequency = <25000000>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
index 6daa2313f879..568d24265ddf 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
@@ -201,7 +201,7 @@
 	tpm@0 {
 		compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
 		reg = <0x0>;
-		spi-max-frequency = <36000000>;
+		spi-max-frequency = <25000000>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts b/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
index 6c47f4b47356..9f4d0899a94d 100644
--- a/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
@@ -574,17 +574,17 @@
 &scmi_iomuxc {
 	pinctrl_emdio: emdiogrp {
 		fsl,pins = <
-			IMX95_PAD_ENET2_MDC__NETCMIX_TOP_NETC_MDC		0x57e
-			IMX95_PAD_ENET2_MDIO__NETCMIX_TOP_NETC_MDIO		0x97e
+			IMX95_PAD_ENET2_MDC__NETCMIX_TOP_NETC_MDC		0x50e
+			IMX95_PAD_ENET2_MDIO__NETCMIX_TOP_NETC_MDIO		0x90e
 		>;
 	};
 
 	pinctrl_enetc0: enetc0grp {
 		fsl,pins = <
-			IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3		0x57e
-			IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2		0x57e
-			IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1		0x57e
-			IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0		0x57e
+			IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3		0x50e
+			IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2		0x50e
+			IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1		0x50e
+			IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0		0x50e
 			IMX95_PAD_ENET1_TX_CTL__NETCMIX_TOP_ETH0_RGMII_TX_CTL	0x57e
 			IMX95_PAD_ENET1_TXC__NETCMIX_TOP_ETH0_RGMII_TX_CLK	0x58e
 			IMX95_PAD_ENET1_RX_CTL__NETCMIX_TOP_ETH0_RGMII_RX_CTL	0x57e
@@ -598,10 +598,10 @@
 
 	pinctrl_enetc1: enetc1grp {
 		fsl,pins = <
-			IMX95_PAD_ENET2_TD3__NETCMIX_TOP_ETH1_RGMII_TD3		0x57e
-			IMX95_PAD_ENET2_TD2__NETCMIX_TOP_ETH1_RGMII_TD2		0x57e
-			IMX95_PAD_ENET2_TD1__NETCMIX_TOP_ETH1_RGMII_TD1		0x57e
-			IMX95_PAD_ENET2_TD0__NETCMIX_TOP_ETH1_RGMII_TD0		0x57e
+			IMX95_PAD_ENET2_TD3__NETCMIX_TOP_ETH1_RGMII_TD3		0x50e
+			IMX95_PAD_ENET2_TD2__NETCMIX_TOP_ETH1_RGMII_TD2		0x50e
+			IMX95_PAD_ENET2_TD1__NETCMIX_TOP_ETH1_RGMII_TD1		0x50e
+			IMX95_PAD_ENET2_TD0__NETCMIX_TOP_ETH1_RGMII_TD0		0x50e
 			IMX95_PAD_ENET2_TX_CTL__NETCMIX_TOP_ETH1_RGMII_TX_CTL	0x57e
 			IMX95_PAD_ENET2_TXC__NETCMIX_TOP_ETH1_RGMII_TX_CLK	0x58e
 			IMX95_PAD_ENET2_RX_CTL__NETCMIX_TOP_ETH1_RGMII_RX_CTL	0x57e
diff --git a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
index 6886ea766655..d7d845231312 100644
--- a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
@@ -566,17 +566,17 @@
 &scmi_iomuxc {
 	pinctrl_emdio: emdiogrp{
 		fsl,pins = <
-			IMX95_PAD_ENET1_MDC__NETCMIX_TOP_NETC_MDC		0x57e
-			IMX95_PAD_ENET1_MDIO__NETCMIX_TOP_NETC_MDIO		0x97e
+			IMX95_PAD_ENET1_MDC__NETCMIX_TOP_NETC_MDC		0x50e
+			IMX95_PAD_ENET1_MDIO__NETCMIX_TOP_NETC_MDIO		0x90e
 		>;
 	};
 
 	pinctrl_enetc0: enetc0grp {
 		fsl,pins = <
-			IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3		0x57e
-			IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2		0x57e
-			IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1		0x57e
-			IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0		0x57e
+			IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3		0x50e
+			IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2		0x50e
+			IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1		0x50e
+			IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0		0x50e
 			IMX95_PAD_ENET1_TX_CTL__NETCMIX_TOP_ETH0_RGMII_TX_CTL	0x57e
 			IMX95_PAD_ENET1_TXC__NETCMIX_TOP_ETH0_RGMII_TX_CLK	0x58e
 			IMX95_PAD_ENET1_RX_CTL__NETCMIX_TOP_ETH0_RGMII_RX_CTL	0x57e
diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi
index 632631a29112..5aecdd9b62ff 100644
--- a/arch/arm64/boot/dts/freescale/imx95.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx95.dtsi
@@ -1708,7 +1708,7 @@
 			      <0x9 0 1 0>;
 			reg-names = "dbi","atu", "dbi2", "app", "dma", "addr_space";
 			num-lanes = <1>;
-			interrupts = <GIC_SPI 317 IRQ_TYPE_LEVEL_HIGH>;
+			interrupts = <GIC_SPI 311 IRQ_TYPE_LEVEL_HIGH>;
 			interrupt-names = "dma";
 			clocks = <&scmi_clk IMX95_CLK_HSIO>,
 				 <&scmi_clk IMX95_CLK_HSIOPLL>,
diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
index ae7a275fd223..cefecb7a23cf 100644
--- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
+++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
@@ -1090,6 +1090,8 @@
 };
 
 &pmk8280_rtc {
+	qcom,uefi-rtc-info;
+
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi b/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
index c02fd4d15c96..e3888bc143a0 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
+++ b/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
@@ -224,6 +224,7 @@
 			reg-names = "rtc", "alarm";
 			interrupts = <0x0 0x62 0x1 IRQ_TYPE_EDGE_RISING>;
 			qcom,no-alarm; /* alarm owned by ADSP */
+			qcom,uefi-rtc-info;
 		};
 
 		pmk8550_sdam_2: nvram@7100 {
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
index ab232e5c7ad6..4203b335a263 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
@@ -379,6 +379,18 @@
 				<0 RK_PA7 RK_FUNC_GPIO &pcfg_pull_up>;
 		};
 	};
+
+	spi1 {
+		spi1_csn0_gpio_pin: spi1-csn0-gpio-pin {
+			rockchip,pins =
+				<3 RK_PB1 RK_FUNC_GPIO &pcfg_pull_up_4ma>;
+		};
+
+		spi1_csn1_gpio_pin: spi1-csn1-gpio-pin {
+			rockchip,pins =
+				<3 RK_PB2 RK_FUNC_GPIO &pcfg_pull_up_4ma>;
+		};
+	};
 };
 
 &pmu_io_domains {
@@ -396,6 +408,17 @@
 	vqmmc-supply = <&vccio_sd>;
 };
 
+&spi1 {
+	/*
+	 * Hardware CS has a very slow rise time of about 6us,
+	 * causing transmission errors.
+	 * With cs-gpios we have a rise time of about 20ns.
+	 */
+	cs-gpios = <&gpio3 RK_PB1 GPIO_ACTIVE_LOW>, <&gpio3 RK_PB2 GPIO_ACTIVE_LOW>;
+	pinctrl-names = "default";
+	pinctrl-0 = <&spi1_clk &spi1_csn0_gpio_pin &spi1_csn1_gpio_pin &spi1_miso &spi1_mosi>;
+};
+
 &tsadc {
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
index 3c127c5c2607..a9021c524afb 100644
--- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
@@ -30,6 +30,7 @@
 
 	fan: gpio_fan {
 		compatible = "gpio-fan";
+		fan-supply = <&vcc12v_dcin>;
 		gpios = <&gpio0 RK_PD5 GPIO_ACTIVE_HIGH>;
 		gpio-fan,speed-map =
 				<   0 0>,
diff --git a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
index 3b31f0dd8f3b..539edc3c535f 100644
--- a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
@@ -29,7 +29,6 @@
 			function-enumerator = <1>;
 			gpios = <&gpio3 RK_PD6 GPIO_ACTIVE_HIGH>;
 			label = "LAN-1";
-			linux,default-trigger = "netdev";
 		};
 
 		led-lan2 {
@@ -39,7 +38,6 @@
 			function-enumerator = <2>;
 			gpios = <&gpio3 RK_PD7 GPIO_ACTIVE_HIGH>;
 			label = "LAN-2";
-			linux,default-trigger = "netdev";
 		};
 
 		power_led: led-sys {
@@ -56,7 +54,6 @@
 			function = LED_FUNCTION_WAN;
 			gpios = <&gpio2 RK_PC1 GPIO_ACTIVE_HIGH>;
 			label = "WAN";
-			linux,default-trigger = "netdev";
 		};
 	};
 };
diff --git a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
index b09e789c75c4..801b40fea4e8 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
@@ -211,10 +211,38 @@
 	status = "okay";
 };
 
+&cpu_b0 {
+	cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b1 {
+	cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b2 {
+	cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b3 {
+	cpu-supply = <&vdd_cpu_big_s0>;
+};
+
 &cpu_l0 {
 	cpu-supply = <&vdd_cpu_lit_s0>;
 };
 
+&cpu_l1 {
+	cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
+&cpu_l2 {
+	cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
+&cpu_l3 {
+	cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
 &gmac0 {
 	phy-mode = "rgmii-id";
 	clock_in_out = "output";
diff --git a/arch/arm64/boot/dts/rockchip/rk3576.dtsi b/arch/arm64/boot/dts/rockchip/rk3576.dtsi
index 1086482f0479..64812e3bcb61 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3576.dtsi
@@ -615,7 +615,7 @@
 					<0 0 0 2 &pcie1_intc 1>,
 					<0 0 0 3 &pcie1_intc 2>,
 					<0 0 0 4 &pcie1_intc 3>;
-			linux,pci-domain = <0>;
+			linux,pci-domain = <1>;
 			max-link-speed = <2>;
 			num-ib-windows = <8>;
 			num-viewport = <8>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
index 7f874c77410c..6584d73660f6 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
@@ -578,14 +578,14 @@
 		hdmim0_tx0_scl: hdmim0-tx0-scl {
 			rockchip,pins =
 				/* hdmim0_tx0_scl */
-				<4 RK_PB7 5 &pcfg_pull_none>;
+				<4 RK_PB7 5 &pcfg_pull_none_drv_level_5_smt>;
 		};
 
 		/omit-if-no-ref/
 		hdmim0_tx0_sda: hdmim0-tx0-sda {
 			rockchip,pins =
 				/* hdmim0_tx0_sda */
-				<4 RK_PC0 5 &pcfg_pull_none>;
+				<4 RK_PC0 5 &pcfg_pull_none_drv_level_1_smt>;
 		};
 
 		/omit-if-no-ref/
@@ -640,14 +640,14 @@
 		hdmim1_tx0_scl: hdmim1-tx0-scl {
 			rockchip,pins =
 				/* hdmim1_tx0_scl */
-				<0 RK_PD5 11 &pcfg_pull_none>;
+				<0 RK_PD5 11 &pcfg_pull_none_drv_level_5_smt>;
 		};
 
 		/omit-if-no-ref/
 		hdmim1_tx0_sda: hdmim1-tx0-sda {
 			rockchip,pins =
 				/* hdmim1_tx0_sda */
-				<0 RK_PD4 11 &pcfg_pull_none>;
+				<0 RK_PD4 11 &pcfg_pull_none_drv_level_1_smt>;
 		};
 
 		/omit-if-no-ref/
@@ -668,14 +668,14 @@
 		hdmim1_tx1_scl: hdmim1-tx1-scl {
 			rockchip,pins =
 				/* hdmim1_tx1_scl */
-				<3 RK_PC6 5 &pcfg_pull_none>;
+				<3 RK_PC6 5 &pcfg_pull_none_drv_level_5_smt>;
 		};
 
 		/omit-if-no-ref/
 		hdmim1_tx1_sda: hdmim1-tx1-sda {
 			rockchip,pins =
 				/* hdmim1_tx1_sda */
-				<3 RK_PC5 5 &pcfg_pull_none>;
+				<3 RK_PC5 5 &pcfg_pull_none_drv_level_1_smt>;
 		};
 		/omit-if-no-ref/
 		hdmim2_rx_cec: hdmim2-rx-cec {
@@ -709,14 +709,14 @@
 		hdmim2_tx0_scl: hdmim2-tx0-scl {
 			rockchip,pins =
 				/* hdmim2_tx0_scl */
-				<3 RK_PC7 5 &pcfg_pull_none>;
+				<3 RK_PC7 5 &pcfg_pull_none_drv_level_5_smt>;
 		};
 
 		/omit-if-no-ref/
 		hdmim2_tx0_sda: hdmim2-tx0-sda {
 			rockchip,pins =
 				/* hdmim2_tx0_sda */
-				<3 RK_PD0 5 &pcfg_pull_none>;
+				<3 RK_PD0 5 &pcfg_pull_none_drv_level_1_smt>;
 		};
 
 		/omit-if-no-ref/
@@ -730,14 +730,14 @@
 		hdmim2_tx1_scl: hdmim2-tx1-scl {
 			rockchip,pins =
 				/* hdmim2_tx1_scl */
-				<1 RK_PA4 5 &pcfg_pull_none>;
+				<1 RK_PA4 5 &pcfg_pull_none_drv_level_5_smt>;
 		};
 
 		/omit-if-no-ref/
 		hdmim2_tx1_sda: hdmim2-tx1-sda {
 			rockchip,pins =
 				/* hdmim2_tx1_sda */
-				<1 RK_PA3 5 &pcfg_pull_none>;
+				<1 RK_PA3 5 &pcfg_pull_none_drv_level_1_smt>;
 		};
 
 		/omit-if-no-ref/
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
index cc37f082adea..b07543315f87 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
@@ -321,6 +321,7 @@
 	bus-width = <4>;
 	cap-mmc-highspeed;
 	cap-sd-highspeed;
+	cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>;
 	disable-wp;
 	max-frequency = <150000000>;
 	no-sdio;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
index 244c66faa161..fb48ddc04bcb 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
@@ -160,14 +160,15 @@
 		hdmim0_tx1_scl: hdmim0-tx1-scl {
 			rockchip,pins =
 				/* hdmim0_tx1_scl */
-				<2 RK_PB5 4 &pcfg_pull_none>;
+				<2 RK_PB5 4 &pcfg_pull_none_drv_level_3_smt>;
 		};
 
 		/omit-if-no-ref/
 		hdmim0_tx1_sda: hdmim0-tx1-sda {
 			rockchip,pins =
 				/* hdmim0_tx1_sda */
-				<2 RK_PB4 4 &pcfg_pull_none>;
+				<2 RK_PB4 4 &pcfg_pull_none_drv_level_1_smt>;
+
 		};
 	};
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
index 8b717c4017a4..b2947b36fada 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
@@ -474,6 +474,7 @@
 	bus-width = <4>;
 	cap-mmc-highspeed;
 	cap-sd-highspeed;
+	cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>;
 	disable-wp;
 	max-frequency = <150000000>;
 	no-sdio;
diff --git a/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi b/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
index 5c645437b507..b0475b7c655a 100644
--- a/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
@@ -333,6 +333,41 @@
 	};
 
 	/omit-if-no-ref/
+	pcfg_pull_none_drv_level_1_smt: pcfg-pull-none-drv-level-1-smt {
+		bias-disable;
+		drive-strength = <1>;
+		input-schmitt-enable;
+	};
+
+	/omit-if-no-ref/
+	pcfg_pull_none_drv_level_2_smt: pcfg-pull-none-drv-level-2-smt {
+		bias-disable;
+		drive-strength = <2>;
+		input-schmitt-enable;
+	};
+
+	/omit-if-no-ref/
+	pcfg_pull_none_drv_level_3_smt: pcfg-pull-none-drv-level-3-smt {
+		bias-disable;
+		drive-strength = <3>;
+		input-schmitt-enable;
+	};
+
+	/omit-if-no-ref/
+	pcfg_pull_none_drv_level_4_smt: pcfg-pull-none-drv-level-4-smt {
+		bias-disable;
+		drive-strength = <4>;
+		input-schmitt-enable;
+	};
+
+	/omit-if-no-ref/
+	pcfg_pull_none_drv_level_5_smt: pcfg-pull-none-drv-level-5-smt {
+		bias-disable;
+		drive-strength = <5>;
+		input-schmitt-enable;
+	};
+
+	/omit-if-no-ref/
 	pcfg_output_high: pcfg-output-high {
 		output-high;
 	};
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 897fc686e6a9..03117405b6ce 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1444,6 +1444,7 @@ CONFIG_PLATFORM_MHU=y
 CONFIG_BCM2835_MBOX=y
 CONFIG_QCOM_APCS_IPC=y
 CONFIG_MTK_ADSP_MBOX=m
+CONFIG_QCOM_CPUCP_MBOX=m
 CONFIG_QCOM_IPCC=y
 CONFIG_ROCKCHIP_IOMMU=y
 CONFIG_TEGRA_IOMMU_SMMU=y
@@ -1573,6 +1574,7 @@ CONFIG_RESET_QCOM_AOSS=y
 CONFIG_RESET_QCOM_PDC=m
 CONFIG_RESET_RZG2L_USBPHY_CTRL=y
 CONFIG_RESET_TI_SCI=y
+CONFIG_PHY_SNPS_EUSB2=m
 CONFIG_PHY_XGENE=y
 CONFIG_PHY_CAN_TRANSCEIVER=m
 CONFIG_PHY_NXP_PTN3222=m
@@ -1597,7 +1599,6 @@ CONFIG_PHY_QCOM_EDP=m
 CONFIG_PHY_QCOM_PCIE2=m
 CONFIG_PHY_QCOM_QMP=m
 CONFIG_PHY_QCOM_QUSB2=m
-CONFIG_PHY_QCOM_SNPS_EUSB2=m
 CONFIG_PHY_QCOM_EUSB2_REPEATER=m
 CONFIG_PHY_QCOM_M31_USB=m
 CONFIG_PHY_QCOM_USB_HS=m
@@ -1743,8 +1744,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
-CONFIG_CRYPTO_SHA1_ARM64_CE=y
-CONFIG_CRYPTO_SHA512_ARM64_CE=m
 CONFIG_CRYPTO_SHA3_ARM64=m
 CONFIG_CRYPTO_SM3_ARM64_CE=m
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index c44b0f202a1f..3bb5b513d5ae 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -25,36 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm64 using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_SHA1_ARM64_CE
-	tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA1
-	help
-	  SHA-1 secure hash algorithm (FIPS 180)
-
-	  Architecture: arm64 using:
-	  - ARMv8 Crypto Extensions
-
-config CRYPTO_SHA512_ARM64
-	tristate "Hash functions: SHA-384 and SHA-512"
-	select CRYPTO_HASH
-	help
-	  SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm64
-
-config CRYPTO_SHA512_ARM64_CE
-	tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA512_ARM64
-	help
-	  SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm64 using:
-	  - ARMv8 Crypto Extensions
-
 config CRYPTO_SHA3_ARM64
 	tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index c231c980c514..a8b2cdbe202c 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -5,12 +5,6 @@
 # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
 #
 
-obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
-sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
-
-obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
-sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
-
 obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
 sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
 
@@ -53,9 +47,6 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
 obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
 aes-neon-blk-y := aes-glue-neon.o aes-neon.o
 
-obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
-sha512-arm64-y := sha512-glue.o sha512-core.o
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 
@@ -64,11 +55,3 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
 aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
-
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $(<) void $(@)
-
-$(obj)/sha512-core.S: $(src)/../lib/crypto/sha2-armv8.pl
-	$(call cmd,perlasm)
-
-clean-files += sha512-core.S
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
deleted file mode 100644
index 9b1f2d82a6fe..000000000000
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.arch		armv8-a+crypto
-
-	k0		.req	v0
-	k1		.req	v1
-	k2		.req	v2
-	k3		.req	v3
-
-	t0		.req	v4
-	t1		.req	v5
-
-	dga		.req	q6
-	dgav		.req	v6
-	dgb		.req	s7
-	dgbv		.req	v7
-
-	dg0q		.req	q12
-	dg0s		.req	s12
-	dg0v		.req	v12
-	dg1s		.req	s13
-	dg1v		.req	v13
-	dg2s		.req	s14
-
-	.macro		add_only, op, ev, rc, s0, dg1
-	.ifc		\ev, ev
-	add		t1.4s, v\s0\().4s, \rc\().4s
-	sha1h		dg2s, dg0s
-	.ifnb		\dg1
-	sha1\op		dg0q, \dg1, t0.4s
-	.else
-	sha1\op		dg0q, dg1s, t0.4s
-	.endif
-	.else
-	.ifnb		\s0
-	add		t0.4s, v\s0\().4s, \rc\().4s
-	.endif
-	sha1h		dg1s, dg0s
-	sha1\op		dg0q, dg2s, t1.4s
-	.endif
-	.endm
-
-	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1
-	sha1su0		v\s0\().4s, v\s1\().4s, v\s2\().4s
-	add_only	\op, \ev, \rc, \s1, \dg1
-	sha1su1		v\s0\().4s, v\s3\().4s
-	.endm
-
-	.macro		loadrc, k, val, tmp
-	movz		\tmp, :abs_g0_nc:\val
-	movk		\tmp, :abs_g1:\val
-	dup		\k, \tmp
-	.endm
-
-	/*
-	 * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
-	 *			   int blocks)
-	 */
-SYM_FUNC_START(__sha1_ce_transform)
-	/* load round constants */
-	loadrc		k0.4s, 0x5a827999, w6
-	loadrc		k1.4s, 0x6ed9eba1, w6
-	loadrc		k2.4s, 0x8f1bbcdc, w6
-	loadrc		k3.4s, 0xca62c1d6, w6
-
-	/* load state */
-	ld1		{dgav.4s}, [x0]
-	ldr		dgb, [x0, #16]
-
-	/* load sha1_ce_state::finalize */
-	ldr_l		w4, sha1_ce_offsetof_finalize, x4
-	ldr		w4, [x0, x4]
-
-	/* load input */
-0:	ld1		{v8.4s-v11.4s}, [x1], #64
-	sub		w2, w2, #1
-
-CPU_LE(	rev32		v8.16b, v8.16b		)
-CPU_LE(	rev32		v9.16b, v9.16b		)
-CPU_LE(	rev32		v10.16b, v10.16b	)
-CPU_LE(	rev32		v11.16b, v11.16b	)
-
-1:	add		t0.4s, v8.4s, k0.4s
-	mov		dg0v.16b, dgav.16b
-
-	add_update	c, ev, k0,  8,  9, 10, 11, dgb
-	add_update	c, od, k0,  9, 10, 11,  8
-	add_update	c, ev, k0, 10, 11,  8,  9
-	add_update	c, od, k0, 11,  8,  9, 10
-	add_update	c, ev, k1,  8,  9, 10, 11
-
-	add_update	p, od, k1,  9, 10, 11,  8
-	add_update	p, ev, k1, 10, 11,  8,  9
-	add_update	p, od, k1, 11,  8,  9, 10
-	add_update	p, ev, k1,  8,  9, 10, 11
-	add_update	p, od, k2,  9, 10, 11,  8
-
-	add_update	m, ev, k2, 10, 11,  8,  9
-	add_update	m, od, k2, 11,  8,  9, 10
-	add_update	m, ev, k2,  8,  9, 10, 11
-	add_update	m, od, k2,  9, 10, 11,  8
-	add_update	m, ev, k3, 10, 11,  8,  9
-
-	add_update	p, od, k3, 11,  8,  9, 10
-	add_only	p, ev, k3,  9
-	add_only	p, od, k3, 10
-	add_only	p, ev, k3, 11
-	add_only	p, od
-
-	/* update state */
-	add		dgbv.2s, dgbv.2s, dg1v.2s
-	add		dgav.4s, dgav.4s, dg0v.4s
-
-	cbz		w2, 2f
-	cond_yield	3f, x5, x6
-	b		0b
-
-	/*
-	 * Final block: add padding and total bit count.
-	 * Skip if the input size was not a round multiple of the block size,
-	 * the padding is handled by the C code in that case.
-	 */
-2:	cbz		x4, 3f
-	ldr_l		w4, sha1_ce_offsetof_count, x4
-	ldr		x4, [x0, x4]
-	movi		v9.2d, #0
-	mov		x8, #0x80000000
-	movi		v10.2d, #0
-	ror		x7, x4, #29		// ror(lsl(x4, 3), 32)
-	fmov		d8, x8
-	mov		x4, #0
-	mov		v11.d[0], xzr
-	mov		v11.d[1], x7
-	b		1b
-
-	/* store new state */
-3:	st1		{dgav.4s}, [x0]
-	str		dgb, [x0, #16]
-	mov		w0, w2
-	ret
-SYM_FUNC_END(__sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
deleted file mode 100644
index 65b6980817e5..000000000000
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ /dev/null
@@ -1,118 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha1");
-
-struct sha1_ce_state {
-	struct sha1_state	sst;
-	u32			finalize;
-};
-
-extern const u32 sha1_ce_offsetof_count;
-extern const u32 sha1_ce_offsetof_finalize;
-
-asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
-				   int blocks);
-
-static void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
-			      int blocks)
-{
-	while (blocks) {
-		int rem;
-
-		kernel_neon_begin();
-		rem = __sha1_ce_transform(container_of(sst,
-						       struct sha1_ce_state,
-						       sst), src, blocks);
-		kernel_neon_end();
-		src += (blocks - rem) * SHA1_BLOCK_SIZE;
-		blocks = rem;
-	}
-}
-
-const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
-const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);
-
-static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
-			  unsigned int len)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	sctx->finalize = 0;
-	return sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
-}
-
-static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
-			 unsigned int len, u8 *out)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-	bool finalized = false;
-
-	/*
-	 * Allow the asm code to perform the finalization if there is no
-	 * partial data and the input is a round multiple of the block size.
-	 */
-	if (len >= SHA1_BLOCK_SIZE) {
-		unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE);
-
-		finalized = !remain;
-		sctx->finalize = finalized;
-		sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
-		data += len - remain;
-		len = remain;
-	}
-	if (!finalized) {
-		sctx->finalize = 0;
-		sha1_base_do_finup(desc, data, len, sha1_ce_transform);
-	}
-	return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-	.init			= sha1_base_init,
-	.update			= sha1_ce_update,
-	.finup			= sha1_ce_finup,
-	.descsize		= sizeof(struct sha1_ce_state),
-	.statesize		= SHA1_STATE_SIZE,
-	.digestsize		= SHA1_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha1",
-		.cra_driver_name	= "sha1-ce",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					  CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize		= SHA1_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-};
-
-static int __init sha1_ce_mod_init(void)
-{
-	return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_ce_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(SHA1, sha1_ce_mod_init);
-module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
deleted file mode 100644
index 91ef68b15fcc..000000000000
--- a/arch/arm64/crypto/sha512-ce-core.S
+++ /dev/null
@@ -1,206 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.irp		b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
-	.set		.Lq\b, \b
-	.set		.Lv\b\().2d, \b
-	.endr
-
-	.macro		sha512h, rd, rn, rm
-	.inst		0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	.macro		sha512h2, rd, rn, rm
-	.inst		0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	.macro		sha512su0, rd, rn
-	.inst		0xcec08000 | .L\rd | (.L\rn << 5)
-	.endm
-
-	.macro		sha512su1, rd, rn, rm
-	.inst		0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	/*
-	 * The SHA-512 round constants
-	 */
-	.section	".rodata", "a"
-	.align		4
-.Lsha512_rcon:
-	.quad		0x428a2f98d728ae22, 0x7137449123ef65cd
-	.quad		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
-	.quad		0x3956c25bf348b538, 0x59f111f1b605d019
-	.quad		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
-	.quad		0xd807aa98a3030242, 0x12835b0145706fbe
-	.quad		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
-	.quad		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
-	.quad		0x9bdc06a725c71235, 0xc19bf174cf692694
-	.quad		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
-	.quad		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
-	.quad		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
-	.quad		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
-	.quad		0x983e5152ee66dfab, 0xa831c66d2db43210
-	.quad		0xb00327c898fb213f, 0xbf597fc7beef0ee4
-	.quad		0xc6e00bf33da88fc2, 0xd5a79147930aa725
-	.quad		0x06ca6351e003826f, 0x142929670a0e6e70
-	.quad		0x27b70a8546d22ffc, 0x2e1b21385c26c926
-	.quad		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
-	.quad		0x650a73548baf63de, 0x766a0abb3c77b2a8
-	.quad		0x81c2c92e47edaee6, 0x92722c851482353b
-	.quad		0xa2bfe8a14cf10364, 0xa81a664bbc423001
-	.quad		0xc24b8b70d0f89791, 0xc76c51a30654be30
-	.quad		0xd192e819d6ef5218, 0xd69906245565a910
-	.quad		0xf40e35855771202a, 0x106aa07032bbd1b8
-	.quad		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
-	.quad		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
-	.quad		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
-	.quad		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
-	.quad		0x748f82ee5defb2fc, 0x78a5636f43172f60
-	.quad		0x84c87814a1f0ab72, 0x8cc702081a6439ec
-	.quad		0x90befffa23631e28, 0xa4506cebde82bde9
-	.quad		0xbef9a3f7b2c67915, 0xc67178f2e372532b
-	.quad		0xca273eceea26619c, 0xd186b8c721c0c207
-	.quad		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
-	.quad		0x06f067aa72176fba, 0x0a637dc5a2c898a6
-	.quad		0x113f9804bef90dae, 0x1b710b35131c471b
-	.quad		0x28db77f523047d84, 0x32caab7b40c72493
-	.quad		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
-	.quad		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
-	.quad		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
-
-	.macro		dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
-	.ifnb		\rc1
-	ld1		{v\rc1\().2d}, [x4], #16
-	.endif
-	add		v5.2d, v\rc0\().2d, v\in0\().2d
-	ext		v6.16b, v\i2\().16b, v\i3\().16b, #8
-	ext		v5.16b, v5.16b, v5.16b, #8
-	ext		v7.16b, v\i1\().16b, v\i2\().16b, #8
-	add		v\i3\().2d, v\i3\().2d, v5.2d
-	.ifnb		\in1
-	ext		v5.16b, v\in3\().16b, v\in4\().16b, #8
-	sha512su0	v\in0\().2d, v\in1\().2d
-	.endif
-	sha512h		q\i3, q6, v7.2d
-	.ifnb		\in1
-	sha512su1	v\in0\().2d, v\in2\().2d, v5.2d
-	.endif
-	add		v\i4\().2d, v\i1\().2d, v\i3\().2d
-	sha512h2	q\i3, q\i1, v\i0\().2d
-	.endm
-
-	/*
-	 * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
-	 *			     int blocks)
-	 */
-	.text
-SYM_FUNC_START(__sha512_ce_transform)
-	/* load state */
-	ld1		{v8.2d-v11.2d}, [x0]
-
-	/* load first 4 round constants */
-	adr_l		x3, .Lsha512_rcon
-	ld1		{v20.2d-v23.2d}, [x3], #64
-
-	/* load input */
-0:	ld1		{v12.2d-v15.2d}, [x1], #64
-	ld1		{v16.2d-v19.2d}, [x1], #64
-	sub		w2, w2, #1
-
-CPU_LE(	rev64		v12.16b, v12.16b	)
-CPU_LE(	rev64		v13.16b, v13.16b	)
-CPU_LE(	rev64		v14.16b, v14.16b	)
-CPU_LE(	rev64		v15.16b, v15.16b	)
-CPU_LE(	rev64		v16.16b, v16.16b	)
-CPU_LE(	rev64		v17.16b, v17.16b	)
-CPU_LE(	rev64		v18.16b, v18.16b	)
-CPU_LE(	rev64		v19.16b, v19.16b	)
-
-	mov		x4, x3				// rc pointer
-
-	mov		v0.16b, v8.16b
-	mov		v1.16b, v9.16b
-	mov		v2.16b, v10.16b
-	mov		v3.16b, v11.16b
-
-	// v0  ab  cd  --  ef  gh  ab
-	// v1  cd  --  ef  gh  ab  cd
-	// v2  ef  gh  ab  cd  --  ef
-	// v3  gh  ab  cd  --  ef  gh
-	// v4  --  ef  gh  ab  cd  --
-
-	dround		0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
-	dround		3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
-	dround		2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
-	dround		4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
-	dround		1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
-
-	dround		0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
-	dround		3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
-	dround		2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
-	dround		4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
-	dround		1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
-
-	dround		0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
-	dround		3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
-	dround		2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
-	dround		4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
-	dround		1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
-
-	dround		0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
-	dround		3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
-	dround		2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
-	dround		4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
-	dround		1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
-
-	dround		0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
-	dround		3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
-	dround		2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
-	dround		4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
-	dround		1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
-
-	dround		0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
-	dround		3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
-	dround		2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
-	dround		4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
-	dround		1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
-
-	dround		0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
-	dround		3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
-	dround		2, 3, 1, 4, 0, 28, 24, 12
-	dround		4, 2, 0, 1, 3, 29, 25, 13
-	dround		1, 4, 3, 0, 2, 30, 26, 14
-
-	dround		0, 1, 2, 3, 4, 31, 27, 15
-	dround		3, 0, 4, 2, 1, 24,   , 16
-	dround		2, 3, 1, 4, 0, 25,   , 17
-	dround		4, 2, 0, 1, 3, 26,   , 18
-	dround		1, 4, 3, 0, 2, 27,   , 19
-
-	/* update state */
-	add		v8.2d, v8.2d, v0.2d
-	add		v9.2d, v9.2d, v1.2d
-	add		v10.2d, v10.2d, v2.2d
-	add		v11.2d, v11.2d, v3.2d
-
-	cond_yield	3f, x4, x5
-	/* handled all input blocks? */
-	cbnz		w2, 0b
-
-	/* store new state */
-3:	st1		{v8.2d-v11.2d}, [x0]
-	mov		w0, w2
-	ret
-SYM_FUNC_END(__sha512_ce_transform)
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
deleted file mode 100644
index 6fb3001fa2c9..000000000000
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-
-asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
-				     int blocks);
-
-static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
-				int blocks)
-{
-	do {
-		int rem;
-
-		kernel_neon_begin();
-		rem = __sha512_ce_transform(sst, src, blocks);
-		kernel_neon_end();
-		src += (blocks - rem) * SHA512_BLOCK_SIZE;
-		blocks = rem;
-	} while (blocks);
-}
-
-static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
-			    unsigned int len)
-{
-	return sha512_base_do_update_blocks(desc, data, len,
-					    sha512_ce_transform);
-}
-
-static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
-			   unsigned int len, u8 *out)
-{
-	sha512_base_do_finup(desc, data, len, sha512_ce_transform);
-	return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
-	.init			= sha384_base_init,
-	.update			= sha512_ce_update,
-	.finup			= sha512_ce_finup,
-	.descsize		= SHA512_STATE_SIZE,
-	.digestsize		= SHA384_DIGEST_SIZE,
-	.base.cra_name		= "sha384",
-	.base.cra_driver_name	= "sha384-ce",
-	.base.cra_priority	= 200,
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-				  CRYPTO_AHASH_ALG_FINUP_MAX,
-	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.init			= sha512_base_init,
-	.update			= sha512_ce_update,
-	.finup			= sha512_ce_finup,
-	.descsize		= SHA512_STATE_SIZE,
-	.digestsize		= SHA512_DIGEST_SIZE,
-	.base.cra_name		= "sha512",
-	.base.cra_driver_name	= "sha512-ce",
-	.base.cra_priority	= 200,
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-				  CRYPTO_AHASH_ALG_FINUP_MAX,
-	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init sha512_ce_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_ce_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA512, sha512_ce_mod_init);
-module_exit(sha512_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
deleted file mode 100644
index 15aa9d8b7b2c..000000000000
--- a/arch/arm64/crypto/sha512-glue.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64
- *
- * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64");
-MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-
-asmlinkage void sha512_blocks_arch(u64 *digest, const void *data,
-				   unsigned int num_blks);
-
-static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
-				   int blocks)
-{
-	sha512_blocks_arch(sst->state, src, blocks);
-}
-
-static int sha512_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len)
-{
-	return sha512_base_do_update_blocks(desc, data, len,
-					    sha512_arm64_transform);
-}
-
-static int sha512_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *out)
-{
-	sha512_base_do_finup(desc, data, len, sha512_arm64_transform);
-	return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA512_DIGEST_SIZE,
-	.init			= sha512_base_init,
-	.update			= sha512_update,
-	.finup			= sha512_finup,
-	.descsize		= SHA512_STATE_SIZE,
-	.base.cra_name		= "sha512",
-	.base.cra_driver_name	= "sha512-arm64",
-	.base.cra_priority	= 150,
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-				  CRYPTO_AHASH_ALG_FINUP_MAX,
-	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA384_DIGEST_SIZE,
-	.init			= sha384_base_init,
-	.update			= sha512_update,
-	.finup			= sha512_finup,
-	.descsize		= SHA512_STATE_SIZE,
-	.base.cra_name		= "sha384",
-	.base.cra_driver_name	= "sha384-arm64",
-	.base.cra_priority	= 150,
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-				  CRYPTO_AHASH_ALG_FINUP_MAX,
-	.base.cra_blocksize	= SHA384_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init sha512_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha512_mod_init);
-module_exit(sha512_mod_fini);
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index a407f9cd549e..c07a58b96329 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -150,7 +150,7 @@ acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor)
 {}
 #endif
 
-static inline const char *acpi_get_enable_method(int cpu)
+static __always_inline const char *acpi_get_enable_method(int cpu)
 {
 	if (acpi_psci_present())
 		return "psci";
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ad63457a05c5..c56c21bb1eec 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -41,6 +41,11 @@
 /*
  * Save/restore interrupts.
  */
+	.macro save_and_disable_daif, flags
+	mrs	\flags, daif
+	msr	daifset, #0xf
+	.endm
+
 	.macro	save_and_disable_irq, flags
 	mrs	\flags, daif
 	msr	daifset, #3
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index ba5df0df02a4..9f38340d24c2 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -287,17 +287,6 @@
 .Lskip_fgt2_\@:
 .endm
 
-.macro __init_el2_gcs
-	mrs_s	x1, SYS_ID_AA64PFR1_EL1
-	ubfx	x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
-	cbz	x1, .Lskip_gcs_\@
-
-	/* Ensure GCS is not enabled when we start trying to do BLs */
-	msr_s	SYS_GCSCR_EL1, xzr
-	msr_s	SYS_GCSCRE0_EL1, xzr
-.Lskip_gcs_\@:
-.endm
-
 /**
  * Initialize EL2 registers to sane values. This should be called early on all
  * cores that were booted in EL2. Note that everything gets initialised as
@@ -319,7 +308,6 @@
 	__init_el2_cptr
 	__init_el2_fgt
 	__init_el2_fgt2
-        __init_el2_gcs
 .endm
 
 #ifndef __KVM_NVHE_HYPERVISOR__
@@ -371,6 +359,13 @@
 	msr_s   SYS_MPAMHCR_EL2, xzr		// clear TRAP_MPAMIDR_EL1 -> EL2
 
 .Lskip_mpam_\@:
+	check_override id_aa64pfr1, ID_AA64PFR1_EL1_GCS_SHIFT, .Linit_gcs_\@, .Lskip_gcs_\@, x1, x2
+
+.Linit_gcs_\@:
+	msr_s	SYS_GCSCR_EL1, xzr
+	msr_s	SYS_GCSCRE0_EL1, xzr
+
+.Lskip_gcs_\@:
 	check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2
 
 .Linit_sve_\@:	/* SVE register access */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index bd020fc28aa9..0720898f563e 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -561,68 +561,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu)
 		vcpu_set_flag((v), e);					\
 	} while (0)
 
-#define __build_check_all_or_none(r, bits)				\
-	BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits))
-
-#define __cpacr_to_cptr_clr(clr, set)					\
-	({								\
-		u64 cptr = 0;						\
-									\
-		if ((set) & CPACR_EL1_FPEN)				\
-			cptr |= CPTR_EL2_TFP;				\
-		if ((set) & CPACR_EL1_ZEN)				\
-			cptr |= CPTR_EL2_TZ;				\
-		if ((set) & CPACR_EL1_SMEN)				\
-			cptr |= CPTR_EL2_TSM;				\
-		if ((clr) & CPACR_EL1_TTA)				\
-			cptr |= CPTR_EL2_TTA;				\
-		if ((clr) & CPTR_EL2_TAM)				\
-			cptr |= CPTR_EL2_TAM;				\
-		if ((clr) & CPTR_EL2_TCPAC)				\
-			cptr |= CPTR_EL2_TCPAC;				\
-									\
-		cptr;							\
-	})
-
-#define __cpacr_to_cptr_set(clr, set)					\
-	({								\
-		u64 cptr = 0;						\
-									\
-		if ((clr) & CPACR_EL1_FPEN)				\
-			cptr |= CPTR_EL2_TFP;				\
-		if ((clr) & CPACR_EL1_ZEN)				\
-			cptr |= CPTR_EL2_TZ;				\
-		if ((clr) & CPACR_EL1_SMEN)				\
-			cptr |= CPTR_EL2_TSM;				\
-		if ((set) & CPACR_EL1_TTA)				\
-			cptr |= CPTR_EL2_TTA;				\
-		if ((set) & CPTR_EL2_TAM)				\
-			cptr |= CPTR_EL2_TAM;				\
-		if ((set) & CPTR_EL2_TCPAC)				\
-			cptr |= CPTR_EL2_TCPAC;				\
-									\
-		cptr;							\
-	})
-
-#define cpacr_clear_set(clr, set)					\
-	do {								\
-		BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0);		\
-		BUILD_BUG_ON((clr) & CPACR_EL1_E0POE);			\
-		__build_check_all_or_none((clr), CPACR_EL1_FPEN);	\
-		__build_check_all_or_none((set), CPACR_EL1_FPEN);	\
-		__build_check_all_or_none((clr), CPACR_EL1_ZEN);	\
-		__build_check_all_or_none((set), CPACR_EL1_ZEN);	\
-		__build_check_all_or_none((clr), CPACR_EL1_SMEN);	\
-		__build_check_all_or_none((set), CPACR_EL1_SMEN);	\
-									\
-		if (has_vhe() || has_hvhe())				\
-			sysreg_clear_set(cpacr_el1, clr, set);		\
-		else							\
-			sysreg_clear_set(cptr_el2,			\
-					 __cpacr_to_cptr_clr(clr, set),	\
-					 __cpacr_to_cptr_set(clr, set));\
-	} while (0)
-
 /*
  * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE
  * format if E2H isn't set.
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 5ccca509dff1..3e41a880b062 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1289,9 +1289,8 @@ void kvm_arm_resume_guest(struct kvm *kvm);
 	})
 
 /*
- * The couple of isb() below are there to guarantee the same behaviour
- * on VHE as on !VHE, where the eret to EL1 acts as a context
- * synchronization event.
+ * The isb() below is there to guarantee the same behaviour on VHE as on !VHE,
+ * where the eret to EL1 acts as a context synchronization event.
  */
 #define kvm_call_hyp(f, ...)						\
 	do {								\
@@ -1309,7 +1308,6 @@ void kvm_arm_resume_guest(struct kvm *kvm);
 									\
 		if (has_vhe()) {					\
 			ret = f(__VA_ARGS__);				\
-			isb();						\
 		} else {						\
 			ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__);	\
 		}							\
@@ -1482,7 +1480,6 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm,
 					struct reg_mask_range *range);
 
 /* Guest/host FPSIMD coordination helpers */
-int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2920b0a51403..a2faf0049dab 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -34,7 +34,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
 			   syscall.o proton-pack.o idle.o patching.o pi/	\
-			   rsi.o
+			   rsi.o jump_label.o
 
 obj-$(CONFIG_COMPAT)			+= sys32.o signal32.o			\
 					   sys_compat.o
@@ -47,7 +47,6 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o perf_callchain.o
 obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF)	+= watchdog_hld.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_CPU_PM)			+= sleep.o suspend.o
-obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
 obj-$(CONFIG_KGDB)			+= kgdb.o
 obj-$(CONFIG_EFI)			+= efi.o efi-rt-wrapper.o
 obj-$(CONFIG_PCI)			+= pci.o
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index b34044e20128..e151585c6cca 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3135,6 +3135,13 @@ static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope)
 }
 #endif
 
+#ifdef CONFIG_ARM64_SME
+static bool has_sme_feature(const struct arm64_cpu_capabilities *cap, int scope)
+{
+	return system_supports_sme() && has_user_cpuid_feature(cap, scope);
+}
+#endif
+
 static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL),
 	HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES),
@@ -3223,31 +3230,31 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC),
 #ifdef CONFIG_ARM64_SME
 	HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP),
-	HWCAP_CAP(ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP),
+	HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4),
 #endif /* CONFIG_ARM64_SME */
 	HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT),
 	HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA),
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 3857fd7ee8d4..62230d6dd919 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -15,6 +15,7 @@
 
 #include <asm/efi.h>
 #include <asm/stacktrace.h>
+#include <asm/vmap_stack.h>
 
 static bool region_is_misaligned(const efi_memory_desc_t *md)
 {
@@ -214,9 +215,13 @@ static int __init arm64_efi_rt_init(void)
 	if (!efi_enabled(EFI_RUNTIME_SERVICES))
 		return 0;
 
-	p = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, GFP_KERNEL,
-			   NUMA_NO_NODE, &&l);
-l:	if (!p) {
+	if (!IS_ENABLED(CONFIG_VMAP_STACK)) {
+		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+		return -ENOMEM;
+	}
+
+	p = arch_alloc_vmap_stack(THREAD_SIZE, NUMA_NO_NODE);
+	if (!p) {
 		pr_warn("Failed to allocate EFI runtime stack\n");
 		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 		return -ENOMEM;
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5ae2a34b50bd..fe62218b8e0e 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -614,7 +614,7 @@ SYM_CODE_END(ret_to_kernel)
 SYM_CODE_START_LOCAL(ret_to_user)
 	ldr	x19, [tsk, #TSK_TI_FLAGS]	// re-check for single-step
 	enable_step_tsk x19, x2
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	bl	stackleak_erase_on_task_stack
 #endif
 	kernel_exit 0
@@ -825,6 +825,7 @@ SYM_CODE_END(__bp_harden_el1_vectors)
  *
  */
 SYM_FUNC_START(cpu_switch_to)
+	save_and_disable_daif x11
 	mov	x10, #THREAD_CPU_CONTEXT
 	add	x8, x0, x10
 	mov	x9, sp
@@ -848,6 +849,7 @@ SYM_FUNC_START(cpu_switch_to)
 	ptrauth_keys_install_kernel x1, x8, x9, x10
 	scs_save x0
 	scs_load_current
+	restore_irq x11
 	ret
 SYM_FUNC_END(cpu_switch_to)
 NOKPROBE(cpu_switch_to)
@@ -874,6 +876,7 @@ NOKPROBE(ret_from_fork)
  * Calls func(regs) using this CPU's irq stack and shadow irq stack.
  */
 SYM_FUNC_START(call_on_irq_stack)
+	save_and_disable_daif x9
 #ifdef CONFIG_SHADOW_CALL_STACK
 	get_current_task x16
 	scs_save x16
@@ -888,8 +891,10 @@ SYM_FUNC_START(call_on_irq_stack)
 
 	/* Move to the new stack and call the function there */
 	add	sp, x16, #IRQ_STACK_SIZE
+	restore_irq x9
 	blr	x1
 
+	save_and_disable_daif x9
 	/*
 	 * Restore the SP from the FP, and restore the FP and LR from the frame
 	 * record.
@@ -897,6 +902,7 @@ SYM_FUNC_START(call_on_irq_stack)
 	mov	sp, x29
 	ldp	x29, x30, [sp], #16
 	scs_load_current
+	restore_irq x9
 	ret
 SYM_FUNC_END(call_on_irq_stack)
 NOKPROBE(call_on_irq_stack)
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index 4d11a8c29181..f440bf57b1a5 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -2,7 +2,7 @@
 # Copyright 2022 Google LLC
 
 KBUILD_CFLAGS	:= $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
-		   -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \
+		   -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \
 		   $(DISABLE_LATENT_ENTROPY_PLUGIN) \
 		   $(call cc-option,-mbranch-protection=none) \
 		   -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index a5ca15daeb8a..08b7042a2e2d 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -288,7 +288,9 @@ static void flush_gcs(void)
 	if (!system_supports_gcs())
 		return;
 
-	gcs_free(current);
+	current->thread.gcspr_el0 = 0;
+	current->thread.gcs_base = 0;
+	current->thread.gcs_size = 0;
 	current->thread.gcs_el0_mode = 0;
 	write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
 	write_sysreg_s(0, SYS_GCSPR_EL0);
@@ -671,6 +673,11 @@ static void permission_overlay_switch(struct task_struct *next)
 	current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
 	if (current->thread.por_el0 != next->thread.por_el0) {
 		write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
+		/*
+		 * No ISB required as we can tolerate spurious Overlay faults -
+		 * the fault handler will check again based on the new value
+		 * of POR_EL0.
+		 */
 	}
 }
 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index a360e52db02f..4b001121c72d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -141,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
 
 	addr += n;
 	if (regs_within_kernel_stack(regs, (unsigned long)addr))
-		return *addr;
+		return READ_ONCE_NOCHECK(*addr);
 	else
 		return 0;
 }
@@ -1586,7 +1586,7 @@ enum aarch64_regset {
 
 static const struct user_regset aarch64_regsets[] = {
 	[REGSET_GPR] = {
-		.core_note_type = NT_PRSTATUS,
+		USER_REGSET_NOTE_TYPE(PRSTATUS),
 		.n = sizeof(struct user_pt_regs) / sizeof(u64),
 		.size = sizeof(u64),
 		.align = sizeof(u64),
@@ -1594,7 +1594,7 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = gpr_set
 	},
 	[REGSET_FPR] = {
-		.core_note_type = NT_PRFPREG,
+		USER_REGSET_NOTE_TYPE(PRFPREG),
 		.n = sizeof(struct user_fpsimd_state) / sizeof(u32),
 		/*
 		 * We pretend we have 32-bit registers because the fpsr and
@@ -1607,7 +1607,7 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = fpr_set
 	},
 	[REGSET_TLS] = {
-		.core_note_type = NT_ARM_TLS,
+		USER_REGSET_NOTE_TYPE(ARM_TLS),
 		.n = 2,
 		.size = sizeof(void *),
 		.align = sizeof(void *),
@@ -1616,7 +1616,7 @@ static const struct user_regset aarch64_regsets[] = {
 	},
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	[REGSET_HW_BREAK] = {
-		.core_note_type = NT_ARM_HW_BREAK,
+		USER_REGSET_NOTE_TYPE(ARM_HW_BREAK),
 		.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
 		.size = sizeof(u32),
 		.align = sizeof(u32),
@@ -1624,7 +1624,7 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = hw_break_set,
 	},
 	[REGSET_HW_WATCH] = {
-		.core_note_type = NT_ARM_HW_WATCH,
+		USER_REGSET_NOTE_TYPE(ARM_HW_WATCH),
 		.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
 		.size = sizeof(u32),
 		.align = sizeof(u32),
@@ -1633,7 +1633,7 @@ static const struct user_regset aarch64_regsets[] = {
 	},
 #endif
 	[REGSET_SYSTEM_CALL] = {
-		.core_note_type = NT_ARM_SYSTEM_CALL,
+		USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL),
 		.n = 1,
 		.size = sizeof(int),
 		.align = sizeof(int),
@@ -1641,7 +1641,7 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = system_call_set,
 	},
 	[REGSET_FPMR] = {
-		.core_note_type = NT_ARM_FPMR,
+		USER_REGSET_NOTE_TYPE(ARM_FPMR),
 		.n = 1,
 		.size = sizeof(u64),
 		.align = sizeof(u64),
@@ -1650,7 +1650,7 @@ static const struct user_regset aarch64_regsets[] = {
 	},
 #ifdef CONFIG_ARM64_SVE
 	[REGSET_SVE] = { /* Scalable Vector Extension */
-		.core_note_type = NT_ARM_SVE,
+		USER_REGSET_NOTE_TYPE(ARM_SVE),
 		.n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX,
 					      SVE_PT_REGS_SVE),
 				  SVE_VQ_BYTES),
@@ -1662,7 +1662,7 @@ static const struct user_regset aarch64_regsets[] = {
 #endif
 #ifdef CONFIG_ARM64_SME
 	[REGSET_SSVE] = { /* Streaming mode SVE */
-		.core_note_type = NT_ARM_SSVE,
+		USER_REGSET_NOTE_TYPE(ARM_SSVE),
 		.n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE),
 				  SVE_VQ_BYTES),
 		.size = SVE_VQ_BYTES,
@@ -1671,7 +1671,7 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = ssve_set,
 	},
 	[REGSET_ZA] = { /* SME ZA */
-		.core_note_type = NT_ARM_ZA,
+		USER_REGSET_NOTE_TYPE(ARM_ZA),
 		/*
 		 * ZA is a single register but it's variably sized and
 		 * the ptrace core requires that the size of any data
@@ -1687,7 +1687,7 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = za_set,
 	},
 	[REGSET_ZT] = { /* SME ZT */
-		.core_note_type = NT_ARM_ZT,
+		USER_REGSET_NOTE_TYPE(ARM_ZT),
 		.n = 1,
 		.size = ZT_SIG_REG_BYTES,
 		.align = sizeof(u64),
@@ -1697,7 +1697,7 @@ static const struct user_regset aarch64_regsets[] = {
 #endif
 #ifdef CONFIG_ARM64_PTR_AUTH
 	[REGSET_PAC_MASK] = {
-		.core_note_type = NT_ARM_PAC_MASK,
+		USER_REGSET_NOTE_TYPE(ARM_PAC_MASK),
 		.n = sizeof(struct user_pac_mask) / sizeof(u64),
 		.size = sizeof(u64),
 		.align = sizeof(u64),
@@ -1705,7 +1705,7 @@ static const struct user_regset aarch64_regsets[] = {
 		/* this cannot be set dynamically */
 	},
 	[REGSET_PAC_ENABLED_KEYS] = {
-		.core_note_type = NT_ARM_PAC_ENABLED_KEYS,
+		USER_REGSET_NOTE_TYPE(ARM_PAC_ENABLED_KEYS),
 		.n = 1,
 		.size = sizeof(long),
 		.align = sizeof(long),
@@ -1714,7 +1714,7 @@ static const struct user_regset aarch64_regsets[] = {
 	},
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	[REGSET_PACA_KEYS] = {
-		.core_note_type = NT_ARM_PACA_KEYS,
+		USER_REGSET_NOTE_TYPE(ARM_PACA_KEYS),
 		.n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t),
 		.size = sizeof(__uint128_t),
 		.align = sizeof(__uint128_t),
@@ -1722,7 +1722,7 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = pac_address_keys_set,
 	},
 	[REGSET_PACG_KEYS] = {
-		.core_note_type = NT_ARM_PACG_KEYS,
+		USER_REGSET_NOTE_TYPE(ARM_PACG_KEYS),
 		.n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t),
 		.size = sizeof(__uint128_t),
 		.align = sizeof(__uint128_t),
@@ -1733,7 +1733,7 @@ static const struct user_regset aarch64_regsets[] = {
 #endif
 #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
 	[REGSET_TAGGED_ADDR_CTRL] = {
-		.core_note_type = NT_ARM_TAGGED_ADDR_CTRL,
+		USER_REGSET_NOTE_TYPE(ARM_TAGGED_ADDR_CTRL),
 		.n = 1,
 		.size = sizeof(long),
 		.align = sizeof(long),
@@ -1743,7 +1743,7 @@ static const struct user_regset aarch64_regsets[] = {
 #endif
 #ifdef CONFIG_ARM64_POE
 	[REGSET_POE] = {
-		.core_note_type = NT_ARM_POE,
+		USER_REGSET_NOTE_TYPE(ARM_POE),
 		.n = 1,
 		.size = sizeof(long),
 		.align = sizeof(long),
@@ -1753,7 +1753,7 @@ static const struct user_regset aarch64_regsets[] = {
 #endif
 #ifdef CONFIG_ARM64_GCS
 	[REGSET_GCS] = {
-		.core_note_type = NT_ARM_GCS,
+		USER_REGSET_NOTE_TYPE(ARM_GCS),
 		.n = sizeof(struct user_gcs) / sizeof(u64),
 		.size = sizeof(u64),
 		.align = sizeof(u64),
@@ -1943,7 +1943,7 @@ static int compat_tls_set(struct task_struct *target,
 
 static const struct user_regset aarch32_regsets[] = {
 	[REGSET_COMPAT_GPR] = {
-		.core_note_type = NT_PRSTATUS,
+		USER_REGSET_NOTE_TYPE(PRSTATUS),
 		.n = COMPAT_ELF_NGREG,
 		.size = sizeof(compat_elf_greg_t),
 		.align = sizeof(compat_elf_greg_t),
@@ -1951,7 +1951,7 @@ static const struct user_regset aarch32_regsets[] = {
 		.set = compat_gpr_set
 	},
 	[REGSET_COMPAT_VFP] = {
-		.core_note_type = NT_ARM_VFP,
+		USER_REGSET_NOTE_TYPE(ARM_VFP),
 		.n = VFP_STATE_SIZE / sizeof(compat_ulong_t),
 		.size = sizeof(compat_ulong_t),
 		.align = sizeof(compat_ulong_t),
@@ -1968,7 +1968,7 @@ static const struct user_regset_view user_aarch32_view = {
 
 static const struct user_regset aarch32_ptrace_regsets[] = {
 	[REGSET_GPR] = {
-		.core_note_type = NT_PRSTATUS,
+		USER_REGSET_NOTE_TYPE(PRSTATUS),
 		.n = COMPAT_ELF_NGREG,
 		.size = sizeof(compat_elf_greg_t),
 		.align = sizeof(compat_elf_greg_t),
@@ -1976,7 +1976,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
 		.set = compat_gpr_set
 	},
 	[REGSET_FPR] = {
-		.core_note_type = NT_ARM_VFP,
+		USER_REGSET_NOTE_TYPE(ARM_VFP),
 		.n = VFP_STATE_SIZE / sizeof(compat_ulong_t),
 		.size = sizeof(compat_ulong_t),
 		.align = sizeof(compat_ulong_t),
@@ -1984,7 +1984,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
 		.set = compat_vfp_set
 	},
 	[REGSET_TLS] = {
-		.core_note_type = NT_ARM_TLS,
+		USER_REGSET_NOTE_TYPE(ARM_TLS),
 		.n = 1,
 		.size = sizeof(compat_ulong_t),
 		.align = sizeof(compat_ulong_t),
@@ -1993,7 +1993,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
 	},
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	[REGSET_HW_BREAK] = {
-		.core_note_type = NT_ARM_HW_BREAK,
+		USER_REGSET_NOTE_TYPE(ARM_HW_BREAK),
 		.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
 		.size = sizeof(u32),
 		.align = sizeof(u32),
@@ -2001,7 +2001,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
 		.set = hw_break_set,
 	},
 	[REGSET_HW_WATCH] = {
-		.core_note_type = NT_ARM_HW_WATCH,
+		USER_REGSET_NOTE_TYPE(ARM_HW_WATCH),
 		.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
 		.size = sizeof(u32),
 		.align = sizeof(u32),
@@ -2010,7 +2010,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
 	},
 #endif
 	[REGSET_SYSTEM_CALL] = {
-		.core_note_type = NT_ARM_SYSTEM_CALL,
+		USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL),
 		.n = 1,
 		.size = sizeof(int),
 		.align = sizeof(int),
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 3b3f6b56e733..21a795303568 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1143,7 +1143,7 @@ static inline unsigned int num_other_online_cpus(void)
 void smp_send_stop(void)
 {
 	static unsigned long stop_in_progress;
-	cpumask_t mask;
+	static cpumask_t mask;
 	unsigned long timeout;
 
 	/*
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 5e27e46aa496..7dec05dd33b7 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -36,7 +36,8 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 # -Wmissing-prototypes and -Wmissing-declarations are removed from
 # the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled.
 CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
-			$(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
+			$(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) \
+			$(GCC_PLUGINS_CFLAGS) \
 			$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
 			-Wmissing-prototypes -Wmissing-declarations
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index de2b4e9c9f9f..23dd3f3fc3eb 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -825,10 +825,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 	if (!kvm_arm_vcpu_is_finalized(vcpu))
 		return -EPERM;
 
-	ret = kvm_arch_vcpu_run_map_fp(vcpu);
-	if (ret)
-		return ret;
-
 	if (likely(vcpu_has_run_once(vcpu)))
 		return 0;
 
@@ -2129,7 +2125,7 @@ static void cpu_hyp_init(void *discard)
 
 static void cpu_hyp_uninit(void *discard)
 {
-	if (__this_cpu_read(kvm_hyp_initialized)) {
+	if (!is_protected_kvm_enabled() && __this_cpu_read(kvm_hyp_initialized)) {
 		cpu_hyp_reset();
 		__this_cpu_write(kvm_hyp_initialized, 0);
 	}
@@ -2345,8 +2341,13 @@ static void __init teardown_hyp_mode(void)
 
 	free_hyp_pgds();
 	for_each_possible_cpu(cpu) {
+		if (per_cpu(kvm_hyp_initialized, cpu))
+			continue;
+
 		free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT);
-		free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
+
+		if (!kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu])
+			continue;
 
 		if (free_sve) {
 			struct cpu_sve_state *sve_state;
@@ -2354,6 +2355,9 @@ static void __init teardown_hyp_mode(void)
 			sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
 			free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
 		}
+
+		free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
+
 	}
 }
 
@@ -2764,7 +2768,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
 				  struct kvm_kernel_irq_routing_entry *new)
 {
-	if (new->type != KVM_IRQ_ROUTING_MSI)
+	if (old->type != KVM_IRQ_ROUTING_MSI ||
+	    new->type != KVM_IRQ_ROUTING_MSI)
 		return true;
 
 	return memcmp(&old->msi, &new->msi, sizeof(new->msi));
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 8f6c8f57c6b9..15e17aca1dec 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -15,32 +15,6 @@
 #include <asm/sysreg.h>
 
 /*
- * Called on entry to KVM_RUN unless this vcpu previously ran at least
- * once and the most recent prior KVM_RUN for this vcpu was called from
- * the same task as current (highly likely).
- *
- * This is guaranteed to execute before kvm_arch_vcpu_load_fp(vcpu),
- * such that on entering hyp the relevant parts of current are already
- * mapped.
- */
-int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
-{
-	struct user_fpsimd_state *fpsimd = &current->thread.uw.fpsimd_state;
-	int ret;
-
-	/* pKVM has its own tracking of the host fpsimd state. */
-	if (is_protected_kvm_enabled())
-		return 0;
-
-	/* Make sure the host task fpsimd state is visible to hyp: */
-	ret = kvm_share_hyp(fpsimd, fpsimd + 1);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
-/*
  * Prepare vcpu for saving the host's FPSIMD state and loading the guest's.
  * The actual loading is done by the FPSIMD access trap taken to hyp.
  *
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 76dfda116e56..2ad57b117385 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -65,6 +65,136 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
 	}
 }
 
+static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
+{
+	u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA;
+
+	/*
+	 * Always trap SME since it's not supported in KVM.
+	 * TSM is RES1 if SME isn't implemented.
+	 */
+	val |= CPTR_EL2_TSM;
+
+	if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
+		val |= CPTR_EL2_TZ;
+
+	if (!guest_owns_fp_regs())
+		val |= CPTR_EL2_TFP;
+
+	write_sysreg(val, cptr_el2);
+}
+
+static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
+	 * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
+	 * except for some missing controls, such as TAM.
+	 * In this case, CPTR_EL2.TAM has the same position with or without
+	 * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
+	 * shift value for trapping the AMU accesses.
+	 */
+	u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA;
+	u64 cptr;
+
+	if (guest_owns_fp_regs()) {
+		val |= CPACR_EL1_FPEN;
+		if (vcpu_has_sve(vcpu))
+			val |= CPACR_EL1_ZEN;
+	}
+
+	if (!vcpu_has_nv(vcpu))
+		goto write;
+
+	/*
+	 * The architecture is a bit crap (what a surprise): an EL2 guest
+	 * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
+	 * as they are RES0 in the guest's view. To work around it, trap the
+	 * sucker using the very same bit it can't set...
+	 */
+	if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
+		val |= CPTR_EL2_TCPAC;
+
+	/*
+	 * Layer the guest hypervisor's trap configuration on top of our own if
+	 * we're in a nested context.
+	 */
+	if (is_hyp_ctxt(vcpu))
+		goto write;
+
+	cptr = vcpu_sanitised_cptr_el2(vcpu);
+
+	/*
+	 * Pay attention, there's some interesting detail here.
+	 *
+	 * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
+	 * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
+	 *
+	 *  - CPTR_EL2.xEN = x0, traps are enabled
+	 *  - CPTR_EL2.xEN = x1, traps are disabled
+	 *
+	 * In other words, bit[0] determines if guest accesses trap or not. In
+	 * the interest of simplicity, clear the entire field if the guest
+	 * hypervisor has traps enabled to dispel any illusion of something more
+	 * complicated taking place.
+	 */
+	if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
+		val &= ~CPACR_EL1_FPEN;
+	if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
+		val &= ~CPACR_EL1_ZEN;
+
+	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
+		val |= cptr & CPACR_EL1_E0POE;
+
+	val |= cptr & CPTR_EL2_TCPAC;
+
+write:
+	write_sysreg(val, cpacr_el1);
+}
+
+static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+	if (!guest_owns_fp_regs())
+		__activate_traps_fpsimd32(vcpu);
+
+	if (has_vhe() || has_hvhe())
+		__activate_cptr_traps_vhe(vcpu);
+	else
+		__activate_cptr_traps_nvhe(vcpu);
+}
+
+static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
+{
+	u64 val = CPTR_NVHE_EL2_RES1;
+
+	if (!cpus_have_final_cap(ARM64_SVE))
+		val |= CPTR_EL2_TZ;
+	if (!cpus_have_final_cap(ARM64_SME))
+		val |= CPTR_EL2_TSM;
+
+	write_sysreg(val, cptr_el2);
+}
+
+static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
+{
+	u64 val = CPACR_EL1_FPEN;
+
+	if (cpus_have_final_cap(ARM64_SVE))
+		val |= CPACR_EL1_ZEN;
+	if (cpus_have_final_cap(ARM64_SME))
+		val |= CPACR_EL1_SMEN;
+
+	write_sysreg(val, cpacr_el1);
+}
+
+static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+	if (has_vhe() || has_hvhe())
+		__deactivate_cptr_traps_vhe(vcpu);
+	else
+		__deactivate_cptr_traps_nvhe(vcpu);
+}
+
 #define reg_to_fgt_masks(reg)						\
 	({								\
 		struct fgt_masks *m;					\
@@ -486,11 +616,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
 	 */
 	if (system_supports_sve()) {
 		__hyp_sve_save_host();
-
-		/* Re-enable SVE traps if not supported for the guest vcpu. */
-		if (!vcpu_has_sve(vcpu))
-			cpacr_clear_set(CPACR_EL1_ZEN, 0);
-
 	} else {
 		__fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
 	}
@@ -541,10 +666,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
 	/* Valid trap.  Switch the context: */
 
 	/* First disable enough traps to allow us to update the registers */
-	if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve()))
-		cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
-	else
-		cpacr_clear_set(0, CPACR_EL1_FPEN);
+	__deactivate_cptr_traps(vcpu);
 	isb();
 
 	/* Write out the host state if it's in the registers */
@@ -566,6 +688,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
 
 	*host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED;
 
+	/*
+	 * Re-enable traps necessary for the current state of the guest, e.g.
+	 * those enabled by a guest hypervisor. The ERET to the guest will
+	 * provide the necessary context synchronization.
+	 */
+	__activate_cptr_traps(vcpu);
+
 	return true;
 }
 
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index a76522d63c3e..0b0a68b663d4 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -12,7 +12,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
 ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
 ccflags-y += -fno-stack-protector	\
 	     -DDISABLE_BRANCH_PROFILING	\
-	     $(DISABLE_STACKLEAK_PLUGIN)
+	     $(DISABLE_KSTACK_ERASE)
 
 hostprogs := gen-hyprel
 HOST_EXTRACFLAGS += -I$(objtree)/include
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index e9198e56e784..3206b2c07f82 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -69,7 +69,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
 	if (!guest_owns_fp_regs())
 		return;
 
-	cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
+	/*
+	 * Traps have been disabled by __deactivate_cptr_traps(), but there
+	 * hasn't necessarily been a context synchronization event yet.
+	 */
 	isb();
 
 	if (vcpu_has_sve(vcpu))
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 95d7534c9679..8957734d6183 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -479,6 +479,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
 {
 	struct kvm_mem_range cur;
 	kvm_pte_t pte;
+	u64 granule;
 	s8 level;
 	int ret;
 
@@ -496,18 +497,21 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
 		return -EPERM;
 	}
 
-	do {
-		u64 granule = kvm_granule_size(level);
+	for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) {
+		if (!kvm_level_supports_block_mapping(level))
+			continue;
+		granule = kvm_granule_size(level);
 		cur.start = ALIGN_DOWN(addr, granule);
 		cur.end = cur.start + granule;
-		level++;
-	} while ((level <= KVM_PGTABLE_LAST_LEVEL) &&
-			!(kvm_level_supports_block_mapping(level) &&
-			  range_included(&cur, range)));
+		if (!range_included(&cur, range))
+			continue;
+		*range = cur;
+		return 0;
+	}
 
-	*range = cur;
+	WARN_ON(1);
 
-	return 0;
+	return -EINVAL;
 }
 
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 73affe1333a4..0e752b515d0f 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -47,65 +47,6 @@ struct fgt_masks hdfgwtr2_masks;
 
 extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
 
-static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
-{
-	u64 val = CPTR_EL2_TAM;	/* Same bit irrespective of E2H */
-
-	if (!guest_owns_fp_regs())
-		__activate_traps_fpsimd32(vcpu);
-
-	if (has_hvhe()) {
-		val |= CPACR_EL1_TTA;
-
-		if (guest_owns_fp_regs()) {
-			val |= CPACR_EL1_FPEN;
-			if (vcpu_has_sve(vcpu))
-				val |= CPACR_EL1_ZEN;
-		}
-
-		write_sysreg(val, cpacr_el1);
-	} else {
-		val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
-
-		/*
-		 * Always trap SME since it's not supported in KVM.
-		 * TSM is RES1 if SME isn't implemented.
-		 */
-		val |= CPTR_EL2_TSM;
-
-		if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
-			val |= CPTR_EL2_TZ;
-
-		if (!guest_owns_fp_regs())
-			val |= CPTR_EL2_TFP;
-
-		write_sysreg(val, cptr_el2);
-	}
-}
-
-static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
-{
-	if (has_hvhe()) {
-		u64 val = CPACR_EL1_FPEN;
-
-		if (cpus_have_final_cap(ARM64_SVE))
-			val |= CPACR_EL1_ZEN;
-		if (cpus_have_final_cap(ARM64_SME))
-			val |= CPACR_EL1_SMEN;
-
-		write_sysreg(val, cpacr_el1);
-	} else {
-		u64 val = CPTR_NVHE_EL2_RES1;
-
-		if (!cpus_have_final_cap(ARM64_SVE))
-			val |= CPTR_EL2_TZ;
-		if (!cpus_have_final_cap(ARM64_SME))
-			val |= CPTR_EL2_TSM;
-
-		write_sysreg(val, cptr_el2);
-	}
-}
-
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	___activate_traps(vcpu, vcpu->arch.hcr_el2);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 09df2b42bc1b..477f1580ffea 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -90,87 +90,6 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu)
 	return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE);
 }
 
-static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
-{
-	u64 cptr;
-
-	/*
-	 * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
-	 * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
-	 * except for some missing controls, such as TAM.
-	 * In this case, CPTR_EL2.TAM has the same position with or without
-	 * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
-	 * shift value for trapping the AMU accesses.
-	 */
-	u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM;
-
-	if (guest_owns_fp_regs()) {
-		val |= CPACR_EL1_FPEN;
-		if (vcpu_has_sve(vcpu))
-			val |= CPACR_EL1_ZEN;
-	} else {
-		__activate_traps_fpsimd32(vcpu);
-	}
-
-	if (!vcpu_has_nv(vcpu))
-		goto write;
-
-	/*
-	 * The architecture is a bit crap (what a surprise): an EL2 guest
-	 * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
-	 * as they are RES0 in the guest's view. To work around it, trap the
-	 * sucker using the very same bit it can't set...
-	 */
-	if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
-		val |= CPTR_EL2_TCPAC;
-
-	/*
-	 * Layer the guest hypervisor's trap configuration on top of our own if
-	 * we're in a nested context.
-	 */
-	if (is_hyp_ctxt(vcpu))
-		goto write;
-
-	cptr = vcpu_sanitised_cptr_el2(vcpu);
-
-	/*
-	 * Pay attention, there's some interesting detail here.
-	 *
-	 * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
-	 * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
-	 *
-	 *  - CPTR_EL2.xEN = x0, traps are enabled
-	 *  - CPTR_EL2.xEN = x1, traps are disabled
-	 *
-	 * In other words, bit[0] determines if guest accesses trap or not. In
-	 * the interest of simplicity, clear the entire field if the guest
-	 * hypervisor has traps enabled to dispel any illusion of something more
-	 * complicated taking place.
-	 */
-	if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
-		val &= ~CPACR_EL1_FPEN;
-	if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
-		val &= ~CPACR_EL1_ZEN;
-
-	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
-		val |= cptr & CPACR_EL1_E0POE;
-
-	val |= cptr & CPTR_EL2_TCPAC;
-
-write:
-	write_sysreg(val, cpacr_el1);
-}
-
-static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
-{
-	u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN;
-
-	if (cpus_have_final_cap(ARM64_SME))
-		val |= CPACR_EL1_SMEN_EL1EN;
-
-	write_sysreg(val, cpacr_el1);
-}
-
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	u64 val;
@@ -639,10 +558,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 	host_ctxt = host_data_ptr(host_ctxt);
 	guest_ctxt = &vcpu->arch.ctxt;
 
-	sysreg_save_host_state_vhe(host_ctxt);
-
 	fpsimd_lazy_switch_to_guest(vcpu);
 
+	sysreg_save_host_state_vhe(host_ctxt);
+
 	/*
 	 * Note that ARM erratum 1165522 requires us to configure both stage 1
 	 * and stage 2 translation for the guest context before we clear
@@ -667,15 +586,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 
 	__deactivate_traps(vcpu);
 
-	fpsimd_lazy_switch_to_host(vcpu);
-
 	sysreg_restore_host_state_vhe(host_ctxt);
 
+	__debug_switch_to_host(vcpu);
+
+	/*
+	 * Ensure that all system register writes above have taken effect
+	 * before returning to the host. In VHE mode, CPTR traps for
+	 * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be
+	 * manipulated after the ISB.
+	 */
+	isb();
+
+	fpsimd_lazy_switch_to_host(vcpu);
+
 	if (guest_owns_fp_regs())
 		__fpsimd_save_fpexc32(vcpu);
 
-	__debug_switch_to_host(vcpu);
-
 	return exit_code;
 }
 NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);
@@ -705,12 +632,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	 */
 	local_daif_restore(DAIF_PROCCTX_NOIRQ);
 
-	/*
-	 * When we exit from the guest we change a number of CPU configuration
-	 * parameters, such as traps.  We rely on the isb() in kvm_call_hyp*()
-	 * to make sure these changes take effect before running the host or
-	 * additional guests.
-	 */
 	return ret;
 }
 
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 5b191f4dc566..dc1d26559bfa 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1402,6 +1402,21 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
 	}
 }
 
+#define has_tgran_2(__r, __sz)						\
+	({								\
+		u64 _s1, _s2, _mmfr0 = __r;				\
+									\
+		_s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1,			\
+				    TGRAN##__sz##_2, _mmfr0);		\
+									\
+		_s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1,			\
+				    TGRAN##__sz, _mmfr0);		\
+									\
+		((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI &&		\
+		  _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \
+		 (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \
+		  _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI));		\
+	})
 /*
  * Our emulated CPU doesn't support all the possible features. For the
  * sake of simplicity (and probably mental sanity), wipe out a number
@@ -1411,6 +1426,8 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
  */
 u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
 {
+	u64 orig_val = val;
+
 	switch (reg) {
 	case SYS_ID_AA64ISAR0_EL1:
 		/* Support everything but TME */
@@ -1480,13 +1497,16 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
 		 */
 		switch (PAGE_SIZE) {
 		case SZ_4K:
-			val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
+			if (has_tgran_2(orig_val, 4))
+				val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
 			fallthrough;
 		case SZ_16K:
-			val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
+			if (has_tgran_2(orig_val, 16))
+				val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
 			fallthrough;
 		case SZ_64K:
-			val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
+			if (has_tgran_2(orig_val, 64))
+				val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
 			break;
 		}
 
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 76c2f0da821f..c20bd6f21e60 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2624,7 +2624,7 @@ static bool access_mdcr(struct kvm_vcpu *vcpu,
 	 */
 	if (hpmn > vcpu->kvm->arch.nr_pmu_counters) {
 		hpmn = vcpu->kvm->arch.nr_pmu_counters;
-		u64_replace_bits(val, hpmn, MDCR_EL2_HPMN);
+		u64p_replace_bits(&val, hpmn, MDCR_EL2_HPMN);
 	}
 
 	__vcpu_assign_sys_reg(vcpu, MDCR_EL2, val);
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
index d22a8ad7bcc5..679aafe77de2 100644
--- a/arch/arm64/kvm/vgic/vgic-v3-nested.c
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -36,6 +36,11 @@ struct shadow_if {
 
 static DEFINE_PER_CPU(struct shadow_if, shadow_if);
 
+static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
+{
+	return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
+}
+
 /*
  * Nesting GICv3 support
  *
@@ -209,6 +214,29 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
 	return reg;
 }
 
+static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
+{
+	struct vgic_irq *irq;
+
+	if (!(lr & ICH_LR_HW))
+		return lr;
+
+	/* We have the HW bit set, check for validity of pINTID */
+	irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+	/* If there was no real mapping, nuke the HW bit */
+	if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
+		lr &= ~ICH_LR_HW;
+
+	/* Translate the virtual mapping to the real one, even if invalid */
+	if (irq) {
+		lr &= ~ICH_LR_PHYS_ID_MASK;
+		lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return lr;
+}
+
 /*
  * For LRs which have HW bit set such as timer interrupts, we modify them to
  * have the host hardware interrupt number instead of the virtual one programmed
@@ -217,58 +245,37 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
 static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
 				     struct vgic_v3_cpu_if *s_cpu_if)
 {
-	unsigned long lr_map = 0;
-	int index = 0;
+	struct shadow_if *shadow_if;
+
+	shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
+	shadow_if->lr_map = 0;
 
 	for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
-		struct vgic_irq *irq;
 
 		if (!(lr & ICH_LR_STATE))
-			lr = 0;
-
-		if (!(lr & ICH_LR_HW))
-			goto next;
-
-		/* We have the HW bit set, check for validity of pINTID */
-		irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
-		if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) {
-			/* There was no real mapping, so nuke the HW bit */
-			lr &= ~ICH_LR_HW;
-			if (irq)
-				vgic_put_irq(vcpu->kvm, irq);
-			goto next;
-		}
-
-		/* Translate the virtual mapping to the real one */
-		lr &= ~ICH_LR_PHYS_ID_MASK;
-		lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+			continue;
 
-		vgic_put_irq(vcpu->kvm, irq);
+		lr = translate_lr_pintid(vcpu, lr);
 
-next:
-		s_cpu_if->vgic_lr[index] = lr;
-		if (lr) {
-			lr_map |= BIT(i);
-			index++;
-		}
+		s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
+		shadow_if->lr_map |= BIT(i);
 	}
 
-	container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map;
-	s_cpu_if->used_lrs = index;
+	s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
 }
 
 void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
 {
 	struct shadow_if *shadow_if = get_shadow_if();
-	int i, index = 0;
+	int i;
 
 	for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
 		struct vgic_irq *irq;
 
 		if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
-			goto next;
+			continue;
 
 		/*
 		 * If we had a HW lr programmed by the guest hypervisor, we
@@ -277,15 +284,13 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
 		 */
 		irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
 		if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
-			goto next;
+			continue;
 
-		lr = __gic_v3_get_lr(index);
+		lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
 		if (!(lr & ICH_LR_STATE))
 			irq->active = false;
 
 		vgic_put_irq(vcpu->kvm, irq);
-	next:
-		index++;
 	}
 }
 
@@ -368,13 +373,11 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
 		val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
 
 		val &= ~ICH_LR_STATE;
-		val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;
+		val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE;
 
 		__vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
-		s_cpu_if->vgic_lr[i] = 0;
 	}
 
-	shadow_if->lr_map = 0;
 	vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
 }
 
@@ -398,9 +401,7 @@ void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
 {
 	bool level;
 
-	level  = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En;
-	if (level)
-		level &= vgic_v3_get_misr(vcpu);
+	level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu);
 	kvm_vgic_inject_irq(vcpu->kvm, vcpu,
 			    vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
 }
diff --git a/arch/arm64/lib/.gitignore b/arch/arm64/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/arm64/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 027bfa9689c6..633e5223d944 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-
-obj-y += crypto/
-
 lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_page.o				\
 		   clear_page.o csum.o insn.o memchr.o memcpy.o		\
@@ -16,12 +13,6 @@ endif
 
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
-obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
-crc32-arm64-y := crc32.o crc32-core.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
-crc-t10dif-arm64-y := crc-t10dif.o crc-t10dif-core.o
-
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_ARM64_MTE) += mte.o
diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S
deleted file mode 100644
index 87dd6d46224d..000000000000
--- a/arch/arm64/lib/crc-t10dif-core.S
+++ /dev/null
@@ -1,469 +0,0 @@
-//
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd
-// Copyright (C) 2019-2024 Google LLC
-//
-// Authors: Ard Biesheuvel <ardb@google.com>
-//          Eric Biggers <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-//     Erdinc Ozturk <erdinc.ozturk@intel.com>
-//     Vinodh Gopal <vinodh.gopal@intel.com>
-//     James Guilford <james.guilford@intel.com>
-//     Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses.  You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//       Reference paper titled "Fast CRC Computation for Generic
-//	Polynomials Using PCLMULQDQ Instruction"
-//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
-//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.arch		armv8-a+crypto
-
-	init_crc	.req	w0
-	buf		.req	x1
-	len		.req	x2
-	fold_consts_ptr	.req	x5
-
-	fold_consts	.req	v10
-
-	t3		.req	v17
-	t4		.req	v18
-	t5		.req	v19
-	t6		.req	v20
-	t7		.req	v21
-	t8		.req	v22
-
-	perm		.req	v27
-
-	.macro		pmull16x64_p64, a16, b64, c64
-	pmull2		\c64\().1q, \a16\().2d, \b64\().2d
-	pmull		\b64\().1q, \a16\().1d, \b64\().1d
-	.endm
-
-	/*
-	 * Pairwise long polynomial multiplication of two 16-bit values
-	 *
-	 *   { w0, w1 }, { y0, y1 }
-	 *
-	 * by two 64-bit values
-	 *
-	 *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
-	 *
-	 * where each vector element is a byte, ordered from least to most
-	 * significant.
-	 *
-	 * This can be implemented using 8x8 long polynomial multiplication, by
-	 * reorganizing the input so that each pairwise 8x8 multiplication
-	 * produces one of the terms from the decomposition below, and
-	 * combining the results of each rank and shifting them into place.
-	 *
-	 * Rank
-	 *  0            w0*x0 ^              |        y0*z0 ^
-	 *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
-	 *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
-	 *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
-	 *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
-	 *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
-	 *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
-	 *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
-	 *  8            w1*x7      << 64     |        y1*z7      << 64
-	 *
-	 * The inputs can be reorganized into
-	 *
-	 *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
-	 *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
-	 *
-	 * and after performing 8x8->16 bit long polynomial multiplication of
-	 * each of the halves of the first vector with those of the second one,
-	 * we obtain the following four vectors of 16-bit elements:
-	 *
-	 *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
-	 *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
-	 *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
-	 *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
-	 *
-	 * Results b and c can be XORed together, as the vector elements have
-	 * matching ranks. Then, the final XOR (*) can be pulled forward, and
-	 * applied between the halves of each of the remaining three vectors,
-	 * which are then shifted into place, and combined to produce two
-	 * 80-bit results.
-	 *
-	 * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
-	 * to the 64x64 bit one above, but XOR'ing the outputs together will
-	 * produce the expected result, and this is sufficient in the context of
-	 * this algorithm.
-	 */
-	.macro		pmull16x64_p8, a16, b64, c64
-	ext		t7.16b, \b64\().16b, \b64\().16b, #1
-	tbl		t5.16b, {\a16\().16b}, perm.16b
-	uzp1		t7.16b, \b64\().16b, t7.16b
-	bl		__pmull_p8_16x64
-	ext		\b64\().16b, t4.16b, t4.16b, #15
-	eor		\c64\().16b, t8.16b, t5.16b
-	.endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
-	ext		t6.16b, t5.16b, t5.16b, #8
-
-	pmull		t3.8h, t7.8b, t5.8b
-	pmull		t4.8h, t7.8b, t6.8b
-	pmull2		t5.8h, t7.16b, t5.16b
-	pmull2		t6.8h, t7.16b, t6.16b
-
-	ext		t8.16b, t3.16b, t3.16b, #8
-	eor		t4.16b, t4.16b, t6.16b
-	ext		t7.16b, t5.16b, t5.16b, #8
-	ext		t6.16b, t4.16b, t4.16b, #8
-	eor		t8.8b, t8.8b, t3.8b
-	eor		t5.8b, t5.8b, t7.8b
-	eor		t4.8b, t4.8b, t6.8b
-	ext		t5.16b, t5.16b, t5.16b, #14
-	ret
-SYM_FUNC_END(__pmull_p8_16x64)
-
-
-	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
-	// into reg1, reg2.
-	.macro		fold_32_bytes, p, reg1, reg2
-	ldp		q11, q12, [buf], #0x20
-
-	pmull16x64_\p	fold_consts, \reg1, v8
-
-CPU_LE(	rev64		v11.16b, v11.16b		)
-CPU_LE(	rev64		v12.16b, v12.16b		)
-
-	pmull16x64_\p	fold_consts, \reg2, v9
-
-CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
-CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
-
-	eor		\reg1\().16b, \reg1\().16b, v8.16b
-	eor		\reg2\().16b, \reg2\().16b, v9.16b
-	eor		\reg1\().16b, \reg1\().16b, v11.16b
-	eor		\reg2\().16b, \reg2\().16b, v12.16b
-	.endm
-
-	// Fold src_reg into dst_reg, optionally loading the next fold constants
-	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-	pmull16x64_\p	fold_consts, \src_reg, v8
-	.ifnb		\load_next_consts
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	.endif
-	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
-	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
-	.endm
-
-	.macro		crc_t10dif_pmull, p
-
-	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-	cmp		len, #256
-	b.lt		.Lless_than_256_bytes_\@
-
-	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
-
-	// Load the first 128 data bytes.  Byte swapping is necessary to make
-	// the bit order match the polynomial coefficient order.
-	ldp		q0, q1, [buf]
-	ldp		q2, q3, [buf, #0x20]
-	ldp		q4, q5, [buf, #0x40]
-	ldp		q6, q7, [buf, #0x60]
-	add		buf, buf, #0x80
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	rev64		v2.16b, v2.16b			)
-CPU_LE(	rev64		v3.16b, v3.16b			)
-CPU_LE(	rev64		v4.16b, v4.16b			)
-CPU_LE(	rev64		v5.16b, v5.16b			)
-CPU_LE(	rev64		v6.16b, v6.16b			)
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
-CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
-CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
-CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
-CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	movi		v8.16b, #0
-	mov		v8.h[7], init_crc
-	eor		v0.16b, v0.16b, v8.16b
-
-	// Load the constants for folding across 128 bytes.
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-
-	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
-	// 128 to simplify the termination condition of the following loop.
-	sub		len, len, #256
-
-	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
-	// bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
-	fold_32_bytes	\p, v0, v1
-	fold_32_bytes	\p, v2, v3
-	fold_32_bytes	\p, v4, v5
-	fold_32_bytes	\p, v6, v7
-
-	subs		len, len, #128
-	b.ge		.Lfold_128_bytes_loop_\@
-
-	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
-	// Fold across 64 bytes.
-	add		fold_consts_ptr, fold_consts_ptr, #16
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	fold_16_bytes	\p, v0, v4
-	fold_16_bytes	\p, v1, v5
-	fold_16_bytes	\p, v2, v6
-	fold_16_bytes	\p, v3, v7, 1
-	// Fold across 32 bytes.
-	fold_16_bytes	\p, v4, v6
-	fold_16_bytes	\p, v5, v7, 1
-	// Fold across 16 bytes.
-	fold_16_bytes	\p, v6, v7
-
-	// Add 128 to get the correct number of data bytes remaining in 0...127
-	// (not counting v7), following the previous extra subtraction by 128.
-	// Then subtract 16 to simplify the termination condition of the
-	// following loop.
-	adds		len, len, #(128-16)
-
-	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
-	// into them, storing the result back into v7.
-	b.lt		.Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
-	pmull16x64_\p	fold_consts, v7, v8
-	eor		v7.16b, v7.16b, v8.16b
-	ldr		q0, [buf], #16
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-	eor		v7.16b, v7.16b, v0.16b
-	subs		len, len, #16
-	b.ge		.Lfold_16_bytes_loop_\@
-
-.Lfold_16_bytes_loop_done_\@:
-	// Add 16 to get the correct number of data bytes remaining in 0...15
-	// (not counting v7), following the previous extra subtraction by 16.
-	adds		len, len, #16
-	b.eq		.Lreduce_final_16_bytes_\@
-
-.Lhandle_partial_segment_\@:
-	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
-	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
-	// do this without needing a fold constant for each possible 'len',
-	// redivide the bytes into a first chunk of 'len' bytes and a second
-	// chunk of 16 bytes, then fold the first chunk into the second.
-
-	// v0 = last 16 original data bytes
-	add		buf, buf, len
-	ldr		q0, [buf, #-16]
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-
-	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
-	adr_l		x4, .Lbyteshift_table + 16
-	sub		x4, x4, len
-	ld1		{v2.16b}, [x4]
-	tbl		v1.16b, {v7.16b}, v2.16b
-
-	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
-	movi		v3.16b, #0x80
-	eor		v2.16b, v2.16b, v3.16b
-	tbl		v3.16b, {v7.16b}, v2.16b
-
-	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
-	sshr		v2.16b, v2.16b, #7
-
-	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
-	// then '16-len' bytes from v1 (high-order bytes).
-	bsl		v2.16b, v1.16b, v0.16b
-
-	// Fold the first chunk into the second chunk, storing the result in v7.
-	pmull16x64_\p	fold_consts, v3, v0
-	eor		v7.16b, v3.16b, v0.16b
-	eor		v7.16b, v7.16b, v2.16b
-	b		.Lreduce_final_16_bytes_\@
-
-.Lless_than_256_bytes_\@:
-	// Checksumming a buffer of length 16...255 bytes
-
-	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
-
-	// Load the first 16 data bytes.
-	ldr		q7, [buf], #0x10
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	movi		v0.16b, #0
-	mov		v0.h[7], init_crc
-	eor		v7.16b, v7.16b, v0.16b
-
-	// Load the fold-across-16-bytes constants.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-
-	cmp		len, #16
-	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
-	subs		len, len, #32
-	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
-	add		len, len, #16
-	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
-
-.Lreduce_final_16_bytes_\@:
-	.endm
-
-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
-	frame_push	1
-
-	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
-	movi		perm.4h, #8, lsl #8
-	orr		perm.2s, #1, lsl #16
-	orr		perm.2s, #1, lsl #24
-	zip1		perm.16b, perm.16b, perm.16b
-	zip1		perm.16b, perm.16b, perm.16b
-
-	crc_t10dif_pmull p8
-
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-	str		q7, [x3]
-
-	frame_pop
-	ret
-SYM_FUNC_END(crc_t10dif_pmull_p8)
-
-	.align		5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
-	crc_t10dif_pmull	p64
-
-	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
-	movi		v2.16b, #0		// init zero register
-
-	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-
-	// Fold the high 64 bits into the low 64 bits, while also multiplying by
-	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-	// whose low 48 bits are 0.
-	ext		v0.16b, v2.16b, v7.16b, #8
-	pmull2		v7.1q, v7.2d, fold_consts.2d	// high bits * x^48 * (x^80 mod G(x))
-	eor		v0.16b, v0.16b, v7.16b		// + low bits * x^64
-
-	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
-	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
-	mov		v0.s[3], v2.s[0]		// zero high 32 bits
-	pmull		v1.1q, v1.1d, fold_consts.1d	// high 32 bits * x^48 * (x^48 mod G(x))
-	eor		v0.16b, v0.16b, v1.16b		// + low bits
-
-	// Load G(x) and floor(x^48 / G(x)).
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-
-	// Use Barrett reduction to compute the final CRC value.
-	pmull2		v1.1q, v0.2d, fold_consts.2d	// high 32 bits * floor(x^48 / G(x))
-	ushr		v1.2d, v1.2d, #32		// /= x^32
-	pmull		v1.1q, v1.1d, fold_consts.1d	// *= G(x)
-	ushr		v0.2d, v0.2d, #48
-	eor		v0.16b, v0.16b, v1.16b		// + low 16 nonzero bits
-	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
-	umov		w0, v0.h[0]
-	ret
-SYM_FUNC_END(crc_t10dif_pmull_p64)
-
-	.section	".rodata", "a"
-	.align		4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
-	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
-	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
-// .Lfold_across_64_bytes_consts:
-	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
-	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
-// .Lfold_across_32_bytes_consts:
-	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
-	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
-.Lfold_across_16_bytes_consts:
-	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
-	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
-// .Lfinal_fold_consts:
-	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
-	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
-	.quad		0x0000000000018bb7	// G(x)
-	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
-	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
-	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/lib/crc-t10dif.c b/arch/arm64/lib/crc-t10dif.c
deleted file mode 100644
index c2ffe4fdb59d..000000000000
--- a/arch/arm64/lib/crc-t10dif.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
-
-asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
-				    u8 out[16]);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
-{
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
-		if (static_branch_likely(&have_pmull)) {
-			if (crypto_simd_usable()) {
-				kernel_neon_begin();
-				crc = crc_t10dif_pmull_p64(crc, data, length);
-				kernel_neon_end();
-				return crc;
-			}
-		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
-			   static_branch_likely(&have_asimd) &&
-			   crypto_simd_usable()) {
-			u8 buf[16];
-
-			kernel_neon_begin();
-			crc_t10dif_pmull_p8(crc, data, length, buf);
-			kernel_neon_end();
-
-			return crc_t10dif_generic(0, buf, sizeof(buf));
-		}
-	}
-	return crc_t10dif_generic(crc, data, length);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_arm64_init(void)
-{
-	if (cpu_have_named_feature(ASIMD)) {
-		static_branch_enable(&have_asimd);
-		if (cpu_have_named_feature(PMULL))
-			static_branch_enable(&have_pmull);
-	}
-	return 0;
-}
-subsys_initcall(crc_t10dif_arm64_init);
-
-static void __exit crc_t10dif_arm64_exit(void)
-{
-}
-module_exit(crc_t10dif_arm64_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crc32-core.S b/arch/arm64/lib/crc32-core.S
deleted file mode 100644
index 68825317460f..000000000000
--- a/arch/arm64/lib/crc32-core.S
+++ /dev/null
@@ -1,362 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
- *
- * Copyright (C) 2016 - 2018 Linaro Ltd.
- * Copyright (C) 2024 Google LLC
- *
- * Author: Ard Biesheuvel <ardb@kernel.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.cpu		generic+crc+crypto
-
-	.macro		bitle, reg
-	.endm
-
-	.macro		bitbe, reg
-	rbit		\reg, \reg
-	.endm
-
-	.macro		bytele, reg
-	.endm
-
-	.macro		bytebe, reg
-	rbit		\reg, \reg
-	lsr		\reg, \reg, #24
-	.endm
-
-	.macro		hwordle, reg
-CPU_BE(	rev16		\reg, \reg	)
-	.endm
-
-	.macro		hwordbe, reg
-CPU_LE(	rev		\reg, \reg	)
-	rbit		\reg, \reg
-CPU_BE(	lsr		\reg, \reg, #16	)
-	.endm
-
-	.macro		le, regs:vararg
-	.irp		r, \regs
-CPU_BE(	rev		\r, \r		)
-	.endr
-	.endm
-
-	.macro		be, regs:vararg
-	.irp		r, \regs
-CPU_LE(	rev		\r, \r		)
-	.endr
-	.irp		r, \regs
-	rbit		\r, \r
-	.endr
-	.endm
-
-	.macro		__crc32, c, order=le
-	bit\order	w0
-	cmp		x2, #16
-	b.lt		8f			// less than 16 bytes
-
-	and		x7, x2, #0x1f
-	and		x2, x2, #~0x1f
-	cbz		x7, 32f			// multiple of 32 bytes
-
-	and		x8, x7, #0xf
-	ldp		x3, x4, [x1]
-	add		x8, x8, x1
-	add		x1, x1, x7
-	ldp		x5, x6, [x8]
-	\order		x3, x4, x5, x6
-
-	tst		x7, #8
-	crc32\c\()x	w8, w0, x3
-	csel		x3, x3, x4, eq
-	csel		w0, w0, w8, eq
-	tst		x7, #4
-	lsr		x4, x3, #32
-	crc32\c\()w	w8, w0, w3
-	csel		x3, x3, x4, eq
-	csel		w0, w0, w8, eq
-	tst		x7, #2
-	lsr		w4, w3, #16
-	crc32\c\()h	w8, w0, w3
-	csel		w3, w3, w4, eq
-	csel		w0, w0, w8, eq
-	tst		x7, #1
-	crc32\c\()b	w8, w0, w3
-	csel		w0, w0, w8, eq
-	tst		x7, #16
-	crc32\c\()x	w8, w0, x5
-	crc32\c\()x	w8, w8, x6
-	csel		w0, w0, w8, eq
-	cbz		x2, 0f
-
-32:	ldp		x3, x4, [x1], #32
-	sub		x2, x2, #32
-	ldp		x5, x6, [x1, #-16]
-	\order		x3, x4, x5, x6
-	crc32\c\()x	w0, w0, x3
-	crc32\c\()x	w0, w0, x4
-	crc32\c\()x	w0, w0, x5
-	crc32\c\()x	w0, w0, x6
-	cbnz		x2, 32b
-0:	bit\order	w0
-	ret
-
-8:	tbz		x2, #3, 4f
-	ldr		x3, [x1], #8
-	\order		x3
-	crc32\c\()x	w0, w0, x3
-4:	tbz		x2, #2, 2f
-	ldr		w3, [x1], #4
-	\order		w3
-	crc32\c\()w	w0, w0, w3
-2:	tbz		x2, #1, 1f
-	ldrh		w3, [x1], #2
-	hword\order	w3
-	crc32\c\()h	w0, w0, w3
-1:	tbz		x2, #0, 0f
-	ldrb		w3, [x1]
-	byte\order	w3
-	crc32\c\()b	w0, w0, w3
-0:	bit\order	w0
-	ret
-	.endm
-
-	.align		5
-SYM_FUNC_START(crc32_le_arm64)
-	__crc32
-SYM_FUNC_END(crc32_le_arm64)
-
-	.align		5
-SYM_FUNC_START(crc32c_le_arm64)
-	__crc32		c
-SYM_FUNC_END(crc32c_le_arm64)
-
-	.align		5
-SYM_FUNC_START(crc32_be_arm64)
-	__crc32		order=be
-SYM_FUNC_END(crc32_be_arm64)
-
-	in		.req	x1
-	len		.req	x2
-
-	/*
-	 * w0: input CRC at entry, output CRC at exit
-	 * x1: pointer to input buffer
-	 * x2: length of input in bytes
-	 */
-	.macro		crc4way, insn, table, order=le
-	bit\order	w0
-	lsr		len, len, #6		// len := # of 64-byte blocks
-
-	/* Process up to 64 blocks of 64 bytes at a time */
-.La\@:	mov		x3, #64
-	cmp		len, #64
-	csel		x3, x3, len, hi		// x3 := min(len, 64)
-	sub		len, len, x3
-
-	/* Divide the input into 4 contiguous blocks */
-	add		x4, x3, x3, lsl #1	// x4 :=  3 * x3
-	add		x7, in, x3, lsl #4	// x7 := in + 16 * x3
-	add		x8, in, x3, lsl #5	// x8 := in + 32 * x3
-	add		x9, in, x4, lsl #4	// x9 := in + 16 * x4
-
-	/* Load the folding coefficients from the lookup table */
-	adr_l		x5, \table - 12		// entry 0 omitted
-	add		x5, x5, x4, lsl #2	// x5 += 12 * x3
-	ldp		s0, s1, [x5]
-	ldr		s2, [x5, #8]
-
-	/* Zero init partial CRCs for this iteration */
-	mov		w4, wzr
-	mov		w5, wzr
-	mov		w6, wzr
-	mov		x17, xzr
-
-.Lb\@:	sub		x3, x3, #1
-	\insn		w6, w6, x17
-	ldp		x10, x11, [in], #16
-	ldp		x12, x13, [x7], #16
-	ldp		x14, x15, [x8], #16
-	ldp		x16, x17, [x9], #16
-
-	\order		x10, x11, x12, x13, x14, x15, x16, x17
-
-	/* Apply the CRC transform to 4 16-byte blocks in parallel */
-	\insn		w0, w0, x10
-	\insn		w4, w4, x12
-	\insn		w5, w5, x14
-	\insn		w6, w6, x16
-	\insn		w0, w0, x11
-	\insn		w4, w4, x13
-	\insn		w5, w5, x15
-	cbnz		x3, .Lb\@
-
-	/* Combine the 4 partial results into w0 */
-	mov		v3.d[0], x0
-	mov		v4.d[0], x4
-	mov		v5.d[0], x5
-	pmull		v0.1q, v0.1d, v3.1d
-	pmull		v1.1q, v1.1d, v4.1d
-	pmull		v2.1q, v2.1d, v5.1d
-	eor		v0.8b, v0.8b, v1.8b
-	eor		v0.8b, v0.8b, v2.8b
-	mov		x5, v0.d[0]
-	eor		x5, x5, x17
-	\insn		w0, w6, x5
-
-	mov		in, x9
-	cbnz		len, .La\@
-
-	bit\order	w0
-	ret
-	.endm
-
-	.align		5
-SYM_FUNC_START(crc32c_le_arm64_4way)
-	crc4way		crc32cx, .L0
-SYM_FUNC_END(crc32c_le_arm64_4way)
-
-	.align		5
-SYM_FUNC_START(crc32_le_arm64_4way)
-	crc4way		crc32x, .L1
-SYM_FUNC_END(crc32_le_arm64_4way)
-
-	.align		5
-SYM_FUNC_START(crc32_be_arm64_4way)
-	crc4way		crc32x, .L1, be
-SYM_FUNC_END(crc32_be_arm64_4way)
-
-	.section	.rodata, "a", %progbits
-	.align		6
-.L0:	.long		0xddc0152b, 0xba4fc28e, 0x493c7d27
-	.long		0x0715ce53, 0x9e4addf8, 0xba4fc28e
-	.long		0xc96cfdc0, 0x0715ce53, 0xddc0152b
-	.long		0xab7aff2a, 0x0d3b6092, 0x9e4addf8
-	.long		0x299847d5, 0x878a92a7, 0x39d3b296
-	.long		0xb6dd949b, 0xab7aff2a, 0x0715ce53
-	.long		0xa60ce07b, 0x83348832, 0x47db8317
-	.long		0xd270f1a2, 0xb9e02b86, 0x0d3b6092
-	.long		0x65863b64, 0xb6dd949b, 0xc96cfdc0
-	.long		0xb3e32c28, 0xbac2fd7b, 0x878a92a7
-	.long		0xf285651c, 0xce7f39f4, 0xdaece73e
-	.long		0x271d9844, 0xd270f1a2, 0xab7aff2a
-	.long		0x6cb08e5c, 0x2b3cac5d, 0x2162d385
-	.long		0xcec3662e, 0x1b03397f, 0x83348832
-	.long		0x8227bb8a, 0xb3e32c28, 0x299847d5
-	.long		0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
-	.long		0xf6076544, 0x10746f3c, 0x18b33a4e
-	.long		0x98d8d9cb, 0x271d9844, 0xb6dd949b
-	.long		0x57a3d037, 0x93a5f730, 0x78d9ccb7
-	.long		0x3771e98f, 0x6b749fb2, 0xbac2fd7b
-	.long		0xe0ac139e, 0xcec3662e, 0xa60ce07b
-	.long		0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
-	.long		0xa2b73df1, 0xb0cd4768, 0x61d82e56
-	.long		0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
-	.long		0xa90fd27a, 0x0167d312, 0xc619809d
-	.long		0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
-	.long		0x4597456a, 0x98d8d9cb, 0x65863b64
-	.long		0xc9c8b782, 0x68bce87a, 0x1b03397f
-	.long		0x62ec6c6d, 0x6956fc3b, 0xebb883bd
-	.long		0x2342001e, 0x3771e98f, 0xb3e32c28
-	.long		0xe8b6368b, 0x2178513a, 0x064f7f26
-	.long		0x9ef68d35, 0x170076fa, 0xdd7e3b0c
-	.long		0x0b0bf8ca, 0x6f345e45, 0xf285651c
-	.long		0x02ee03b2, 0xff0dba97, 0x10746f3c
-	.long		0x135c83fd, 0xf872e54c, 0xc7a68855
-	.long		0x00bcf5f6, 0x86d8e4d2, 0x271d9844
-	.long		0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
-	.long		0xded288f8, 0xb3af077a, 0x93a5f730
-	.long		0x37170390, 0xca6ef3ac, 0x6cb08e5c
-	.long		0xf48642e9, 0xdd66cbbb, 0x6b749fb2
-	.long		0xb25b29f2, 0xe9e28eb4, 0x1393e203
-	.long		0x45cddf4e, 0xc9c8b782, 0xcec3662e
-	.long		0xdfd94fb2, 0x93e106a4, 0x96c515bb
-	.long		0x021ac5ef, 0xd813b325, 0xe6fc4e6a
-	.long		0x8e1450f7, 0x2342001e, 0x8227bb8a
-	.long		0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
-	.long		0x613eee91, 0xd2c3ed1a, 0x39c7ff35
-	.long		0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
-	.long		0x0cd1526a, 0xf2271e60, 0x0ab3844b
-	.long		0xd6c3a807, 0x2664fd8b, 0x0167d312
-	.long		0x1d31175f, 0x02ee03b2, 0xf6076544
-	.long		0x4be7fd90, 0x363bd6b3, 0x26f6a60a
-	.long		0x6eeed1c9, 0x5fabe670, 0xa741c1bf
-	.long		0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
-	.long		0x2e7d11a7, 0x17f27698, 0x49c3cc9c
-	.long		0x889774e1, 0xaa7c7ad5, 0x68bce87a
-	.long		0x8a074012, 0xded288f8, 0x57a3d037
-	.long		0xbd0bb25f, 0x6d390dec, 0x6956fc3b
-	.long		0x3be3c09b, 0x6353c1cc, 0x42d98888
-	.long		0x465a4eee, 0xf48642e9, 0x3771e98f
-	.long		0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
-	.long		0xa52f58ec, 0x9a5ede41, 0x2178513a
-	.long		0x47972100, 0x45cddf4e, 0xe0ac139e
-	.long		0x359674f7, 0xa51b6135, 0x170076fa
-
-.L1:	.long		0xaf449247, 0x81256527, 0xccaa009e
-	.long		0x57c54819, 0x1d9513d7, 0x81256527
-	.long		0x3f41287a, 0x57c54819, 0xaf449247
-	.long		0xf5e48c85, 0x910eeec1, 0x1d9513d7
-	.long		0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
-	.long		0x71d54a59, 0xf5e48c85, 0x57c54819
-	.long		0x1c63267b, 0xfe807bbd, 0x0cbec0ed
-	.long		0xd31343ea, 0xe95c1271, 0x910eeec1
-	.long		0xf9d9c7ee, 0x71d54a59, 0x3f41287a
-	.long		0x9ee62949, 0xcec97417, 0x9026d5b1
-	.long		0xa55d1514, 0xf183c71b, 0xd1df2327
-	.long		0x21aa2b26, 0xd31343ea, 0xf5e48c85
-	.long		0x9d842b80, 0xeea395c4, 0x3c656ced
-	.long		0xd8110ff1, 0xcd669a40, 0xfe807bbd
-	.long		0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
-	.long		0x1d6708a0, 0x0c30f51d, 0xe95c1271
-	.long		0xef82aa68, 0xdb3935ea, 0xb918a347
-	.long		0xd14bcc9b, 0x21aa2b26, 0x71d54a59
-	.long		0x99cce860, 0x356d209f, 0xff6f2fc2
-	.long		0xd8af8e46, 0xc352f6de, 0xcec97417
-	.long		0xf1996890, 0xd8110ff1, 0x1c63267b
-	.long		0x631bc508, 0xe95c7216, 0xf183c71b
-	.long		0x8511c306, 0x8e031a19, 0x9b9bdbd0
-	.long		0xdb3839f3, 0x1d6708a0, 0xd31343ea
-	.long		0x7a92fffb, 0xf7003835, 0x4470ac44
-	.long		0x6ce68f2a, 0x00eba0c8, 0xeea395c4
-	.long		0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
-	.long		0xb46f7cff, 0x9a1b53c8, 0xcd669a40
-	.long		0x60290934, 0x81b6f443, 0x6d40f445
-	.long		0x8e976a7d, 0xd8af8e46, 0x9ee62949
-	.long		0xdcf5088a, 0x9dbdc100, 0x145575d5
-	.long		0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
-	.long		0x255b139e, 0x631bc508, 0xa55d1514
-	.long		0xd784eaa8, 0xce26786c, 0xdb3935ea
-	.long		0x6d2c864a, 0x8068c345, 0x2586d334
-	.long		0x02072e24, 0xdb3839f3, 0x21aa2b26
-	.long		0x06689b0a, 0x5efd72f5, 0xe0575528
-	.long		0x1e52f5ea, 0x4117915b, 0x356d209f
-	.long		0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
-	.long		0x3796455c, 0xb8e0e4a8, 0xc352f6de
-	.long		0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
-	.long		0x28ae0976, 0xb46f7cff, 0xd8110ff1
-	.long		0x9764bc8d, 0xd7e7a22c, 0x712510f0
-	.long		0x13a13e18, 0x3e9a43cd, 0xe95c7216
-	.long		0xb8ee242e, 0x8e976a7d, 0x3f9e9356
-	.long		0x0c540e7b, 0x753c81ff, 0x8e031a19
-	.long		0x9924c781, 0xb9220208, 0x3edcde65
-	.long		0x3954de39, 0x1753ab84, 0x1d6708a0
-	.long		0xf32238b5, 0xbec81497, 0x9e70b943
-	.long		0xbbd2cd2c, 0x0925d861, 0xf7003835
-	.long		0xcc401304, 0xd784eaa8, 0xef82aa68
-	.long		0x4987e684, 0x6044fbb0, 0x00eba0c8
-	.long		0x3aa11427, 0x18fe3b4a, 0x87441142
-	.long		0x297aad60, 0x02072e24, 0xd14bcc9b
-	.long		0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
-	.long		0x632d78c5, 0x3fc33de4, 0x9a1b53c8
-	.long		0x25b8822a, 0x1e52f5ea, 0x99cce860
-	.long		0xd4fc84bc, 0x1af62fb8, 0x81b6f443
-	.long		0x5690aa32, 0xa91fdefb, 0x688a110e
-	.long		0x1357a093, 0x3796455c, 0xd8af8e46
-	.long		0x798fdd33, 0xaaa18a37, 0x357b9517
-	.long		0xc2815395, 0x54d42691, 0x9dbdc100
-	.long		0x21cfc0f7, 0x28ae0976, 0xf1996890
-	.long		0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/arch/arm64/lib/crc32.c b/arch/arm64/lib/crc32.c
deleted file mode 100644
index ed3acd71178f..000000000000
--- a/arch/arm64/lib/crc32.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/crc32.h>
-#include <linux/linkage.h>
-#include <linux/module.h>
-
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#include <crypto/internal/simd.h>
-
-// The minimum input length to consider the 4-way interleaved code path
-static const size_t min_len = 1024;
-
-asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
-
-asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
-	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
-		return crc32_le_base(crc, p, len);
-
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		crc = crc32_le_arm64_4way(crc, p, len);
-		kernel_neon_end();
-
-		p += round_down(len, 64);
-		len %= 64;
-
-		if (!len)
-			return crc;
-	}
-
-	return crc32_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
-	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
-		return crc32c_base(crc, p, len);
-
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		crc = crc32c_le_arm64_4way(crc, p, len);
-		kernel_neon_end();
-
-		p += round_down(len, 64);
-		len %= 64;
-
-		if (!len)
-			return crc;
-	}
-
-	return crc32c_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
-	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
-		return crc32_be_base(crc, p, len);
-
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		crc = crc32_be_arm64_4way(crc, p, len);
-		kernel_neon_end();
-
-		p += round_down(len, 64);
-		len %= 64;
-
-		if (!len)
-			return crc;
-	}
-
-	return crc32_be_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-u32 crc32_optimizations(void)
-{
-	if (alternative_has_cap_likely(ARM64_HAS_CRC32))
-		return CRC32_LE_OPTIMIZATION |
-		       CRC32_BE_OPTIMIZATION |
-		       CRC32C_OPTIMIZATION;
-	return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("arm64-optimized CRC32 functions");
diff --git a/arch/arm64/lib/crypto/.gitignore b/arch/arm64/lib/crypto/.gitignore
deleted file mode 100644
index 12d74d8b03d0..000000000000
--- a/arch/arm64/lib/crypto/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
-sha256-core.S
diff --git a/arch/arm64/lib/crypto/Kconfig b/arch/arm64/lib/crypto/Kconfig
deleted file mode 100644
index 129a7685cb4c..000000000000
--- a/arch/arm64/lib/crypto/Kconfig
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA20_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	default CRYPTO_LIB_POLY1305
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM64
-	tristate
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm64/lib/crypto/Makefile b/arch/arm64/lib/crypto/Makefile
deleted file mode 100644
index 946c09903711..000000000000
--- a/arch/arm64/lib/crypto/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
-poly1305-neon-y := poly1305-core.o poly1305-glue.o
-AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
-AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
-
-obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
-sha256-arm64-y := sha256.o sha256-core.o
-sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $(<) void $(@)
-
-$(obj)/%-core.S: $(src)/%-armv8.pl
-	$(call cmd,perlasm)
-
-$(obj)/sha256-core.S: $(src)/sha2-armv8.pl
-	$(call cmd,perlasm)
-
-clean-files += poly1305-core.S sha256-core.S
diff --git a/arch/arm64/lib/crypto/chacha-neon-core.S b/arch/arm64/lib/crypto/chacha-neon-core.S
deleted file mode 100644
index 80079586ecc7..000000000000
--- a/arch/arm64/lib/crypto/chacha-neon-core.S
+++ /dev/null
@@ -1,805 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Originally based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-	.text
-	.align		6
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers v0-v3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in w3.
- *
- * Clobbers: w3, x10, v4, v12
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
-	adr_l		x10, ROT8
-	ld1		{v12.4s}, [x10]
-
-.Ldoubleround:
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	rev32		v3.8h, v3.8h
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #12
-	sri		v1.4s, v4.4s, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	tbl		v3.16b, {v3.16b}, v12.16b
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #7
-	sri		v1.4s, v4.4s, #25
-
-	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	ext		v1.16b, v1.16b, v1.16b, #4
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	ext		v2.16b, v2.16b, v2.16b, #8
-	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	ext		v3.16b, v3.16b, v3.16b, #12
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	rev32		v3.8h, v3.8h
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #12
-	sri		v1.4s, v4.4s, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	tbl		v3.16b, {v3.16b}, v12.16b
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #7
-	sri		v1.4s, v4.4s, #25
-
-	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	ext		v1.16b, v1.16b, v1.16b, #12
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	ext		v2.16b, v2.16b, v2.16b, #8
-	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	ext		v3.16b, v3.16b, v3.16b, #4
-
-	subs		w3, w3, #2
-	b.ne		.Ldoubleround
-
-	ret
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_neon)
-	// x0: Input state matrix, s
-	// x1: 1 data block output, o
-	// x2: 1 data block input, i
-	// w3: nrounds
-
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	// x0..3 = s0..3
-	ld1		{v0.4s-v3.4s}, [x0]
-	ld1		{v8.4s-v11.4s}, [x0]
-
-	bl		chacha_permute
-
-	ld1		{v4.16b-v7.16b}, [x2]
-
-	// o0 = i0 ^ (x0 + s0)
-	add		v0.4s, v0.4s, v8.4s
-	eor		v0.16b, v0.16b, v4.16b
-
-	// o1 = i1 ^ (x1 + s1)
-	add		v1.4s, v1.4s, v9.4s
-	eor		v1.16b, v1.16b, v5.16b
-
-	// o2 = i2 ^ (x2 + s2)
-	add		v2.4s, v2.4s, v10.4s
-	eor		v2.16b, v2.16b, v6.16b
-
-	// o3 = i3 ^ (x3 + s3)
-	add		v3.4s, v3.4s, v11.4s
-	eor		v3.16b, v3.16b, v7.16b
-
-	st1		{v0.16b-v3.16b}, [x1]
-
-	ldp		x29, x30, [sp], #16
-	ret
-SYM_FUNC_END(chacha_block_xor_neon)
-
-SYM_FUNC_START(hchacha_block_neon)
-	// x0: Input state matrix, s
-	// x1: output (8 32-bit words)
-	// w2: nrounds
-
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	ld1		{v0.4s-v3.4s}, [x0]
-
-	mov		w3, w2
-	bl		chacha_permute
-
-	st1		{v0.4s}, [x1], #16
-	st1		{v3.4s}, [x1]
-
-	ldp		x29, x30, [sp], #16
-	ret
-SYM_FUNC_END(hchacha_block_neon)
-
-	a0		.req	w12
-	a1		.req	w13
-	a2		.req	w14
-	a3		.req	w15
-	a4		.req	w16
-	a5		.req	w17
-	a6		.req	w19
-	a7		.req	w20
-	a8		.req	w21
-	a9		.req	w22
-	a10		.req	w23
-	a11		.req	w24
-	a12		.req	w25
-	a13		.req	w26
-	a14		.req	w27
-	a15		.req	w28
-
-	.align		6
-SYM_FUNC_START(chacha_4block_xor_neon)
-	frame_push	10
-
-	// x0: Input state matrix, s
-	// x1: 4 data blocks output, o
-	// x2: 4 data blocks input, i
-	// w3: nrounds
-	// x4: byte count
-
-	adr_l		x10, .Lpermute
-	and		x5, x4, #63
-	add		x10, x10, x5
-
-	//
-	// This function encrypts four consecutive ChaCha blocks by loading
-	// the state matrix in NEON registers four times. The algorithm performs
-	// each operation on the corresponding word of each state matrix, hence
-	// requires no word shuffling. For final XORing step we transpose the
-	// matrix by interleaving 32- and then 64-bit words, which allows us to
-	// do XOR in NEON registers.
-	//
-	// At the same time, a fifth block is encrypted in parallel using
-	// scalar registers
-	//
-	adr_l		x9, CTRINC		// ... and ROT8
-	ld1		{v30.4s-v31.4s}, [x9]
-
-	// x0..15[0-3] = s0..3[0..3]
-	add		x8, x0, #16
-	ld4r		{ v0.4s- v3.4s}, [x0]
-	ld4r		{ v4.4s- v7.4s}, [x8], #16
-	ld4r		{ v8.4s-v11.4s}, [x8], #16
-	ld4r		{v12.4s-v15.4s}, [x8]
-
-	mov		a0, v0.s[0]
-	mov		a1, v1.s[0]
-	mov		a2, v2.s[0]
-	mov		a3, v3.s[0]
-	mov		a4, v4.s[0]
-	mov		a5, v5.s[0]
-	mov		a6, v6.s[0]
-	mov		a7, v7.s[0]
-	mov		a8, v8.s[0]
-	mov		a9, v9.s[0]
-	mov		a10, v10.s[0]
-	mov		a11, v11.s[0]
-	mov		a12, v12.s[0]
-	mov		a13, v13.s[0]
-	mov		a14, v14.s[0]
-	mov		a15, v15.s[0]
-
-	// x12 += counter values 1-4
-	add		v12.4s, v12.4s, v30.4s
-
-.Ldoubleround4:
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	add		v0.4s, v0.4s, v4.4s
-	  add		a0, a0, a4
-	add		v1.4s, v1.4s, v5.4s
-	  add		a1, a1, a5
-	add		v2.4s, v2.4s, v6.4s
-	  add		a2, a2, a6
-	add		v3.4s, v3.4s, v7.4s
-	  add		a3, a3, a7
-
-	eor		v12.16b, v12.16b, v0.16b
-	  eor		a12, a12, a0
-	eor		v13.16b, v13.16b, v1.16b
-	  eor		a13, a13, a1
-	eor		v14.16b, v14.16b, v2.16b
-	  eor		a14, a14, a2
-	eor		v15.16b, v15.16b, v3.16b
-	  eor		a15, a15, a3
-
-	rev32		v12.8h, v12.8h
-	  ror		a12, a12, #16
-	rev32		v13.8h, v13.8h
-	  ror		a13, a13, #16
-	rev32		v14.8h, v14.8h
-	  ror		a14, a14, #16
-	rev32		v15.8h, v15.8h
-	  ror		a15, a15, #16
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	add		v8.4s, v8.4s, v12.4s
-	  add		a8, a8, a12
-	add		v9.4s, v9.4s, v13.4s
-	  add		a9, a9, a13
-	add		v10.4s, v10.4s, v14.4s
-	  add		a10, a10, a14
-	add		v11.4s, v11.4s, v15.4s
-	  add		a11, a11, a15
-
-	eor		v16.16b, v4.16b, v8.16b
-	  eor		a4, a4, a8
-	eor		v17.16b, v5.16b, v9.16b
-	  eor		a5, a5, a9
-	eor		v18.16b, v6.16b, v10.16b
-	  eor		a6, a6, a10
-	eor		v19.16b, v7.16b, v11.16b
-	  eor		a7, a7, a11
-
-	shl		v4.4s, v16.4s, #12
-	shl		v5.4s, v17.4s, #12
-	shl		v6.4s, v18.4s, #12
-	shl		v7.4s, v19.4s, #12
-
-	sri		v4.4s, v16.4s, #20
-	  ror		a4, a4, #20
-	sri		v5.4s, v17.4s, #20
-	  ror		a5, a5, #20
-	sri		v6.4s, v18.4s, #20
-	  ror		a6, a6, #20
-	sri		v7.4s, v19.4s, #20
-	  ror		a7, a7, #20
-
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	add		v0.4s, v0.4s, v4.4s
-	  add		a0, a0, a4
-	add		v1.4s, v1.4s, v5.4s
-	  add		a1, a1, a5
-	add		v2.4s, v2.4s, v6.4s
-	  add		a2, a2, a6
-	add		v3.4s, v3.4s, v7.4s
-	  add		a3, a3, a7
-
-	eor		v12.16b, v12.16b, v0.16b
-	  eor		a12, a12, a0
-	eor		v13.16b, v13.16b, v1.16b
-	  eor		a13, a13, a1
-	eor		v14.16b, v14.16b, v2.16b
-	  eor		a14, a14, a2
-	eor		v15.16b, v15.16b, v3.16b
-	  eor		a15, a15, a3
-
-	tbl		v12.16b, {v12.16b}, v31.16b
-	  ror		a12, a12, #24
-	tbl		v13.16b, {v13.16b}, v31.16b
-	  ror		a13, a13, #24
-	tbl		v14.16b, {v14.16b}, v31.16b
-	  ror		a14, a14, #24
-	tbl		v15.16b, {v15.16b}, v31.16b
-	  ror		a15, a15, #24
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	add		v8.4s, v8.4s, v12.4s
-	  add		a8, a8, a12
-	add		v9.4s, v9.4s, v13.4s
-	  add		a9, a9, a13
-	add		v10.4s, v10.4s, v14.4s
-	  add		a10, a10, a14
-	add		v11.4s, v11.4s, v15.4s
-	  add		a11, a11, a15
-
-	eor		v16.16b, v4.16b, v8.16b
-	  eor		a4, a4, a8
-	eor		v17.16b, v5.16b, v9.16b
-	  eor		a5, a5, a9
-	eor		v18.16b, v6.16b, v10.16b
-	  eor		a6, a6, a10
-	eor		v19.16b, v7.16b, v11.16b
-	  eor		a7, a7, a11
-
-	shl		v4.4s, v16.4s, #7
-	shl		v5.4s, v17.4s, #7
-	shl		v6.4s, v18.4s, #7
-	shl		v7.4s, v19.4s, #7
-
-	sri		v4.4s, v16.4s, #25
-	  ror		a4, a4, #25
-	sri		v5.4s, v17.4s, #25
-	  ror		a5, a5, #25
-	sri		v6.4s, v18.4s, #25
-	 ror		a6, a6, #25
-	sri		v7.4s, v19.4s, #25
-	  ror		a7, a7, #25
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	add		v0.4s, v0.4s, v5.4s
-	  add		a0, a0, a5
-	add		v1.4s, v1.4s, v6.4s
-	  add		a1, a1, a6
-	add		v2.4s, v2.4s, v7.4s
-	  add		a2, a2, a7
-	add		v3.4s, v3.4s, v4.4s
-	  add		a3, a3, a4
-
-	eor		v15.16b, v15.16b, v0.16b
-	  eor		a15, a15, a0
-	eor		v12.16b, v12.16b, v1.16b
-	  eor		a12, a12, a1
-	eor		v13.16b, v13.16b, v2.16b
-	  eor		a13, a13, a2
-	eor		v14.16b, v14.16b, v3.16b
-	  eor		a14, a14, a3
-
-	rev32		v15.8h, v15.8h
-	  ror		a15, a15, #16
-	rev32		v12.8h, v12.8h
-	  ror		a12, a12, #16
-	rev32		v13.8h, v13.8h
-	  ror		a13, a13, #16
-	rev32		v14.8h, v14.8h
-	  ror		a14, a14, #16
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	add		v10.4s, v10.4s, v15.4s
-	  add		a10, a10, a15
-	add		v11.4s, v11.4s, v12.4s
-	  add		a11, a11, a12
-	add		v8.4s, v8.4s, v13.4s
-	  add		a8, a8, a13
-	add		v9.4s, v9.4s, v14.4s
-	  add		a9, a9, a14
-
-	eor		v16.16b, v5.16b, v10.16b
-	  eor		a5, a5, a10
-	eor		v17.16b, v6.16b, v11.16b
-	  eor		a6, a6, a11
-	eor		v18.16b, v7.16b, v8.16b
-	  eor		a7, a7, a8
-	eor		v19.16b, v4.16b, v9.16b
-	  eor		a4, a4, a9
-
-	shl		v5.4s, v16.4s, #12
-	shl		v6.4s, v17.4s, #12
-	shl		v7.4s, v18.4s, #12
-	shl		v4.4s, v19.4s, #12
-
-	sri		v5.4s, v16.4s, #20
-	  ror		a5, a5, #20
-	sri		v6.4s, v17.4s, #20
-	  ror		a6, a6, #20
-	sri		v7.4s, v18.4s, #20
-	  ror		a7, a7, #20
-	sri		v4.4s, v19.4s, #20
-	  ror		a4, a4, #20
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	add		v0.4s, v0.4s, v5.4s
-	  add		a0, a0, a5
-	add		v1.4s, v1.4s, v6.4s
-	  add		a1, a1, a6
-	add		v2.4s, v2.4s, v7.4s
-	  add		a2, a2, a7
-	add		v3.4s, v3.4s, v4.4s
-	  add		a3, a3, a4
-
-	eor		v15.16b, v15.16b, v0.16b
-	  eor		a15, a15, a0
-	eor		v12.16b, v12.16b, v1.16b
-	  eor		a12, a12, a1
-	eor		v13.16b, v13.16b, v2.16b
-	  eor		a13, a13, a2
-	eor		v14.16b, v14.16b, v3.16b
-	  eor		a14, a14, a3
-
-	tbl		v15.16b, {v15.16b}, v31.16b
-	  ror		a15, a15, #24
-	tbl		v12.16b, {v12.16b}, v31.16b
-	  ror		a12, a12, #24
-	tbl		v13.16b, {v13.16b}, v31.16b
-	  ror		a13, a13, #24
-	tbl		v14.16b, {v14.16b}, v31.16b
-	  ror		a14, a14, #24
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	add		v10.4s, v10.4s, v15.4s
-	  add		a10, a10, a15
-	add		v11.4s, v11.4s, v12.4s
-	  add		a11, a11, a12
-	add		v8.4s, v8.4s, v13.4s
-	  add		a8, a8, a13
-	add		v9.4s, v9.4s, v14.4s
-	  add		a9, a9, a14
-
-	eor		v16.16b, v5.16b, v10.16b
-	  eor		a5, a5, a10
-	eor		v17.16b, v6.16b, v11.16b
-	  eor		a6, a6, a11
-	eor		v18.16b, v7.16b, v8.16b
-	  eor		a7, a7, a8
-	eor		v19.16b, v4.16b, v9.16b
-	  eor		a4, a4, a9
-
-	shl		v5.4s, v16.4s, #7
-	shl		v6.4s, v17.4s, #7
-	shl		v7.4s, v18.4s, #7
-	shl		v4.4s, v19.4s, #7
-
-	sri		v5.4s, v16.4s, #25
-	  ror		a5, a5, #25
-	sri		v6.4s, v17.4s, #25
-	  ror		a6, a6, #25
-	sri		v7.4s, v18.4s, #25
-	  ror		a7, a7, #25
-	sri		v4.4s, v19.4s, #25
-	  ror		a4, a4, #25
-
-	subs		w3, w3, #2
-	b.ne		.Ldoubleround4
-
-	ld4r		{v16.4s-v19.4s}, [x0], #16
-	ld4r		{v20.4s-v23.4s}, [x0], #16
-
-	// x12 += counter values 0-3
-	add		v12.4s, v12.4s, v30.4s
-
-	// x0[0-3] += s0[0]
-	// x1[0-3] += s0[1]
-	// x2[0-3] += s0[2]
-	// x3[0-3] += s0[3]
-	add		v0.4s, v0.4s, v16.4s
-	  mov		w6, v16.s[0]
-	  mov		w7, v17.s[0]
-	add		v1.4s, v1.4s, v17.4s
-	  mov		w8, v18.s[0]
-	  mov		w9, v19.s[0]
-	add		v2.4s, v2.4s, v18.4s
-	  add		a0, a0, w6
-	  add		a1, a1, w7
-	add		v3.4s, v3.4s, v19.4s
-	  add		a2, a2, w8
-	  add		a3, a3, w9
-CPU_BE(	  rev		a0, a0		)
-CPU_BE(	  rev		a1, a1		)
-CPU_BE(	  rev		a2, a2		)
-CPU_BE(	  rev		a3, a3		)
-
-	ld4r		{v24.4s-v27.4s}, [x0], #16
-	ld4r		{v28.4s-v31.4s}, [x0]
-
-	// x4[0-3] += s1[0]
-	// x5[0-3] += s1[1]
-	// x6[0-3] += s1[2]
-	// x7[0-3] += s1[3]
-	add		v4.4s, v4.4s, v20.4s
-	  mov		w6, v20.s[0]
-	  mov		w7, v21.s[0]
-	add		v5.4s, v5.4s, v21.4s
-	  mov		w8, v22.s[0]
-	  mov		w9, v23.s[0]
-	add		v6.4s, v6.4s, v22.4s
-	  add		a4, a4, w6
-	  add		a5, a5, w7
-	add		v7.4s, v7.4s, v23.4s
-	  add		a6, a6, w8
-	  add		a7, a7, w9
-CPU_BE(	  rev		a4, a4		)
-CPU_BE(	  rev		a5, a5		)
-CPU_BE(	  rev		a6, a6		)
-CPU_BE(	  rev		a7, a7		)
-
-	// x8[0-3] += s2[0]
-	// x9[0-3] += s2[1]
-	// x10[0-3] += s2[2]
-	// x11[0-3] += s2[3]
-	add		v8.4s, v8.4s, v24.4s
-	  mov		w6, v24.s[0]
-	  mov		w7, v25.s[0]
-	add		v9.4s, v9.4s, v25.4s
-	  mov		w8, v26.s[0]
-	  mov		w9, v27.s[0]
-	add		v10.4s, v10.4s, v26.4s
-	  add		a8, a8, w6
-	  add		a9, a9, w7
-	add		v11.4s, v11.4s, v27.4s
-	  add		a10, a10, w8
-	  add		a11, a11, w9
-CPU_BE(	  rev		a8, a8		)
-CPU_BE(	  rev		a9, a9		)
-CPU_BE(	  rev		a10, a10	)
-CPU_BE(	  rev		a11, a11	)
-
-	// x12[0-3] += s3[0]
-	// x13[0-3] += s3[1]
-	// x14[0-3] += s3[2]
-	// x15[0-3] += s3[3]
-	add		v12.4s, v12.4s, v28.4s
-	  mov		w6, v28.s[0]
-	  mov		w7, v29.s[0]
-	add		v13.4s, v13.4s, v29.4s
-	  mov		w8, v30.s[0]
-	  mov		w9, v31.s[0]
-	add		v14.4s, v14.4s, v30.4s
-	  add		a12, a12, w6
-	  add		a13, a13, w7
-	add		v15.4s, v15.4s, v31.4s
-	  add		a14, a14, w8
-	  add		a15, a15, w9
-CPU_BE(	  rev		a12, a12	)
-CPU_BE(	  rev		a13, a13	)
-CPU_BE(	  rev		a14, a14	)
-CPU_BE(	  rev		a15, a15	)
-
-	// interleave 32-bit words in state n, n+1
-	  ldp		w6, w7, [x2], #64
-	zip1		v16.4s, v0.4s, v1.4s
-	  ldp		w8, w9, [x2, #-56]
-	  eor		a0, a0, w6
-	zip2		v17.4s, v0.4s, v1.4s
-	  eor		a1, a1, w7
-	zip1		v18.4s, v2.4s, v3.4s
-	  eor		a2, a2, w8
-	zip2		v19.4s, v2.4s, v3.4s
-	  eor		a3, a3, w9
-	  ldp		w6, w7, [x2, #-48]
-	zip1		v20.4s, v4.4s, v5.4s
-	  ldp		w8, w9, [x2, #-40]
-	  eor		a4, a4, w6
-	zip2		v21.4s, v4.4s, v5.4s
-	  eor		a5, a5, w7
-	zip1		v22.4s, v6.4s, v7.4s
-	  eor		a6, a6, w8
-	zip2		v23.4s, v6.4s, v7.4s
-	  eor		a7, a7, w9
-	  ldp		w6, w7, [x2, #-32]
-	zip1		v24.4s, v8.4s, v9.4s
-	  ldp		w8, w9, [x2, #-24]
-	  eor		a8, a8, w6
-	zip2		v25.4s, v8.4s, v9.4s
-	  eor		a9, a9, w7
-	zip1		v26.4s, v10.4s, v11.4s
-	  eor		a10, a10, w8
-	zip2		v27.4s, v10.4s, v11.4s
-	  eor		a11, a11, w9
-	  ldp		w6, w7, [x2, #-16]
-	zip1		v28.4s, v12.4s, v13.4s
-	  ldp		w8, w9, [x2, #-8]
-	  eor		a12, a12, w6
-	zip2		v29.4s, v12.4s, v13.4s
-	  eor		a13, a13, w7
-	zip1		v30.4s, v14.4s, v15.4s
-	  eor		a14, a14, w8
-	zip2		v31.4s, v14.4s, v15.4s
-	  eor		a15, a15, w9
-
-	add		x3, x2, x4
-	sub		x3, x3, #128		// start of last block
-
-	subs		x5, x4, #128
-	csel		x2, x2, x3, ge
-
-	// interleave 64-bit words in state n, n+2
-	zip1		v0.2d, v16.2d, v18.2d
-	zip2		v4.2d, v16.2d, v18.2d
-	  stp		a0, a1, [x1], #64
-	zip1		v8.2d, v17.2d, v19.2d
-	zip2		v12.2d, v17.2d, v19.2d
-	  stp		a2, a3, [x1, #-56]
-
-	subs		x6, x4, #192
-	ld1		{v16.16b-v19.16b}, [x2], #64
-	csel		x2, x2, x3, ge
-
-	zip1		v1.2d, v20.2d, v22.2d
-	zip2		v5.2d, v20.2d, v22.2d
-	  stp		a4, a5, [x1, #-48]
-	zip1		v9.2d, v21.2d, v23.2d
-	zip2		v13.2d, v21.2d, v23.2d
-	  stp		a6, a7, [x1, #-40]
-
-	subs		x7, x4, #256
-	ld1		{v20.16b-v23.16b}, [x2], #64
-	csel		x2, x2, x3, ge
-
-	zip1		v2.2d, v24.2d, v26.2d
-	zip2		v6.2d, v24.2d, v26.2d
-	  stp		a8, a9, [x1, #-32]
-	zip1		v10.2d, v25.2d, v27.2d
-	zip2		v14.2d, v25.2d, v27.2d
-	  stp		a10, a11, [x1, #-24]
-
-	subs		x8, x4, #320
-	ld1		{v24.16b-v27.16b}, [x2], #64
-	csel		x2, x2, x3, ge
-
-	zip1		v3.2d, v28.2d, v30.2d
-	zip2		v7.2d, v28.2d, v30.2d
-	  stp		a12, a13, [x1, #-16]
-	zip1		v11.2d, v29.2d, v31.2d
-	zip2		v15.2d, v29.2d, v31.2d
-	  stp		a14, a15, [x1, #-8]
-
-	tbnz		x5, #63, .Lt128
-	ld1		{v28.16b-v31.16b}, [x2]
-
-	// xor with corresponding input, write to output
-	eor		v16.16b, v16.16b, v0.16b
-	eor		v17.16b, v17.16b, v1.16b
-	eor		v18.16b, v18.16b, v2.16b
-	eor		v19.16b, v19.16b, v3.16b
-
-	tbnz		x6, #63, .Lt192
-
-	eor		v20.16b, v20.16b, v4.16b
-	eor		v21.16b, v21.16b, v5.16b
-	eor		v22.16b, v22.16b, v6.16b
-	eor		v23.16b, v23.16b, v7.16b
-
-	st1		{v16.16b-v19.16b}, [x1], #64
-	tbnz		x7, #63, .Lt256
-
-	eor		v24.16b, v24.16b, v8.16b
-	eor		v25.16b, v25.16b, v9.16b
-	eor		v26.16b, v26.16b, v10.16b
-	eor		v27.16b, v27.16b, v11.16b
-
-	st1		{v20.16b-v23.16b}, [x1], #64
-	tbnz		x8, #63, .Lt320
-
-	eor		v28.16b, v28.16b, v12.16b
-	eor		v29.16b, v29.16b, v13.16b
-	eor		v30.16b, v30.16b, v14.16b
-	eor		v31.16b, v31.16b, v15.16b
-
-	st1		{v24.16b-v27.16b}, [x1], #64
-	st1		{v28.16b-v31.16b}, [x1]
-
-.Lout:	frame_pop
-	ret
-
-	// fewer than 192 bytes of in/output
-.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
-	ld1		{v28.16b-v31.16b}, [x10]
-	add		x5, x5, x1
-	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
-	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
-	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
-	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
-
-0:	eor		v20.16b, v20.16b, v28.16b
-	eor		v21.16b, v21.16b, v29.16b
-	eor		v22.16b, v22.16b, v30.16b
-	eor		v23.16b, v23.16b, v31.16b
-	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
-1:	st1		{v16.16b-v19.16b}, [x1]
-	b		.Lout
-
-	// fewer than 128 bytes of in/output
-.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
-	add		x5, x5, x1
-	sub		x1, x1, #64
-	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
-	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
-	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
-	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
-	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
-	b		0b
-
-	// fewer than 256 bytes of in/output
-.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
-	ld1		{v4.16b-v7.16b}, [x10]
-	add		x6, x6, x1
-	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
-	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
-	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
-	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
-
-	eor		v28.16b, v28.16b, v0.16b
-	eor		v29.16b, v29.16b, v1.16b
-	eor		v30.16b, v30.16b, v2.16b
-	eor		v31.16b, v31.16b, v3.16b
-	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
-2:	st1		{v20.16b-v23.16b}, [x1]
-	b		.Lout
-
-	// fewer than 320 bytes of in/output
-.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
-	ld1		{v4.16b-v7.16b}, [x10]
-	add		x7, x7, x1
-	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
-	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
-	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
-	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
-
-	eor		v28.16b, v28.16b, v0.16b
-	eor		v29.16b, v29.16b, v1.16b
-	eor		v30.16b, v30.16b, v2.16b
-	eor		v31.16b, v31.16b, v3.16b
-	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
-3:	st1		{v24.16b-v27.16b}, [x1]
-	b		.Lout
-SYM_FUNC_END(chacha_4block_xor_neon)
-
-	.section	".rodata", "a", %progbits
-	.align		L1_CACHE_SHIFT
-.Lpermute:
-	.set		.Li, 0
-	.rept		128
-	.byte		(.Li - 64)
-	.set		.Li, .Li + 1
-	.endr
-
-CTRINC:	.word		1, 2, 3, 4
-ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/lib/crypto/chacha-neon-glue.c b/arch/arm64/lib/crypto/chacha-neon-glue.c
deleted file mode 100644
index d0188f974ca5..000000000000
--- a/arch/arm64/lib/crypto/chacha-neon-glue.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * ChaCha and HChaCha functions (ARM64 optimized)
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
-				      u8 *dst, const u8 *src, int nrounds);
-asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
-				       u8 *dst, const u8 *src,
-				       int nrounds, int bytes);
-asmlinkage void hchacha_block_neon(const struct chacha_state *state,
-				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
-			  int bytes, int nrounds)
-{
-	while (bytes > 0) {
-		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
-
-		if (l <= CHACHA_BLOCK_SIZE) {
-			u8 buf[CHACHA_BLOCK_SIZE];
-
-			memcpy(buf, src, l);
-			chacha_block_xor_neon(state, buf, buf, nrounds);
-			memcpy(dst, buf, l);
-			state->x[12] += 1;
-			break;
-		}
-		chacha_4block_xor_neon(state, dst, src, nrounds, l);
-		bytes -= l;
-		src += l;
-		dst += l;
-		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
-	}
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
-		hchacha_block_generic(state, out, nrounds);
-	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, out, nrounds);
-		kernel_neon_end();
-	}
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
-{
-	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
-	    !crypto_simd_usable())
-		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
-	do {
-		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
-
-		bytes -= todo;
-		src += todo;
-		dst += todo;
-	} while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-	return static_key_enabled(&have_neon);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
-{
-	if (cpu_have_named_feature(ASIMD))
-		static_branch_enable(&have_neon);
-	return 0;
-}
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crypto/poly1305-armv8.pl b/arch/arm64/lib/crypto/poly1305-armv8.pl
deleted file mode 100644
index 22c9069c0650..000000000000
--- a/arch/arm64/lib/crypto/poly1305-armv8.pl
+++ /dev/null
@@ -1,917 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-#		IALU/gcc-4.9	NEON
-#
-# Apple A7	1.86/+5%	0.72
-# Cortex-A53	2.69/+58%	1.47
-# Cortex-A57	2.70/+7%	1.14
-# Denver	1.64/+50%	1.18(*)
-# X-Gene	2.13/+68%	2.27
-# Mongoose	1.77/+75%	1.12
-# Kryo		2.70/+55%	1.13
-# ThunderX2	1.17/+95%	1.36
-#
-# (*)	estimate based on resources availability is less than 1.0,
-#	i.e. measured result is worse than expected, presumably binary
-#	translator is not almighty;
-
-$flavour=shift;
-$output=shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern	OPENSSL_armcap_P
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl	poly1305_blocks
-.globl	poly1305_emit
-
-.globl	poly1305_init
-.type	poly1305_init,%function
-.align	5
-poly1305_init:
-	cmp	$inp,xzr
-	stp	xzr,xzr,[$ctx]		// zero hash value
-	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
-
-	csel	x0,xzr,x0,eq
-	b.eq	.Lno_key
-
-#ifndef	__KERNEL__
-	adrp	x17,OPENSSL_armcap_P
-	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
-#endif
-
-	ldp	$r0,$r1,[$inp]		// load key
-	mov	$s1,#0xfffffffc0fffffff
-	movk	$s1,#0x0fff,lsl#48
-#ifdef	__AARCH64EB__
-	rev	$r0,$r0			// flip bytes
-	rev	$r1,$r1
-#endif
-	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
-	and	$s1,$s1,#-4
-	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
-	mov	w#$s1,#-1
-	stp	$r0,$r1,[$ctx,#32]	// save key value
-	str	w#$s1,[$ctx,#48]	// impossible key power value
-
-#ifndef	__KERNEL__
-	tst	w17,#ARMV7_NEON
-
-	adr	$d0,.Lpoly1305_blocks
-	adr	$r0,.Lpoly1305_blocks_neon
-	adr	$d1,.Lpoly1305_emit
-
-	csel	$d0,$d0,$r0,eq
-
-# ifdef	__ILP32__
-	stp	w#$d0,w#$d1,[$len]
-# else
-	stp	$d0,$d1,[$len]
-# endif
-#endif
-	mov	x0,#1
-.Lno_key:
-	ret
-.size	poly1305_init,.-poly1305_init
-
-.type	poly1305_blocks,%function
-.align	5
-poly1305_blocks:
-.Lpoly1305_blocks:
-	ands	$len,$len,#-16
-	b.eq	.Lno_data
-
-	ldp	$h0,$h1,[$ctx]		// load hash value
-	ldp	$h2,x17,[$ctx,#16]	// [along with is_base2_26]
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-#ifdef	__AARCH64EB__
-	lsr	$d0,$h0,#32
-	mov	w#$d1,w#$h0
-	lsr	$d2,$h1,#32
-	mov	w15,w#$h1
-	lsr	x16,$h2,#32
-#else
-	mov	w#$d0,w#$h0
-	lsr	$d1,$h0,#32
-	mov	w#$d2,w#$h1
-	lsr	x15,$h1,#32
-	mov	w16,w#$h2
-#endif
-
-	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
-	lsr	$d1,$d2,#12
-	adds	$d0,$d0,$d2,lsl#52
-	add	$d1,$d1,x15,lsl#14
-	adc	$d1,$d1,xzr
-	lsr	$d2,x16,#24
-	adds	$d1,$d1,x16,lsl#40
-	adc	$d2,$d2,xzr
-
-	cmp	x17,#0			// is_base2_26?
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-	csel	$h0,$h0,$d0,eq		// choose between radixes
-	csel	$h1,$h1,$d1,eq
-	csel	$h2,$h2,$d2,eq
-
-.Loop:
-	ldp	$t0,$t1,[$inp],#16	// load input
-	sub	$len,$len,#16
-#ifdef	__AARCH64EB__
-	rev	$t0,$t0
-	rev	$t1,$t1
-#endif
-	adds	$h0,$h0,$t0		// accumulate input
-	adcs	$h1,$h1,$t1
-
-	mul	$d0,$h0,$r0		// h0*r0
-	adc	$h2,$h2,$padbit
-	umulh	$d1,$h0,$r0
-
-	mul	$t0,$h1,$s1		// h1*5*r1
-	umulh	$t1,$h1,$s1
-
-	adds	$d0,$d0,$t0
-	mul	$t0,$h0,$r1		// h0*r1
-	adc	$d1,$d1,$t1
-	umulh	$d2,$h0,$r1
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h1,$r0		// h1*r0
-	adc	$d2,$d2,xzr
-	umulh	$t1,$h1,$r0
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h2,$s1		// h2*5*r1
-	adc	$d2,$d2,$t1
-	mul	$t1,$h2,$r0		// h2*r0
-
-	adds	$d1,$d1,$t0
-	adc	$d2,$d2,$t1
-
-	and	$t0,$d2,#-4		// final reduction
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$d0,$t0
-	adcs	$h1,$d1,xzr
-	adc	$h2,$h2,xzr
-
-	cbnz	$len,.Loop
-
-	stp	$h0,$h1,[$ctx]		// store hash value
-	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26]
-
-.Lno_data:
-	ret
-.size	poly1305_blocks,.-poly1305_blocks
-
-.type	poly1305_emit,%function
-.align	5
-poly1305_emit:
-.Lpoly1305_emit:
-	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
-	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26]
-	ldp	$t0,$t1,[$nonce]	// load nonce
-
-#ifdef	__AARCH64EB__
-	lsr	$d0,$h0,#32
-	mov	w#$d1,w#$h0
-	lsr	$d2,$h1,#32
-	mov	w15,w#$h1
-	lsr	x16,$h2,#32
-#else
-	mov	w#$d0,w#$h0
-	lsr	$d1,$h0,#32
-	mov	w#$d2,w#$h1
-	lsr	x15,$h1,#32
-	mov	w16,w#$h2
-#endif
-
-	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
-	lsr	$d1,$d2,#12
-	adds	$d0,$d0,$d2,lsl#52
-	add	$d1,$d1,x15,lsl#14
-	adc	$d1,$d1,xzr
-	lsr	$d2,x16,#24
-	adds	$d1,$d1,x16,lsl#40
-	adc	$d2,$d2,xzr
-
-	cmp	$r0,#0			// is_base2_26?
-	csel	$h0,$h0,$d0,eq		// choose between radixes
-	csel	$h1,$h1,$d1,eq
-	csel	$h2,$h2,$d2,eq
-
-	adds	$d0,$h0,#5		// compare to modulus
-	adcs	$d1,$h1,xzr
-	adc	$d2,$h2,xzr
-
-	tst	$d2,#-4			// see if it's carried/borrowed
-
-	csel	$h0,$h0,$d0,eq
-	csel	$h1,$h1,$d1,eq
-
-#ifdef	__AARCH64EB__
-	ror	$t0,$t0,#32		// flip nonce words
-	ror	$t1,$t1,#32
-#endif
-	adds	$h0,$h0,$t0		// accumulate nonce
-	adc	$h1,$h1,$t1
-#ifdef	__AARCH64EB__
-	rev	$h0,$h0			// flip output bytes
-	rev	$h1,$h1
-#endif
-	stp	$h0,$h1,[$mac]		// write result
-
-	ret
-.size	poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros;		# borrow
-
-$code.=<<___;
-.type	poly1305_mult,%function
-.align	5
-poly1305_mult:
-	mul	$d0,$h0,$r0		// h0*r0
-	umulh	$d1,$h0,$r0
-
-	mul	$t0,$h1,$s1		// h1*5*r1
-	umulh	$t1,$h1,$s1
-
-	adds	$d0,$d0,$t0
-	mul	$t0,$h0,$r1		// h0*r1
-	adc	$d1,$d1,$t1
-	umulh	$d2,$h0,$r1
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h1,$r0		// h1*r0
-	adc	$d2,$d2,xzr
-	umulh	$t1,$h1,$r0
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h2,$s1		// h2*5*r1
-	adc	$d2,$d2,$t1
-	mul	$t1,$h2,$r0		// h2*r0
-
-	adds	$d1,$d1,$t0
-	adc	$d2,$d2,$t1
-
-	and	$t0,$d2,#-4		// final reduction
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$d0,$t0
-	adcs	$h1,$d1,xzr
-	adc	$h2,$h2,xzr
-
-	ret
-.size	poly1305_mult,.-poly1305_mult
-
-.type	poly1305_splat,%function
-.align	4
-poly1305_splat:
-	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x13,$h0,#26,#26
-	extr	x14,$h1,$h0,#52
-	and	x14,x14,#0x03ffffff
-	ubfx	x15,$h1,#14,#26
-	extr	x16,$h2,$h1,#40
-
-	str	w12,[$ctx,#16*0]	// r0
-	add	w12,w13,w13,lsl#2	// r1*5
-	str	w13,[$ctx,#16*1]	// r1
-	add	w13,w14,w14,lsl#2	// r2*5
-	str	w12,[$ctx,#16*2]	// s1
-	str	w14,[$ctx,#16*3]	// r2
-	add	w14,w15,w15,lsl#2	// r3*5
-	str	w13,[$ctx,#16*4]	// s2
-	str	w15,[$ctx,#16*5]	// r3
-	add	w15,w16,w16,lsl#2	// r4*5
-	str	w14,[$ctx,#16*6]	// s3
-	str	w16,[$ctx,#16*7]	// r4
-	str	w15,[$ctx,#16*8]	// s4
-
-	ret
-.size	poly1305_splat,.-poly1305_splat
-
-#ifdef	__KERNEL__
-.globl	poly1305_blocks_neon
-#endif
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
-	ldr	$is_base2_26,[$ctx,#24]
-	cmp	$len,#128
-	b.lo	.Lpoly1305_blocks
-
-	.inst	0xd503233f		// paciasp
-	stp	x29,x30,[sp,#-80]!
-	add	x29,sp,#0
-
-	stp	d8,d9,[sp,#16]		// meet ABI requirements
-	stp	d10,d11,[sp,#32]
-	stp	d12,d13,[sp,#48]
-	stp	d14,d15,[sp,#64]
-
-	cbz	$is_base2_26,.Lbase2_64_neon
-
-	ldp	w10,w11,[$ctx]		// load hash value base 2^26
-	ldp	w12,w13,[$ctx,#8]
-	ldr	w14,[$ctx,#16]
-
-	tst	$len,#31
-	b.eq	.Leven_neon
-
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
-	lsr	$h1,x12,#12
-	adds	$h0,$h0,x12,lsl#52
-	add	$h1,$h1,x13,lsl#14
-	adc	$h1,$h1,xzr
-	lsr	$h2,x14,#24
-	adds	$h1,$h1,x14,lsl#40
-	adc	$d2,$h2,xzr		// can be partially reduced...
-
-	ldp	$d0,$d1,[$inp],#16	// load input
-	sub	$len,$len,#16
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-
-#ifdef	__AARCH64EB__
-	rev	$d0,$d0
-	rev	$d1,$d1
-#endif
-	adds	$h0,$h0,$d0		// accumulate input
-	adcs	$h1,$h1,$d1
-	adc	$h2,$h2,$padbit
-
-	bl	poly1305_mult
-
-	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,$h0,#26,#26
-	extr	x12,$h1,$h0,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,$h1,#14,#26
-	extr	x14,$h2,$h1,#40
-
-	b	.Leven_neon
-
-.align	4
-.Lbase2_64_neon:
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
-	ldr	$h2,[$ctx,#16]
-
-	tst	$len,#31
-	b.eq	.Linit_neon
-
-	ldp	$d0,$d1,[$inp],#16	// load input
-	sub	$len,$len,#16
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-#ifdef	__AARCH64EB__
-	rev	$d0,$d0
-	rev	$d1,$d1
-#endif
-	adds	$h0,$h0,$d0		// accumulate input
-	adcs	$h1,$h1,$d1
-	adc	$h2,$h2,$padbit
-
-	bl	poly1305_mult
-
-.Linit_neon:
-	ldr	w17,[$ctx,#48]		// first table element
-	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,$h0,#26,#26
-	extr	x12,$h1,$h0,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,$h1,#14,#26
-	extr	x14,$h2,$h1,#40
-
-	cmp	w17,#-1			// is value impossible?
-	b.ne	.Leven_neon
-
-	fmov	${H0},x10
-	fmov	${H1},x11
-	fmov	${H2},x12
-	fmov	${H3},x13
-	fmov	${H4},x14
-
-	////////////////////////////////// initialize r^n table
-	mov	$h0,$r0			// r^1
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-	mov	$h1,$r1
-	mov	$h2,xzr
-	add	$ctx,$ctx,#48+12
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^2
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^3
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^4
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-	sub	$ctx,$ctx,#48		// restore original $ctx
-	b	.Ldo_neon
-
-.align	4
-.Leven_neon:
-	fmov	${H0},x10
-	fmov	${H1},x11
-	fmov	${H2},x12
-	fmov	${H3},x13
-	fmov	${H4},x14
-
-.Ldo_neon:
-	ldp	x8,x12,[$inp,#32]	// inp[2:3]
-	subs	$len,$len,#64
-	ldp	x9,x13,[$inp,#48]
-	add	$in2,$inp,#96
-	adrp	$zeros,.Lzeros
-	add	$zeros,$zeros,#:lo12:.Lzeros
-
-	lsl	$padbit,$padbit,#24
-	add	x15,$ctx,#48
-
-#ifdef	__AARCH64EB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	$IN23_0,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,$padbit,x12,lsr#40
-	add	x13,$padbit,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	$IN23_1,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	fmov	$IN23_2,x8
-	fmov	$IN23_3,x10
-	fmov	$IN23_4,x12
-
-	ldp	x8,x12,[$inp],#16	// inp[0:1]
-	ldp	x9,x13,[$inp],#48
-
-	ld1	{$R0,$R1,$S1,$R2},[x15],#64
-	ld1	{$S2,$R3,$S3,$R4},[x15],#64
-	ld1	{$S4},[x15]
-
-#ifdef	__AARCH64EB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	$IN01_0,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,$padbit,x12,lsr#40
-	add	x13,$padbit,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	$IN01_1,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	movi	$MASK.2d,#-1
-	fmov	$IN01_2,x8
-	fmov	$IN01_3,x10
-	fmov	$IN01_4,x12
-	ushr	$MASK.2d,$MASK.2d,#38
-
-	b.ls	.Lskip_loop
-
-.align	4
-.Loop_neon:
-	////////////////////////////////////////////////////////////////
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	//   \___________________/
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	//   \___________________/ \____________________/
-	//
-	// Note that we start with inp[2:3]*r^2. This is because it
-	// doesn't depend on reduction in previous iteration.
-	////////////////////////////////////////////////////////////////
-	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
-	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
-	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
-	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	subs	$len,$len,#64
-	umull	$ACC4,$IN23_0,${R4}[2]
-	csel	$in2,$zeros,$in2,lo
-	umull	$ACC3,$IN23_0,${R3}[2]
-	umull	$ACC2,$IN23_0,${R2}[2]
-	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
-	umull	$ACC1,$IN23_0,${R1}[2]
-	 ldp	x9,x13,[$in2],#48
-	umull	$ACC0,$IN23_0,${R0}[2]
-#ifdef	__AARCH64EB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	umlal	$ACC4,$IN23_1,${R3}[2]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	$ACC3,$IN23_1,${R2}[2]
-	 and	x5,x9,#0x03ffffff
-	umlal	$ACC2,$IN23_1,${R1}[2]
-	 ubfx	x6,x8,#26,#26
-	umlal	$ACC1,$IN23_1,${R0}[2]
-	 ubfx	x7,x9,#26,#26
-	umlal	$ACC0,$IN23_1,${S4}[2]
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-
-	umlal	$ACC4,$IN23_2,${R2}[2]
-	 extr	x8,x12,x8,#52
-	umlal	$ACC3,$IN23_2,${R1}[2]
-	 extr	x9,x13,x9,#52
-	umlal	$ACC2,$IN23_2,${R0}[2]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	$ACC1,$IN23_2,${S4}[2]
-	 fmov	$IN23_0,x4
-	umlal	$ACC0,$IN23_2,${S3}[2]
-	 and	x8,x8,#0x03ffffff
-
-	umlal	$ACC4,$IN23_3,${R1}[2]
-	 and	x9,x9,#0x03ffffff
-	umlal	$ACC3,$IN23_3,${R0}[2]
-	 ubfx	x10,x12,#14,#26
-	umlal	$ACC2,$IN23_3,${S4}[2]
-	 ubfx	x11,x13,#14,#26
-	umlal	$ACC1,$IN23_3,${S3}[2]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	$ACC0,$IN23_3,${S2}[2]
-	 fmov	$IN23_1,x6
-
-	add	$IN01_2,$IN01_2,$H2
-	 add	x12,$padbit,x12,lsr#40
-	umlal	$ACC4,$IN23_4,${R0}[2]
-	 add	x13,$padbit,x13,lsr#40
-	umlal	$ACC3,$IN23_4,${S4}[2]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	$ACC2,$IN23_4,${S3}[2]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	$ACC1,$IN23_4,${S2}[2]
-	 fmov	$IN23_2,x8
-	umlal	$ACC0,$IN23_4,${S1}[2]
-	 fmov	$IN23_3,x10
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4 and accumulate
-
-	add	$IN01_0,$IN01_0,$H0
-	 fmov	$IN23_4,x12
-	umlal	$ACC3,$IN01_2,${R1}[0]
-	 ldp	x8,x12,[$inp],#16	// inp[0:1]
-	umlal	$ACC0,$IN01_2,${S3}[0]
-	 ldp	x9,x13,[$inp],#48
-	umlal	$ACC4,$IN01_2,${R2}[0]
-	umlal	$ACC1,$IN01_2,${S4}[0]
-	umlal	$ACC2,$IN01_2,${R0}[0]
-#ifdef	__AARCH64EB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	add	$IN01_1,$IN01_1,$H1
-	umlal	$ACC3,$IN01_0,${R3}[0]
-	umlal	$ACC4,$IN01_0,${R4}[0]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	$ACC2,$IN01_0,${R2}[0]
-	 and	x5,x9,#0x03ffffff
-	umlal	$ACC0,$IN01_0,${R0}[0]
-	 ubfx	x6,x8,#26,#26
-	umlal	$ACC1,$IN01_0,${R1}[0]
-	 ubfx	x7,x9,#26,#26
-
-	add	$IN01_3,$IN01_3,$H3
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	umlal	$ACC3,$IN01_1,${R2}[0]
-	 extr	x8,x12,x8,#52
-	umlal	$ACC4,$IN01_1,${R3}[0]
-	 extr	x9,x13,x9,#52
-	umlal	$ACC0,$IN01_1,${S4}[0]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	$ACC2,$IN01_1,${R1}[0]
-	 fmov	$IN01_0,x4
-	umlal	$ACC1,$IN01_1,${R0}[0]
-	 and	x8,x8,#0x03ffffff
-
-	add	$IN01_4,$IN01_4,$H4
-	 and	x9,x9,#0x03ffffff
-	umlal	$ACC3,$IN01_3,${R0}[0]
-	 ubfx	x10,x12,#14,#26
-	umlal	$ACC0,$IN01_3,${S2}[0]
-	 ubfx	x11,x13,#14,#26
-	umlal	$ACC4,$IN01_3,${R1}[0]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	$ACC1,$IN01_3,${S3}[0]
-	 fmov	$IN01_1,x6
-	umlal	$ACC2,$IN01_3,${S4}[0]
-	 add	x12,$padbit,x12,lsr#40
-
-	umlal	$ACC3,$IN01_4,${S4}[0]
-	 add	x13,$padbit,x13,lsr#40
-	umlal	$ACC0,$IN01_4,${S1}[0]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	$ACC4,$IN01_4,${R0}[0]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	$ACC1,$IN01_4,${S2}[0]
-	 fmov	$IN01_2,x8
-	umlal	$ACC2,$IN01_4,${S3}[0]
-	 fmov	$IN01_3,x10
-	 fmov	$IN01_4,x12
-
-	/////////////////////////////////////////////////////////////////
-	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	// and P. Schwabe
-	//
-	// [see discussion in poly1305-armv4 module]
-
-	ushr	$T0.2d,$ACC3,#26
-	xtn	$H3,$ACC3
-	 ushr	$T1.2d,$ACC0,#26
-	 and	$ACC0,$ACC0,$MASK.2d
-	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
-	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
-	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-
-	ushr	$T0.2d,$ACC4,#26
-	xtn	$H4,$ACC4
-	 ushr	$T1.2d,$ACC1,#26
-	 xtn	$H1,$ACC1
-	bic	$H4,#0xfc,lsl#24
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
-
-	add	$ACC0,$ACC0,$T0.2d
-	shl	$T0.2d,$T0.2d,#2
-	 shrn	$T1.2s,$ACC2,#26
-	 xtn	$H2,$ACC2
-	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
-	 bic	$H1,#0xfc,lsl#24
-	 add	$H3,$H3,$T1.2s		// h2 -> h3
-	 bic	$H2,#0xfc,lsl#24
-
-	shrn	$T0.2s,$ACC0,#26
-	xtn	$H0,$ACC0
-	 ushr	$T1.2s,$H3,#26
-	 bic	$H3,#0xfc,lsl#24
-	 bic	$H0,#0xfc,lsl#24
-	add	$H1,$H1,$T0.2s		// h0 -> h1
-	 add	$H4,$H4,$T1.2s		// h3 -> h4
-
-	b.hi	.Loop_neon
-
-.Lskip_loop:
-	dup	$IN23_2,${IN23_2}[0]
-	add	$IN01_2,$IN01_2,$H2
-
-	////////////////////////////////////////////////////////////////
-	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	adds	$len,$len,#32
-	b.ne	.Long_tail
-
-	dup	$IN23_2,${IN01_2}[0]
-	add	$IN23_0,$IN01_0,$H0
-	add	$IN23_3,$IN01_3,$H3
-	add	$IN23_1,$IN01_1,$H1
-	add	$IN23_4,$IN01_4,$H4
-
-.Long_tail:
-	dup	$IN23_0,${IN23_0}[0]
-	umull2	$ACC0,$IN23_2,${S3}
-	umull2	$ACC3,$IN23_2,${R1}
-	umull2	$ACC4,$IN23_2,${R2}
-	umull2	$ACC2,$IN23_2,${R0}
-	umull2	$ACC1,$IN23_2,${S4}
-
-	dup	$IN23_1,${IN23_1}[0]
-	umlal2	$ACC0,$IN23_0,${R0}
-	umlal2	$ACC2,$IN23_0,${R2}
-	umlal2	$ACC3,$IN23_0,${R3}
-	umlal2	$ACC4,$IN23_0,${R4}
-	umlal2	$ACC1,$IN23_0,${R1}
-
-	dup	$IN23_3,${IN23_3}[0]
-	umlal2	$ACC0,$IN23_1,${S4}
-	umlal2	$ACC3,$IN23_1,${R2}
-	umlal2	$ACC2,$IN23_1,${R1}
-	umlal2	$ACC4,$IN23_1,${R3}
-	umlal2	$ACC1,$IN23_1,${R0}
-
-	dup	$IN23_4,${IN23_4}[0]
-	umlal2	$ACC3,$IN23_3,${R0}
-	umlal2	$ACC4,$IN23_3,${R1}
-	umlal2	$ACC0,$IN23_3,${S2}
-	umlal2	$ACC1,$IN23_3,${S3}
-	umlal2	$ACC2,$IN23_3,${S4}
-
-	umlal2	$ACC3,$IN23_4,${S4}
-	umlal2	$ACC0,$IN23_4,${S1}
-	umlal2	$ACC4,$IN23_4,${R0}
-	umlal2	$ACC1,$IN23_4,${S2}
-	umlal2	$ACC2,$IN23_4,${S3}
-
-	b.eq	.Lshort_tail
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	add	$IN01_0,$IN01_0,$H0
-	umlal	$ACC3,$IN01_2,${R1}
-	umlal	$ACC0,$IN01_2,${S3}
-	umlal	$ACC4,$IN01_2,${R2}
-	umlal	$ACC1,$IN01_2,${S4}
-	umlal	$ACC2,$IN01_2,${R0}
-
-	add	$IN01_1,$IN01_1,$H1
-	umlal	$ACC3,$IN01_0,${R3}
-	umlal	$ACC0,$IN01_0,${R0}
-	umlal	$ACC4,$IN01_0,${R4}
-	umlal	$ACC1,$IN01_0,${R1}
-	umlal	$ACC2,$IN01_0,${R2}
-
-	add	$IN01_3,$IN01_3,$H3
-	umlal	$ACC3,$IN01_1,${R2}
-	umlal	$ACC0,$IN01_1,${S4}
-	umlal	$ACC4,$IN01_1,${R3}
-	umlal	$ACC1,$IN01_1,${R0}
-	umlal	$ACC2,$IN01_1,${R1}
-
-	add	$IN01_4,$IN01_4,$H4
-	umlal	$ACC3,$IN01_3,${R0}
-	umlal	$ACC0,$IN01_3,${S2}
-	umlal	$ACC4,$IN01_3,${R1}
-	umlal	$ACC1,$IN01_3,${S3}
-	umlal	$ACC2,$IN01_3,${S4}
-
-	umlal	$ACC3,$IN01_4,${S4}
-	umlal	$ACC0,$IN01_4,${S1}
-	umlal	$ACC4,$IN01_4,${R0}
-	umlal	$ACC1,$IN01_4,${S2}
-	umlal	$ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
-	////////////////////////////////////////////////////////////////
-	// horizontal add
-
-	addp	$ACC3,$ACC3,$ACC3
-	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
-	addp	$ACC0,$ACC0,$ACC0
-	 ldp	d10,d11,[sp,#32]
-	addp	$ACC4,$ACC4,$ACC4
-	 ldp	d12,d13,[sp,#48]
-	addp	$ACC1,$ACC1,$ACC1
-	 ldp	d14,d15,[sp,#64]
-	addp	$ACC2,$ACC2,$ACC2
-	 ldr	x30,[sp,#8]
-
-	////////////////////////////////////////////////////////////////
-	// lazy reduction, but without narrowing
-
-	ushr	$T0.2d,$ACC3,#26
-	and	$ACC3,$ACC3,$MASK.2d
-	 ushr	$T1.2d,$ACC0,#26
-	 and	$ACC0,$ACC0,$MASK.2d
-
-	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
-	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-
-	ushr	$T0.2d,$ACC4,#26
-	and	$ACC4,$ACC4,$MASK.2d
-	 ushr	$T1.2d,$ACC1,#26
-	 and	$ACC1,$ACC1,$MASK.2d
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
-
-	add	$ACC0,$ACC0,$T0.2d
-	shl	$T0.2d,$T0.2d,#2
-	 ushr	$T1.2d,$ACC2,#26
-	 and	$ACC2,$ACC2,$MASK.2d
-	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
-	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
-
-	ushr	$T0.2d,$ACC0,#26
-	and	$ACC0,$ACC0,$MASK.2d
-	 ushr	$T1.2d,$ACC3,#26
-	 and	$ACC3,$ACC3,$MASK.2d
-	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
-	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
-
-	////////////////////////////////////////////////////////////////
-	// write the result, can be partially reduced
-
-	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
-	mov	x4,#1
-	st1	{$ACC4}[0],[$ctx]
-	str	x4,[$ctx,#8]		// set is_base2_26
-
-	ldr	x29,[sp],#80
-	 .inst	0xd50323bf		// autiasp
-	ret
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.pushsection .rodata
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0
-.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
-.popsection
-
-.align	2
-#if !defined(__KERNEL__) && !defined(_WIN64)
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
-___
-
-foreach (split("\n",$code)) {
-	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
-	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
-	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
-	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
-	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
-	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
-	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
-	s/\.[124]([sd])\[/.$1\[/;
-	s/w#x([0-9]+)/w$1/g;
-
-	print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c
deleted file mode 100644
index 6a661cf04821..000000000000
--- a/arch/arm64/lib/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
-	struct poly1305_block_state *state,
-	const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
-				const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
-				     const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-				   u8 digest[POLY1305_DIGEST_SIZE],
-				   const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
-			  unsigned int len, u32 padbit)
-{
-	len = round_down(len, POLY1305_BLOCK_SIZE);
-	if (static_branch_likely(&have_neon)) {
-		do {
-			unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, 1);
-			kernel_neon_end();
-
-			len -= todo;
-			src += todo;
-		} while (len);
-	} else
-		poly1305_blocks(state, src, len, 1);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM64 scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init neon_poly1305_mod_init(void)
-{
-	if (cpu_have_named_feature(ASIMD))
-		static_branch_enable(&have_neon);
-	return 0;
-}
-subsys_initcall(neon_poly1305_mod_init);
-
-static void __exit neon_poly1305_mod_exit(void)
-{
-}
-module_exit(neon_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crypto/sha2-armv8.pl b/arch/arm64/lib/crypto/sha2-armv8.pl
deleted file mode 100644
index 4aebd20c498b..000000000000
--- a/arch/arm64/lib/crypto/sha2-armv8.pl
+++ /dev/null
@@ -1,786 +0,0 @@
-#! /usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# SHA256/512 for ARMv8.
-#
-# Performance in cycles per processed byte and improvement coefficient
-# over code generated with "default" compiler:
-#
-#		SHA256-hw	SHA256(*)	SHA512
-# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
-# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
-# Denver	2.01		10.5 (+26%)	6.70 (+8%)
-# X-Gene			20.0 (+100%)	12.8 (+300%(***))
-# Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
-#
-# (*)	Software SHA256 results are of lesser relevance, presented
-#	mostly for informational purposes.
-# (**)	The result is a trade-off: it's possible to improve it by
-#	10% (or by 1 cycle per round), but at the cost of 20% loss
-#	on Cortex-A53 (or by 4 cycles per round).
-# (***)	Super-impressive coefficients over gcc-generated code are
-#	indication of some compiler "pathology", most notably code
-#	generated with -mgeneral-regs-only is significantly faster
-#	and the gap is only 40-90%.
-#
-# October 2016.
-#
-# Originally it was reckoned that it makes no sense to implement NEON
-# version of SHA256 for 64-bit processors. This is because performance
-# improvement on most wide-spread Cortex-A5x processors was observed
-# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-# observed that 32-bit NEON SHA256 performs significantly better than
-# 64-bit scalar version on *some* of the more recent processors. As
-# result 64-bit NEON version of SHA256 was added to provide best
-# all-round performance. For example it executes ~30% faster on X-Gene
-# and Mongoose. [For reference, NEON version of SHA512 is bound to
-# deliver much less improvement, likely *negative* on Cortex-A5x.
-# Which is why NEON support is limited to SHA256.]
-
-$output=pop;
-$flavour=pop;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open OUT,"| \"$^X\" $xlate $flavour $output";
-    *STDOUT=*OUT;
-} else {
-    open STDOUT,">$output";
-}
-
-if ($output =~ /512/) {
-	$BITS=512;
-	$SZ=8;
-	@Sigma0=(28,34,39);
-	@Sigma1=(14,18,41);
-	@sigma0=(1,  8, 7);
-	@sigma1=(19,61, 6);
-	$rounds=80;
-	$reg_t="x";
-} else {
-	$BITS=256;
-	$SZ=4;
-	@Sigma0=( 2,13,22);
-	@Sigma1=( 6,11,25);
-	@sigma0=( 7,18, 3);
-	@sigma1=(17,19,10);
-	$rounds=64;
-	$reg_t="w";
-}
-
-$func="sha${BITS}_blocks_arch";
-
-($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
-
-@X=map("$reg_t$_",(3..15,0..2));
-@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
-($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
-
-sub BODY_00_xx {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my $j=($i+1)&15;
-my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
-   $T0=@X[$i+3] if ($i<11);
-
-$code.=<<___	if ($i<16);
-#ifndef	__AARCH64EB__
-	rev	@X[$i],@X[$i]			// $i
-#endif
-___
-$code.=<<___	if ($i<13 && ($i&1));
-	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
-___
-$code.=<<___	if ($i==13);
-	ldp	@X[14],@X[15],[$inp]
-___
-$code.=<<___	if ($i>=14);
-	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
-___
-$code.=<<___	if ($i>0 && $i<16);
-	add	$a,$a,$t1			// h+=Sigma0(a)
-___
-$code.=<<___	if ($i>=11);
-	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
-___
-# While ARMv8 specifies merged rotate-n-logical operation such as
-# 'eor x,y,z,ror#n', it was found to negatively affect performance
-# on Apple A7. The reason seems to be that it requires even 'y' to
-# be available earlier. This means that such merged instruction is
-# not necessarily best choice on critical path... On the other hand
-# Cortex-A5x handles merged instructions much better than disjoint
-# rotate and logical... See (**) footnote above.
-$code.=<<___	if ($i<15);
-	ror	$t0,$e,#$Sigma1[0]
-	add	$h,$h,$t2			// h+=K[i]
-	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
-	and	$t1,$f,$e
-	bic	$t2,$g,$e
-	add	$h,$h,@X[$i&15]			// h+=X[i]
-	orr	$t1,$t1,$t2			// Ch(e,f,g)
-	eor	$t2,$a,$b			// a^b, b^c in next round
-	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
-	ror	$T0,$a,#$Sigma0[0]
-	add	$h,$h,$t1			// h+=Ch(e,f,g)
-	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
-	add	$h,$h,$t0			// h+=Sigma1(e)
-	and	$t3,$t3,$t2			// (b^c)&=(a^b)
-	add	$d,$d,$h			// d+=h
-	eor	$t3,$t3,$b			// Maj(a,b,c)
-	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
-	add	$h,$h,$t3			// h+=Maj(a,b,c)
-	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
-	//add	$h,$h,$t1			// h+=Sigma0(a)
-___
-$code.=<<___	if ($i>=15);
-	ror	$t0,$e,#$Sigma1[0]
-	add	$h,$h,$t2			// h+=K[i]
-	ror	$T1,@X[($j+1)&15],#$sigma0[0]
-	and	$t1,$f,$e
-	ror	$T2,@X[($j+14)&15],#$sigma1[0]
-	bic	$t2,$g,$e
-	ror	$T0,$a,#$Sigma0[0]
-	add	$h,$h,@X[$i&15]			// h+=X[i]
-	eor	$t0,$t0,$e,ror#$Sigma1[1]
-	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
-	orr	$t1,$t1,$t2			// Ch(e,f,g)
-	eor	$t2,$a,$b			// a^b, b^c in next round
-	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
-	eor	$T0,$T0,$a,ror#$Sigma0[1]
-	add	$h,$h,$t1			// h+=Ch(e,f,g)
-	and	$t3,$t3,$t2			// (b^c)&=(a^b)
-	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
-	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
-	add	$h,$h,$t0			// h+=Sigma1(e)
-	eor	$t3,$t3,$b			// Maj(a,b,c)
-	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
-	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
-	add	@X[$j],@X[$j],@X[($j+9)&15]
-	add	$d,$d,$h			// d+=h
-	add	$h,$h,$t3			// h+=Maj(a,b,c)
-	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
-	add	@X[$j],@X[$j],$T1
-	add	$h,$h,$t1			// h+=Sigma0(a)
-	add	@X[$j],@X[$j],$T2
-___
-	($t2,$t3)=($t3,$t2);
-}
-
-$code.=<<___;
-#ifndef	__KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern	OPENSSL_armcap_P
-.globl	$func
-.type	$func,%function
-.align	6
-$func:
-___
-$code.=<<___	if ($SZ==4);
-#ifndef	__KERNEL__
-# ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
-# else
-	ldr	x16,.LOPENSSL_armcap_P
-# endif
-	adr	x17,.LOPENSSL_armcap_P
-	add	x16,x16,x17
-	ldr	w16,[x16]
-	tst	w16,#ARMV8_SHA256
-	b.ne	.Lv8_entry
-	tst	w16,#ARMV7_NEON
-	b.ne	.Lneon_entry
-#endif
-___
-$code.=<<___;
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#4*$SZ
-
-	ldp	$A,$B,[$ctx]				// load context
-	ldp	$C,$D,[$ctx,#2*$SZ]
-	ldp	$E,$F,[$ctx,#4*$SZ]
-	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
-	ldp	$G,$H,[$ctx,#6*$SZ]
-	adr	$Ktbl,.LK$BITS
-	stp	$ctx,$num,[x29,#96]
-
-.Loop:
-	ldp	@X[0],@X[1],[$inp],#2*$SZ
-	ldr	$t2,[$Ktbl],#$SZ			// *K++
-	eor	$t3,$B,$C				// magic seed
-	str	$inp,[x29,#112]
-___
-for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=".Loop_16_xx:\n";
-for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-	cbnz	$t2,.Loop_16_xx
-
-	ldp	$ctx,$num,[x29,#96]
-	ldr	$inp,[x29,#112]
-	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
-
-	ldp	@X[0],@X[1],[$ctx]
-	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
-	add	$inp,$inp,#14*$SZ			// advance input pointer
-	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
-	add	$A,$A,@X[0]
-	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
-	add	$B,$B,@X[1]
-	add	$C,$C,@X[2]
-	add	$D,$D,@X[3]
-	stp	$A,$B,[$ctx]
-	add	$E,$E,@X[4]
-	add	$F,$F,@X[5]
-	stp	$C,$D,[$ctx,#2*$SZ]
-	add	$G,$G,@X[6]
-	add	$H,$H,@X[7]
-	cmp	$inp,$num
-	stp	$E,$F,[$ctx,#4*$SZ]
-	stp	$G,$H,[$ctx,#6*$SZ]
-	b.ne	.Loop
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#4*$SZ
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#128
-	ret
-.size	$func,.-$func
-
-.align	6
-.type	.LK$BITS,%object
-.LK$BITS:
-___
-$code.=<<___ if ($SZ==8);
-	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
-	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-	.quad	0x3956c25bf348b538,0x59f111f1b605d019
-	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
-	.quad	0xd807aa98a3030242,0x12835b0145706fbe
-	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
-	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
-	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
-	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
-	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
-	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
-	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
-	.quad	0x06ca6351e003826f,0x142929670a0e6e70
-	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
-	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
-	.quad	0x81c2c92e47edaee6,0x92722c851482353b
-	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
-	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
-	.quad	0xd192e819d6ef5218,0xd69906245565a910
-	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
-	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
-	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
-	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
-	.quad	0x90befffa23631e28,0xa4506cebde82bde9
-	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
-	.quad	0xca273eceea26619c,0xd186b8c721c0c207
-	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
-	.quad	0x113f9804bef90dae,0x1b710b35131c471b
-	.quad	0x28db77f523047d84,0x32caab7b40c72493
-	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
-	.quad	0	// terminator
-___
-$code.=<<___ if ($SZ==4);
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-	.long	0	//terminator
-___
-$code.=<<___;
-.size	.LK$BITS,.-.LK$BITS
-#ifndef	__KERNEL__
-.align	3
-.LOPENSSL_armcap_P:
-# ifdef	__ILP32__
-	.long	OPENSSL_armcap_P-.
-# else
-	.quad	OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-___
-
-if ($SZ==4) {
-my $Ktbl="x3";
-
-my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
-my @MSG=map("v$_.16b",(4..7));
-my ($W0,$W1)=("v16.4s","v17.4s");
-my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
-
-$code.=<<___;
-#ifndef	__KERNEL__
-.type	sha256_block_armv8,%function
-.align	6
-sha256_block_armv8:
-.Lv8_entry:
-	stp		x29,x30,[sp,#-16]!
-	add		x29,sp,#0
-
-	ld1.32		{$ABCD,$EFGH},[$ctx]
-	adr		$Ktbl,.LK256
-
-.Loop_hw:
-	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
-	sub		$num,$num,#1
-	ld1.32		{$W0},[$Ktbl],#16
-	rev32		@MSG[0],@MSG[0]
-	rev32		@MSG[1],@MSG[1]
-	rev32		@MSG[2],@MSG[2]
-	rev32		@MSG[3],@MSG[3]
-	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
-	orr		$EFGH_SAVE,$EFGH,$EFGH
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
-	ld1.32		{$W1},[$Ktbl],#16
-	add.i32		$W0,$W0,@MSG[0]
-	sha256su0	@MSG[0],@MSG[1]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-	sha256su1	@MSG[0],@MSG[2],@MSG[3]
-___
-	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
-}
-$code.=<<___;
-	ld1.32		{$W1},[$Ktbl],#16
-	add.i32		$W0,$W0,@MSG[0]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-
-	ld1.32		{$W0},[$Ktbl],#16
-	add.i32		$W1,$W1,@MSG[1]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W1
-	sha256h2	$EFGH,$abcd,$W1
-
-	ld1.32		{$W1},[$Ktbl]
-	add.i32		$W0,$W0,@MSG[2]
-	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-
-	add.i32		$W1,$W1,@MSG[3]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W1
-	sha256h2	$EFGH,$abcd,$W1
-
-	add.i32		$ABCD,$ABCD,$ABCD_SAVE
-	add.i32		$EFGH,$EFGH,$EFGH_SAVE
-
-	cbnz		$num,.Loop_hw
-
-	st1.32		{$ABCD,$EFGH},[$ctx]
-
-	ldr		x29,[sp],#16
-	ret
-.size	sha256_block_armv8,.-sha256_block_armv8
-#endif
-___
-}
-
-if ($SZ==4) {	######################################### NEON stuff #
-# You'll surely note a lot of similarities with sha256-armv4 module,
-# and of course it's not a coincidence. sha256-armv4 was used as
-# initial template, but was adapted for ARMv8 instruction set and
-# extensively re-tuned for all-round performance.
-
-my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
-my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
-my $Ktbl="x16";
-my $Xfer="x17";
-my @X = map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
-my $j=0;
-
-sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
-  my $arg = pop;
-    $arg = "#$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
-sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
-sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
-
-sub Xupdate()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ushr_32	($T2,$T0,$sigma0[0]);
-	 eval(shift(@insns));
-	&ushr_32	($T1,$T0,$sigma0[2]);
-	 eval(shift(@insns));
-	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
-	 eval(shift(@insns));
-	&sli_32		($T2,$T0,32-$sigma0[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ushr_32	($T3,$T0,$sigma0[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T1,$T1,$T2);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&sli_32		($T3,$T0,32-$sigma0[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T4,$T7,$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T4,$T7,32-$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T5,$T7,$sigma1[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T3,$T7,$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_u32	($T3,$T7,32-$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T5,$T5,$T4);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T6,@X[0],$sigma1[0]);
-	 eval(shift(@insns));
-	  &ushr_32	($T7,@X[0],$sigma1[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T6,@X[0],32-$sigma1[0]);
-	 eval(shift(@insns));
-	  &ushr_32	($T5,@X[0],$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T7,$T7,$T6);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T5,@X[0],32-$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_32		("{$T0}","[$Ktbl], #16");
-	 eval(shift(@insns));
-	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T5,$T5,$T5);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&mov		(&Dhi($T5), &Dlo($T7));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		($T0,$T0,@X[0]);
-	 while($#insns>=1) { eval(shift(@insns)); }
-	&st1_32		("{$T0}","[$Xfer], #16");
-	 eval(shift(@insns));
-
-	push(@X,shift(@X));		# "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_8		("{@X[0]}","[$inp],#16");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_32		("{$T0}","[$Ktbl],#16");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&rev32		(@X[0],@X[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		($T0,$T0,@X[0]);
-	 foreach (@insns) { eval; }	# remaining instructions
-	&st1_32		("{$T0}","[$Xfer], #16");
-
-	push(@X,shift(@X));		# "rotate" X[]
-}
-
-sub body_00_15 () {
-	(
-	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
-	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
-	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
-	'&and	($t1,$f,$e)',
-	'&bic	($t4,$g,$e)',
-	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
-	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
-	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
-	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
-	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
-	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
-	'&ror	($t0,$t0,"#$Sigma1[0]")',
-	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
-	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
-	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
-	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
-	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
-	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
-	'&ror	($t4,$t4,"#$Sigma0[0]")',
-	'&add	($d,$d,$h)',			# d+=h
-	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
-	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
-	)
-}
-
-$code.=<<___;
-#ifdef	__KERNEL__
-.globl	sha256_block_neon
-#endif
-.type	sha256_block_neon,%function
-.align	4
-sha256_block_neon:
-.Lneon_entry:
-	stp	x29, x30, [sp, #-16]!
-	mov	x29, sp
-	sub	sp,sp,#16*4
-
-	adr	$Ktbl,.LK256
-	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
-
-	ld1.8	{@X[0]},[$inp], #16
-	ld1.8	{@X[1]},[$inp], #16
-	ld1.8	{@X[2]},[$inp], #16
-	ld1.8	{@X[3]},[$inp], #16
-	ld1.32	{$T0},[$Ktbl], #16
-	ld1.32	{$T1},[$Ktbl], #16
-	ld1.32	{$T2},[$Ktbl], #16
-	ld1.32	{$T3},[$Ktbl], #16
-	rev32	@X[0],@X[0]		// yes, even on
-	rev32	@X[1],@X[1]		// big-endian
-	rev32	@X[2],@X[2]
-	rev32	@X[3],@X[3]
-	mov	$Xfer,sp
-	add.32	$T0,$T0,@X[0]
-	add.32	$T1,$T1,@X[1]
-	add.32	$T2,$T2,@X[2]
-	st1.32	{$T0-$T1},[$Xfer], #32
-	add.32	$T3,$T3,@X[3]
-	st1.32	{$T2-$T3},[$Xfer]
-	sub	$Xfer,$Xfer,#32
-
-	ldp	$A,$B,[$ctx]
-	ldp	$C,$D,[$ctx,#8]
-	ldp	$E,$F,[$ctx,#16]
-	ldp	$G,$H,[$ctx,#24]
-	ldr	$t1,[sp,#0]
-	mov	$t2,wzr
-	eor	$t3,$B,$C
-	mov	$t4,wzr
-	b	.L_00_48
-
-.align	4
-.L_00_48:
-___
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-$code.=<<___;
-	cmp	$t1,#0				// check for K256 terminator
-	ldr	$t1,[sp,#0]
-	sub	$Xfer,$Xfer,#64
-	bne	.L_00_48
-
-	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
-	cmp	$inp,$num
-	mov	$Xfer, #64
-	csel	$Xfer, $Xfer, xzr, eq
-	sub	$inp,$inp,$Xfer			// avoid SEGV
-	mov	$Xfer,sp
-___
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-$code.=<<___;
-	add	$A,$A,$t4			// h+=Sigma0(a) from the past
-	ldp	$t0,$t1,[$ctx,#0]
-	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
-	ldp	$t2,$t3,[$ctx,#8]
-	add	$A,$A,$t0			// accumulate
-	add	$B,$B,$t1
-	ldp	$t0,$t1,[$ctx,#16]
-	add	$C,$C,$t2
-	add	$D,$D,$t3
-	ldp	$t2,$t3,[$ctx,#24]
-	add	$E,$E,$t0
-	add	$F,$F,$t1
-	 ldr	$t1,[sp,#0]
-	stp	$A,$B,[$ctx,#0]
-	add	$G,$G,$t2
-	 mov	$t2,wzr
-	stp	$C,$D,[$ctx,#8]
-	add	$H,$H,$t3
-	stp	$E,$F,[$ctx,#16]
-	 eor	$t3,$B,$C
-	stp	$G,$H,[$ctx,#24]
-	 mov	$t4,wzr
-	 mov	$Xfer,sp
-	b.ne	.L_00_48
-
-	ldr	x29,[x29]
-	add	sp,sp,#16*4+16
-	ret
-.size	sha256_block_neon,.-sha256_block_neon
-___
-}
-
-$code.=<<___;
-#ifndef	__KERNEL__
-.comm	OPENSSL_armcap_P,4,4
-#endif
-___
-
-{   my  %opcode = (
-	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
-	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
-
-    sub unsha256 {
-	my ($mnemonic,$arg)=@_;
-
-	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
-	&&
-	sprintf ".inst\t0x%08x\t//%s %s",
-			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
-			$mnemonic,$arg;
-    }
-}
-
-open SELF,$0;
-while(<SELF>) {
-        next if (/^#!/);
-        last if (!s/^#/\/\// and !/^$/);
-        print;
-}
-close SELF;
-
-foreach(split("\n",$code)) {
-
-	s/\`([^\`]*)\`/eval($1)/ge;
-
-	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
-
-	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
-
-	s/\.[ui]?8(\s)/$1/;
-	s/\.\w?32\b//		and s/\.16b/\.4s/g;
-	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
-
-	print $_,"\n";
-}
-
-close STDOUT;
diff --git a/arch/arm64/lib/crypto/sha256-ce.S b/arch/arm64/lib/crypto/sha256-ce.S
deleted file mode 100644
index f3e21c6d87d2..000000000000
--- a/arch/arm64/lib/crypto/sha256-ce.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.arch		armv8-a+crypto
-
-	dga		.req	q20
-	dgav		.req	v20
-	dgb		.req	q21
-	dgbv		.req	v21
-
-	t0		.req	v22
-	t1		.req	v23
-
-	dg0q		.req	q24
-	dg0v		.req	v24
-	dg1q		.req	q25
-	dg1v		.req	v25
-	dg2q		.req	q26
-	dg2v		.req	v26
-
-	.macro		add_only, ev, rc, s0
-	mov		dg2v.16b, dg0v.16b
-	.ifeq		\ev
-	add		t1.4s, v\s0\().4s, \rc\().4s
-	sha256h		dg0q, dg1q, t0.4s
-	sha256h2	dg1q, dg2q, t0.4s
-	.else
-	.ifnb		\s0
-	add		t0.4s, v\s0\().4s, \rc\().4s
-	.endif
-	sha256h		dg0q, dg1q, t1.4s
-	sha256h2	dg1q, dg2q, t1.4s
-	.endif
-	.endm
-
-	.macro		add_update, ev, rc, s0, s1, s2, s3
-	sha256su0	v\s0\().4s, v\s1\().4s
-	add_only	\ev, \rc, \s1
-	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
-	.endm
-
-	/*
-	 * The SHA-256 round constants
-	 */
-	.section	".rodata", "a"
-	.align		4
-.Lsha2_rcon:
-	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
-	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
-	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
-	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
-	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
-	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
-	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
-	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
-	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
-	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
-	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
-	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
-	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
-	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
-	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
-	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
-	/*
-	 * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
-	 *				const u8 *data, size_t nblocks);
-	 */
-	.text
-SYM_FUNC_START(__sha256_ce_transform)
-	/* load round constants */
-	adr_l		x8, .Lsha2_rcon
-	ld1		{ v0.4s- v3.4s}, [x8], #64
-	ld1		{ v4.4s- v7.4s}, [x8], #64
-	ld1		{ v8.4s-v11.4s}, [x8], #64
-	ld1		{v12.4s-v15.4s}, [x8]
-
-	/* load state */
-	ld1		{dgav.4s, dgbv.4s}, [x0]
-
-	/* load input */
-0:	ld1		{v16.4s-v19.4s}, [x1], #64
-	sub		x2, x2, #1
-
-CPU_LE(	rev32		v16.16b, v16.16b	)
-CPU_LE(	rev32		v17.16b, v17.16b	)
-CPU_LE(	rev32		v18.16b, v18.16b	)
-CPU_LE(	rev32		v19.16b, v19.16b	)
-
-	add		t0.4s, v16.4s, v0.4s
-	mov		dg0v.16b, dgav.16b
-	mov		dg1v.16b, dgbv.16b
-
-	add_update	0,  v1, 16, 17, 18, 19
-	add_update	1,  v2, 17, 18, 19, 16
-	add_update	0,  v3, 18, 19, 16, 17
-	add_update	1,  v4, 19, 16, 17, 18
-
-	add_update	0,  v5, 16, 17, 18, 19
-	add_update	1,  v6, 17, 18, 19, 16
-	add_update	0,  v7, 18, 19, 16, 17
-	add_update	1,  v8, 19, 16, 17, 18
-
-	add_update	0,  v9, 16, 17, 18, 19
-	add_update	1, v10, 17, 18, 19, 16
-	add_update	0, v11, 18, 19, 16, 17
-	add_update	1, v12, 19, 16, 17, 18
-
-	add_only	0, v13, 17
-	add_only	1, v14, 18
-	add_only	0, v15, 19
-	add_only	1
-
-	/* update state */
-	add		dgav.4s, dgav.4s, dg0v.4s
-	add		dgbv.4s, dgbv.4s, dg1v.4s
-
-	/* return early if voluntary preemption is needed */
-	cond_yield	1f, x5, x6
-
-	/* handled all input blocks? */
-	cbnz		x2, 0b
-
-	/* store new state */
-1:	st1		{dgav.4s, dgbv.4s}, [x0]
-	mov		x0, x2
-	ret
-SYM_FUNC_END(__sha256_ce_transform)
diff --git a/arch/arm64/lib/crypto/sha256.c b/arch/arm64/lib/crypto/sha256.c
deleted file mode 100644
index bcf7a3adc0c4..000000000000
--- a/arch/arm64/lib/crypto/sha256.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-				   const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS],
-				  const u8 *data, size_t nblocks);
-asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
-					const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
-			const u8 *data, size_t nblocks)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon)) {
-		if (static_branch_likely(&have_ce)) {
-			do {
-				size_t rem;
-
-				kernel_neon_begin();
-				rem = __sha256_ce_transform(state,
-							    data, nblocks);
-				kernel_neon_end();
-				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
-				nblocks = rem;
-			} while (nblocks);
-		} else {
-			kernel_neon_begin();
-			sha256_block_neon(state, data, nblocks);
-			kernel_neon_end();
-		}
-	} else {
-		sha256_blocks_arch(state, data, nblocks);
-	}
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-bool sha256_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM64 scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_arm64_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    cpu_have_named_feature(ASIMD)) {
-		static_branch_enable(&have_neon);
-		if (cpu_have_named_feature(SHA2))
-			static_branch_enable(&have_ce);
-	}
-	return 0;
-}
-subsys_initcall(sha256_arm64_mod_init);
-
-static void __exit sha256_arm64_mod_exit(void)
-{
-}
-module_exit(sha256_arm64_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM64");
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ec0a337891dd..11eb8d1adc84 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -487,17 +487,29 @@ static void do_bad_area(unsigned long far, unsigned long esr,
 	}
 }
 
-static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
-			unsigned int mm_flags)
+static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags)
 {
-	unsigned long iss2 = ESR_ELx_ISS2(esr);
-
 	if (!system_supports_poe())
 		return false;
 
-	if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay))
-		return true;
-
+	/*
+	 * We do not check whether an Overlay fault has occurred because we
+	 * cannot make a decision based solely on its value:
+	 *
+	 * - If Overlay is set, a fault did occur due to POE, but it may be
+	 *   spurious in those cases where we update POR_EL0 without ISB (e.g.
+	 *   on context-switch). We would then need to manually check POR_EL0
+	 *   against vma_pkey(vma), which is exactly what
+	 *   arch_vma_access_permitted() does.
+	 *
+	 * - If Overlay is not set, we may still need to report a pkey fault.
+	 *   This is the case if an access was made within a mapping but with no
+	 *   page mapped, and POR_EL0 forbids the access (according to
+	 *   vma_pkey()). Such access will result in a SIGSEGV regardless
+	 *   because core code checks arch_vma_access_permitted(), but in order
+	 *   to report the correct error code - SEGV_PKUERR - we must handle
+	 *   that case here.
+	 */
 	return !arch_vma_access_permitted(vma,
 			mm_flags & FAULT_FLAG_WRITE,
 			mm_flags & FAULT_FLAG_INSTRUCTION,
@@ -635,7 +647,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 		goto bad_area;
 	}
 
-	if (fault_from_pkey(esr, vma, mm_flags)) {
+	if (fault_from_pkey(vma, mm_flags)) {
 		pkey = vma_pkey(vma);
 		vma_end_read(vma);
 		fault = 0;
@@ -679,7 +691,7 @@ retry:
 		goto bad_area;
 	}
 
-	if (fault_from_pkey(esr, vma, mm_flags)) {
+	if (fault_from_pkey(vma, mm_flags)) {
 		pkey = vma_pkey(vma);
 		mmap_read_unlock(mm);
 		fault = 0;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8fcf59ba39db..00ab1d648db6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1305,7 +1305,8 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
 	next = addr;
 	end = addr + PUD_SIZE;
 	do {
-		pmd_free_pte_page(pmdp, next);
+		if (pmd_present(pmdp_get(pmdp)))
+			pmd_free_pte_page(pmdp, next);
 	} while (pmdp++, next += PMD_SIZE, next != end);
 
 	pud_clear(pudp);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 80d470aa469d..54dccfd6aa11 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -518,7 +518,6 @@ alternative_else_nop_endif
 	msr	REG_PIR_EL1, x0
 
 	orr	tcr2, tcr2, TCR2_EL1_PIE
-	msr	REG_TCR2_EL1, x0
 
 .Lskip_indirection:
 
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 0765b3a8d6d6..8d9088bc577d 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -479,3 +479,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr