69 files changed, 1030 insertions, 727 deletions
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 251ecf34cb02..a231815f549b 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -112,12 +112,13 @@ config ARCH_MESON
 	bool "Amlogic Platforms"
 	select PINCTRL
 	select PINCTRL_MESON
-	select COMMON_CLK_AMLOGIC
 	select COMMON_CLK_GXBB
 	select COMMON_CLK_AXG
+	select COMMON_CLK_G12A
 	select MESON_IRQ_GPIO
 	help
-	  This enables support for the Amlogic S905 SoCs.
+	  This enables support for the arm64 based Amlogic SoCs
+	  such as the s905, S905X/D, S912, A113X/D or S905X/D2
 
 config ARCH_MVEBU
 	bool "Marvell EBU SoC Family"
@@ -146,6 +147,10 @@ config ARCH_MXC
 	bool "ARMv8 based NXP i.MX SoC family"
 	select ARM64_ERRATUM_843419
 	select ARM64_ERRATUM_845719
+	select IMX_GPCV2
+	select IMX_GPCV2_PM_DOMAINS
+	select PM
+	select PM_GENERIC_DOMAINS
 	help
 	  This enables support for the ARMv8 based SoCs in the
 	  NXP i.MX family.
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-orangepi-win.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-orangepi-win.dts
index 281bf9fe5ca2..510f661229dc 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-orangepi-win.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-orangepi-win.dts
@@ -188,6 +188,7 @@
 		reg = <0x3a3>;
 		interrupt-parent = <&r_intc>;
 		interrupts = <0 IRQ_TYPE_LEVEL_LOW>;
+		x-powers,drive-vbus-en; /* set N_VBUSEN as output pin */
 	};
 };
 
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
index 631b451c5aa5..e628d063931b 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
@@ -400,7 +400,7 @@
 		};
 
 		video-codec@1c0e000 {
-			compatible = "allwinner,sun50i-h5-video-engine";
+			compatible = "allwinner,sun50i-a64-video-engine";
 			reg = <0x01c0e000 0x1000>;
 			clocks = <&ccu CLK_BUS_VE>, <&ccu CLK_VE>,
 				 <&ccu CLK_DRAM_VE>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi
index e14e0ce7e89f..016641a41694 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi
@@ -187,8 +187,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_boot>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts
index 8cd50b75171d..ade2ee09ae96 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts
@@ -305,8 +305,7 @@
 	max-frequency = <200000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddio_ao3v3>;
 	vqmmc-supply = <&vddio_tf>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts
index 4cf7f6e80c6a..25105ac96d55 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts
@@ -238,8 +238,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_card>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts
index 2e1cd5e3a246..1cc9dc68ef00 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts
@@ -258,8 +258,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&tflash_vdd>;
 	vqmmc-supply = <&tf_io>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi
index ce862266b9aa..0be0f2a5d2fe 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi
@@ -196,8 +196,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_card>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi
index 93a4acf2c46c..ad4d50bd9d77 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi
@@ -154,8 +154,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vcc_3v3>;
 };
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-wetek.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-wetek.dtsi
index ec09bb5792b7..2d2db783c44c 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb-wetek.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-wetek.dtsi
@@ -211,8 +211,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vcc_3v3>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts
index f1c410e2da2b..796baea7a0bf 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts
@@ -131,8 +131,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_card>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
index db293440e4ca..255cede7b447 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts
@@ -238,8 +238,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vcc_3v3>;
 	vqmmc-supply = <&vcc_card>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts
index 6739697be1de..9cbdb85fb591 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts
@@ -183,8 +183,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_card>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi
index a1b31013ab6e..bc811a2faf42 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi
@@ -137,8 +137,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_boot>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-khadas-vim2.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-khadas-vim2.dts
index 3c3a667a8df8..3f086ed7de05 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxm-khadas-vim2.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxm-khadas-vim2.dts
@@ -356,8 +356,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_boot>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts
index f7a1cffab4a8..8acfd40090d2 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts
@@ -147,8 +147,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_boot>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts
index 7212dc4531e4..7fa20a8ede17 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts
@@ -170,8 +170,7 @@
 	max-frequency = <100000000>;
 	disable-wp;
 
-	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>;
-	cd-inverted;
+	cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_LOW>;
 
 	vmmc-supply = <&vddao_3v3>;
 	vqmmc-supply = <&vddio_boot>;
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
index fdeb4176fc33..f86b054a74ae 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
@@ -71,3 +71,20 @@
 &duart1 {
 	status = "okay";
 };
+
+&enetc_port0 {
+	phy-handle = <&sgmii_phy0>;
+	phy-connection-type = "sgmii";
+
+	mdio {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		sgmii_phy0: ethernet-phy@2 {
+			reg = <0x2>;
+		};
+	};
+};
+
+&enetc_port1 {
+	status = "disabled";
+};
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
index a8cf92af05fb..2896bbcfa3bb 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
@@ -335,5 +335,40 @@
 				     <GIC_SPI 206 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 207 IRQ_TYPE_LEVEL_HIGH>,
 				     <GIC_SPI 208 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 209 IRQ_TYPE_LEVEL_HIGH>;
 		};
+
+		pcie@1f0000000 { /* Integrated Endpoint Root Complex */
+			compatible = "pci-host-ecam-generic";
+			reg = <0x01 0xf0000000 0x0 0x100000>;
+			#address-cells = <3>;
+			#size-cells = <2>;
+			#interrupt-cells = <1>;
+			msi-parent = <&its>;
+			device_type = "pci";
+			bus-range = <0x0 0x0>;
+			dma-coherent;
+			msi-map = <0 &its 0x17 0xe>;
+			iommu-map = <0 &smmu 0x17 0xe>;
+				  /* PF0-6 BAR0 - non-prefetchable memory */
+			ranges = <0x82000000 0x0 0x00000000  0x1 0xf8000000  0x0 0x160000
+				  /* PF0-6 BAR2 - prefetchable memory */
+				  0xc2000000 0x0 0x00000000  0x1 0xf8160000  0x0 0x070000
+				  /* PF0: VF0-1 BAR0 - non-prefetchable memory */
+				  0x82000000 0x0 0x00000000  0x1 0xf81d0000  0x0 0x020000
+				  /* PF0: VF0-1 BAR2 - prefetchable memory */
+				  0xc2000000 0x0 0x00000000  0x1 0xf81f0000  0x0 0x020000
+				  /* PF1: VF0-1 BAR0 - non-prefetchable memory */
+				  0x82000000 0x0 0x00000000  0x1 0xf8210000  0x0 0x020000
+				  /* PF1: VF0-1 BAR2 - prefetchable memory */
+				  0xc2000000 0x0 0x00000000  0x1 0xf8230000  0x0 0x020000>;
+
+			enetc_port0: ethernet@0,0 {
+				compatible = "fsl,enetc";
+				reg = <0x000000 0 0 0 0>;
+			};
+			enetc_port1: ethernet@0,1 {
+				compatible = "fsl,enetc";
+				reg = <0x000100 0 0 0 0>;
+			};
+		};
 	};
 };
diff --git a/arch/arm64/boot/dts/freescale/imx8mq-evk.dts b/arch/arm64/boot/dts/freescale/imx8mq-evk.dts
index d3fcb8872948..54737bf1772f 100644
--- a/arch/arm64/boot/dts/freescale/imx8mq-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mq-evk.dts
@@ -274,34 +274,34 @@
 
 	pinctrl_usdhc1_100mhz: usdhc1-100grp {
 		fsl,pins = <
-			MX8MQ_IOMUXC_SD1_CLK_USDHC1_CLK			0x85
-			MX8MQ_IOMUXC_SD1_CMD_USDHC1_CMD			0xc5
-			MX8MQ_IOMUXC_SD1_DATA0_USDHC1_DATA0		0xc5
-			MX8MQ_IOMUXC_SD1_DATA1_USDHC1_DATA1		0xc5
-			MX8MQ_IOMUXC_SD1_DATA2_USDHC1_DATA2		0xc5
-			MX8MQ_IOMUXC_SD1_DATA3_USDHC1_DATA3		0xc5
-			MX8MQ_IOMUXC_SD1_DATA4_USDHC1_DATA4		0xc5
-			MX8MQ_IOMUXC_SD1_DATA5_USDHC1_DATA5		0xc5
-			MX8MQ_IOMUXC_SD1_DATA6_USDHC1_DATA6		0xc5
-			MX8MQ_IOMUXC_SD1_DATA7_USDHC1_DATA7		0xc5
-			MX8MQ_IOMUXC_SD1_STROBE_USDHC1_STROBE		0x85
+			MX8MQ_IOMUXC_SD1_CLK_USDHC1_CLK			0x8d
+			MX8MQ_IOMUXC_SD1_CMD_USDHC1_CMD			0xcd
+			MX8MQ_IOMUXC_SD1_DATA0_USDHC1_DATA0		0xcd
+			MX8MQ_IOMUXC_SD1_DATA1_USDHC1_DATA1		0xcd
+			MX8MQ_IOMUXC_SD1_DATA2_USDHC1_DATA2		0xcd
+			MX8MQ_IOMUXC_SD1_DATA3_USDHC1_DATA3		0xcd
+			MX8MQ_IOMUXC_SD1_DATA4_USDHC1_DATA4		0xcd
+			MX8MQ_IOMUXC_SD1_DATA5_USDHC1_DATA5		0xcd
+			MX8MQ_IOMUXC_SD1_DATA6_USDHC1_DATA6		0xcd
+			MX8MQ_IOMUXC_SD1_DATA7_USDHC1_DATA7		0xcd
+			MX8MQ_IOMUXC_SD1_STROBE_USDHC1_STROBE		0x8d
 			MX8MQ_IOMUXC_SD1_RESET_B_USDHC1_RESET_B		0xc1
 		>;
 	};
 
 	pinctrl_usdhc1_200mhz: usdhc1-200grp {
 		fsl,pins = <
-			MX8MQ_IOMUXC_SD1_CLK_USDHC1_CLK			0x87
-			MX8MQ_IOMUXC_SD1_CMD_USDHC1_CMD			0xc7
-			MX8MQ_IOMUXC_SD1_DATA0_USDHC1_DATA0		0xc7
-			MX8MQ_IOMUXC_SD1_DATA1_USDHC1_DATA1		0xc7
-			MX8MQ_IOMUXC_SD1_DATA2_USDHC1_DATA2		0xc7
-			MX8MQ_IOMUXC_SD1_DATA3_USDHC1_DATA3		0xc7
-			MX8MQ_IOMUXC_SD1_DATA4_USDHC1_DATA4		0xc7
-			MX8MQ_IOMUXC_SD1_DATA5_USDHC1_DATA5		0xc7
-			MX8MQ_IOMUXC_SD1_DATA6_USDHC1_DATA6		0xc7
-			MX8MQ_IOMUXC_SD1_DATA7_USDHC1_DATA7		0xc7
-			MX8MQ_IOMUXC_SD1_STROBE_USDHC1_STROBE		0x87
+			MX8MQ_IOMUXC_SD1_CLK_USDHC1_CLK			0x9f
+			MX8MQ_IOMUXC_SD1_CMD_USDHC1_CMD			0xdf
+			MX8MQ_IOMUXC_SD1_DATA0_USDHC1_DATA0		0xdf
+			MX8MQ_IOMUXC_SD1_DATA1_USDHC1_DATA1		0xdf
+			MX8MQ_IOMUXC_SD1_DATA2_USDHC1_DATA2		0xdf
+			MX8MQ_IOMUXC_SD1_DATA3_USDHC1_DATA3		0xdf
+			MX8MQ_IOMUXC_SD1_DATA4_USDHC1_DATA4		0xdf
+			MX8MQ_IOMUXC_SD1_DATA5_USDHC1_DATA5		0xdf
+			MX8MQ_IOMUXC_SD1_DATA6_USDHC1_DATA6		0xdf
+			MX8MQ_IOMUXC_SD1_DATA7_USDHC1_DATA7		0xdf
+			MX8MQ_IOMUXC_SD1_STROBE_USDHC1_STROBE		0x9f
 			MX8MQ_IOMUXC_SD1_RESET_B_USDHC1_RESET_B		0xc1
 		>;
 	};
diff --git a/arch/arm64/boot/dts/freescale/imx8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
index c626ccbbc500..9155bd4784eb 100644
--- a/arch/arm64/boot/dts/freescale/imx8mq.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
@@ -538,6 +538,8 @@
 				         <&clk IMX8MQ_CLK_NAND_USDHC_BUS>,
 				         <&clk IMX8MQ_CLK_USDHC1_ROOT>;
 				clock-names = "ipg", "ahb", "per";
+				assigned-clocks = <&clk IMX8MQ_CLK_USDHC1>;
+				assigned-clock-rates = <400000000>;
 				fsl,tuning-start-tap = <20>;
 				fsl,tuning-step = <2>;
 				bus-width = <4>;
diff --git a/arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts b/arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts
index 5b4a9609e31f..2468762283a5 100644
--- a/arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts
+++ b/arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts
@@ -351,7 +351,7 @@
 		reg = <0>;
 		pinctrl-names = "default";
 		pinctrl-0 = <&cp0_copper_eth_phy_reset>;
-		reset-gpios = <&cp1_gpio1 11 GPIO_ACTIVE_LOW>;
+		reset-gpios = <&cp0_gpio2 11 GPIO_ACTIVE_LOW>;
 		reset-assert-us = <10000>;
 	};
 
diff --git a/arch/arm64/boot/dts/qcom/msm8998.dtsi b/arch/arm64/boot/dts/qcom/msm8998.dtsi
index 5091ac07a2af..3fd0769fe648 100644
--- a/arch/arm64/boot/dts/qcom/msm8998.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8998.dtsi
@@ -38,7 +38,7 @@
 		};
 
 		memory@86200000 {
-			reg = <0x0 0x86200000 0x0 0x2600000>;
+			reg = <0x0 0x86200000 0x0 0x2d00000>;
 			no-map;
 		};
 
diff --git a/arch/arm64/boot/dts/renesas/r8a774a1.dtsi b/arch/arm64/boot/dts/renesas/r8a774a1.dtsi
index cac787a630a4..ef3cff2dd1b6 100644
--- a/arch/arm64/boot/dts/renesas/r8a774a1.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a774a1.dtsi
@@ -1011,6 +1011,9 @@
 				 <&cpg CPG_CORE R8A774A1_CLK_S3D1>,
 				 <&scif_clk>;
 			clock-names = "fck", "brg_int", "scif_clk";
+			dmas = <&dmac1 0x13>, <&dmac1 0x12>,
+			       <&dmac2 0x13>, <&dmac2 0x12>;
+			dma-names = "tx", "rx", "tx", "rx";
 			power-domains = <&sysc R8A774A1_PD_ALWAYS_ON>;
 			resets = <&cpg 310>;
 			status = "disabled";
diff --git a/arch/arm64/boot/dts/renesas/r8a7796.dtsi b/arch/arm64/boot/dts/renesas/r8a7796.dtsi
index 23de63f3d6c3..cdf784899cf8 100644
--- a/arch/arm64/boot/dts/renesas/r8a7796.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a7796.dtsi
@@ -1262,6 +1262,9 @@
 				 <&cpg CPG_CORE R8A7796_CLK_S3D1>,
 				 <&scif_clk>;
 			clock-names = "fck", "brg_int", "scif_clk";
+			dmas = <&dmac1 0x13>, <&dmac1 0x12>,
+			       <&dmac2 0x13>, <&dmac2 0x12>;
+			dma-names = "tx", "rx", "tx", "rx";
 			power-domains = <&sysc R8A7796_PD_ALWAYS_ON>;
 			resets = <&cpg 310>;
 			status = "disabled";
diff --git a/arch/arm64/boot/dts/renesas/r8a77965.dtsi b/arch/arm64/boot/dts/renesas/r8a77965.dtsi
index 979f14d1fcc4..9763d108e183 100644
--- a/arch/arm64/boot/dts/renesas/r8a77965.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a77965.dtsi
@@ -1068,6 +1068,9 @@
 				 <&cpg CPG_CORE R8A77965_CLK_S3D1>,
 				 <&scif_clk>;
 			clock-names = "fck", "brg_int", "scif_clk";
+			dmas = <&dmac1 0x13>, <&dmac1 0x12>,
+			       <&dmac2 0x13>, <&dmac2 0x12>;
+			dma-names = "tx", "rx", "tx", "rx";
 			power-domains = <&sysc R8A77965_PD_ALWAYS_ON>;
 			resets = <&cpg 310>;
 			status = "disabled";
diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
index 77db0dec2fdb..2157a528276b 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
@@ -40,6 +40,7 @@
 		pinctrl-0 = <&usb30_host_drv>;
 		regulator-name = "vcc_host_5v";
 		regulator-always-on;
+		regulator-boot-on;
 		vin-supply = <&vcc_sys>;
 	};
 
@@ -51,6 +52,7 @@
 		pinctrl-0 = <&usb20_host_drv>;
 		regulator-name = "vcc_host1_5v";
 		regulator-always-on;
+		regulator-boot-on;
 		vin-supply = <&vcc_sys>;
 	};
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-bob.dts b/arch/arm64/boot/dts/rockchip/rk3399-gru-bob.dts
index 1ee0dc0d9f10..d1cf404b8708 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-gru-bob.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-bob.dts
@@ -22,7 +22,7 @@
 		backlight = <&backlight>;
 		power-supply = <&pp3300_disp>;
 
-		ports {
+		port {
 			panel_in_edp: endpoint {
 				remote-endpoint = <&edp_out_panel>;
 			};
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi
index c400be64170e..931640e9aed4 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi
@@ -200,6 +200,19 @@
 		pinctrl-0 = <&bl_en>;
 		pwm-delay-us = <10000>;
 	};
+
+	gpio_keys: gpio-keys {
+		compatible = "gpio-keys";
+		pinctrl-names = "default";
+		pinctrl-0 = <&bt_host_wake_l>;
+
+		wake_on_bt: wake-on-bt {
+			label = "Wake-on-Bluetooth";
+			gpios = <&gpio0 3 GPIO_ACTIVE_LOW>;
+			linux,code = <KEY_WAKEUP>;
+			wakeup-source;
+		};
+	};
 };
 
 &ppvar_bigcpu {
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts b/arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts
index 81e73103fa78..15e254a77391 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-kevin.dts
@@ -43,7 +43,7 @@
 		backlight = <&backlight>;
 		power-supply = <&pp3300_disp>;
 
-		ports {
+		port {
 			panel_in_edp: endpoint {
 				remote-endpoint = <&edp_out_panel>;
 			};
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi
index fc50b3ef758c..62ea7d6a7d4a 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi
@@ -175,6 +175,21 @@
 		pinctrl-0 = <&dmic_en>;
 		wakeup-delay-ms = <250>;
 	};
+
+	gpio_keys: gpio-keys {
+		compatible = "gpio-keys";
+		pinctrl-names = "default";
+		pinctrl-0 = <&pen_eject_odl>;
+
+		pen-insert {
+			label = "Pen Insert";
+			/* Insert = low, eject = high */
+			gpios = <&gpio1 1 GPIO_ACTIVE_LOW>;
+			linux,code = <SW_PEN_INSERTED>;
+			linux,input-type = <EV_SW>;
+			wakeup-source;
+		};
+	};
 };
 
 /* pp900_s0 aliases */
@@ -328,20 +343,6 @@ camera: &i2c7 {
 		<400000000>;
 };
 
-&gpio_keys {
-	pinctrl-names = "default";
-	pinctrl-0 = <&bt_host_wake_l>, <&pen_eject_odl>;
-
-	pen-insert {
-		label = "Pen Insert";
-		/* Insert = low, eject = high */
-		gpios = <&gpio1 1 GPIO_ACTIVE_LOW>;
-		linux,code = <SW_PEN_INSERTED>;
-		linux,input-type = <EV_SW>;
-		wakeup-source;
-	};
-};
-
 &i2c_tunnel {
 	google,remote-bus = <0>;
 };
@@ -437,8 +438,19 @@ camera: &i2c7 {
 	status = "okay";
 };
 
-&wake_on_bt {
-	gpios = <&gpio1 2 GPIO_ACTIVE_LOW>;
+&usb_host0_ohci {
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	qca_bt: bluetooth@1 {
+		compatible = "usbcf3,e300", "usb4ca,301a";
+		reg = <1>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&bt_host_wake_l>;
+		interrupt-parent = <&gpio1>;
+		interrupts = <2 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "wakeup";
+	};
 };
 
 /* PINCTRL OVERRIDES */
@@ -455,7 +467,7 @@ camera: &i2c7 {
 };
 
 &bt_host_wake_l {
-	rockchip,pins = <1 2 RK_FUNC_GPIO &pcfg_pull_up>;
+	rockchip,pins = <1 2 RK_FUNC_GPIO &pcfg_pull_none>;
 };
 
 &ec_ap_int_l {
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi
index ea607a601a86..da03fa9c5662 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi
@@ -269,19 +269,6 @@
 		#clock-cells = <0>;
 	};
 
-	gpio_keys: gpio-keys {
-		compatible = "gpio-keys";
-		pinctrl-names = "default";
-		pinctrl-0 = <&bt_host_wake_l>;
-
-		wake_on_bt: wake-on-bt {
-			label = "Wake-on-Bluetooth";
-			gpios = <&gpio0 3 GPIO_ACTIVE_LOW>;
-			linux,code = <KEY_WAKEUP>;
-			wakeup-source;
-		};
-	};
-
 	max98357a: max98357a {
 		compatible = "maxim,max98357a";
 		pinctrl-names = "default";
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-sapphire-excavator.dts b/arch/arm64/boot/dts/rockchip/rk3399-sapphire-excavator.dts
index d5abf7d90f91..808ea77f951d 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-sapphire-excavator.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3399-sapphire-excavator.dts
@@ -91,7 +91,7 @@
 		pinctrl-0 = <&lcd_panel_reset>;
 		power-supply = <&vcc3v3_s0>;
 
-		ports {
+		port {
 			panel_in_edp: endpoint {
 				remote-endpoint = <&edp_out_panel>;
 			};
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index e3a375c4cb83..1b151442dac1 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -74,12 +74,13 @@ ENTRY(ce_aes_ccm_auth_data)
 	beq	10f
 	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
 	b	7b
-8:	mov	w7, w8
+8:	cbz	w8, 91f
+	mov	w7, w8
 	add	w8, w8, #16
 9:	ext	v1.16b, v1.16b, v1.16b, #1
 	adds	w7, w7, #1
 	bne	9b
-	eor	v0.16b, v0.16b, v1.16b
+91:	eor	v0.16b, v0.16b, v1.16b
 	st1	{v0.16b}, [x0]
 10:	str	w8, [x3]
 	ret
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 68b11aa690e4..5fc6f51908fd 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -125,7 +125,7 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
 			abytes -= added;
 		}
 
-		while (abytes > AES_BLOCK_SIZE) {
+		while (abytes >= AES_BLOCK_SIZE) {
 			__aes_arm64_encrypt(key->key_enc, mac, mac,
 					    num_rounds(key));
 			crypto_xor(mac, in, AES_BLOCK_SIZE);
@@ -139,8 +139,6 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
 					    num_rounds(key));
 			crypto_xor(mac, in, abytes);
 			*macp = abytes;
-		} else {
-			*macp = 0;
 		}
 	}
 }
@@ -255,7 +253,7 @@ static int ccm_encrypt(struct aead_request *req)
 	/* preserve the original iv for the final round */
 	memcpy(buf, req->iv, AES_BLOCK_SIZE);
 
-	err = skcipher_walk_aead_encrypt(&walk, req, true);
+	err = skcipher_walk_aead_encrypt(&walk, req, false);
 
 	if (may_use_simd()) {
 		while (walk.nbytes) {
@@ -313,7 +311,7 @@ static int ccm_decrypt(struct aead_request *req)
 	/* preserve the original iv for the final round */
 	memcpy(buf, req->iv, AES_BLOCK_SIZE);
 
-	err = skcipher_walk_aead_decrypt(&walk, req, true);
+	err = skcipher_walk_aead_decrypt(&walk, req, false);
 
 	if (may_use_simd()) {
 		while (walk.nbytes) {
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 67700045a0e0..4c7ce231963c 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -320,8 +320,7 @@ AES_ENTRY(aes_ctr_encrypt)
 
 .Lctrtailblock:
 	st1		{v0.16b}, [x0]
-	ldp		x29, x30, [sp], #16
-	ret
+	b		.Lctrout
 
 .Lctrcarry:
 	umov		x7, v4.d[0]		/* load upper word of ctr  */
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index e613a87f8b53..8432c8d0dea6 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -971,18 +971,22 @@ CPU_LE(	rev		x8, x8		)
 
 8:	next_ctr	v0
 	st1		{v0.16b}, [x24]
-	cbz		x23, 0f
+	cbz		x23, .Lctr_done
 
 	cond_yield_neon	98b
 	b		99b
 
-0:	frame_pop
+.Lctr_done:
+	frame_pop
 	ret
 
 	/*
 	 * If we are handling the tail of the input (x6 != NULL), return the
 	 * final keystream block back to the caller.
 	 */
+0:	cbz		x25, 8b
+	st1		{v0.16b}, [x25]
+	b		8b
 1:	cbz		x25, 8b
 	st1		{v1.16b}, [x25]
 	b		8b
diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 021bb9e9784b..706c4e10e9e2 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -158,8 +158,8 @@ ENTRY(hchacha_block_neon)
 	mov		w3, w2
 	bl		chacha_permute
 
-	st1		{v0.16b}, [x1], #16
-	st1		{v3.16b}, [x1]
+	st1		{v0.4s}, [x1], #16
+	st1		{v3.4s}, [x1]
 
 	ldp		x29, x30, [sp], #16
 	ret
@@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon)
 	add		v3.4s, v3.4s, v19.4s
 	  add		a2, a2, w8
 	  add		a3, a3, w9
+CPU_BE(	  rev		a0, a0		)
+CPU_BE(	  rev		a1, a1		)
+CPU_BE(	  rev		a2, a2		)
+CPU_BE(	  rev		a3, a3		)
 
 	ld4r		{v24.4s-v27.4s}, [x0], #16
 	ld4r		{v28.4s-v31.4s}, [x0]
@@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon)
 	add		v7.4s, v7.4s, v23.4s
 	  add		a6, a6, w8
 	  add		a7, a7, w9
+CPU_BE(	  rev		a4, a4		)
+CPU_BE(	  rev		a5, a5		)
+CPU_BE(	  rev		a6, a6		)
+CPU_BE(	  rev		a7, a7		)
 
 	// x8[0-3] += s2[0]
 	// x9[0-3] += s2[1]
@@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon)
 	add		v11.4s, v11.4s, v27.4s
 	  add		a10, a10, w8
 	  add		a11, a11, w9
+CPU_BE(	  rev		a8, a8		)
+CPU_BE(	  rev		a9, a9		)
+CPU_BE(	  rev		a10, a10	)
+CPU_BE(	  rev		a11, a11	)
 
 	// x12[0-3] += s3[0]
 	// x13[0-3] += s3[1]
@@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon)
 	add		v15.4s, v15.4s, v31.4s
 	  add		a14, a14, w8
 	  add		a15, a15, w9
+CPU_BE(	  rev		a12, a12	)
+CPU_BE(	  rev		a13, a13	)
+CPU_BE(	  rev		a14, a14	)
+CPU_BE(	  rev		a15, a15	)
 
 	// interleave 32-bit words in state n, n+1
 	  ldp		w6, w7, [x2], #64
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 9e82e8e8ed05..e545b42e6a46 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -2,12 +2,14 @@
 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
 //
 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License version 2 as
 // published by the Free Software Foundation.
 //
 
+// Derived from the x86 version:
 //
 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
 //
@@ -54,19 +56,11 @@
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-//       Function API:
-//       UINT16 crc_t10dif_pcl(
-//               UINT16 init_crc, //initial CRC value, 16 bits
-//               const unsigned char *buf, //buffer pointer to calculate CRC on
-//               UINT64 len //buffer length in bytes (64-bit data)
-//       );
-//
 //       Reference paper titled "Fast CRC Computation for Generic
 //	Polynomials Using PCLMULQDQ Instruction"
 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 //
-//
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -74,14 +68,14 @@
 	.text
 	.cpu		generic+crypto
 
-	arg1_low32	.req	w19
-	arg2		.req	x20
-	arg3		.req	x21
+	init_crc	.req	w19
+	buf		.req	x20
+	len		.req	x21
+	fold_consts_ptr	.req	x22
 
-	vzr		.req	v13
+	fold_consts	.req	v10
 
 	ad		.req	v14
-	bd		.req	v10
 
 	k00_16		.req	v15
 	k32_48		.req	v16
@@ -143,11 +137,11 @@ __pmull_p8_core:
 	ext		t5.8b, ad.8b, ad.8b, #2			// A2
 	ext		t6.8b, ad.8b, ad.8b, #3			// A3
 
-	pmull		t4.8h, t4.8b, bd.8b			// F = A1*B
+	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
 	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
-	pmull		t5.8h, t5.8b, bd.8b			// H = A2*B
+	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
 	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
-	pmull		t6.8h, t6.8b, bd.8b			// J = A3*B
+	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
 	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
 	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
 	b		0f
@@ -157,11 +151,11 @@ __pmull_p8_core:
 	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
 	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
 
-	pmull2		t4.8h, t4.16b, bd.16b			// F = A1*B
+	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
 	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
-	pmull2		t5.8h, t5.16b, bd.16b			// H = A2*B
+	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
 	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
-	pmull2		t6.8h, t6.16b, bd.16b			// J = A3*B
+	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
 	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
 	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
 
@@ -203,14 +197,14 @@ __pmull_p8_core:
 ENDPROC(__pmull_p8_core)
 
 	.macro		__pmull_p8, rq, ad, bd, i
-	.ifnc		\bd, v10
+	.ifnc		\bd, fold_consts
 	.err
 	.endif
 	mov		ad.16b, \ad\().16b
 	.ifb		\i
-	pmull		\rq\().8h, \ad\().8b, bd.8b		// D = A*B
+	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
 	.else
-	pmull2		\rq\().8h, \ad\().16b, bd.16b		// D = A*B
+	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
 	.endif
 
 	bl		.L__pmull_p8_core\i
@@ -219,17 +213,19 @@ ENDPROC(__pmull_p8_core)
 	eor		\rq\().16b, \rq\().16b, t6.16b
 	.endm
 
-	.macro		fold64, p, reg1, reg2
-	ldp		q11, q12, [arg2], #0x20
+	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
+	// into reg1, reg2.
+	.macro		fold_32_bytes, p, reg1, reg2
+	ldp		q11, q12, [buf], #0x20
 
-	__pmull_\p	v8, \reg1, v10, 2
-	__pmull_\p	\reg1, \reg1, v10
+	__pmull_\p	v8, \reg1, fold_consts, 2
+	__pmull_\p	\reg1, \reg1, fold_consts
 
 CPU_LE(	rev64		v11.16b, v11.16b		)
 CPU_LE(	rev64		v12.16b, v12.16b		)
 
-	__pmull_\p	v9, \reg2, v10, 2
-	__pmull_\p	\reg2, \reg2, v10
+	__pmull_\p	v9, \reg2, fold_consts, 2
+	__pmull_\p	\reg2, \reg2, fold_consts
 
 CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
 CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
@@ -240,15 +236,16 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	eor		\reg2\().16b, \reg2\().16b, v12.16b
 	.endm
 
-	.macro		fold16, p, reg, rk
-	__pmull_\p	v8, \reg, v10
-	__pmull_\p	\reg, \reg, v10, 2
-	.ifnb		\rk
-	ldr_l		q10, \rk, x8
-	__pmull_pre_\p	v10
+	// Fold src_reg into dst_reg, optionally loading the next fold constants
+	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+	__pmull_\p	v8, \src_reg, fold_consts
+	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
+	.ifnb		\load_next_consts
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
 	.endif
-	eor		v7.16b, v7.16b, v8.16b
-	eor		v7.16b, v7.16b, \reg\().16b
+	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
+	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
 	.endm
 
 	.macro		__pmull_p64, rd, rn, rm, n
@@ -260,40 +257,27 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	.endm
 
 	.macro		crc_t10dif_pmull, p
-	frame_push	3, 128
+	frame_push	4, 128
 
-	mov		arg1_low32, w0
-	mov		arg2, x1
-	mov		arg3, x2
-
-	movi		vzr.16b, #0		// init zero register
+	mov		init_crc, w0
+	mov		buf, x1
+	mov		len, x2
 
 	__pmull_init_\p
 
-	// adjust the 16-bit initial_crc value, scale it to 32 bits
-	lsl		arg1_low32, arg1_low32, #16
-
-	// check if smaller than 256
-	cmp		arg3, #256
-
-	// for sizes less than 128, we can't fold 64B at a time...
-	b.lt		.L_less_than_128_\@
+	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+	cmp		len, #256
+	b.lt		.Lless_than_256_bytes_\@
 
-	// load the initial crc value
-	// crc value does not need to be byte-reflected, but it needs
-	// to be moved to the high part of the register.
-	// because data will be byte-reflected and will align with
-	// initial crc at correct place.
-	movi		v10.16b, #0
-	mov		v10.s[3], arg1_low32		// initial crc
-
-	// receive the initial 64B data, xor the initial crc value
-	ldp		q0, q1, [arg2]
-	ldp		q2, q3, [arg2, #0x20]
-	ldp		q4, q5, [arg2, #0x40]
-	ldp		q6, q7, [arg2, #0x60]
-	add		arg2, arg2, #0x80
+	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
 
+	// Load the first 128 data bytes.  Byte swapping is necessary to make
+	// the bit order match the polynomial coefficient order.
+	ldp		q0, q1, [buf]
+	ldp		q2, q3, [buf, #0x20]
+	ldp		q4, q5, [buf, #0x40]
+	ldp		q6, q7, [buf, #0x60]
+	add		buf, buf, #0x80
 CPU_LE(	rev64		v0.16b, v0.16b			)
 CPU_LE(	rev64		v1.16b, v1.16b			)
 CPU_LE(	rev64		v2.16b, v2.16b			)
@@ -302,7 +286,6 @@ CPU_LE(	rev64		v4.16b, v4.16b			)
 CPU_LE(	rev64		v5.16b, v5.16b			)
 CPU_LE(	rev64		v6.16b, v6.16b			)
 CPU_LE(	rev64		v7.16b, v7.16b			)
-
 CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
 CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
@@ -312,36 +295,29 @@ CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
 CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
 CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 
-	// XOR the initial_crc value
-	eor		v0.16b, v0.16b, v10.16b
-
-	ldr_l		q10, rk3, x8	// xmm10 has rk3 and rk4
-					// type of pmull instruction
-					// will determine which constant to use
-	__pmull_pre_\p	v10
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v8.16b, #0
+	mov		v8.h[7], init_crc
+	eor		v0.16b, v0.16b, v8.16b
 
-	//
-	// we subtract 256 instead of 128 to save one instruction from the loop
-	//
-	sub		arg3, arg3, #256
+	// Load the constants for folding across 128 bytes.
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+	__pmull_pre_\p	fold_consts
 
-	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
-	// buffer. The _fold_64_B_loop will fold 64B at a time
-	// until we have 64+y Bytes of buffer
+	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
+	// 128 to simplify the termination condition of the following loop.
+	sub		len, len, #256
 
-	// fold 64B at a time. This section of the code folds 4 vector
-	// registers in parallel
-.L_fold_64_B_loop_\@:
+	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
+	// bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+	fold_32_bytes	\p, v0, v1
+	fold_32_bytes	\p, v2, v3
+	fold_32_bytes	\p, v4, v5
+	fold_32_bytes	\p, v6, v7
 
-	fold64		\p, v0, v1
-	fold64		\p, v2, v3
-	fold64		\p, v4, v5
-	fold64		\p, v6, v7
-
-	subs		arg3, arg3, #128
-
-	// check if there is another 64B in the buffer to be able to fold
-	b.lt		.L_fold_64_B_end_\@
+	subs		len, len, #128
+	b.lt		.Lfold_128_bytes_loop_done_\@
 
 	if_will_cond_yield_neon
 	stp		q0, q1, [sp, #.Lframe_local_offset]
@@ -353,228 +329,207 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	ldp		q2, q3, [sp, #.Lframe_local_offset + 32]
 	ldp		q4, q5, [sp, #.Lframe_local_offset + 64]
 	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
-	ldr_l		q10, rk3, x8
-	movi		vzr.16b, #0		// init zero register
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
 	__pmull_init_\p
-	__pmull_pre_\p	v10
+	__pmull_pre_\p	fold_consts
 	endif_yield_neon
 
-	b		.L_fold_64_B_loop_\@
-
-.L_fold_64_B_end_\@:
-	// at this point, the buffer pointer is pointing at the last y Bytes
-	// of the buffer the 64B of folded data is in 4 of the vector
-	// registers: v0, v1, v2, v3
-
-	// fold the 8 vector registers to 1 vector register with different
-	// constants
-
-	ldr_l		q10, rk9, x8
-	__pmull_pre_\p	v10
-
-	fold16		\p, v0, rk11
-	fold16		\p, v1, rk13
-	fold16		\p, v2, rk15
-	fold16		\p, v3, rk17
-	fold16		\p, v4, rk19
-	fold16		\p, v5, rk1
-	fold16		\p, v6
-
-	// instead of 64, we add 48 to the loop counter to save 1 instruction
-	// from the loop instead of a cmp instruction, we use the negative
-	// flag with the jl instruction
-	adds		arg3, arg3, #(128-16)
-	b.lt		.L_final_reduction_for_128_\@
-
-	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
-	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
-	// continue folding 16B at a time
-
-.L_16B_reduction_loop_\@:
-	__pmull_\p	v8, v7, v10
-	__pmull_\p	v7, v7, v10, 2
+	b		.Lfold_128_bytes_loop_\@
+
+.Lfold_128_bytes_loop_done_\@:
+
+	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+	// Fold across 64 bytes.
+	add		fold_consts_ptr, fold_consts_ptr, #16
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+	fold_16_bytes	\p, v0, v4
+	fold_16_bytes	\p, v1, v5
+	fold_16_bytes	\p, v2, v6
+	fold_16_bytes	\p, v3, v7, 1
+	// Fold across 32 bytes.
+	fold_16_bytes	\p, v4, v6
+	fold_16_bytes	\p, v5, v7, 1
+	// Fold across 16 bytes.
+	fold_16_bytes	\p, v6, v7
+
+	// Add 128 to get the correct number of data bytes remaining in 0...127
+	// (not counting v7), following the previous extra subtraction by 128.
+	// Then subtract 16 to simplify the termination condition of the
+	// following loop.
+	adds		len, len, #(128-16)
+
+	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+	// into them, storing the result back into v7.
+	b.lt		.Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+	__pmull_\p	v8, v7, fold_consts
+	__pmull_\p	v7, v7, fold_consts, 2
 	eor		v7.16b, v7.16b, v8.16b
-
-	ldr		q0, [arg2], #16
+	ldr		q0, [buf], #16
 CPU_LE(	rev64		v0.16b, v0.16b			)
 CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	eor		v7.16b, v7.16b, v0.16b
-	subs		arg3, arg3, #16
-
-	// instead of a cmp instruction, we utilize the flags with the
-	// jge instruction equivalent of: cmp arg3, 16-16
-	// check if there is any more 16B in the buffer to be able to fold
-	b.ge		.L_16B_reduction_loop_\@
-
-	// now we have 16+z bytes left to reduce, where 0<= z < 16.
-	// first, we reduce the data in the xmm7 register
-
-.L_final_reduction_for_128_\@:
-	// check if any more data to fold. If not, compute the CRC of
-	// the final 128 bits
-	adds		arg3, arg3, #16
-	b.eq		.L_128_done_\@
-
-	// here we are getting data that is less than 16 bytes.
-	// since we know that there was data before the pointer, we can
-	// offset the input pointer before the actual point, to receive
-	// exactly 16 bytes. after that the registers need to be adjusted.
-.L_get_last_two_regs_\@:
-	add		arg2, arg2, arg3
-	ldr		q1, [arg2, #-16]
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-
-	// get rid of the extra data that was loaded before
-	// load the shift constant
-	adr_l		x4, tbl_shf_table + 16
-	sub		x4, x4, arg3
-	ld1		{v0.16b}, [x4]
-
-	// shift v2 to the left by arg3 bytes
-	tbl		v2.16b, {v7.16b}, v0.16b
-
-	// shift v7 to the right by 16-arg3 bytes
-	movi		v9.16b, #0x80
-	eor		v0.16b, v0.16b, v9.16b
-	tbl		v7.16b, {v7.16b}, v0.16b
-
-	// blend
-	sshr		v0.16b, v0.16b, #7	// convert to 8-bit mask
-	bsl		v0.16b, v2.16b, v1.16b
-
-	// fold 16 Bytes
-	__pmull_\p	v8, v7, v10
-	__pmull_\p	v7, v7, v10, 2
-	eor		v7.16b, v7.16b, v8.16b
-	eor		v7.16b, v7.16b, v0.16b
+	subs		len, len, #16
+	b.ge		.Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+	// Add 16 to get the correct number of data bytes remaining in 0...15
+	// (not counting v7), following the previous extra subtraction by 16.
+	adds		len, len, #16
+	b.eq		.Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
+	// do this without needing a fold constant for each possible 'len',
+	// redivide the bytes into a first chunk of 'len' bytes and a second
+	// chunk of 16 bytes, then fold the first chunk into the second.
+
+	// v0 = last 16 original data bytes
+	add		buf, buf, len
+	ldr		q0, [buf, #-16]
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 
-.L_128_done_\@:
-	// compute crc of a 128-bit value
-	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
-	__pmull_pre_\p	v10
+	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+	adr_l		x4, .Lbyteshift_table + 16
+	sub		x4, x4, len
+	ld1		{v2.16b}, [x4]
+	tbl		v1.16b, {v7.16b}, v2.16b
 
-	// 64b fold
-	ext		v0.16b, vzr.16b, v7.16b, #8
-	mov		v7.d[0], v7.d[1]
-	__pmull_\p	v7, v7, v10
-	eor		v7.16b, v7.16b, v0.16b
+	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
+	movi		v3.16b, #0x80
+	eor		v2.16b, v2.16b, v3.16b
+	tbl		v3.16b, {v7.16b}, v2.16b
 
-	// 32b fold
-	ext		v0.16b, v7.16b, vzr.16b, #4
-	mov		v7.s[3], vzr.s[0]
-	__pmull_\p	v0, v0, v10, 2
-	eor		v7.16b, v7.16b, v0.16b
+	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+	sshr		v2.16b, v2.16b, #7
 
-	// barrett reduction
-	ldr_l		q10, rk7, x8
-	__pmull_pre_\p	v10
-	mov		v0.d[0], v7.d[1]
+	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+	// then '16-len' bytes from v1 (high-order bytes).
+	bsl		v2.16b, v1.16b, v0.16b
 
-	__pmull_\p	v0, v0, v10
-	ext		v0.16b, vzr.16b, v0.16b, #12
-	__pmull_\p	v0, v0, v10, 2
-	ext		v0.16b, vzr.16b, v0.16b, #12
+	// Fold the first chunk into the second chunk, storing the result in v7.
+	__pmull_\p	v0, v3, fold_consts
+	__pmull_\p	v7, v3, fold_consts, 2
 	eor		v7.16b, v7.16b, v0.16b
-	mov		w0, v7.s[1]
-
-.L_cleanup_\@:
-	// scale the result back to 16 bits
-	lsr		x0, x0, #16
+	eor		v7.16b, v7.16b, v2.16b
+
+.Lreduce_final_16_bytes_\@:
+	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+	movi		v2.16b, #0		// init zero register
+
+	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+
+	// Fold the high 64 bits into the low 64 bits, while also multiplying by
+	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+	// whose low 48 bits are 0.
+	ext		v0.16b, v2.16b, v7.16b, #8
+	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
+	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
+
+	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
+	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
+	mov		v0.s[3], v2.s[0]	// zero high 32 bits
+	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
+	eor		v0.16b, v0.16b, v1.16b	// + low bits
+
+	// Load G(x) and floor(x^48 / G(x)).
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+	__pmull_pre_\p	fold_consts
+
+	// Use Barrett reduction to compute the final CRC value.
+	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
+	ushr		v1.2d, v1.2d, #32	// /= x^32
+	__pmull_\p	v1, v1, fold_consts	// *= G(x)
+	ushr		v0.2d, v0.2d, #48
+	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
+	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+	umov		w0, v0.h[0]
 	frame_pop
 	ret
 
-.L_less_than_128_\@:
-	cbz		arg3, .L_cleanup_\@
+.Lless_than_256_bytes_\@:
+	// Checksumming a buffer of length 16...255 bytes
 
-	movi		v0.16b, #0
-	mov		v0.s[3], arg1_low32	// get the initial crc value
+	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
 
-	ldr		q7, [arg2], #0x10
+	// Load the first 16 data bytes.
+	ldr		q7, [buf], #0x10
 CPU_LE(	rev64		v7.16b, v7.16b			)
 CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
-
-	cmp		arg3, #16
-	b.eq		.L_128_done_\@		// exactly 16 left
-	b.lt		.L_less_than_16_left_\@
-
-	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
-	__pmull_pre_\p	v10
-
-	// update the counter. subtract 32 instead of 16 to save one
-	// instruction from the loop
-	subs		arg3, arg3, #32
-	b.ge		.L_16B_reduction_loop_\@
-
-	add		arg3, arg3, #16
-	b		.L_get_last_two_regs_\@
-
-.L_less_than_16_left_\@:
-	// shl r9, 4
-	adr_l		x0, tbl_shf_table + 16
-	sub		x0, x0, arg3
-	ld1		{v0.16b}, [x0]
-	movi		v9.16b, #0x80
-	eor		v0.16b, v0.16b, v9.16b
-	tbl		v7.16b, {v7.16b}, v0.16b
-	b		.L_128_done_\@
+
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v0.16b, #0
+	mov		v0.h[7], init_crc
+	eor		v7.16b, v7.16b, v0.16b
+
+	// Load the fold-across-16-bytes constants.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	__pmull_pre_\p	fold_consts
+
+	cmp		len, #16
+	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
+	subs		len, len, #32
+	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
+	add		len, len, #16
+	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
 	.endm
 
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
 ENTRY(crc_t10dif_pmull_p8)
 	crc_t10dif_pmull	p8
 ENDPROC(crc_t10dif_pmull_p8)
 
 	.align		5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
 ENTRY(crc_t10dif_pmull_p64)
 	crc_t10dif_pmull	p64
 ENDPROC(crc_t10dif_pmull_p64)
 
-// precomputed constants
-// these constants are precomputed from the poly:
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
 	.section	".rodata", "a"
 	.align		4
-// Q = 0x18BB70000
-// rk1 = 2^(32*3) mod Q << 32
-// rk2 = 2^(32*5) mod Q << 32
-// rk3 = 2^(32*15) mod Q << 32
-// rk4 = 2^(32*17) mod Q << 32
-// rk5 = 2^(32*3) mod Q << 32
-// rk6 = 2^(32*2) mod Q << 32
-// rk7 = floor(2^64/Q)
-// rk8 = Q
-
-rk1:	.octa		0x06df0000000000002d56000000000000
-rk3:	.octa		0x7cf50000000000009d9d000000000000
-rk5:	.octa		0x13680000000000002d56000000000000
-rk7:	.octa		0x000000018bb7000000000001f65a57f8
-rk9:	.octa		0xbfd6000000000000ceae000000000000
-rk11:	.octa		0x713c0000000000001e16000000000000
-rk13:	.octa		0x80a6000000000000f7f9000000000000
-rk15:	.octa		0xe658000000000000044c000000000000
-rk17:	.octa		0xa497000000000000ad18000000000000
-rk19:	.octa		0xe7b50000000000006ee3000000000000
-
-tbl_shf_table:
-// use these values for shift constants for the tbl/tbx instruction
-// different alignments result in values as shown:
-//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
-//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
-//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
-//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
-//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
-//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
-//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
-//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
-//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
 
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
+	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
+// .Lfold_across_64_bytes_consts:
+	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
+	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
+// .Lfold_across_32_bytes_consts:
+	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
+	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
+.Lfold_across_16_bytes_consts:
+	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
+	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
+// .Lfinal_fold_consts:
+	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
+	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+	.quad		0x0000000000018bb7	// G(x)
+	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
 	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
 	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index b461d62023f2..dd325829ee44 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -22,10 +22,8 @@
 
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
 
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 buf[], u64 len);
-
-static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
 
 static int crct10dif_init(struct shash_desc *desc)
 {
@@ -35,30 +33,33 @@ static int crct10dif_init(struct shash_desc *desc)
 	return 0;
 }
 
-static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
 			    unsigned int length)
 {
 	u16 *crc = shash_desc_ctx(desc);
-	unsigned int l;
 
-	if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
-		l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
-			  ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull_p8(*crc, data, length);
+		kernel_neon_end();
+	} else {
+		*crc = crc_t10dif_generic(*crc, data, length);
+	}
 
-		*crc = crc_t10dif_generic(*crc, data, l);
+	return 0;
+}
 
-		length -= l;
-		data += l;
-	}
+static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
+			    unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
 
-	if (length > 0) {
-		if (may_use_simd()) {
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull(*crc, data, length);
-			kernel_neon_end();
-		} else {
-			*crc = crc_t10dif_generic(*crc, data, length);
-		}
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull_p64(*crc, data, length);
+		kernel_neon_end();
+	} else {
+		*crc = crc_t10dif_generic(*crc, data, length);
 	}
 
 	return 0;
@@ -72,10 +73,22 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
 	return 0;
 }
 
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg crc_t10dif_alg[] = {{
 	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
 	.init			= crct10dif_init,
-	.update			= crct10dif_update,
+	.update			= crct10dif_update_pmull_p8,
+	.final			= crct10dif_final,
+	.descsize		= CRC_T10DIF_DIGEST_SIZE,
+
+	.base.cra_name		= "crct10dif",
+	.base.cra_driver_name	= "crct10dif-arm64-neon",
+	.base.cra_priority	= 100,
+	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
+	.init			= crct10dif_init,
+	.update			= crct10dif_update_pmull_p64,
 	.final			= crct10dif_final,
 	.descsize		= CRC_T10DIF_DIGEST_SIZE,
 
@@ -84,21 +97,25 @@ static struct shash_alg crc_t10dif_alg = {
 	.base.cra_priority	= 200,
 	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
-};
+}};
 
 static int __init crc_t10dif_mod_init(void)
 {
 	if (elf_hwcap & HWCAP_PMULL)
-		crc_t10dif_pmull = crc_t10dif_pmull_p64;
+		return crypto_register_shashes(crc_t10dif_alg,
+					       ARRAY_SIZE(crc_t10dif_alg));
 	else
-		crc_t10dif_pmull = crc_t10dif_pmull_p8;
-
-	return crypto_register_shash(&crc_t10dif_alg);
+		/* only register the first array element */
+		return crypto_register_shash(crc_t10dif_alg);
 }
 
 static void __exit crc_t10dif_mod_exit(void)
 {
-	crypto_unregister_shash(&crc_t10dif_alg);
+	if (elf_hwcap & HWCAP_PMULL)
+		crypto_unregister_shashes(crc_t10dif_alg,
+					  ARRAY_SIZE(crc_t10dif_alg));
+	else
+		crypto_unregister_shash(crc_t10dif_alg);
 }
 
 module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 067d8937d5af..791ad422c427 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -60,10 +60,6 @@ asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
 				      struct ghash_key const *k,
 				      const char *head);
 
-static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
-				  struct ghash_key const *k,
-				  const char *head);
-
 asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
 				  const u8 src[], struct ghash_key const *k,
 				  u8 ctr[], u32 const rk[], int rounds,
@@ -87,11 +83,15 @@ static int ghash_init(struct shash_desc *desc)
 }
 
 static void ghash_do_update(int blocks, u64 dg[], const char *src,
-			    struct ghash_key *key, const char *head)
+			    struct ghash_key *key, const char *head,
+			    void (*simd_update)(int blocks, u64 dg[],
+						const char *src,
+						struct ghash_key const *k,
+						const char *head))
 {
 	if (likely(may_use_simd())) {
 		kernel_neon_begin();
-		pmull_ghash_update(blocks, dg, src, key, head);
+		simd_update(blocks, dg, src, key, head);
 		kernel_neon_end();
 	} else {
 		be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
@@ -119,8 +119,12 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
 /* avoid hogging the CPU for too long */
 #define MAX_BLOCKS	(SZ_64K / GHASH_BLOCK_SIZE)
 
-static int ghash_update(struct shash_desc *desc, const u8 *src,
-			unsigned int len)
+static int __ghash_update(struct shash_desc *desc, const u8 *src,
+			  unsigned int len,
+			  void (*simd_update)(int blocks, u64 dg[],
+					      const char *src,
+					      struct ghash_key const *k,
+					      const char *head))
 {
 	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
 	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
@@ -146,7 +150,8 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
 			int chunk = min(blocks, MAX_BLOCKS);
 
 			ghash_do_update(chunk, ctx->digest, src, key,
-					partial ? ctx->buf : NULL);
+					partial ? ctx->buf : NULL,
+					simd_update);
 
 			blocks -= chunk;
 			src += chunk * GHASH_BLOCK_SIZE;
@@ -158,7 +163,19 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
 	return 0;
 }
 
-static int ghash_final(struct shash_desc *desc, u8 *dst)
+static int ghash_update_p8(struct shash_desc *desc, const u8 *src,
+			   unsigned int len)
+{
+	return __ghash_update(desc, src, len, pmull_ghash_update_p8);
+}
+
+static int ghash_update_p64(struct shash_desc *desc, const u8 *src,
+			    unsigned int len)
+{
+	return __ghash_update(desc, src, len, pmull_ghash_update_p64);
+}
+
+static int ghash_final_p8(struct shash_desc *desc, u8 *dst)
 {
 	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
 	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
@@ -168,7 +185,28 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
 
 		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
 
-		ghash_do_update(1, ctx->digest, ctx->buf, key, NULL);
+		ghash_do_update(1, ctx->digest, ctx->buf, key, NULL,
+				pmull_ghash_update_p8);
+	}
+	put_unaligned_be64(ctx->digest[1], dst);
+	put_unaligned_be64(ctx->digest[0], dst + 8);
+
+	*ctx = (struct ghash_desc_ctx){};
+	return 0;
+}
+
+static int ghash_final_p64(struct shash_desc *desc, u8 *dst)
+{
+	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+
+	if (partial) {
+		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+
+		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
+
+		ghash_do_update(1, ctx->digest, ctx->buf, key, NULL,
+				pmull_ghash_update_p64);
 	}
 	put_unaligned_be64(ctx->digest[1], dst);
 	put_unaligned_be64(ctx->digest[0], dst + 8);
@@ -224,7 +262,21 @@ static int ghash_setkey(struct crypto_shash *tfm,
 	return __ghash_setkey(key, inkey, keylen);
 }
 
-static struct shash_alg ghash_alg = {
+static struct shash_alg ghash_alg[] = {{
+	.base.cra_name		= "ghash",
+	.base.cra_driver_name	= "ghash-neon",
+	.base.cra_priority	= 100,
+	.base.cra_blocksize	= GHASH_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct ghash_key),
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= GHASH_DIGEST_SIZE,
+	.init			= ghash_init,
+	.update			= ghash_update_p8,
+	.final			= ghash_final_p8,
+	.setkey			= ghash_setkey,
+	.descsize		= sizeof(struct ghash_desc_ctx),
+}, {
 	.base.cra_name		= "ghash",
 	.base.cra_driver_name	= "ghash-ce",
 	.base.cra_priority	= 200,
@@ -234,11 +286,11 @@ static struct shash_alg ghash_alg = {
 
 	.digestsize		= GHASH_DIGEST_SIZE,
 	.init			= ghash_init,
-	.update			= ghash_update,
-	.final			= ghash_final,
+	.update			= ghash_update_p64,
+	.final			= ghash_final_p64,
 	.setkey			= ghash_setkey,
 	.descsize		= sizeof(struct ghash_desc_ctx),
-};
+}};
 
 static int num_rounds(struct crypto_aes_ctx *ctx)
 {
@@ -301,7 +353,8 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
 		int blocks = count / GHASH_BLOCK_SIZE;
 
 		ghash_do_update(blocks, dg, src, &ctx->ghash_key,
-				*buf_count ? buf : NULL);
+				*buf_count ? buf : NULL,
+				pmull_ghash_update_p64);
 
 		src += blocks * GHASH_BLOCK_SIZE;
 		count %= GHASH_BLOCK_SIZE;
@@ -345,7 +398,8 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
 
 	if (buf_count) {
 		memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
-		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL,
+				pmull_ghash_update_p64);
 	}
 }
 
@@ -358,7 +412,8 @@ static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
 	lengths.a = cpu_to_be64(req->assoclen * 8);
 	lengths.b = cpu_to_be64(cryptlen * 8);
 
-	ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL);
+	ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL,
+			pmull_ghash_update_p64);
 
 	put_unaligned_be64(dg[1], mac);
 	put_unaligned_be64(dg[0], mac + 8);
@@ -434,7 +489,7 @@ static int gcm_encrypt(struct aead_request *req)
 
 			ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,
 					walk.dst.virt.addr, &ctx->ghash_key,
-					NULL);
+					NULL, pmull_ghash_update_p64);
 
 			err = skcipher_walk_done(&walk,
 						 walk.nbytes % (2 * AES_BLOCK_SIZE));
@@ -469,7 +524,8 @@ static int gcm_encrypt(struct aead_request *req)
 
 		memcpy(buf, dst, nbytes);
 		memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
-		ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head);
+		ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head,
+				pmull_ghash_update_p64);
 
 		err = skcipher_walk_done(&walk, 0);
 	}
@@ -558,7 +614,8 @@ static int gcm_decrypt(struct aead_request *req)
 			u8 *src = walk.src.virt.addr;
 
 			ghash_do_update(blocks, dg, walk.src.virt.addr,
-					&ctx->ghash_key, NULL);
+					&ctx->ghash_key, NULL,
+					pmull_ghash_update_p64);
 
 			do {
 				__aes_arm64_encrypt(ctx->aes_key.key_enc,
@@ -602,7 +659,8 @@ static int gcm_decrypt(struct aead_request *req)
 
 		memcpy(buf, src, nbytes);
 		memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
-		ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head);
+		ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head,
+				pmull_ghash_update_p64);
 
 		crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
 			       walk.nbytes);
@@ -650,26 +708,30 @@ static int __init ghash_ce_mod_init(void)
 		return -ENODEV;
 
 	if (elf_hwcap & HWCAP_PMULL)
-		pmull_ghash_update = pmull_ghash_update_p64;
-
+		ret = crypto_register_shashes(ghash_alg,
+					      ARRAY_SIZE(ghash_alg));
 	else
-		pmull_ghash_update = pmull_ghash_update_p8;
+		/* only register the first array element */
+		ret = crypto_register_shash(ghash_alg);
 
-	ret = crypto_register_shash(&ghash_alg);
 	if (ret)
 		return ret;
 
 	if (elf_hwcap & HWCAP_PMULL) {
 		ret = crypto_register_aead(&gcm_aes_alg);
 		if (ret)
-			crypto_unregister_shash(&ghash_alg);
+			crypto_unregister_shashes(ghash_alg,
+						  ARRAY_SIZE(ghash_alg));
 	}
 	return ret;
 }
 
 static void __exit ghash_ce_mod_exit(void)
 {
-	crypto_unregister_shash(&ghash_alg);
+	if (elf_hwcap & HWCAP_PMULL)
+		crypto_unregister_shashes(ghash_alg, ARRAY_SIZE(ghash_alg));
+	else
+		crypto_unregister_shash(ghash_alg);
 	crypto_unregister_aead(&gcm_aes_alg);
 }
 
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 9bca54dda75c..1f4e9ee641c9 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -42,124 +42,131 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)			READ_ONCE((v)->counter)
-#define atomic_set(v, i)		WRITE_ONCE(((v)->counter), (i))
-
-#define atomic_add_return_relaxed	atomic_add_return_relaxed
-#define atomic_add_return_acquire	atomic_add_return_acquire
-#define atomic_add_return_release	atomic_add_return_release
-#define atomic_add_return		atomic_add_return
-
-#define atomic_sub_return_relaxed	atomic_sub_return_relaxed
-#define atomic_sub_return_acquire	atomic_sub_return_acquire
-#define atomic_sub_return_release	atomic_sub_return_release
-#define atomic_sub_return		atomic_sub_return
-
-#define atomic_fetch_add_relaxed	atomic_fetch_add_relaxed
-#define atomic_fetch_add_acquire	atomic_fetch_add_acquire
-#define atomic_fetch_add_release	atomic_fetch_add_release
-#define atomic_fetch_add		atomic_fetch_add
-
-#define atomic_fetch_sub_relaxed	atomic_fetch_sub_relaxed
-#define atomic_fetch_sub_acquire	atomic_fetch_sub_acquire
-#define atomic_fetch_sub_release	atomic_fetch_sub_release
-#define atomic_fetch_sub		atomic_fetch_sub
-
-#define atomic_fetch_and_relaxed	atomic_fetch_and_relaxed
-#define atomic_fetch_and_acquire	atomic_fetch_and_acquire
-#define atomic_fetch_and_release	atomic_fetch_and_release
-#define atomic_fetch_and		atomic_fetch_and
-
-#define atomic_fetch_andnot_relaxed	atomic_fetch_andnot_relaxed
-#define atomic_fetch_andnot_acquire	atomic_fetch_andnot_acquire
-#define atomic_fetch_andnot_release	atomic_fetch_andnot_release
-#define atomic_fetch_andnot		atomic_fetch_andnot
-
-#define atomic_fetch_or_relaxed		atomic_fetch_or_relaxed
-#define atomic_fetch_or_acquire		atomic_fetch_or_acquire
-#define atomic_fetch_or_release		atomic_fetch_or_release
-#define atomic_fetch_or			atomic_fetch_or
-
-#define atomic_fetch_xor_relaxed	atomic_fetch_xor_relaxed
-#define atomic_fetch_xor_acquire	atomic_fetch_xor_acquire
-#define atomic_fetch_xor_release	atomic_fetch_xor_release
-#define atomic_fetch_xor		atomic_fetch_xor
-
-#define atomic_xchg_relaxed(v, new)	xchg_relaxed(&((v)->counter), (new))
-#define atomic_xchg_acquire(v, new)	xchg_acquire(&((v)->counter), (new))
-#define atomic_xchg_release(v, new)	xchg_release(&((v)->counter), (new))
-#define atomic_xchg(v, new)		xchg(&((v)->counter), (new))
-
-#define atomic_cmpxchg_relaxed(v, old, new)				\
-	cmpxchg_relaxed(&((v)->counter), (old), (new))
-#define atomic_cmpxchg_acquire(v, old, new)				\
-	cmpxchg_acquire(&((v)->counter), (old), (new))
-#define atomic_cmpxchg_release(v, old, new)				\
-	cmpxchg_release(&((v)->counter), (old), (new))
-#define atomic_cmpxchg(v, old, new)	cmpxchg(&((v)->counter), (old), (new))
-
-#define atomic_andnot			atomic_andnot
+#define arch_atomic_read(v)			READ_ONCE((v)->counter)
+#define arch_atomic_set(v, i)			WRITE_ONCE(((v)->counter), (i))
+
+#define arch_atomic_add_return_relaxed		arch_atomic_add_return_relaxed
+#define arch_atomic_add_return_acquire		arch_atomic_add_return_acquire
+#define arch_atomic_add_return_release		arch_atomic_add_return_release
+#define arch_atomic_add_return			arch_atomic_add_return
+
+#define arch_atomic_sub_return_relaxed		arch_atomic_sub_return_relaxed
+#define arch_atomic_sub_return_acquire		arch_atomic_sub_return_acquire
+#define arch_atomic_sub_return_release		arch_atomic_sub_return_release
+#define arch_atomic_sub_return			arch_atomic_sub_return
+
+#define arch_atomic_fetch_add_relaxed		arch_atomic_fetch_add_relaxed
+#define arch_atomic_fetch_add_acquire		arch_atomic_fetch_add_acquire
+#define arch_atomic_fetch_add_release		arch_atomic_fetch_add_release
+#define arch_atomic_fetch_add			arch_atomic_fetch_add
+
+#define arch_atomic_fetch_sub_relaxed		arch_atomic_fetch_sub_relaxed
+#define arch_atomic_fetch_sub_acquire		arch_atomic_fetch_sub_acquire
+#define arch_atomic_fetch_sub_release		arch_atomic_fetch_sub_release
+#define arch_atomic_fetch_sub			arch_atomic_fetch_sub
+
+#define arch_atomic_fetch_and_relaxed		arch_atomic_fetch_and_relaxed
+#define arch_atomic_fetch_and_acquire		arch_atomic_fetch_and_acquire
+#define arch_atomic_fetch_and_release		arch_atomic_fetch_and_release
+#define arch_atomic_fetch_and			arch_atomic_fetch_and
+
+#define arch_atomic_fetch_andnot_relaxed	arch_atomic_fetch_andnot_relaxed
+#define arch_atomic_fetch_andnot_acquire	arch_atomic_fetch_andnot_acquire
+#define arch_atomic_fetch_andnot_release	arch_atomic_fetch_andnot_release
+#define arch_atomic_fetch_andnot		arch_atomic_fetch_andnot
+
+#define arch_atomic_fetch_or_relaxed		arch_atomic_fetch_or_relaxed
+#define arch_atomic_fetch_or_acquire		arch_atomic_fetch_or_acquire
+#define arch_atomic_fetch_or_release		arch_atomic_fetch_or_release
+#define arch_atomic_fetch_or			arch_atomic_fetch_or
+
+#define arch_atomic_fetch_xor_relaxed		arch_atomic_fetch_xor_relaxed
+#define arch_atomic_fetch_xor_acquire		arch_atomic_fetch_xor_acquire
+#define arch_atomic_fetch_xor_release		arch_atomic_fetch_xor_release
+#define arch_atomic_fetch_xor			arch_atomic_fetch_xor
+
+#define arch_atomic_xchg_relaxed(v, new) \
+	arch_xchg_relaxed(&((v)->counter), (new))
+#define arch_atomic_xchg_acquire(v, new) \
+	arch_xchg_acquire(&((v)->counter), (new))
+#define arch_atomic_xchg_release(v, new) \
+	arch_xchg_release(&((v)->counter), (new))
+#define arch_atomic_xchg(v, new) \
+	arch_xchg(&((v)->counter), (new))
+
+#define arch_atomic_cmpxchg_relaxed(v, old, new) \
+	arch_cmpxchg_relaxed(&((v)->counter), (old), (new))
+#define arch_atomic_cmpxchg_acquire(v, old, new) \
+	arch_cmpxchg_acquire(&((v)->counter), (old), (new))
+#define arch_atomic_cmpxchg_release(v, old, new) \
+	arch_cmpxchg_release(&((v)->counter), (old), (new))
+#define arch_atomic_cmpxchg(v, old, new) \
+	arch_cmpxchg(&((v)->counter), (old), (new))
+
+#define arch_atomic_andnot			arch_atomic_andnot
 
 /*
- * 64-bit atomic operations.
+ * 64-bit arch_atomic operations.
  */
-#define ATOMIC64_INIT			ATOMIC_INIT
-#define atomic64_read			atomic_read
-#define atomic64_set			atomic_set
-
-#define atomic64_add_return_relaxed	atomic64_add_return_relaxed
-#define atomic64_add_return_acquire	atomic64_add_return_acquire
-#define atomic64_add_return_release	atomic64_add_return_release
-#define atomic64_add_return		atomic64_add_return
-
-#define atomic64_sub_return_relaxed	atomic64_sub_return_relaxed
-#define atomic64_sub_return_acquire	atomic64_sub_return_acquire
-#define atomic64_sub_return_release	atomic64_sub_return_release
-#define atomic64_sub_return		atomic64_sub_return
-
-#define atomic64_fetch_add_relaxed	atomic64_fetch_add_relaxed
-#define atomic64_fetch_add_acquire	atomic64_fetch_add_acquire
-#define atomic64_fetch_add_release	atomic64_fetch_add_release
-#define atomic64_fetch_add		atomic64_fetch_add
-
-#define atomic64_fetch_sub_relaxed	atomic64_fetch_sub_relaxed
-#define atomic64_fetch_sub_acquire	atomic64_fetch_sub_acquire
-#define atomic64_fetch_sub_release	atomic64_fetch_sub_release
-#define atomic64_fetch_sub		atomic64_fetch_sub
-
-#define atomic64_fetch_and_relaxed	atomic64_fetch_and_relaxed
-#define atomic64_fetch_and_acquire	atomic64_fetch_and_acquire
-#define atomic64_fetch_and_release	atomic64_fetch_and_release
-#define atomic64_fetch_and		atomic64_fetch_and
-
-#define atomic64_fetch_andnot_relaxed	atomic64_fetch_andnot_relaxed
-#define atomic64_fetch_andnot_acquire	atomic64_fetch_andnot_acquire
-#define atomic64_fetch_andnot_release	atomic64_fetch_andnot_release
-#define atomic64_fetch_andnot		atomic64_fetch_andnot
-
-#define atomic64_fetch_or_relaxed	atomic64_fetch_or_relaxed
-#define atomic64_fetch_or_acquire	atomic64_fetch_or_acquire
-#define atomic64_fetch_or_release	atomic64_fetch_or_release
-#define atomic64_fetch_or		atomic64_fetch_or
-
-#define atomic64_fetch_xor_relaxed	atomic64_fetch_xor_relaxed
-#define atomic64_fetch_xor_acquire	atomic64_fetch_xor_acquire
-#define atomic64_fetch_xor_release	atomic64_fetch_xor_release
-#define atomic64_fetch_xor		atomic64_fetch_xor
-
-#define atomic64_xchg_relaxed		atomic_xchg_relaxed
-#define atomic64_xchg_acquire		atomic_xchg_acquire
-#define atomic64_xchg_release		atomic_xchg_release
-#define atomic64_xchg			atomic_xchg
-
-#define atomic64_cmpxchg_relaxed	atomic_cmpxchg_relaxed
-#define atomic64_cmpxchg_acquire	atomic_cmpxchg_acquire
-#define atomic64_cmpxchg_release	atomic_cmpxchg_release
-#define atomic64_cmpxchg		atomic_cmpxchg
-
-#define atomic64_andnot			atomic64_andnot
-
-#define atomic64_dec_if_positive	atomic64_dec_if_positive
+#define ATOMIC64_INIT				ATOMIC_INIT
+#define arch_atomic64_read			arch_atomic_read
+#define arch_atomic64_set			arch_atomic_set
+
+#define arch_atomic64_add_return_relaxed	arch_atomic64_add_return_relaxed
+#define arch_atomic64_add_return_acquire	arch_atomic64_add_return_acquire
+#define arch_atomic64_add_return_release	arch_atomic64_add_return_release
+#define arch_atomic64_add_return		arch_atomic64_add_return
+
+#define arch_atomic64_sub_return_relaxed	arch_atomic64_sub_return_relaxed
+#define arch_atomic64_sub_return_acquire	arch_atomic64_sub_return_acquire
+#define arch_atomic64_sub_return_release	arch_atomic64_sub_return_release
+#define arch_atomic64_sub_return		arch_atomic64_sub_return
+
+#define arch_atomic64_fetch_add_relaxed		arch_atomic64_fetch_add_relaxed
+#define arch_atomic64_fetch_add_acquire		arch_atomic64_fetch_add_acquire
+#define arch_atomic64_fetch_add_release		arch_atomic64_fetch_add_release
+#define arch_atomic64_fetch_add			arch_atomic64_fetch_add
+
+#define arch_atomic64_fetch_sub_relaxed		arch_atomic64_fetch_sub_relaxed
+#define arch_atomic64_fetch_sub_acquire		arch_atomic64_fetch_sub_acquire
+#define arch_atomic64_fetch_sub_release		arch_atomic64_fetch_sub_release
+#define arch_atomic64_fetch_sub			arch_atomic64_fetch_sub
+
+#define arch_atomic64_fetch_and_relaxed		arch_atomic64_fetch_and_relaxed
+#define arch_atomic64_fetch_and_acquire		arch_atomic64_fetch_and_acquire
+#define arch_atomic64_fetch_and_release		arch_atomic64_fetch_and_release
+#define arch_atomic64_fetch_and			arch_atomic64_fetch_and
+
+#define arch_atomic64_fetch_andnot_relaxed	arch_atomic64_fetch_andnot_relaxed
+#define arch_atomic64_fetch_andnot_acquire	arch_atomic64_fetch_andnot_acquire
+#define arch_atomic64_fetch_andnot_release	arch_atomic64_fetch_andnot_release
+#define arch_atomic64_fetch_andnot		arch_atomic64_fetch_andnot
+
+#define arch_atomic64_fetch_or_relaxed		arch_atomic64_fetch_or_relaxed
+#define arch_atomic64_fetch_or_acquire		arch_atomic64_fetch_or_acquire
+#define arch_atomic64_fetch_or_release		arch_atomic64_fetch_or_release
+#define arch_atomic64_fetch_or			arch_atomic64_fetch_or
+
+#define arch_atomic64_fetch_xor_relaxed		arch_atomic64_fetch_xor_relaxed
+#define arch_atomic64_fetch_xor_acquire		arch_atomic64_fetch_xor_acquire
+#define arch_atomic64_fetch_xor_release		arch_atomic64_fetch_xor_release
+#define arch_atomic64_fetch_xor			arch_atomic64_fetch_xor
+
+#define arch_atomic64_xchg_relaxed		arch_atomic_xchg_relaxed
+#define arch_atomic64_xchg_acquire		arch_atomic_xchg_acquire
+#define arch_atomic64_xchg_release		arch_atomic_xchg_release
+#define arch_atomic64_xchg			arch_atomic_xchg
+
+#define arch_atomic64_cmpxchg_relaxed		arch_atomic_cmpxchg_relaxed
+#define arch_atomic64_cmpxchg_acquire		arch_atomic_cmpxchg_acquire
+#define arch_atomic64_cmpxchg_release		arch_atomic_cmpxchg_release
+#define arch_atomic64_cmpxchg			arch_atomic_cmpxchg
+
+#define arch_atomic64_andnot			arch_atomic64_andnot
+
+#define arch_atomic64_dec_if_positive		arch_atomic64_dec_if_positive
+
+#include <asm-generic/atomic-instrumented.h>
 
 #endif
 #endif
diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index af7b99005453..e321293e0c89 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -39,7 +39,7 @@
 
 #define ATOMIC_OP(op, asm_op)						\
 __LL_SC_INLINE void							\
-__LL_SC_PREFIX(atomic_##op(int i, atomic_t *v))				\
+__LL_SC_PREFIX(arch_atomic_##op(int i, atomic_t *v))			\
 {									\
 	unsigned long tmp;						\
 	int result;							\
@@ -53,11 +53,11 @@ __LL_SC_PREFIX(atomic_##op(int i, atomic_t *v))				\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: "Ir" (i));							\
 }									\
-__LL_SC_EXPORT(atomic_##op);
+__LL_SC_EXPORT(arch_atomic_##op);
 
 #define ATOMIC_OP_RETURN(name, mb, acq, rel, cl, op, asm_op)		\
 __LL_SC_INLINE int							\
-__LL_SC_PREFIX(atomic_##op##_return##name(int i, atomic_t *v))		\
+__LL_SC_PREFIX(arch_atomic_##op##_return##name(int i, atomic_t *v))	\
 {									\
 	unsigned long tmp;						\
 	int result;							\
@@ -75,11 +75,11 @@ __LL_SC_PREFIX(atomic_##op##_return##name(int i, atomic_t *v))		\
 									\
 	return result;							\
 }									\
-__LL_SC_EXPORT(atomic_##op##_return##name);
+__LL_SC_EXPORT(arch_atomic_##op##_return##name);
 
 #define ATOMIC_FETCH_OP(name, mb, acq, rel, cl, op, asm_op)		\
 __LL_SC_INLINE int							\
-__LL_SC_PREFIX(atomic_fetch_##op##name(int i, atomic_t *v))		\
+__LL_SC_PREFIX(arch_atomic_fetch_##op##name(int i, atomic_t *v))	\
 {									\
 	unsigned long tmp;						\
 	int val, result;						\
@@ -97,7 +97,7 @@ __LL_SC_PREFIX(atomic_fetch_##op##name(int i, atomic_t *v))		\
 									\
 	return result;							\
 }									\
-__LL_SC_EXPORT(atomic_fetch_##op##name);
+__LL_SC_EXPORT(arch_atomic_fetch_##op##name);
 
 #define ATOMIC_OPS(...)							\
 	ATOMIC_OP(__VA_ARGS__)						\
@@ -133,7 +133,7 @@ ATOMIC_OPS(xor, eor)
 
 #define ATOMIC64_OP(op, asm_op)						\
 __LL_SC_INLINE void							\
-__LL_SC_PREFIX(atomic64_##op(long i, atomic64_t *v))			\
+__LL_SC_PREFIX(arch_atomic64_##op(long i, atomic64_t *v))		\
 {									\
 	long result;							\
 	unsigned long tmp;						\
@@ -147,11 +147,11 @@ __LL_SC_PREFIX(atomic64_##op(long i, atomic64_t *v))			\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: "Ir" (i));							\
 }									\
-__LL_SC_EXPORT(atomic64_##op);
+__LL_SC_EXPORT(arch_atomic64_##op);
 
 #define ATOMIC64_OP_RETURN(name, mb, acq, rel, cl, op, asm_op)		\
 __LL_SC_INLINE long							\
-__LL_SC_PREFIX(atomic64_##op##_return##name(long i, atomic64_t *v))	\
+__LL_SC_PREFIX(arch_atomic64_##op##_return##name(long i, atomic64_t *v))\
 {									\
 	long result;							\
 	unsigned long tmp;						\
@@ -169,11 +169,11 @@ __LL_SC_PREFIX(atomic64_##op##_return##name(long i, atomic64_t *v))	\
 									\
 	return result;							\
 }									\
-__LL_SC_EXPORT(atomic64_##op##_return##name);
+__LL_SC_EXPORT(arch_atomic64_##op##_return##name);
 
 #define ATOMIC64_FETCH_OP(name, mb, acq, rel, cl, op, asm_op)		\
 __LL_SC_INLINE long							\
-__LL_SC_PREFIX(atomic64_fetch_##op##name(long i, atomic64_t *v))	\
+__LL_SC_PREFIX(arch_atomic64_fetch_##op##name(long i, atomic64_t *v))	\
 {									\
 	long result, val;						\
 	unsigned long tmp;						\
@@ -191,7 +191,7 @@ __LL_SC_PREFIX(atomic64_fetch_##op##name(long i, atomic64_t *v))	\
 									\
 	return result;							\
 }									\
-__LL_SC_EXPORT(atomic64_fetch_##op##name);
+__LL_SC_EXPORT(arch_atomic64_fetch_##op##name);
 
 #define ATOMIC64_OPS(...)						\
 	ATOMIC64_OP(__VA_ARGS__)					\
@@ -226,7 +226,7 @@ ATOMIC64_OPS(xor, eor)
 #undef ATOMIC64_OP
 
 __LL_SC_INLINE long
-__LL_SC_PREFIX(atomic64_dec_if_positive(atomic64_t *v))
+__LL_SC_PREFIX(arch_atomic64_dec_if_positive(atomic64_t *v))
 {
 	long result;
 	unsigned long tmp;
@@ -246,7 +246,7 @@ __LL_SC_PREFIX(atomic64_dec_if_positive(atomic64_t *v))
 
 	return result;
 }
-__LL_SC_EXPORT(atomic64_dec_if_positive);
+__LL_SC_EXPORT(arch_atomic64_dec_if_positive);
 
 #define __CMPXCHG_CASE(w, sfx, name, sz, mb, acq, rel, cl)		\
 __LL_SC_INLINE u##sz							\
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index a424355240c5..9256a3921e4b 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -25,9 +25,9 @@
 #error "please don't include this file directly"
 #endif
 
-#define __LL_SC_ATOMIC(op)	__LL_SC_CALL(atomic_##op)
+#define __LL_SC_ATOMIC(op)	__LL_SC_CALL(arch_atomic_##op)
 #define ATOMIC_OP(op, asm_op)						\
-static inline void atomic_##op(int i, atomic_t *v)			\
+static inline void arch_atomic_##op(int i, atomic_t *v)			\
 {									\
 	register int w0 asm ("w0") = i;					\
 	register atomic_t *x1 asm ("x1") = v;				\
@@ -47,7 +47,7 @@ ATOMIC_OP(add, stadd)
 #undef ATOMIC_OP
 
 #define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...)			\
-static inline int atomic_fetch_##op##name(int i, atomic_t *v)		\
+static inline int arch_atomic_fetch_##op##name(int i, atomic_t *v)	\
 {									\
 	register int w0 asm ("w0") = i;					\
 	register atomic_t *x1 asm ("x1") = v;				\
@@ -79,7 +79,7 @@ ATOMIC_FETCH_OPS(add, ldadd)
 #undef ATOMIC_FETCH_OPS
 
 #define ATOMIC_OP_ADD_RETURN(name, mb, cl...)				\
-static inline int atomic_add_return##name(int i, atomic_t *v)		\
+static inline int arch_atomic_add_return##name(int i, atomic_t *v)	\
 {									\
 	register int w0 asm ("w0") = i;					\
 	register atomic_t *x1 asm ("x1") = v;				\
@@ -105,7 +105,7 @@ ATOMIC_OP_ADD_RETURN(        , al, "memory")
 
 #undef ATOMIC_OP_ADD_RETURN
 
-static inline void atomic_and(int i, atomic_t *v)
+static inline void arch_atomic_and(int i, atomic_t *v)
 {
 	register int w0 asm ("w0") = i;
 	register atomic_t *x1 asm ("x1") = v;
@@ -123,7 +123,7 @@ static inline void atomic_and(int i, atomic_t *v)
 }
 
 #define ATOMIC_FETCH_OP_AND(name, mb, cl...)				\
-static inline int atomic_fetch_and##name(int i, atomic_t *v)		\
+static inline int arch_atomic_fetch_and##name(int i, atomic_t *v)	\
 {									\
 	register int w0 asm ("w0") = i;					\
 	register atomic_t *x1 asm ("x1") = v;				\
@@ -149,7 +149,7 @@ ATOMIC_FETCH_OP_AND(        , al, "memory")
 
 #undef ATOMIC_FETCH_OP_AND
 
-static inline void atomic_sub(int i, atomic_t *v)
+static inline void arch_atomic_sub(int i, atomic_t *v)
 {
 	register int w0 asm ("w0") = i;
 	register atomic_t *x1 asm ("x1") = v;
@@ -167,7 +167,7 @@ static inline void atomic_sub(int i, atomic_t *v)
 }
 
 #define ATOMIC_OP_SUB_RETURN(name, mb, cl...)				\
-static inline int atomic_sub_return##name(int i, atomic_t *v)		\
+static inline int arch_atomic_sub_return##name(int i, atomic_t *v)	\
 {									\
 	register int w0 asm ("w0") = i;					\
 	register atomic_t *x1 asm ("x1") = v;				\
@@ -195,7 +195,7 @@ ATOMIC_OP_SUB_RETURN(        , al, "memory")
 #undef ATOMIC_OP_SUB_RETURN
 
 #define ATOMIC_FETCH_OP_SUB(name, mb, cl...)				\
-static inline int atomic_fetch_sub##name(int i, atomic_t *v)		\
+static inline int arch_atomic_fetch_sub##name(int i, atomic_t *v)	\
 {									\
 	register int w0 asm ("w0") = i;					\
 	register atomic_t *x1 asm ("x1") = v;				\
@@ -222,9 +222,9 @@ ATOMIC_FETCH_OP_SUB(        , al, "memory")
 #undef ATOMIC_FETCH_OP_SUB
 #undef __LL_SC_ATOMIC
 
-#define __LL_SC_ATOMIC64(op)	__LL_SC_CALL(atomic64_##op)
+#define __LL_SC_ATOMIC64(op)	__LL_SC_CALL(arch_atomic64_##op)
 #define ATOMIC64_OP(op, asm_op)						\
-static inline void atomic64_##op(long i, atomic64_t *v)			\
+static inline void arch_atomic64_##op(long i, atomic64_t *v)		\
 {									\
 	register long x0 asm ("x0") = i;				\
 	register atomic64_t *x1 asm ("x1") = v;				\
@@ -244,7 +244,7 @@ ATOMIC64_OP(add, stadd)
 #undef ATOMIC64_OP
 
 #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...)			\
-static inline long atomic64_fetch_##op##name(long i, atomic64_t *v)	\
+static inline long arch_atomic64_fetch_##op##name(long i, atomic64_t *v)\
 {									\
 	register long x0 asm ("x0") = i;				\
 	register atomic64_t *x1 asm ("x1") = v;				\
@@ -276,7 +276,7 @@ ATOMIC64_FETCH_OPS(add, ldadd)
 #undef ATOMIC64_FETCH_OPS
 
 #define ATOMIC64_OP_ADD_RETURN(name, mb, cl...)				\
-static inline long atomic64_add_return##name(long i, atomic64_t *v)	\
+static inline long arch_atomic64_add_return##name(long i, atomic64_t *v)\
 {									\
 	register long x0 asm ("x0") = i;				\
 	register atomic64_t *x1 asm ("x1") = v;				\
@@ -302,7 +302,7 @@ ATOMIC64_OP_ADD_RETURN(        , al, "memory")
 
 #undef ATOMIC64_OP_ADD_RETURN
 
-static inline void atomic64_and(long i, atomic64_t *v)
+static inline void arch_atomic64_and(long i, atomic64_t *v)
 {
 	register long x0 asm ("x0") = i;
 	register atomic64_t *x1 asm ("x1") = v;
@@ -320,7 +320,7 @@ static inline void atomic64_and(long i, atomic64_t *v)
 }
 
 #define ATOMIC64_FETCH_OP_AND(name, mb, cl...)				\
-static inline long atomic64_fetch_and##name(long i, atomic64_t *v)	\
+static inline long arch_atomic64_fetch_and##name(long i, atomic64_t *v)	\
 {									\
 	register long x0 asm ("x0") = i;				\
 	register atomic64_t *x1 asm ("x1") = v;				\
@@ -346,7 +346,7 @@ ATOMIC64_FETCH_OP_AND(        , al, "memory")
 
 #undef ATOMIC64_FETCH_OP_AND
 
-static inline void atomic64_sub(long i, atomic64_t *v)
+static inline void arch_atomic64_sub(long i, atomic64_t *v)
 {
 	register long x0 asm ("x0") = i;
 	register atomic64_t *x1 asm ("x1") = v;
@@ -364,7 +364,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
 }
 
 #define ATOMIC64_OP_SUB_RETURN(name, mb, cl...)				\
-static inline long atomic64_sub_return##name(long i, atomic64_t *v)	\
+static inline long arch_atomic64_sub_return##name(long i, atomic64_t *v)\
 {									\
 	register long x0 asm ("x0") = i;				\
 	register atomic64_t *x1 asm ("x1") = v;				\
@@ -392,7 +392,7 @@ ATOMIC64_OP_SUB_RETURN(        , al, "memory")
 #undef ATOMIC64_OP_SUB_RETURN
 
 #define ATOMIC64_FETCH_OP_SUB(name, mb, cl...)				\
-static inline long atomic64_fetch_sub##name(long i, atomic64_t *v)	\
+static inline long arch_atomic64_fetch_sub##name(long i, atomic64_t *v)	\
 {									\
 	register long x0 asm ("x0") = i;				\
 	register atomic64_t *x1 asm ("x1") = v;				\
@@ -418,7 +418,7 @@ ATOMIC64_FETCH_OP_SUB(        , al, "memory")
 
 #undef ATOMIC64_FETCH_OP_SUB
 
-static inline long atomic64_dec_if_positive(atomic64_t *v)
+static inline long arch_atomic64_dec_if_positive(atomic64_t *v)
 {
 	register long x0 asm ("x0") = (long)v;
 
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 3f9376f1c409..e6ea0f42e097 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -110,10 +110,10 @@ __XCHG_GEN(_mb)
 })
 
 /* xchg */
-#define xchg_relaxed(...)	__xchg_wrapper(    , __VA_ARGS__)
-#define xchg_acquire(...)	__xchg_wrapper(_acq, __VA_ARGS__)
-#define xchg_release(...)	__xchg_wrapper(_rel, __VA_ARGS__)
-#define xchg(...)		__xchg_wrapper( _mb, __VA_ARGS__)
+#define arch_xchg_relaxed(...)	__xchg_wrapper(    , __VA_ARGS__)
+#define arch_xchg_acquire(...)	__xchg_wrapper(_acq, __VA_ARGS__)
+#define arch_xchg_release(...)	__xchg_wrapper(_rel, __VA_ARGS__)
+#define arch_xchg(...)		__xchg_wrapper( _mb, __VA_ARGS__)
 
 #define __CMPXCHG_GEN(sfx)						\
 static inline unsigned long __cmpxchg##sfx(volatile void *ptr,		\
@@ -154,18 +154,18 @@ __CMPXCHG_GEN(_mb)
 })
 
 /* cmpxchg */
-#define cmpxchg_relaxed(...)	__cmpxchg_wrapper(    , __VA_ARGS__)
-#define cmpxchg_acquire(...)	__cmpxchg_wrapper(_acq, __VA_ARGS__)
-#define cmpxchg_release(...)	__cmpxchg_wrapper(_rel, __VA_ARGS__)
-#define cmpxchg(...)		__cmpxchg_wrapper( _mb, __VA_ARGS__)
-#define cmpxchg_local		cmpxchg_relaxed
+#define arch_cmpxchg_relaxed(...)	__cmpxchg_wrapper(    , __VA_ARGS__)
+#define arch_cmpxchg_acquire(...)	__cmpxchg_wrapper(_acq, __VA_ARGS__)
+#define arch_cmpxchg_release(...)	__cmpxchg_wrapper(_rel, __VA_ARGS__)
+#define arch_cmpxchg(...)		__cmpxchg_wrapper( _mb, __VA_ARGS__)
+#define arch_cmpxchg_local		arch_cmpxchg_relaxed
 
 /* cmpxchg64 */
-#define cmpxchg64_relaxed	cmpxchg_relaxed
-#define cmpxchg64_acquire	cmpxchg_acquire
-#define cmpxchg64_release	cmpxchg_release
-#define cmpxchg64		cmpxchg
-#define cmpxchg64_local		cmpxchg_local
+#define arch_cmpxchg64_relaxed		arch_cmpxchg_relaxed
+#define arch_cmpxchg64_acquire		arch_cmpxchg_acquire
+#define arch_cmpxchg64_release		arch_cmpxchg_release
+#define arch_cmpxchg64			arch_cmpxchg
+#define arch_cmpxchg64_local		arch_cmpxchg_local
 
 /* cmpxchg_double */
 #define system_has_cmpxchg_double()     1
@@ -177,24 +177,24 @@ __CMPXCHG_GEN(_mb)
 	VM_BUG_ON((unsigned long *)(ptr2) - (unsigned long *)(ptr1) != 1);	\
 })
 
-#define cmpxchg_double(ptr1, ptr2, o1, o2, n1, n2) \
-({\
-	int __ret;\
-	__cmpxchg_double_check(ptr1, ptr2); \
-	__ret = !__cmpxchg_double_mb((unsigned long)(o1), (unsigned long)(o2), \
-				     (unsigned long)(n1), (unsigned long)(n2), \
-				     ptr1); \
-	__ret; \
+#define arch_cmpxchg_double(ptr1, ptr2, o1, o2, n1, n2)				\
+({										\
+	int __ret;								\
+	__cmpxchg_double_check(ptr1, ptr2);					\
+	__ret = !__cmpxchg_double_mb((unsigned long)(o1), (unsigned long)(o2),	\
+				     (unsigned long)(n1), (unsigned long)(n2),	\
+				     ptr1);					\
+	__ret;									\
 })
 
-#define cmpxchg_double_local(ptr1, ptr2, o1, o2, n1, n2) \
-({\
-	int __ret;\
-	__cmpxchg_double_check(ptr1, ptr2); \
-	__ret = !__cmpxchg_double((unsigned long)(o1), (unsigned long)(o2), \
-				  (unsigned long)(n1), (unsigned long)(n2), \
-				  ptr1); \
-	__ret; \
+#define arch_cmpxchg_double_local(ptr1, ptr2, o1, o2, n1, n2)			\
+({										\
+	int __ret;								\
+	__cmpxchg_double_check(ptr1, ptr2);					\
+	__ret = !__cmpxchg_double((unsigned long)(o1), (unsigned long)(o2),	\
+				  (unsigned long)(n1), (unsigned long)(n2),	\
+				  ptr1);					\
+	__ret;									\
 })
 
 #define __CMPWAIT_CASE(w, sfx, sz)					\
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7732d0ba4e60..da3fc7324d68 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -48,6 +48,7 @@
 #define KVM_REQ_SLEEP \
 	KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_IRQ_PENDING	KVM_ARCH_REQ(1)
+#define KVM_REQ_VCPU_RESET	KVM_ARCH_REQ(2)
 
 DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
@@ -208,6 +209,13 @@ struct kvm_cpu_context {
 
 typedef struct kvm_cpu_context kvm_cpu_context_t;
 
+struct vcpu_reset_state {
+	unsigned long	pc;
+	unsigned long	r0;
+	bool		be;
+	bool		reset;
+};
+
 struct kvm_vcpu_arch {
 	struct kvm_cpu_context ctxt;
 
@@ -297,6 +305,9 @@ struct kvm_vcpu_arch {
 	/* Virtual SError ESR to restore when HCR_EL2.VSE is set */
 	u64 vsesr_el2;
 
+	/* Additional reset state */
+	struct vcpu_reset_state	reset_state;
+
 	/* True when deferrable sysregs are loaded on the physical CPU,
 	 * see kvm_vcpu_load_sysregs and kvm_vcpu_put_sysregs. */
 	bool sysregs_loaded_on_cpu;
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index e1ec947e7c0c..0c656850eeea 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -332,6 +332,17 @@ static inline void *phys_to_virt(phys_addr_t x)
 #define virt_addr_valid(kaddr)		\
 	(_virt_addr_is_linear(kaddr) && _virt_addr_valid(kaddr))
 
+/*
+ * Given that the GIC architecture permits ITS implementations that can only be
+ * configured with a LPI table address once, GICv3 systems with many CPUs may
+ * end up reserving a lot of different regions after a kexec for their LPI
+ * tables (one per CPU), as we are forced to reuse the same memory after kexec
+ * (and thus reserve it persistently with EFI beforehand)
+ */
+#if defined(CONFIG_EFI) && defined(CONFIG_ARM_GIC_V3_ITS)
+# define INIT_MEMBLOCK_RESERVED_REGIONS	(INIT_MEMBLOCK_REGIONS + NR_CPUS + 1)
+#endif
+
 #include <asm-generic/memory_model.h>
 
 #endif
diff --git a/arch/arm64/include/asm/neon-intrinsics.h b/arch/arm64/include/asm/neon-intrinsics.h
index 2ba6c6b9541f..71abfc7612b2 100644
--- a/arch/arm64/include/asm/neon-intrinsics.h
+++ b/arch/arm64/include/asm/neon-intrinsics.h
@@ -36,4 +36,8 @@
 #include <arm_neon.h>
 #endif
 
+#ifdef CONFIG_CC_IS_CLANG
+#pragma clang diagnostic ignored "-Wincompatible-pointer-types"
+#endif
+
 #endif /* __ASM_NEON_INTRINSICS_H */
diff --git a/arch/arm64/include/asm/sync_bitops.h b/arch/arm64/include/asm/sync_bitops.h
index eee31a9f72a5..e9c1a02c2154 100644
--- a/arch/arm64/include/asm/sync_bitops.h
+++ b/arch/arm64/include/asm/sync_bitops.h
@@ -15,13 +15,13 @@
  * ops which are SMP safe even on a UP kernel.
  */
 
-#define sync_set_bit(nr, p)            set_bit(nr, p)
-#define sync_clear_bit(nr, p)          clear_bit(nr, p)
-#define sync_change_bit(nr, p)         change_bit(nr, p)
-#define sync_test_and_set_bit(nr, p)   test_and_set_bit(nr, p)
-#define sync_test_and_clear_bit(nr, p) test_and_clear_bit(nr, p)
-#define sync_test_and_change_bit(nr, p)        test_and_change_bit(nr, p)
-#define sync_test_bit(nr, addr)                test_bit(nr, addr)
-#define sync_cmpxchg                   cmpxchg
+#define sync_set_bit(nr, p)			set_bit(nr, p)
+#define sync_clear_bit(nr, p)			clear_bit(nr, p)
+#define sync_change_bit(nr, p)			change_bit(nr, p)
+#define sync_test_and_set_bit(nr, p)		test_and_set_bit(nr, p)
+#define sync_test_and_clear_bit(nr, p)		test_and_clear_bit(nr, p)
+#define sync_test_and_change_bit(nr, p)		test_and_change_bit(nr, p)
+#define sync_test_bit(nr, addr)			test_bit(nr, addr)
+#define arch_sync_cmpxchg			arch_cmpxchg
 
 #endif
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 547d7a0c9d05..f1e5c9165809 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -34,7 +34,6 @@
 #include <asm/memory.h>
 #include <asm/extable.h>
 
-#define get_ds()	(KERNEL_DS)
 #define get_fs()	(current_thread_info()->addr_limit)
 
 static inline void set_fs(mm_segment_t fs)
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index a7b1fc58ffdf..d1dd93436e1e 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -44,7 +44,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		400
+#define __NR_compat_syscalls		424
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 04ee190b90fe..5590f2623690 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -270,7 +270,7 @@ __SYSCALL(__NR_uname, sys_newuname)
 			/* 123 was sys_modify_ldt */
 __SYSCALL(123, sys_ni_syscall)
 #define __NR_adjtimex 124
-__SYSCALL(__NR_adjtimex, compat_sys_adjtimex)
+__SYSCALL(__NR_adjtimex, sys_adjtimex_time32)
 #define __NR_mprotect 125
 __SYSCALL(__NR_mprotect, sys_mprotect)
 #define __NR_sigprocmask 126
@@ -344,9 +344,9 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
 #define __NR_sched_get_priority_min 160
 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
 #define __NR_sched_rr_get_interval 161
-__SYSCALL(__NR_sched_rr_get_interval, compat_sys_sched_rr_get_interval)
+__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval_time32)
 #define __NR_nanosleep 162
-__SYSCALL(__NR_nanosleep, compat_sys_nanosleep)
+__SYSCALL(__NR_nanosleep, sys_nanosleep_time32)
 #define __NR_mremap 163
 __SYSCALL(__NR_mremap, sys_mremap)
 #define __NR_setresuid 164
@@ -376,7 +376,7 @@ __SYSCALL(__NR_rt_sigprocmask, compat_sys_rt_sigprocmask)
 #define __NR_rt_sigpending 176
 __SYSCALL(__NR_rt_sigpending, compat_sys_rt_sigpending)
 #define __NR_rt_sigtimedwait 177
-__SYSCALL(__NR_rt_sigtimedwait, compat_sys_rt_sigtimedwait)
+__SYSCALL(__NR_rt_sigtimedwait, compat_sys_rt_sigtimedwait_time32)
 #define __NR_rt_sigqueueinfo 178
 __SYSCALL(__NR_rt_sigqueueinfo, compat_sys_rt_sigqueueinfo)
 #define __NR_rt_sigsuspend 179
@@ -502,7 +502,7 @@ __SYSCALL(__NR_tkill, sys_tkill)
 #define __NR_sendfile64 239
 __SYSCALL(__NR_sendfile64, sys_sendfile64)
 #define __NR_futex 240
-__SYSCALL(__NR_futex, compat_sys_futex)
+__SYSCALL(__NR_futex, sys_futex_time32)
 #define __NR_sched_setaffinity 241
 __SYSCALL(__NR_sched_setaffinity, compat_sys_sched_setaffinity)
 #define __NR_sched_getaffinity 242
@@ -512,7 +512,7 @@ __SYSCALL(__NR_io_setup, compat_sys_io_setup)
 #define __NR_io_destroy 244
 __SYSCALL(__NR_io_destroy, sys_io_destroy)
 #define __NR_io_getevents 245
-__SYSCALL(__NR_io_getevents, compat_sys_io_getevents)
+__SYSCALL(__NR_io_getevents, sys_io_getevents_time32)
 #define __NR_io_submit 246
 __SYSCALL(__NR_io_submit, compat_sys_io_submit)
 #define __NR_io_cancel 247
@@ -538,21 +538,21 @@ __SYSCALL(__NR_set_tid_address, sys_set_tid_address)
 #define __NR_timer_create 257
 __SYSCALL(__NR_timer_create, compat_sys_timer_create)
 #define __NR_timer_settime 258
-__SYSCALL(__NR_timer_settime, compat_sys_timer_settime)
+__SYSCALL(__NR_timer_settime, sys_timer_settime32)
 #define __NR_timer_gettime 259
-__SYSCALL(__NR_timer_gettime, compat_sys_timer_gettime)
+__SYSCALL(__NR_timer_gettime, sys_timer_gettime32)
 #define __NR_timer_getoverrun 260
 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
 #define __NR_timer_delete 261
 __SYSCALL(__NR_timer_delete, sys_timer_delete)
 #define __NR_clock_settime 262
-__SYSCALL(__NR_clock_settime, compat_sys_clock_settime)
+__SYSCALL(__NR_clock_settime, sys_clock_settime32)
 #define __NR_clock_gettime 263
-__SYSCALL(__NR_clock_gettime, compat_sys_clock_gettime)
+__SYSCALL(__NR_clock_gettime, sys_clock_gettime32)
 #define __NR_clock_getres 264
-__SYSCALL(__NR_clock_getres, compat_sys_clock_getres)
+__SYSCALL(__NR_clock_getres, sys_clock_getres_time32)
 #define __NR_clock_nanosleep 265
-__SYSCALL(__NR_clock_nanosleep, compat_sys_clock_nanosleep)
+__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep_time32)
 #define __NR_statfs64 266
 __SYSCALL(__NR_statfs64, compat_sys_aarch32_statfs64)
 #define __NR_fstatfs64 267
@@ -560,7 +560,7 @@ __SYSCALL(__NR_fstatfs64, compat_sys_aarch32_fstatfs64)
 #define __NR_tgkill 268
 __SYSCALL(__NR_tgkill, sys_tgkill)
 #define __NR_utimes 269
-__SYSCALL(__NR_utimes, compat_sys_utimes)
+__SYSCALL(__NR_utimes, sys_utimes_time32)
 #define __NR_arm_fadvise64_64 270
 __SYSCALL(__NR_arm_fadvise64_64, compat_sys_aarch32_fadvise64_64)
 #define __NR_pciconfig_iobase 271
@@ -574,9 +574,9 @@ __SYSCALL(__NR_mq_open, compat_sys_mq_open)
 #define __NR_mq_unlink 275
 __SYSCALL(__NR_mq_unlink, sys_mq_unlink)
 #define __NR_mq_timedsend 276
-__SYSCALL(__NR_mq_timedsend, compat_sys_mq_timedsend)
+__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend_time32)
 #define __NR_mq_timedreceive 277
-__SYSCALL(__NR_mq_timedreceive, compat_sys_mq_timedreceive)
+__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive_time32)
 #define __NR_mq_notify 278
 __SYSCALL(__NR_mq_notify, compat_sys_mq_notify)
 #define __NR_mq_getsetattr 279
@@ -622,7 +622,7 @@ __SYSCALL(__NR_semop, sys_semop)
 #define __NR_semget 299
 __SYSCALL(__NR_semget, sys_semget)
 #define __NR_semctl 300
-__SYSCALL(__NR_semctl, compat_sys_semctl)
+__SYSCALL(__NR_semctl, compat_sys_old_semctl)
 #define __NR_msgsnd 301
 __SYSCALL(__NR_msgsnd, compat_sys_msgsnd)
 #define __NR_msgrcv 302
@@ -630,7 +630,7 @@ __SYSCALL(__NR_msgrcv, compat_sys_msgrcv)
 #define __NR_msgget 303
 __SYSCALL(__NR_msgget, sys_msgget)
 #define __NR_msgctl 304
-__SYSCALL(__NR_msgctl, compat_sys_msgctl)
+__SYSCALL(__NR_msgctl, compat_sys_old_msgctl)
 #define __NR_shmat 305
 __SYSCALL(__NR_shmat, compat_sys_shmat)
 #define __NR_shmdt 306
@@ -638,7 +638,7 @@ __SYSCALL(__NR_shmdt, sys_shmdt)
 #define __NR_shmget 307
 __SYSCALL(__NR_shmget, sys_shmget)
 #define __NR_shmctl 308
-__SYSCALL(__NR_shmctl, compat_sys_shmctl)
+__SYSCALL(__NR_shmctl, compat_sys_old_shmctl)
 #define __NR_add_key 309
 __SYSCALL(__NR_add_key, sys_add_key)
 #define __NR_request_key 310
@@ -646,7 +646,7 @@ __SYSCALL(__NR_request_key, sys_request_key)
 #define __NR_keyctl 311
 __SYSCALL(__NR_keyctl, compat_sys_keyctl)
 #define __NR_semtimedop 312
-__SYSCALL(__NR_semtimedop, compat_sys_semtimedop)
+__SYSCALL(__NR_semtimedop, sys_semtimedop_time32)
 #define __NR_vserver 313
 __SYSCALL(__NR_vserver, sys_ni_syscall)
 #define __NR_ioprio_set 314
@@ -674,7 +674,7 @@ __SYSCALL(__NR_mknodat, sys_mknodat)
 #define __NR_fchownat 325
 __SYSCALL(__NR_fchownat, sys_fchownat)
 #define __NR_futimesat 326
-__SYSCALL(__NR_futimesat, compat_sys_futimesat)
+__SYSCALL(__NR_futimesat, sys_futimesat_time32)
 #define __NR_fstatat64 327
 __SYSCALL(__NR_fstatat64, sys_fstatat64)
 #define __NR_unlinkat 328
@@ -692,9 +692,9 @@ __SYSCALL(__NR_fchmodat, sys_fchmodat)
 #define __NR_faccessat 334
 __SYSCALL(__NR_faccessat, sys_faccessat)
 #define __NR_pselect6 335
-__SYSCALL(__NR_pselect6, compat_sys_pselect6)
+__SYSCALL(__NR_pselect6, compat_sys_pselect6_time32)
 #define __NR_ppoll 336
-__SYSCALL(__NR_ppoll, compat_sys_ppoll)
+__SYSCALL(__NR_ppoll, compat_sys_ppoll_time32)
 #define __NR_unshare 337
 __SYSCALL(__NR_unshare, sys_unshare)
 #define __NR_set_robust_list 338
@@ -718,7 +718,7 @@ __SYSCALL(__NR_epoll_pwait, compat_sys_epoll_pwait)
 #define __NR_kexec_load 347
 __SYSCALL(__NR_kexec_load, compat_sys_kexec_load)
 #define __NR_utimensat 348
-__SYSCALL(__NR_utimensat, compat_sys_utimensat)
+__SYSCALL(__NR_utimensat, sys_utimensat_time32)
 #define __NR_signalfd 349
 __SYSCALL(__NR_signalfd, compat_sys_signalfd)
 #define __NR_timerfd_create 350
@@ -728,9 +728,9 @@ __SYSCALL(__NR_eventfd, sys_eventfd)
 #define __NR_fallocate 352
 __SYSCALL(__NR_fallocate, compat_sys_aarch32_fallocate)
 #define __NR_timerfd_settime 353
-__SYSCALL(__NR_timerfd_settime, compat_sys_timerfd_settime)
+__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime32)
 #define __NR_timerfd_gettime 354
-__SYSCALL(__NR_timerfd_gettime, compat_sys_timerfd_gettime)
+__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime32)
 #define __NR_signalfd4 355
 __SYSCALL(__NR_signalfd4, compat_sys_signalfd4)
 #define __NR_eventfd2 356
@@ -752,7 +752,7 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, compat_sys_rt_tgsigqueueinfo)
 #define __NR_perf_event_open 364
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg 365
-__SYSCALL(__NR_recvmmsg, compat_sys_recvmmsg)
+__SYSCALL(__NR_recvmmsg, compat_sys_recvmmsg_time32)
 #define __NR_accept4 366
 __SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_fanotify_init 367
@@ -766,7 +766,7 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
 #define __NR_open_by_handle_at 371
 __SYSCALL(__NR_open_by_handle_at, compat_sys_open_by_handle_at)
 #define __NR_clock_adjtime 372
-__SYSCALL(__NR_clock_adjtime, compat_sys_clock_adjtime)
+__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime32)
 #define __NR_syncfs 373
 __SYSCALL(__NR_syncfs, sys_syncfs)
 #define __NR_sendmmsg 374
@@ -821,6 +821,51 @@ __SYSCALL(__NR_statx, sys_statx)
 __SYSCALL(__NR_rseq, sys_rseq)
 #define __NR_io_pgetevents 399
 __SYSCALL(__NR_io_pgetevents, compat_sys_io_pgetevents)
+#define __NR_migrate_pages 400
+__SYSCALL(__NR_migrate_pages, compat_sys_migrate_pages)
+#define __NR_kexec_file_load 401
+__SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
+/* 402 is unused */
+#define __NR_clock_gettime64 403
+__SYSCALL(__NR_clock_gettime64, sys_clock_gettime)
+#define __NR_clock_settime64 404
+__SYSCALL(__NR_clock_settime64, sys_clock_settime)
+#define __NR_clock_adjtime64 405
+__SYSCALL(__NR_clock_adjtime64, sys_clock_adjtime)
+#define __NR_clock_getres_time64 406
+__SYSCALL(__NR_clock_getres_time64, sys_clock_getres)
+#define __NR_clock_nanosleep_time64 407
+__SYSCALL(__NR_clock_nanosleep_time64, sys_clock_nanosleep)
+#define __NR_timer_gettime64 408
+__SYSCALL(__NR_timer_gettime64, sys_timer_gettime)
+#define __NR_timer_settime64 409
+__SYSCALL(__NR_timer_settime64, sys_timer_settime)
+#define __NR_timerfd_gettime64 410
+__SYSCALL(__NR_timerfd_gettime64, sys_timerfd_gettime)
+#define __NR_timerfd_settime64 411
+__SYSCALL(__NR_timerfd_settime64, sys_timerfd_settime)
+#define __NR_utimensat_time64 412
+__SYSCALL(__NR_utimensat_time64, sys_utimensat)
+#define __NR_pselect6_time64 413
+__SYSCALL(__NR_pselect6_time64, compat_sys_pselect6_time64)
+#define __NR_ppoll_time64 414
+__SYSCALL(__NR_ppoll_time64, compat_sys_ppoll_time64)
+#define __NR_io_pgetevents_time64 416
+__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents)
+#define __NR_recvmmsg_time64 417
+__SYSCALL(__NR_recvmmsg_time64, compat_sys_recvmmsg_time64)
+#define __NR_mq_timedsend_time64 418
+__SYSCALL(__NR_mq_timedsend_time64, sys_mq_timedsend)
+#define __NR_mq_timedreceive_time64 419
+__SYSCALL(__NR_mq_timedreceive_time64, sys_mq_timedreceive)
+#define __NR_semtimedop_time64 420
+__SYSCALL(__NR_semtimedop_time64, sys_semtimedop)
+#define __NR_rt_sigtimedwait_time64 421
+__SYSCALL(__NR_rt_sigtimedwait_time64, compat_sys_rt_sigtimedwait_time64)
+#define __NR_futex_time64 422
+__SYSCALL(__NR_futex_time64, sys_futex)
+#define __NR_sched_rr_get_interval_time64 423
+__SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h
index dae1584cf017..4703d218663a 100644
--- a/arch/arm64/include/uapi/asm/unistd.h
+++ b/arch/arm64/include/uapi/asm/unistd.h
@@ -17,5 +17,7 @@
 
 #define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_NEW_STAT
+#define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
 
 #include <asm-generic/unistd.h>
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 15d79a8e5e5e..eecf7927dab0 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -539,8 +539,7 @@ set_hcr:
 	/* GICv3 system register access */
 	mrs	x0, id_aa64pfr0_el1
 	ubfx	x0, x0, #24, #4
-	cmp	x0, #1
-	b.ne	3f
+	cbz	x0, 3f
 
 	mrs_s	x0, SYS_ICC_SRE_EL2
 	orr	x0, x0, #ICC_SRE_EL2_SRE	// Set ICC_SRE_EL2.SRE==1
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 29cdc99688f3..9859e1178e6b 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -299,8 +299,10 @@ int swsusp_arch_suspend(void)
 		dcache_clean_range(__idmap_text_start, __idmap_text_end);
 
 		/* Clean kvm setup code to PoC? */
-		if (el2_reset_needed())
+		if (el2_reset_needed()) {
 			dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end);
+			dcache_clean_range(__hyp_text_start, __hyp_text_end);
+		}
 
 		/* make the crash dump kernel image protected again */
 		crash_post_resume();
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index e1261fbaa374..17f325ba831e 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -28,6 +28,8 @@
 #include <asm/virt.h>
 
 	.text
+	.pushsection	.hyp.text, "ax"
+
 	.align 11
 
 ENTRY(__hyp_stub_vectors)
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index ba6b41790fcd..b09b6f75f759 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -88,6 +88,7 @@ u64 __init kaslr_early_init(u64 dt_phys)
 	 * we end up running with module randomization disabled.
 	 */
 	module_alloc_base = (u64)_etext - MODULES_VSIZE;
+	__flush_dcache_area(&module_alloc_base, sizeof(module_alloc_base));
 
 	/*
 	 * Try to map the FDT early. If this fails, we simply bail,
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index f2c211a6229b..58871333737a 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -120,10 +120,12 @@ static int create_dtb(struct kimage *image,
 {
 	void *buf;
 	size_t buf_size;
+	size_t cmdline_len;
 	int ret;
 
+	cmdline_len = cmdline ? strlen(cmdline) : 0;
 	buf_size = fdt_totalsize(initial_boot_params)
-			+ strlen(cmdline) + DTB_EXTRA_SPACE;
+			+ cmdline_len + DTB_EXTRA_SPACE;
 
 	for (;;) {
 		buf = vmalloc(buf_size);
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 2a5b338b2542..f17afb99890c 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -478,13 +478,13 @@ bool arch_within_kprobe_blacklist(unsigned long addr)
 	    addr < (unsigned long)__entry_text_end) ||
 	    (addr >= (unsigned long)__idmap_text_start &&
 	    addr < (unsigned long)__idmap_text_end) ||
+	    (addr >= (unsigned long)__hyp_text_start &&
+	    addr < (unsigned long)__hyp_text_end) ||
 	    !!search_exception_tables(addr))
 		return true;
 
 	if (!is_kernel_in_hyp_mode()) {
-		if ((addr >= (unsigned long)__hyp_text_start &&
-		    addr < (unsigned long)__hyp_text_end) ||
-		    (addr >= (unsigned long)__hyp_idmap_text_start &&
+		if ((addr >= (unsigned long)__hyp_idmap_text_start &&
 		    addr < (unsigned long)__hyp_idmap_text_end))
 			return true;
 	}
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 9dce33b0e260..ddaea0fd2fa4 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1702,19 +1702,20 @@ void syscall_trace_exit(struct pt_regs *regs)
 }
 
 /*
- * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487C.a
- * We also take into account DIT (bit 24), which is not yet documented, and
- * treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may be
- * allocated an EL0 meaning in future.
+ * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487D.a.
+ * We permit userspace to set SSBS (AArch64 bit 12, AArch32 bit 23) which is
+ * not described in ARM DDI 0487D.a.
+ * We treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may
+ * be allocated an EL0 meaning in future.
  * Userspace cannot use these until they have an architectural meaning.
  * Note that this follows the SPSR_ELx format, not the AArch32 PSR format.
  * We also reserve IL for the kernel; SS is handled dynamically.
  */
 #define SPSR_EL1_AARCH64_RES0_BITS \
-	(GENMASK_ULL(63,32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \
-	 GENMASK_ULL(20, 10) | GENMASK_ULL(5, 5))
+	(GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \
+	 GENMASK_ULL(20, 13) | GENMASK_ULL(11, 10) | GENMASK_ULL(5, 5))
 #define SPSR_EL1_AARCH32_RES0_BITS \
-	(GENMASK_ULL(63,32) | GENMASK_ULL(23, 22) | GENMASK_ULL(20,20))
+	(GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20))
 
 static int valid_compat_regs(struct user_pt_regs *regs)
 {
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 4b0e1231625c..009849328289 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -313,7 +313,6 @@ void __init setup_arch(char **cmdline_p)
 	arm64_memblock_init();
 
 	paging_init();
-	efi_apply_persistent_mem_reservations();
 
 	acpi_table_upgrade();
 
@@ -340,6 +339,9 @@ void __init setup_arch(char **cmdline_p)
 	smp_init_cpus();
 	smp_build_mpidr_hash();
 
+	/* Init percpu seeds for random tags after cpus are set up. */
+	kasan_init_tags();
+
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	/*
 	 * Make sure init_thread_info.ttbr0 always generates translation
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index b0b1478094b4..421ebf6f7086 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -23,6 +23,7 @@
 #include <kvm/arm_psci.h>
 
 #include <asm/cpufeature.h>
+#include <asm/kprobes.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_host.h>
@@ -107,6 +108,7 @@ static void activate_traps_vhe(struct kvm_vcpu *vcpu)
 
 	write_sysreg(kvm_get_hyp_vector(), vbar_el1);
 }
+NOKPROBE_SYMBOL(activate_traps_vhe);
 
 static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
 {
@@ -154,6 +156,7 @@ static void deactivate_traps_vhe(void)
 	write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
 	write_sysreg(vectors, vbar_el1);
 }
+NOKPROBE_SYMBOL(deactivate_traps_vhe);
 
 static void __hyp_text __deactivate_traps_nvhe(void)
 {
@@ -513,6 +516,7 @@ int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 
 	return exit_code;
 }
+NOKPROBE_SYMBOL(kvm_vcpu_run_vhe);
 
 /* Switch to the guest for legacy non-VHE systems */
 int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
@@ -620,6 +624,7 @@ static void __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par,
 	      read_sysreg_el2(esr),   read_sysreg_el2(far),
 	      read_sysreg(hpfar_el2), par, vcpu);
 }
+NOKPROBE_SYMBOL(__hyp_call_panic_vhe);
 
 void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
 {
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 68d6f7c3b237..b426e2cf973c 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -18,6 +18,7 @@
 #include <linux/compiler.h>
 #include <linux/kvm_host.h>
 
+#include <asm/kprobes.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
@@ -98,12 +99,14 @@ void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt)
 {
 	__sysreg_save_common_state(ctxt);
 }
+NOKPROBE_SYMBOL(sysreg_save_host_state_vhe);
 
 void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt)
 {
 	__sysreg_save_common_state(ctxt);
 	__sysreg_save_el2_return_state(ctxt);
 }
+NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe);
 
 static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
 {
@@ -188,12 +191,14 @@ void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt)
 {
 	__sysreg_restore_common_state(ctxt);
 }
+NOKPROBE_SYMBOL(sysreg_restore_host_state_vhe);
 
 void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt)
 {
 	__sysreg_restore_common_state(ctxt);
 	__sysreg_restore_el2_return_state(ctxt);
 }
+NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe);
 
 void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index b72a3dd56204..f16a5f8ff2b4 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -32,6 +32,7 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_coproc.h>
+#include <asm/kvm_emulate.h>
 #include <asm/kvm_mmu.h>
 
 /* Maximum phys_shift supported for any VM on this host */
@@ -105,16 +106,33 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
  * This function finds the right table above and sets the registers on
  * the virtual CPU struct to their architecturally defined reset
  * values.
+ *
+ * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT
+ * ioctl or as part of handling a request issued by another VCPU in the PSCI
+ * handling code.  In the first case, the VCPU will not be loaded, and in the
+ * second case the VCPU will be loaded.  Because this function operates purely
+ * on the memory-backed valus of system registers, we want to do a full put if
+ * we were loaded (handling a request) and load the values back at the end of
+ * the function.  Otherwise we leave the state alone.  In both cases, we
+ * disable preemption around the vcpu reset as we would otherwise race with
+ * preempt notifiers which also call put/load.
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {
 	const struct kvm_regs *cpu_reset;
+	int ret = -EINVAL;
+	bool loaded;
+
+	preempt_disable();
+	loaded = (vcpu->cpu != -1);
+	if (loaded)
+		kvm_arch_vcpu_put(vcpu);
 
 	switch (vcpu->arch.target) {
 	default:
 		if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) {
 			if (!cpu_has_32bit_el1())
-				return -EINVAL;
+				goto out;
 			cpu_reset = &default_regs_reset32;
 		} else {
 			cpu_reset = &default_regs_reset;
@@ -129,6 +147,29 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	/* Reset system registers */
 	kvm_reset_sys_regs(vcpu);
 
+	/*
+	 * Additional reset state handling that PSCI may have imposed on us.
+	 * Must be done after all the sys_reg reset.
+	 */
+	if (vcpu->arch.reset_state.reset) {
+		unsigned long target_pc = vcpu->arch.reset_state.pc;
+
+		/* Gracefully handle Thumb2 entry point */
+		if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
+			target_pc &= ~1UL;
+			vcpu_set_thumb(vcpu);
+		}
+
+		/* Propagate caller endianness */
+		if (vcpu->arch.reset_state.be)
+			kvm_vcpu_set_be(vcpu);
+
+		*vcpu_pc(vcpu) = target_pc;
+		vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0);
+
+		vcpu->arch.reset_state.reset = false;
+	}
+
 	/* Reset PMU */
 	kvm_pmu_vcpu_reset(vcpu);
 
@@ -137,7 +178,12 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 		vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
 
 	/* Reset timer */
-	return kvm_timer_vcpu_reset(vcpu);
+	ret = kvm_timer_vcpu_reset(vcpu);
+out:
+	if (loaded)
+		kvm_arch_vcpu_load(vcpu, smp_processor_id());
+	preempt_enable();
+	return ret;
 }
 
 void kvm_set_ipa_limit(void)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e3e37228ae4e..c936aa40c3f4 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -314,12 +314,29 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu,
 		return read_zero(vcpu, p);
 }
 
-static bool trap_undef(struct kvm_vcpu *vcpu,
-		       struct sys_reg_params *p,
-		       const struct sys_reg_desc *r)
+/*
+ * ARMv8.1 mandates at least a trivial LORegion implementation, where all the
+ * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0
+ * system, these registers should UNDEF. LORID_EL1 being a RO register, we
+ * treat it separately.
+ */
+static bool trap_loregion(struct kvm_vcpu *vcpu,
+			  struct sys_reg_params *p,
+			  const struct sys_reg_desc *r)
 {
-	kvm_inject_undefined(vcpu);
-	return false;
+	u64 val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+	u32 sr = sys_reg((u32)r->Op0, (u32)r->Op1,
+			 (u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
+
+	if (!(val & (0xfUL << ID_AA64MMFR1_LOR_SHIFT))) {
+		kvm_inject_undefined(vcpu);
+		return false;
+	}
+
+	if (p->is_write && sr == SYS_LORID_EL1)
+		return write_to_read_only(vcpu, p, r);
+
+	return trap_raz_wi(vcpu, p, r);
 }
 
 static bool trap_oslsr_el1(struct kvm_vcpu *vcpu,
@@ -1048,11 +1065,6 @@ static u64 read_id_reg(struct sys_reg_desc const *r, bool raz)
 		if (val & ptrauth_mask)
 			kvm_debug("ptrauth unsupported for guests, suppressing\n");
 		val &= ~ptrauth_mask;
-	} else if (id == SYS_ID_AA64MMFR1_EL1) {
-		if (val & (0xfUL << ID_AA64MMFR1_LOR_SHIFT))
-			kvm_debug("LORegions unsupported for guests, suppressing\n");
-
-		val &= ~(0xfUL << ID_AA64MMFR1_LOR_SHIFT);
 	}
 
 	return val;
@@ -1338,11 +1350,11 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
 	{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
 
-	{ SYS_DESC(SYS_LORSA_EL1), trap_undef },
-	{ SYS_DESC(SYS_LOREA_EL1), trap_undef },
-	{ SYS_DESC(SYS_LORN_EL1), trap_undef },
-	{ SYS_DESC(SYS_LORC_EL1), trap_undef },
-	{ SYS_DESC(SYS_LORID_EL1), trap_undef },
+	{ SYS_DESC(SYS_LORSA_EL1), trap_loregion },
+	{ SYS_DESC(SYS_LOREA_EL1), trap_loregion },
+	{ SYS_DESC(SYS_LORN_EL1), trap_loregion },
+	{ SYS_DESC(SYS_LORC_EL1), trap_loregion },
+	{ SYS_DESC(SYS_LORID_EL1), trap_loregion },
 
 	{ SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
 	{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
@@ -2596,7 +2608,9 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
 	table = get_target_table(vcpu->arch.target, true, &num);
 	reset_sys_reg_descs(vcpu, table, num);
 
-	for (num = 1; num < NR_SYS_REGS; num++)
-		if (__vcpu_sys_reg(vcpu, num) == 0x4242424242424242)
-			panic("Didn't reset __vcpu_sys_reg(%zi)", num);
+	for (num = 1; num < NR_SYS_REGS; num++) {
+		if (WARN(__vcpu_sys_reg(vcpu, num) == 0x4242424242424242,
+			 "Didn't reset __vcpu_sys_reg(%zi)\n", num))
+			break;
+	}
 }
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index fcb1f2a6d7c6..99bb8facb5cb 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -286,74 +286,73 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
 
 }
 
-static void walk_pte(struct pg_state *st, pmd_t *pmdp, unsigned long start)
+static void walk_pte(struct pg_state *st, pmd_t *pmdp, unsigned long start,
+		     unsigned long end)
 {
-	pte_t *ptep = pte_offset_kernel(pmdp, 0UL);
-	unsigned long addr;
-	unsigned i;
+	unsigned long addr = start;
+	pte_t *ptep = pte_offset_kernel(pmdp, start);
 
-	for (i = 0; i < PTRS_PER_PTE; i++, ptep++) {
-		addr = start + i * PAGE_SIZE;
+	do {
 		note_page(st, addr, 4, READ_ONCE(pte_val(*ptep)));
-	}
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
 }
 
-static void walk_pmd(struct pg_state *st, pud_t *pudp, unsigned long start)
+static void walk_pmd(struct pg_state *st, pud_t *pudp, unsigned long start,
+		     unsigned long end)
 {
-	pmd_t *pmdp = pmd_offset(pudp, 0UL);
-	unsigned long addr;
-	unsigned i;
+	unsigned long next, addr = start;
+	pmd_t *pmdp = pmd_offset(pudp, start);
 
-	for (i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
+	do {
 		pmd_t pmd = READ_ONCE(*pmdp);
+		next = pmd_addr_end(addr, end);
 
-		addr = start + i * PMD_SIZE;
 		if (pmd_none(pmd) || pmd_sect(pmd)) {
 			note_page(st, addr, 3, pmd_val(pmd));
 		} else {
 			BUG_ON(pmd_bad(pmd));
-			walk_pte(st, pmdp, addr);
+			walk_pte(st, pmdp, addr, next);
 		}
-	}
+	} while (pmdp++, addr = next, addr != end);
 }
 
-static void walk_pud(struct pg_state *st, pgd_t *pgdp, unsigned long start)
+static void walk_pud(struct pg_state *st, pgd_t *pgdp, unsigned long start,
+		     unsigned long end)
 {
-	pud_t *pudp = pud_offset(pgdp, 0UL);
-	unsigned long addr;
-	unsigned i;
+	unsigned long next, addr = start;
+	pud_t *pudp = pud_offset(pgdp, start);
 
-	for (i = 0; i < PTRS_PER_PUD; i++, pudp++) {
+	do {
 		pud_t pud = READ_ONCE(*pudp);
+		next = pud_addr_end(addr, end);
 
-		addr = start + i * PUD_SIZE;
 		if (pud_none(pud) || pud_sect(pud)) {
 			note_page(st, addr, 2, pud_val(pud));
 		} else {
 			BUG_ON(pud_bad(pud));
-			walk_pmd(st, pudp, addr);
+			walk_pmd(st, pudp, addr, next);
 		}
-	}
+	} while (pudp++, addr = next, addr != end);
 }
 
 static void walk_pgd(struct pg_state *st, struct mm_struct *mm,
 		     unsigned long start)
 {
-	pgd_t *pgdp = pgd_offset(mm, 0UL);
-	unsigned i;
-	unsigned long addr;
+	unsigned long end = (start < TASK_SIZE_64) ? TASK_SIZE_64 : 0;
+	unsigned long next, addr = start;
+	pgd_t *pgdp = pgd_offset(mm, start);
 
-	for (i = 0; i < PTRS_PER_PGD; i++, pgdp++) {
+	do {
 		pgd_t pgd = READ_ONCE(*pgdp);
+		next = pgd_addr_end(addr, end);
 
-		addr = start + i * PGDIR_SIZE;
 		if (pgd_none(pgd)) {
 			note_page(st, addr, 1, pgd_val(pgd));
 		} else {
 			BUG_ON(pgd_bad(pgd));
-			walk_pud(st, pgdp, addr);
+			walk_pud(st, pgdp, addr, next);
 		}
-	}
+	} while (pgdp++, addr = next, addr != end);
 }
 
 void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info)
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index 30695a868107..5c9073bace83 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -33,7 +33,11 @@ void sync_icache_aliases(void *kaddr, unsigned long len)
 		__clean_dcache_area_pou(kaddr, len);
 		__flush_icache_all();
 	} else {
-		flush_icache_range(addr, addr + len);
+		/*
+		 * Don't issue kick_all_cpus_sync() after I-cache invalidation
+		 * for user mappings.
+		 */
+		__flush_icache_range(addr, addr + len);
 	}
 }
 
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 4b55b15707a3..f37a86d2a69d 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -252,8 +252,6 @@ void __init kasan_init(void)
 	memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE);
 	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
-	kasan_init_tags();
-
 	/* At this point kasan is fully initialized. Enable error messages */
 	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 1542df00b23c..aaddc0217e73 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -362,7 +362,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	const s16 off = insn->off;
 	const s32 imm = insn->imm;
 	const int i = insn - ctx->prog->insnsi;
-	const bool is64 = BPF_CLASS(code) == BPF_ALU64;
+	const bool is64 = BPF_CLASS(code) == BPF_ALU64 ||
+			  BPF_CLASS(code) == BPF_JMP;
 	const bool isdw = BPF_SIZE(code) == BPF_DW;
 	u8 jmp_cond;
 	s32 jmp_offset;
@@ -559,7 +560,17 @@ emit_bswap_uxt:
 	case BPF_JMP | BPF_JSLT | BPF_X:
 	case BPF_JMP | BPF_JSGE | BPF_X:
 	case BPF_JMP | BPF_JSLE | BPF_X:
-		emit(A64_CMP(1, dst, src), ctx);
+	case BPF_JMP32 | BPF_JEQ | BPF_X:
+	case BPF_JMP32 | BPF_JGT | BPF_X:
+	case BPF_JMP32 | BPF_JLT | BPF_X:
+	case BPF_JMP32 | BPF_JGE | BPF_X:
+	case BPF_JMP32 | BPF_JLE | BPF_X:
+	case BPF_JMP32 | BPF_JNE | BPF_X:
+	case BPF_JMP32 | BPF_JSGT | BPF_X:
+	case BPF_JMP32 | BPF_JSLT | BPF_X:
+	case BPF_JMP32 | BPF_JSGE | BPF_X:
+	case BPF_JMP32 | BPF_JSLE | BPF_X:
+		emit(A64_CMP(is64, dst, src), ctx);
 emit_cond_jmp:
 		jmp_offset = bpf2a64_offset(i + off, i, ctx);
 		check_imm19(jmp_offset);
@@ -601,7 +612,8 @@ emit_cond_jmp:
 		emit(A64_B_(jmp_cond, jmp_offset), ctx);
 		break;
 	case BPF_JMP | BPF_JSET | BPF_X:
-		emit(A64_TST(1, dst, src), ctx);
+	case BPF_JMP32 | BPF_JSET | BPF_X:
+		emit(A64_TST(is64, dst, src), ctx);
 		goto emit_cond_jmp;
 	/* IF (dst COND imm) JUMP off */
 	case BPF_JMP | BPF_JEQ | BPF_K:
@@ -614,12 +626,23 @@ emit_cond_jmp:
 	case BPF_JMP | BPF_JSLT | BPF_K:
 	case BPF_JMP | BPF_JSGE | BPF_K:
 	case BPF_JMP | BPF_JSLE | BPF_K:
-		emit_a64_mov_i(1, tmp, imm, ctx);
-		emit(A64_CMP(1, dst, tmp), ctx);
+	case BPF_JMP32 | BPF_JEQ | BPF_K:
+	case BPF_JMP32 | BPF_JGT | BPF_K:
+	case BPF_JMP32 | BPF_JLT | BPF_K:
+	case BPF_JMP32 | BPF_JGE | BPF_K:
+	case BPF_JMP32 | BPF_JLE | BPF_K:
+	case BPF_JMP32 | BPF_JNE | BPF_K:
+	case BPF_JMP32 | BPF_JSGT | BPF_K:
+	case BPF_JMP32 | BPF_JSLT | BPF_K:
+	case BPF_JMP32 | BPF_JSGE | BPF_K:
+	case BPF_JMP32 | BPF_JSLE | BPF_K:
+		emit_a64_mov_i(is64, tmp, imm, ctx);
+		emit(A64_CMP(is64, dst, tmp), ctx);
 		goto emit_cond_jmp;
 	case BPF_JMP | BPF_JSET | BPF_K:
-		emit_a64_mov_i(1, tmp, imm, ctx);
-		emit(A64_TST(1, dst, tmp), ctx);
+	case BPF_JMP32 | BPF_JSET | BPF_K:
+		emit_a64_mov_i(is64, tmp, imm, ctx);
+		emit(A64_TST(is64, dst, tmp), ctx);
 		goto emit_cond_jmp;
 	/* function call */
 	case BPF_JMP | BPF_CALL: