summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/Kconfig5
-rw-r--r--arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi6
-rw-r--r--arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts4
-rw-r--r--arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts4
-rw-r--r--arch/arm64/boot/dts/apple/spi1-nvram.dtsi2
-rw-r--r--arch/arm64/boot/dts/apple/t8103-j293.dts2
-rw-r--r--arch/arm64/boot/dts/apple/t8103-jxxx.dtsi2
-rw-r--r--arch/arm64/boot/dts/apple/t8103.dtsi2
-rw-r--r--arch/arm64/boot/dts/apple/t8112-j493.dts2
-rw-r--r--arch/arm64/boot/dts/apple/t8112.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi3
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi1
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts2
-rw-r--r--arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts20
-rw-r--r--arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts12
-rw-r--r--arch/arm64/boot/dts/freescale/imx95.dtsi2
-rw-r--r--arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts2
-rw-r--r--arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi1
-rw-r--r--arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi23
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts3
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts28
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3576.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi20
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi5
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts1
-rw-r--r--arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi35
-rw-r--r--arch/arm64/configs/defconfig5
-rw-r--r--arch/arm64/crypto/Kconfig30
-rw-r--r--arch/arm64/crypto/Makefile17
-rw-r--r--arch/arm64/crypto/sha1-ce-core.S150
-rw-r--r--arch/arm64/crypto/sha1-ce-glue.c118
-rw-r--r--arch/arm64/crypto/sha512-ce-core.S206
-rw-r--r--arch/arm64/crypto/sha512-ce-glue.c96
-rw-r--r--arch/arm64/crypto/sha512-glue.c83
-rw-r--r--arch/arm64/include/asm/acpi.h2
-rw-r--r--arch/arm64/include/asm/assembler.h5
-rw-r--r--arch/arm64/include/asm/el2_setup.h19
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h62
-rw-r--r--arch/arm64/include/asm/kvm_host.h7
-rw-r--r--arch/arm64/kernel/Makefile3
-rw-r--r--arch/arm64/kernel/cpufeature.c57
-rw-r--r--arch/arm64/kernel/efi.c11
-rw-r--r--arch/arm64/kernel/entry.S8
-rw-r--r--arch/arm64/kernel/pi/Makefile2
-rw-r--r--arch/arm64/kernel/process.c9
-rw-r--r--arch/arm64/kernel/ptrace.c54
-rw-r--r--arch/arm64/kernel/smp.c2
-rw-r--r--arch/arm64/kernel/vdso/Makefile3
-rw-r--r--arch/arm64/kvm/arm.c19
-rw-r--r--arch/arm64/kvm/fpsimd.c26
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h147
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c5
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c20
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c59
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c107
-rw-r--r--arch/arm64/kvm/nested.c26
-rw-r--r--arch/arm64/kvm/sys_regs.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3-nested.c85
-rw-r--r--arch/arm64/lib/.gitignore4
-rw-r--r--arch/arm64/lib/Makefile9
-rw-r--r--arch/arm64/lib/crc-t10dif-core.S469
-rw-r--r--arch/arm64/lib/crc-t10dif.c73
-rw-r--r--arch/arm64/lib/crc32-core.S362
-rw-r--r--arch/arm64/lib/crc32.c99
-rw-r--r--arch/arm64/lib/crypto/.gitignore3
-rw-r--r--arch/arm64/lib/crypto/Kconfig20
-rw-r--r--arch/arm64/lib/crypto/Makefile24
-rw-r--r--arch/arm64/lib/crypto/chacha-neon-core.S805
-rw-r--r--arch/arm64/lib/crypto/chacha-neon-glue.c119
-rw-r--r--arch/arm64/lib/crypto/poly1305-armv8.pl917
-rw-r--r--arch/arm64/lib/crypto/poly1305-glue.c73
-rw-r--r--arch/arm64/lib/crypto/sha2-armv8.pl786
-rw-r--r--arch/arm64/lib/crypto/sha256-ce.S136
-rw-r--r--arch/arm64/lib/crypto/sha256.c75
-rw-r--r--arch/arm64/mm/fault.c30
-rw-r--r--arch/arm64/mm/mmu.c3
-rw-r--r--arch/arm64/mm/proc.S1
-rw-r--r--arch/arm64/tools/syscall_32.tbl2
84 files changed, 525 insertions, 5131 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 55fc331af337..7c456aa838b9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -21,8 +21,6 @@ config ARM64
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CC_PLATFORM
- select ARCH_HAS_CRC32
- select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
@@ -187,12 +185,12 @@ config ARM64
select HAVE_ARCH_KCSAN if EXPERT
select HAVE_ARCH_KFENCE
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
@@ -256,6 +254,7 @@ config ARM64
select HOTPLUG_SMT if HOTPLUG_CPU
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
+ select JUMP_LABEL
select KASAN_VMALLOC if KASAN
select LOCK_MM_AND_FIND_VMA
select MODULES_USE_ELF_RELA
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi b/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
index 8b7cbc2e78f5..51cd148f4227 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi
@@ -131,7 +131,7 @@
"PH5", "PH6", "PH7", "PH9", "PH10",
"PH14", "PH15", "PH16", "PH17", "PH18";
allwinner,pinmux = <5>;
- function = "emac0";
+ function = "gmac0";
drive-strength = <40>;
bias-disable;
};
@@ -540,8 +540,8 @@
status = "disabled";
};
- emac0: ethernet@4500000 {
- compatible = "allwinner,sun55i-a523-emac0",
+ gmac0: ethernet@4500000 {
+ compatible = "allwinner,sun55i-a523-gmac0",
"allwinner,sun50i-a64-emac";
reg = <0x04500000 0x10000>;
clocks = <&ccu CLK_BUS_EMAC0>;
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts b/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
index 0f58d92a6adc..8bc0f2c72a24 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
+++ b/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts
@@ -12,7 +12,7 @@
compatible = "radxa,cubie-a5e", "allwinner,sun55i-a527";
aliases {
- ethernet0 = &emac0;
+ ethernet0 = &gmac0;
serial0 = &uart0;
};
@@ -55,7 +55,7 @@
status = "okay";
};
-&emac0 {
+&gmac0 {
phy-mode = "rgmii-id";
phy-handle = <&ext_rgmii_phy>;
phy-supply = <&reg_cldo3>;
diff --git a/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts b/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
index 08127f0cdd35..142177c1f737 100644
--- a/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
+++ b/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts
@@ -12,7 +12,7 @@
compatible = "yuzukihd,avaota-a1", "allwinner,sun55i-t527";
aliases {
- ethernet0 = &emac0;
+ ethernet0 = &gmac0;
serial0 = &uart0;
};
@@ -65,7 +65,7 @@
status = "okay";
};
-&emac0 {
+&gmac0 {
phy-mode = "rgmii-id";
phy-handle = <&ext_rgmii_phy>;
phy-supply = <&reg_dcdc4>;
diff --git a/arch/arm64/boot/dts/apple/spi1-nvram.dtsi b/arch/arm64/boot/dts/apple/spi1-nvram.dtsi
index 3df2fd3993b5..9740fbf200f0 100644
--- a/arch/arm64/boot/dts/apple/spi1-nvram.dtsi
+++ b/arch/arm64/boot/dts/apple/spi1-nvram.dtsi
@@ -20,8 +20,6 @@
compatible = "jedec,spi-nor";
reg = <0x0>;
spi-max-frequency = <25000000>;
- #address-cells = <1>;
- #size-cells = <1>;
partitions {
compatible = "fixed-partitions";
diff --git a/arch/arm64/boot/dts/apple/t8103-j293.dts b/arch/arm64/boot/dts/apple/t8103-j293.dts
index e2d9439397f7..5b3c42e9f0e6 100644
--- a/arch/arm64/boot/dts/apple/t8103-j293.dts
+++ b/arch/arm64/boot/dts/apple/t8103-j293.dts
@@ -100,6 +100,8 @@
&displaydfr_mipi {
status = "okay";
+ #address-cells = <1>;
+ #size-cells = <0>;
dfr_panel: panel@0 {
compatible = "apple,j293-summit", "apple,summit";
diff --git a/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi b/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi
index 8e82231acab5..0c8206156bfe 100644
--- a/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi
+++ b/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi
@@ -71,7 +71,7 @@
*/
&port00 {
bus-range = <1 1>;
- wifi0: network@0,0 {
+ wifi0: wifi@0,0 {
compatible = "pci14e4,4425";
reg = <0x10000 0x0 0x0 0x0 0x0>;
/* To be filled by the loader */
diff --git a/arch/arm64/boot/dts/apple/t8103.dtsi b/arch/arm64/boot/dts/apple/t8103.dtsi
index 20faf0c0d809..3a204845b85b 100644
--- a/arch/arm64/boot/dts/apple/t8103.dtsi
+++ b/arch/arm64/boot/dts/apple/t8103.dtsi
@@ -405,8 +405,6 @@
compatible = "apple,t8103-display-pipe-mipi", "apple,h7-display-pipe-mipi";
reg = <0x2 0x28600000 0x0 0x100000>;
power-domains = <&ps_mipi_dsi>;
- #address-cells = <1>;
- #size-cells = <0>;
status = "disabled";
ports {
diff --git a/arch/arm64/boot/dts/apple/t8112-j493.dts b/arch/arm64/boot/dts/apple/t8112-j493.dts
index be86d34c6696..fb8ad7d4c65a 100644
--- a/arch/arm64/boot/dts/apple/t8112-j493.dts
+++ b/arch/arm64/boot/dts/apple/t8112-j493.dts
@@ -63,6 +63,8 @@
&displaydfr_mipi {
status = "okay";
+ #address-cells = <1>;
+ #size-cells = <0>;
dfr_panel: panel@0 {
compatible = "apple,j493-summit", "apple,summit";
diff --git a/arch/arm64/boot/dts/apple/t8112.dtsi b/arch/arm64/boot/dts/apple/t8112.dtsi
index e95711d8337f..f68354194355 100644
--- a/arch/arm64/boot/dts/apple/t8112.dtsi
+++ b/arch/arm64/boot/dts/apple/t8112.dtsi
@@ -420,8 +420,6 @@
compatible = "apple,t8112-display-pipe-mipi", "apple,h7-display-pipe-mipi";
reg = <0x2 0x28600000 0x0 0x100000>;
power-domains = <&ps_mipi_dsi>;
- #address-cells = <1>;
- #size-cells = <0>;
status = "disabled";
ports {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index 0baf256b4400..983b2f0e8797 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
@@ -687,11 +687,12 @@
};
wdog0: watchdog@2ad0000 {
- compatible = "fsl,imx21-wdt";
+ compatible = "fsl,ls1046a-wdt", "fsl,imx21-wdt";
reg = <0x0 0x2ad0000 0x0 0x10000>;
interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL
QORIQ_CLK_PLL_DIV(2)>;
+ big-endian;
};
edma0: dma-controller@2c00000 {
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
index d29710772569..1594ce9182a5 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi
@@ -464,6 +464,7 @@
};
reg_nvcc_sd: LDO5 {
+ regulator-always-on;
regulator-max-microvolt = <3300000>;
regulator-min-microvolt = <1800000>;
regulator-name = "On-module +V3.3_1.8_SD (LDO5)";
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
index 2f740d74707b..4bf818873fe3 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi
@@ -70,7 +70,7 @@
tpm@1 {
compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
reg = <0x1>;
- spi-max-frequency = <36000000>;
+ spi-max-frequency = <25000000>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
index 5ab3ffe9931d..cf747ec6fa16 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi
@@ -110,7 +110,7 @@
tpm@1 {
compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
reg = <0x1>;
- spi-max-frequency = <36000000>;
+ spi-max-frequency = <25000000>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
index e2b5e7ac3e46..5eb114d2360a 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi
@@ -122,7 +122,7 @@
tpm@1 {
compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
reg = <0x1>;
- spi-max-frequency = <36000000>;
+ spi-max-frequency = <25000000>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
index 6daa2313f879..568d24265ddf 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts
@@ -201,7 +201,7 @@
tpm@0 {
compatible = "atmel,attpm20p", "tcg,tpm_tis-spi";
reg = <0x0>;
- spi-max-frequency = <36000000>;
+ spi-max-frequency = <25000000>;
};
};
diff --git a/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts b/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
index 6c47f4b47356..9f4d0899a94d 100644
--- a/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts
@@ -574,17 +574,17 @@
&scmi_iomuxc {
pinctrl_emdio: emdiogrp {
fsl,pins = <
- IMX95_PAD_ENET2_MDC__NETCMIX_TOP_NETC_MDC 0x57e
- IMX95_PAD_ENET2_MDIO__NETCMIX_TOP_NETC_MDIO 0x97e
+ IMX95_PAD_ENET2_MDC__NETCMIX_TOP_NETC_MDC 0x50e
+ IMX95_PAD_ENET2_MDIO__NETCMIX_TOP_NETC_MDIO 0x90e
>;
};
pinctrl_enetc0: enetc0grp {
fsl,pins = <
- IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x57e
- IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x57e
- IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x57e
- IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x57e
+ IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x50e
+ IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x50e
+ IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x50e
+ IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x50e
IMX95_PAD_ENET1_TX_CTL__NETCMIX_TOP_ETH0_RGMII_TX_CTL 0x57e
IMX95_PAD_ENET1_TXC__NETCMIX_TOP_ETH0_RGMII_TX_CLK 0x58e
IMX95_PAD_ENET1_RX_CTL__NETCMIX_TOP_ETH0_RGMII_RX_CTL 0x57e
@@ -598,10 +598,10 @@
pinctrl_enetc1: enetc1grp {
fsl,pins = <
- IMX95_PAD_ENET2_TD3__NETCMIX_TOP_ETH1_RGMII_TD3 0x57e
- IMX95_PAD_ENET2_TD2__NETCMIX_TOP_ETH1_RGMII_TD2 0x57e
- IMX95_PAD_ENET2_TD1__NETCMIX_TOP_ETH1_RGMII_TD1 0x57e
- IMX95_PAD_ENET2_TD0__NETCMIX_TOP_ETH1_RGMII_TD0 0x57e
+ IMX95_PAD_ENET2_TD3__NETCMIX_TOP_ETH1_RGMII_TD3 0x50e
+ IMX95_PAD_ENET2_TD2__NETCMIX_TOP_ETH1_RGMII_TD2 0x50e
+ IMX95_PAD_ENET2_TD1__NETCMIX_TOP_ETH1_RGMII_TD1 0x50e
+ IMX95_PAD_ENET2_TD0__NETCMIX_TOP_ETH1_RGMII_TD0 0x50e
IMX95_PAD_ENET2_TX_CTL__NETCMIX_TOP_ETH1_RGMII_TX_CTL 0x57e
IMX95_PAD_ENET2_TXC__NETCMIX_TOP_ETH1_RGMII_TX_CLK 0x58e
IMX95_PAD_ENET2_RX_CTL__NETCMIX_TOP_ETH1_RGMII_RX_CTL 0x57e
diff --git a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
index 6886ea766655..d7d845231312 100644
--- a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
@@ -566,17 +566,17 @@
&scmi_iomuxc {
pinctrl_emdio: emdiogrp{
fsl,pins = <
- IMX95_PAD_ENET1_MDC__NETCMIX_TOP_NETC_MDC 0x57e
- IMX95_PAD_ENET1_MDIO__NETCMIX_TOP_NETC_MDIO 0x97e
+ IMX95_PAD_ENET1_MDC__NETCMIX_TOP_NETC_MDC 0x50e
+ IMX95_PAD_ENET1_MDIO__NETCMIX_TOP_NETC_MDIO 0x90e
>;
};
pinctrl_enetc0: enetc0grp {
fsl,pins = <
- IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x57e
- IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x57e
- IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x57e
- IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x57e
+ IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x50e
+ IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x50e
+ IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x50e
+ IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x50e
IMX95_PAD_ENET1_TX_CTL__NETCMIX_TOP_ETH0_RGMII_TX_CTL 0x57e
IMX95_PAD_ENET1_TXC__NETCMIX_TOP_ETH0_RGMII_TX_CLK 0x58e
IMX95_PAD_ENET1_RX_CTL__NETCMIX_TOP_ETH0_RGMII_RX_CTL 0x57e
diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi
index 632631a29112..5aecdd9b62ff 100644
--- a/arch/arm64/boot/dts/freescale/imx95.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx95.dtsi
@@ -1708,7 +1708,7 @@
<0x9 0 1 0>;
reg-names = "dbi","atu", "dbi2", "app", "dma", "addr_space";
num-lanes = <1>;
- interrupts = <GIC_SPI 317 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <GIC_SPI 311 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "dma";
clocks = <&scmi_clk IMX95_CLK_HSIO>,
<&scmi_clk IMX95_CLK_HSIOPLL>,
diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
index ae7a275fd223..cefecb7a23cf 100644
--- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
+++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
@@ -1090,6 +1090,8 @@
};
&pmk8280_rtc {
+ qcom,uefi-rtc-info;
+
status = "okay";
};
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi b/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
index c02fd4d15c96..e3888bc143a0 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
+++ b/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi
@@ -224,6 +224,7 @@
reg-names = "rtc", "alarm";
interrupts = <0x0 0x62 0x1 IRQ_TYPE_EDGE_RISING>;
qcom,no-alarm; /* alarm owned by ADSP */
+ qcom,uefi-rtc-info;
};
pmk8550_sdam_2: nvram@7100 {
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
index ab232e5c7ad6..4203b335a263 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
@@ -379,6 +379,18 @@
<0 RK_PA7 RK_FUNC_GPIO &pcfg_pull_up>;
};
};
+
+ spi1 {
+ spi1_csn0_gpio_pin: spi1-csn0-gpio-pin {
+ rockchip,pins =
+ <3 RK_PB1 RK_FUNC_GPIO &pcfg_pull_up_4ma>;
+ };
+
+ spi1_csn1_gpio_pin: spi1-csn1-gpio-pin {
+ rockchip,pins =
+ <3 RK_PB2 RK_FUNC_GPIO &pcfg_pull_up_4ma>;
+ };
+ };
};
&pmu_io_domains {
@@ -396,6 +408,17 @@
vqmmc-supply = <&vccio_sd>;
};
+&spi1 {
+ /*
+ * Hardware CS has a very slow rise time of about 6us,
+ * causing transmission errors.
+ * With cs-gpios we have a rise time of about 20ns.
+ */
+ cs-gpios = <&gpio3 RK_PB1 GPIO_ACTIVE_LOW>, <&gpio3 RK_PB2 GPIO_ACTIVE_LOW>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&spi1_clk &spi1_csn0_gpio_pin &spi1_csn1_gpio_pin &spi1_miso &spi1_mosi>;
+};
+
&tsadc {
status = "okay";
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
index 3c127c5c2607..a9021c524afb 100644
--- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts
@@ -30,6 +30,7 @@
fan: gpio_fan {
compatible = "gpio-fan";
+ fan-supply = <&vcc12v_dcin>;
gpios = <&gpio0 RK_PD5 GPIO_ACTIVE_HIGH>;
gpio-fan,speed-map =
< 0 0>,
diff --git a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
index 3b31f0dd8f3b..539edc3c535f 100644
--- a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts
@@ -29,7 +29,6 @@
function-enumerator = <1>;
gpios = <&gpio3 RK_PD6 GPIO_ACTIVE_HIGH>;
label = "LAN-1";
- linux,default-trigger = "netdev";
};
led-lan2 {
@@ -39,7 +38,6 @@
function-enumerator = <2>;
gpios = <&gpio3 RK_PD7 GPIO_ACTIVE_HIGH>;
label = "LAN-2";
- linux,default-trigger = "netdev";
};
power_led: led-sys {
@@ -56,7 +54,6 @@
function = LED_FUNCTION_WAN;
gpios = <&gpio2 RK_PC1 GPIO_ACTIVE_HIGH>;
label = "WAN";
- linux,default-trigger = "netdev";
};
};
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
index b09e789c75c4..801b40fea4e8 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
@@ -211,10 +211,38 @@
status = "okay";
};
+&cpu_b0 {
+ cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b1 {
+ cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b2 {
+ cpu-supply = <&vdd_cpu_big_s0>;
+};
+
+&cpu_b3 {
+ cpu-supply = <&vdd_cpu_big_s0>;
+};
+
&cpu_l0 {
cpu-supply = <&vdd_cpu_lit_s0>;
};
+&cpu_l1 {
+ cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
+&cpu_l2 {
+ cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
+&cpu_l3 {
+ cpu-supply = <&vdd_cpu_lit_s0>;
+};
+
&gmac0 {
phy-mode = "rgmii-id";
clock_in_out = "output";
diff --git a/arch/arm64/boot/dts/rockchip/rk3576.dtsi b/arch/arm64/boot/dts/rockchip/rk3576.dtsi
index 1086482f0479..64812e3bcb61 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3576.dtsi
@@ -615,7 +615,7 @@
<0 0 0 2 &pcie1_intc 1>,
<0 0 0 3 &pcie1_intc 2>,
<0 0 0 4 &pcie1_intc 3>;
- linux,pci-domain = <0>;
+ linux,pci-domain = <1>;
max-link-speed = <2>;
num-ib-windows = <8>;
num-viewport = <8>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
index 7f874c77410c..6584d73660f6 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi
@@ -578,14 +578,14 @@
hdmim0_tx0_scl: hdmim0-tx0-scl {
rockchip,pins =
/* hdmim0_tx0_scl */
- <4 RK_PB7 5 &pcfg_pull_none>;
+ <4 RK_PB7 5 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim0_tx0_sda: hdmim0-tx0-sda {
rockchip,pins =
/* hdmim0_tx0_sda */
- <4 RK_PC0 5 &pcfg_pull_none>;
+ <4 RK_PC0 5 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
@@ -640,14 +640,14 @@
hdmim1_tx0_scl: hdmim1-tx0-scl {
rockchip,pins =
/* hdmim1_tx0_scl */
- <0 RK_PD5 11 &pcfg_pull_none>;
+ <0 RK_PD5 11 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim1_tx0_sda: hdmim1-tx0-sda {
rockchip,pins =
/* hdmim1_tx0_sda */
- <0 RK_PD4 11 &pcfg_pull_none>;
+ <0 RK_PD4 11 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
@@ -668,14 +668,14 @@
hdmim1_tx1_scl: hdmim1-tx1-scl {
rockchip,pins =
/* hdmim1_tx1_scl */
- <3 RK_PC6 5 &pcfg_pull_none>;
+ <3 RK_PC6 5 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim1_tx1_sda: hdmim1-tx1-sda {
rockchip,pins =
/* hdmim1_tx1_sda */
- <3 RK_PC5 5 &pcfg_pull_none>;
+ <3 RK_PC5 5 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
hdmim2_rx_cec: hdmim2-rx-cec {
@@ -709,14 +709,14 @@
hdmim2_tx0_scl: hdmim2-tx0-scl {
rockchip,pins =
/* hdmim2_tx0_scl */
- <3 RK_PC7 5 &pcfg_pull_none>;
+ <3 RK_PC7 5 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim2_tx0_sda: hdmim2-tx0-sda {
rockchip,pins =
/* hdmim2_tx0_sda */
- <3 RK_PD0 5 &pcfg_pull_none>;
+ <3 RK_PD0 5 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
@@ -730,14 +730,14 @@
hdmim2_tx1_scl: hdmim2-tx1-scl {
rockchip,pins =
/* hdmim2_tx1_scl */
- <1 RK_PA4 5 &pcfg_pull_none>;
+ <1 RK_PA4 5 &pcfg_pull_none_drv_level_5_smt>;
};
/omit-if-no-ref/
hdmim2_tx1_sda: hdmim2-tx1-sda {
rockchip,pins =
/* hdmim2_tx1_sda */
- <1 RK_PA3 5 &pcfg_pull_none>;
+ <1 RK_PA3 5 &pcfg_pull_none_drv_level_1_smt>;
};
/omit-if-no-ref/
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
index cc37f082adea..b07543315f87 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi
@@ -321,6 +321,7 @@
bus-width = <4>;
cap-mmc-highspeed;
cap-sd-highspeed;
+ cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>;
disable-wp;
max-frequency = <150000000>;
no-sdio;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
index 244c66faa161..fb48ddc04bcb 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi
@@ -160,14 +160,15 @@
hdmim0_tx1_scl: hdmim0-tx1-scl {
rockchip,pins =
/* hdmim0_tx1_scl */
- <2 RK_PB5 4 &pcfg_pull_none>;
+ <2 RK_PB5 4 &pcfg_pull_none_drv_level_3_smt>;
};
/omit-if-no-ref/
hdmim0_tx1_sda: hdmim0-tx1-sda {
rockchip,pins =
/* hdmim0_tx1_sda */
- <2 RK_PB4 4 &pcfg_pull_none>;
+ <2 RK_PB4 4 &pcfg_pull_none_drv_level_1_smt>;
+
};
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
index 8b717c4017a4..b2947b36fada 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts
@@ -474,6 +474,7 @@
bus-width = <4>;
cap-mmc-highspeed;
cap-sd-highspeed;
+ cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>;
disable-wp;
max-frequency = <150000000>;
no-sdio;
diff --git a/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi b/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
index 5c645437b507..b0475b7c655a 100644
--- a/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi
@@ -333,6 +333,41 @@
};
/omit-if-no-ref/
+ pcfg_pull_none_drv_level_1_smt: pcfg-pull-none-drv-level-1-smt {
+ bias-disable;
+ drive-strength = <1>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
+ pcfg_pull_none_drv_level_2_smt: pcfg-pull-none-drv-level-2-smt {
+ bias-disable;
+ drive-strength = <2>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
+ pcfg_pull_none_drv_level_3_smt: pcfg-pull-none-drv-level-3-smt {
+ bias-disable;
+ drive-strength = <3>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
+ pcfg_pull_none_drv_level_4_smt: pcfg-pull-none-drv-level-4-smt {
+ bias-disable;
+ drive-strength = <4>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
+ pcfg_pull_none_drv_level_5_smt: pcfg-pull-none-drv-level-5-smt {
+ bias-disable;
+ drive-strength = <5>;
+ input-schmitt-enable;
+ };
+
+ /omit-if-no-ref/
pcfg_output_high: pcfg-output-high {
output-high;
};
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 897fc686e6a9..03117405b6ce 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1444,6 +1444,7 @@ CONFIG_PLATFORM_MHU=y
CONFIG_BCM2835_MBOX=y
CONFIG_QCOM_APCS_IPC=y
CONFIG_MTK_ADSP_MBOX=m
+CONFIG_QCOM_CPUCP_MBOX=m
CONFIG_QCOM_IPCC=y
CONFIG_ROCKCHIP_IOMMU=y
CONFIG_TEGRA_IOMMU_SMMU=y
@@ -1573,6 +1574,7 @@ CONFIG_RESET_QCOM_AOSS=y
CONFIG_RESET_QCOM_PDC=m
CONFIG_RESET_RZG2L_USBPHY_CTRL=y
CONFIG_RESET_TI_SCI=y
+CONFIG_PHY_SNPS_EUSB2=m
CONFIG_PHY_XGENE=y
CONFIG_PHY_CAN_TRANSCEIVER=m
CONFIG_PHY_NXP_PTN3222=m
@@ -1597,7 +1599,6 @@ CONFIG_PHY_QCOM_EDP=m
CONFIG_PHY_QCOM_PCIE2=m
CONFIG_PHY_QCOM_QMP=m
CONFIG_PHY_QCOM_QUSB2=m
-CONFIG_PHY_QCOM_SNPS_EUSB2=m
CONFIG_PHY_QCOM_EUSB2_REPEATER=m
CONFIG_PHY_QCOM_M31_USB=m
CONFIG_PHY_QCOM_USB_HS=m
@@ -1743,8 +1744,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_ANSI_CPRNG=y
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_GHASH_ARM64_CE=y
-CONFIG_CRYPTO_SHA1_ARM64_CE=y
-CONFIG_CRYPTO_SHA512_ARM64_CE=m
CONFIG_CRYPTO_SHA3_ARM64=m
CONFIG_CRYPTO_SM3_ARM64_CE=m
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index c44b0f202a1f..3bb5b513d5ae 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -25,36 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
Architecture: arm64 using:
- NEON (Advanced SIMD) extensions
-config CRYPTO_SHA1_ARM64_CE
- tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_HASH
- select CRYPTO_SHA1
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: arm64 using:
- - ARMv8 Crypto Extensions
-
-config CRYPTO_SHA512_ARM64
- tristate "Hash functions: SHA-384 and SHA-512"
- select CRYPTO_HASH
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: arm64
-
-config CRYPTO_SHA512_ARM64_CE
- tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_HASH
- select CRYPTO_SHA512_ARM64
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: arm64 using:
- - ARMv8 Crypto Extensions
-
config CRYPTO_SHA3_ARM64
tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index c231c980c514..a8b2cdbe202c 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -5,12 +5,6 @@
# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
#
-obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
-sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
-
-obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
-sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
-
obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
@@ -53,9 +47,6 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
aes-neon-blk-y := aes-glue-neon.o aes-neon.o
-obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
-sha512-arm64-y := sha512-glue.o sha512-core.o
-
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
@@ -64,11 +55,3 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
-
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $(<) void $(@)
-
-$(obj)/sha512-core.S: $(src)/../lib/crypto/sha2-armv8.pl
- $(call cmd,perlasm)
-
-clean-files += sha512-core.S
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
deleted file mode 100644
index 9b1f2d82a6fe..000000000000
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a+crypto
-
- k0 .req v0
- k1 .req v1
- k2 .req v2
- k3 .req v3
-
- t0 .req v4
- t1 .req v5
-
- dga .req q6
- dgav .req v6
- dgb .req s7
- dgbv .req v7
-
- dg0q .req q12
- dg0s .req s12
- dg0v .req v12
- dg1s .req s13
- dg1v .req v13
- dg2s .req s14
-
- .macro add_only, op, ev, rc, s0, dg1
- .ifc \ev, ev
- add t1.4s, v\s0\().4s, \rc\().4s
- sha1h dg2s, dg0s
- .ifnb \dg1
- sha1\op dg0q, \dg1, t0.4s
- .else
- sha1\op dg0q, dg1s, t0.4s
- .endif
- .else
- .ifnb \s0
- add t0.4s, v\s0\().4s, \rc\().4s
- .endif
- sha1h dg1s, dg0s
- sha1\op dg0q, dg2s, t1.4s
- .endif
- .endm
-
- .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
- sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
- add_only \op, \ev, \rc, \s1, \dg1
- sha1su1 v\s0\().4s, v\s3\().4s
- .endm
-
- .macro loadrc, k, val, tmp
- movz \tmp, :abs_g0_nc:\val
- movk \tmp, :abs_g1:\val
- dup \k, \tmp
- .endm
-
- /*
- * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- * int blocks)
- */
-SYM_FUNC_START(__sha1_ce_transform)
- /* load round constants */
- loadrc k0.4s, 0x5a827999, w6
- loadrc k1.4s, 0x6ed9eba1, w6
- loadrc k2.4s, 0x8f1bbcdc, w6
- loadrc k3.4s, 0xca62c1d6, w6
-
- /* load state */
- ld1 {dgav.4s}, [x0]
- ldr dgb, [x0, #16]
-
- /* load sha1_ce_state::finalize */
- ldr_l w4, sha1_ce_offsetof_finalize, x4
- ldr w4, [x0, x4]
-
- /* load input */
-0: ld1 {v8.4s-v11.4s}, [x1], #64
- sub w2, w2, #1
-
-CPU_LE( rev32 v8.16b, v8.16b )
-CPU_LE( rev32 v9.16b, v9.16b )
-CPU_LE( rev32 v10.16b, v10.16b )
-CPU_LE( rev32 v11.16b, v11.16b )
-
-1: add t0.4s, v8.4s, k0.4s
- mov dg0v.16b, dgav.16b
-
- add_update c, ev, k0, 8, 9, 10, 11, dgb
- add_update c, od, k0, 9, 10, 11, 8
- add_update c, ev, k0, 10, 11, 8, 9
- add_update c, od, k0, 11, 8, 9, 10
- add_update c, ev, k1, 8, 9, 10, 11
-
- add_update p, od, k1, 9, 10, 11, 8
- add_update p, ev, k1, 10, 11, 8, 9
- add_update p, od, k1, 11, 8, 9, 10
- add_update p, ev, k1, 8, 9, 10, 11
- add_update p, od, k2, 9, 10, 11, 8
-
- add_update m, ev, k2, 10, 11, 8, 9
- add_update m, od, k2, 11, 8, 9, 10
- add_update m, ev, k2, 8, 9, 10, 11
- add_update m, od, k2, 9, 10, 11, 8
- add_update m, ev, k3, 10, 11, 8, 9
-
- add_update p, od, k3, 11, 8, 9, 10
- add_only p, ev, k3, 9
- add_only p, od, k3, 10
- add_only p, ev, k3, 11
- add_only p, od
-
- /* update state */
- add dgbv.2s, dgbv.2s, dg1v.2s
- add dgav.4s, dgav.4s, dg0v.4s
-
- cbz w2, 2f
- cond_yield 3f, x5, x6
- b 0b
-
- /*
- * Final block: add padding and total bit count.
- * Skip if the input size was not a round multiple of the block size,
- * the padding is handled by the C code in that case.
- */
-2: cbz x4, 3f
- ldr_l w4, sha1_ce_offsetof_count, x4
- ldr x4, [x0, x4]
- movi v9.2d, #0
- mov x8, #0x80000000
- movi v10.2d, #0
- ror x7, x4, #29 // ror(lsl(x4, 3), 32)
- fmov d8, x8
- mov x4, #0
- mov v11.d[0], xzr
- mov v11.d[1], x7
- b 1b
-
- /* store new state */
-3: st1 {dgav.4s}, [x0]
- str dgb, [x0, #16]
- mov w0, w2
- ret
-SYM_FUNC_END(__sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
deleted file mode 100644
index 65b6980817e5..000000000000
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ /dev/null
@@ -1,118 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha1");
-
-struct sha1_ce_state {
- struct sha1_state sst;
- u32 finalize;
-};
-
-extern const u32 sha1_ce_offsetof_count;
-extern const u32 sha1_ce_offsetof_finalize;
-
-asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- int blocks);
-
-static void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
- int blocks)
-{
- while (blocks) {
- int rem;
-
- kernel_neon_begin();
- rem = __sha1_ce_transform(container_of(sst,
- struct sha1_ce_state,
- sst), src, blocks);
- kernel_neon_end();
- src += (blocks - rem) * SHA1_BLOCK_SIZE;
- blocks = rem;
- }
-}
-
-const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
-const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);
-
-static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
- sctx->finalize = 0;
- return sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
-}
-
-static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- struct sha1_ce_state *sctx = shash_desc_ctx(desc);
- bool finalized = false;
-
- /*
- * Allow the asm code to perform the finalization if there is no
- * partial data and the input is a round multiple of the block size.
- */
- if (len >= SHA1_BLOCK_SIZE) {
- unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE);
-
- finalized = !remain;
- sctx->finalize = finalized;
- sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
- data += len - remain;
- len = remain;
- }
- if (!finalized) {
- sctx->finalize = 0;
- sha1_base_do_finup(desc, data, len, sha1_ce_transform);
- }
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .init = sha1_base_init,
- .update = sha1_ce_update,
- .finup = sha1_ce_finup,
- .descsize = sizeof(struct sha1_ce_state),
- .statesize = SHA1_STATE_SIZE,
- .digestsize = SHA1_DIGEST_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ce",
- .cra_priority = 200,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init sha1_ce_mod_init(void)
-{
- return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_ce_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(SHA1, sha1_ce_mod_init);
-module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
deleted file mode 100644
index 91ef68b15fcc..000000000000
--- a/arch/arm64/crypto/sha512-ce-core.S
+++ /dev/null
@@ -1,206 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
- .set .Lq\b, \b
- .set .Lv\b\().2d, \b
- .endr
-
- .macro sha512h, rd, rn, rm
- .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
- .endm
-
- .macro sha512h2, rd, rn, rm
- .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
- .endm
-
- .macro sha512su0, rd, rn
- .inst 0xcec08000 | .L\rd | (.L\rn << 5)
- .endm
-
- .macro sha512su1, rd, rn, rm
- .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
- .endm
-
- /*
- * The SHA-512 round constants
- */
- .section ".rodata", "a"
- .align 4
-.Lsha512_rcon:
- .quad 0x428a2f98d728ae22, 0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538, 0x59f111f1b605d019
- .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242, 0x12835b0145706fbe
- .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235, 0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
- .quad 0x983e5152ee66dfab, 0xa831c66d2db43210
- .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
- .quad 0x06ca6351e003826f, 0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
- .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6, 0x92722c851482353b
- .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
- .quad 0xd192e819d6ef5218, 0xd69906245565a910
- .quad 0xf40e35855771202a, 0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
- .quad 0x90befffa23631e28, 0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
- .quad 0xca273eceea26619c, 0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae, 0x1b710b35131c471b
- .quad 0x28db77f523047d84, 0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
-
- .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
- .ifnb \rc1
- ld1 {v\rc1\().2d}, [x4], #16
- .endif
- add v5.2d, v\rc0\().2d, v\in0\().2d
- ext v6.16b, v\i2\().16b, v\i3\().16b, #8
- ext v5.16b, v5.16b, v5.16b, #8
- ext v7.16b, v\i1\().16b, v\i2\().16b, #8
- add v\i3\().2d, v\i3\().2d, v5.2d
- .ifnb \in1
- ext v5.16b, v\in3\().16b, v\in4\().16b, #8
- sha512su0 v\in0\().2d, v\in1\().2d
- .endif
- sha512h q\i3, q6, v7.2d
- .ifnb \in1
- sha512su1 v\in0\().2d, v\in2\().2d, v5.2d
- .endif
- add v\i4\().2d, v\i1\().2d, v\i3\().2d
- sha512h2 q\i3, q\i1, v\i0\().2d
- .endm
-
- /*
- * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- * int blocks)
- */
- .text
-SYM_FUNC_START(__sha512_ce_transform)
- /* load state */
- ld1 {v8.2d-v11.2d}, [x0]
-
- /* load first 4 round constants */
- adr_l x3, .Lsha512_rcon
- ld1 {v20.2d-v23.2d}, [x3], #64
-
- /* load input */
-0: ld1 {v12.2d-v15.2d}, [x1], #64
- ld1 {v16.2d-v19.2d}, [x1], #64
- sub w2, w2, #1
-
-CPU_LE( rev64 v12.16b, v12.16b )
-CPU_LE( rev64 v13.16b, v13.16b )
-CPU_LE( rev64 v14.16b, v14.16b )
-CPU_LE( rev64 v15.16b, v15.16b )
-CPU_LE( rev64 v16.16b, v16.16b )
-CPU_LE( rev64 v17.16b, v17.16b )
-CPU_LE( rev64 v18.16b, v18.16b )
-CPU_LE( rev64 v19.16b, v19.16b )
-
- mov x4, x3 // rc pointer
-
- mov v0.16b, v8.16b
- mov v1.16b, v9.16b
- mov v2.16b, v10.16b
- mov v3.16b, v11.16b
-
- // v0 ab cd -- ef gh ab
- // v1 cd -- ef gh ab cd
- // v2 ef gh ab cd -- ef
- // v3 gh ab cd -- ef gh
- // v4 -- ef gh ab cd --
-
- dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
- dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
- dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
- dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
- dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
-
- dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
- dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
- dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
- dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
- dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
-
- dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
- dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
- dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
- dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
- dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
-
- dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
- dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
- dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
- dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
- dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
-
- dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
- dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
- dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
- dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
- dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
-
- dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
- dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
- dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
- dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
- dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
-
- dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
- dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
- dround 2, 3, 1, 4, 0, 28, 24, 12
- dround 4, 2, 0, 1, 3, 29, 25, 13
- dround 1, 4, 3, 0, 2, 30, 26, 14
-
- dround 0, 1, 2, 3, 4, 31, 27, 15
- dround 3, 0, 4, 2, 1, 24, , 16
- dround 2, 3, 1, 4, 0, 25, , 17
- dround 4, 2, 0, 1, 3, 26, , 18
- dround 1, 4, 3, 0, 2, 27, , 19
-
- /* update state */
- add v8.2d, v8.2d, v0.2d
- add v9.2d, v9.2d, v1.2d
- add v10.2d, v10.2d, v2.2d
- add v11.2d, v11.2d, v3.2d
-
- cond_yield 3f, x4, x5
- /* handled all input blocks? */
- cbnz w2, 0b
-
- /* store new state */
-3: st1 {v8.2d-v11.2d}, [x0]
- mov w0, w2
- ret
-SYM_FUNC_END(__sha512_ce_transform)
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
deleted file mode 100644
index 6fb3001fa2c9..000000000000
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-
-asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- int blocks);
-
-static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- int blocks)
-{
- do {
- int rem;
-
- kernel_neon_begin();
- rem = __sha512_ce_transform(sst, src, blocks);
- kernel_neon_end();
- src += (blocks - rem) * SHA512_BLOCK_SIZE;
- blocks = rem;
- } while (blocks);
-}
-
-static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_base_do_update_blocks(desc, data, len,
- sha512_ce_transform);
-}
-
-static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha512_base_do_finup(desc, data, len, sha512_ce_transform);
- return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
- .init = sha384_base_init,
- .update = sha512_ce_update,
- .finup = sha512_ce_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA384_DIGEST_SIZE,
- .base.cra_name = "sha384",
- .base.cra_driver_name = "sha384-ce",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .base.cra_blocksize = SHA512_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-}, {
- .init = sha512_base_init,
- .update = sha512_ce_update,
- .finup = sha512_ce_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA512_DIGEST_SIZE,
- .base.cra_name = "sha512",
- .base.cra_driver_name = "sha512-ce",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .base.cra_blocksize = SHA512_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-} };
-
-static int __init sha512_ce_mod_init(void)
-{
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_ce_mod_fini(void)
-{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA512, sha512_ce_mod_init);
-module_exit(sha512_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
deleted file mode 100644
index 15aa9d8b7b2c..000000000000
--- a/arch/arm64/crypto/sha512-glue.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64
- *
- * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64");
-MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-
-asmlinkage void sha512_blocks_arch(u64 *digest, const void *data,
- unsigned int num_blks);
-
-static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
- int blocks)
-{
- sha512_blocks_arch(sst->state, src, blocks);
-}
-
-static int sha512_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_base_do_update_blocks(desc, data, len,
- sha512_arm64_transform);
-}
-
-static int sha512_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha512_base_do_finup(desc, data, len, sha512_arm64_transform);
- return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
- .digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_base_init,
- .update = sha512_update,
- .finup = sha512_finup,
- .descsize = SHA512_STATE_SIZE,
- .base.cra_name = "sha512",
- .base.cra_driver_name = "sha512-arm64",
- .base.cra_priority = 150,
- .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .base.cra_blocksize = SHA512_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-}, {
- .digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_base_init,
- .update = sha512_update,
- .finup = sha512_finup,
- .descsize = SHA512_STATE_SIZE,
- .base.cra_name = "sha384",
- .base.cra_driver_name = "sha384-arm64",
- .base.cra_priority = 150,
- .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .base.cra_blocksize = SHA384_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-} };
-
-static int __init sha512_mod_init(void)
-{
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_mod_fini(void)
-{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha512_mod_init);
-module_exit(sha512_mod_fini);
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index a407f9cd549e..c07a58b96329 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -150,7 +150,7 @@ acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor)
{}
#endif
-static inline const char *acpi_get_enable_method(int cpu)
+static __always_inline const char *acpi_get_enable_method(int cpu)
{
if (acpi_psci_present())
return "psci";
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ad63457a05c5..c56c21bb1eec 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -41,6 +41,11 @@
/*
* Save/restore interrupts.
*/
+ .macro save_and_disable_daif, flags
+ mrs \flags, daif
+ msr daifset, #0xf
+ .endm
+
.macro save_and_disable_irq, flags
mrs \flags, daif
msr daifset, #3
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index ba5df0df02a4..9f38340d24c2 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -287,17 +287,6 @@
.Lskip_fgt2_\@:
.endm
-.macro __init_el2_gcs
- mrs_s x1, SYS_ID_AA64PFR1_EL1
- ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
- cbz x1, .Lskip_gcs_\@
-
- /* Ensure GCS is not enabled when we start trying to do BLs */
- msr_s SYS_GCSCR_EL1, xzr
- msr_s SYS_GCSCRE0_EL1, xzr
-.Lskip_gcs_\@:
-.endm
-
/**
* Initialize EL2 registers to sane values. This should be called early on all
* cores that were booted in EL2. Note that everything gets initialised as
@@ -319,7 +308,6 @@
__init_el2_cptr
__init_el2_fgt
__init_el2_fgt2
- __init_el2_gcs
.endm
#ifndef __KVM_NVHE_HYPERVISOR__
@@ -371,6 +359,13 @@
msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2
.Lskip_mpam_\@:
+ check_override id_aa64pfr1, ID_AA64PFR1_EL1_GCS_SHIFT, .Linit_gcs_\@, .Lskip_gcs_\@, x1, x2
+
+.Linit_gcs_\@:
+ msr_s SYS_GCSCR_EL1, xzr
+ msr_s SYS_GCSCRE0_EL1, xzr
+
+.Lskip_gcs_\@:
check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2
.Linit_sve_\@: /* SVE register access */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index bd020fc28aa9..0720898f563e 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -561,68 +561,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu)
vcpu_set_flag((v), e); \
} while (0)
-#define __build_check_all_or_none(r, bits) \
- BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits))
-
-#define __cpacr_to_cptr_clr(clr, set) \
- ({ \
- u64 cptr = 0; \
- \
- if ((set) & CPACR_EL1_FPEN) \
- cptr |= CPTR_EL2_TFP; \
- if ((set) & CPACR_EL1_ZEN) \
- cptr |= CPTR_EL2_TZ; \
- if ((set) & CPACR_EL1_SMEN) \
- cptr |= CPTR_EL2_TSM; \
- if ((clr) & CPACR_EL1_TTA) \
- cptr |= CPTR_EL2_TTA; \
- if ((clr) & CPTR_EL2_TAM) \
- cptr |= CPTR_EL2_TAM; \
- if ((clr) & CPTR_EL2_TCPAC) \
- cptr |= CPTR_EL2_TCPAC; \
- \
- cptr; \
- })
-
-#define __cpacr_to_cptr_set(clr, set) \
- ({ \
- u64 cptr = 0; \
- \
- if ((clr) & CPACR_EL1_FPEN) \
- cptr |= CPTR_EL2_TFP; \
- if ((clr) & CPACR_EL1_ZEN) \
- cptr |= CPTR_EL2_TZ; \
- if ((clr) & CPACR_EL1_SMEN) \
- cptr |= CPTR_EL2_TSM; \
- if ((set) & CPACR_EL1_TTA) \
- cptr |= CPTR_EL2_TTA; \
- if ((set) & CPTR_EL2_TAM) \
- cptr |= CPTR_EL2_TAM; \
- if ((set) & CPTR_EL2_TCPAC) \
- cptr |= CPTR_EL2_TCPAC; \
- \
- cptr; \
- })
-
-#define cpacr_clear_set(clr, set) \
- do { \
- BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \
- BUILD_BUG_ON((clr) & CPACR_EL1_E0POE); \
- __build_check_all_or_none((clr), CPACR_EL1_FPEN); \
- __build_check_all_or_none((set), CPACR_EL1_FPEN); \
- __build_check_all_or_none((clr), CPACR_EL1_ZEN); \
- __build_check_all_or_none((set), CPACR_EL1_ZEN); \
- __build_check_all_or_none((clr), CPACR_EL1_SMEN); \
- __build_check_all_or_none((set), CPACR_EL1_SMEN); \
- \
- if (has_vhe() || has_hvhe()) \
- sysreg_clear_set(cpacr_el1, clr, set); \
- else \
- sysreg_clear_set(cptr_el2, \
- __cpacr_to_cptr_clr(clr, set), \
- __cpacr_to_cptr_set(clr, set));\
- } while (0)
-
/*
* Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE
* format if E2H isn't set.
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 5ccca509dff1..3e41a880b062 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1289,9 +1289,8 @@ void kvm_arm_resume_guest(struct kvm *kvm);
})
/*
- * The couple of isb() below are there to guarantee the same behaviour
- * on VHE as on !VHE, where the eret to EL1 acts as a context
- * synchronization event.
+ * The isb() below is there to guarantee the same behaviour on VHE as on !VHE,
+ * where the eret to EL1 acts as a context synchronization event.
*/
#define kvm_call_hyp(f, ...) \
do { \
@@ -1309,7 +1308,6 @@ void kvm_arm_resume_guest(struct kvm *kvm);
\
if (has_vhe()) { \
ret = f(__VA_ARGS__); \
- isb(); \
} else { \
ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \
} \
@@ -1482,7 +1480,6 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm,
struct reg_mask_range *range);
/* Guest/host FPSIMD coordination helpers */
-int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2920b0a51403..a2faf0049dab 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -34,7 +34,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
cpufeature.o alternative.o cacheinfo.o \
smp.o smp_spin_table.o topology.o smccc-call.o \
syscall.o proton-pack.o idle.o patching.o pi/ \
- rsi.o
+ rsi.o jump_label.o
obj-$(CONFIG_COMPAT) += sys32.o signal32.o \
sys_compat.o
@@ -47,7 +47,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o
obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_CPU_PM) += sleep.o suspend.o
-obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_EFI) += efi.o efi-rt-wrapper.o
obj-$(CONFIG_PCI) += pci.o
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index b34044e20128..e151585c6cca 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3135,6 +3135,13 @@ static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope)
}
#endif
+#ifdef CONFIG_ARM64_SME
+static bool has_sme_feature(const struct arm64_cpu_capabilities *cap, int scope)
+{
+ return system_supports_sme() && has_user_cpuid_feature(cap, scope);
+}
+#endif
+
static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL),
HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES),
@@ -3223,31 +3230,31 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC),
#ifdef CONFIG_ARM64_SME
HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME),
- HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
- HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2),
- HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
- HWCAP_CAP(ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
- HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32),
- HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16),
- HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16),
- HWCAP_CAP(ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16),
- HWCAP_CAP(ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32),
- HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
- HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
- HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
- HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32),
- HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM),
- HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA),
- HWCAP_CAP(ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP),
- HWCAP_CAP(ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4),
#endif /* CONFIG_ARM64_SME */
HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT),
HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA),
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 3857fd7ee8d4..62230d6dd919 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -15,6 +15,7 @@
#include <asm/efi.h>
#include <asm/stacktrace.h>
+#include <asm/vmap_stack.h>
static bool region_is_misaligned(const efi_memory_desc_t *md)
{
@@ -214,9 +215,13 @@ static int __init arm64_efi_rt_init(void)
if (!efi_enabled(EFI_RUNTIME_SERVICES))
return 0;
- p = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, GFP_KERNEL,
- NUMA_NO_NODE, &&l);
-l: if (!p) {
+ if (!IS_ENABLED(CONFIG_VMAP_STACK)) {
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return -ENOMEM;
+ }
+
+ p = arch_alloc_vmap_stack(THREAD_SIZE, NUMA_NO_NODE);
+ if (!p) {
pr_warn("Failed to allocate EFI runtime stack\n");
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return -ENOMEM;
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5ae2a34b50bd..fe62218b8e0e 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -614,7 +614,7 @@ SYM_CODE_END(ret_to_kernel)
SYM_CODE_START_LOCAL(ret_to_user)
ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step
enable_step_tsk x19, x2
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
bl stackleak_erase_on_task_stack
#endif
kernel_exit 0
@@ -825,6 +825,7 @@ SYM_CODE_END(__bp_harden_el1_vectors)
*
*/
SYM_FUNC_START(cpu_switch_to)
+ save_and_disable_daif x11
mov x10, #THREAD_CPU_CONTEXT
add x8, x0, x10
mov x9, sp
@@ -848,6 +849,7 @@ SYM_FUNC_START(cpu_switch_to)
ptrauth_keys_install_kernel x1, x8, x9, x10
scs_save x0
scs_load_current
+ restore_irq x11
ret
SYM_FUNC_END(cpu_switch_to)
NOKPROBE(cpu_switch_to)
@@ -874,6 +876,7 @@ NOKPROBE(ret_from_fork)
* Calls func(regs) using this CPU's irq stack and shadow irq stack.
*/
SYM_FUNC_START(call_on_irq_stack)
+ save_and_disable_daif x9
#ifdef CONFIG_SHADOW_CALL_STACK
get_current_task x16
scs_save x16
@@ -888,8 +891,10 @@ SYM_FUNC_START(call_on_irq_stack)
/* Move to the new stack and call the function there */
add sp, x16, #IRQ_STACK_SIZE
+ restore_irq x9
blr x1
+ save_and_disable_daif x9
/*
* Restore the SP from the FP, and restore the FP and LR from the frame
* record.
@@ -897,6 +902,7 @@ SYM_FUNC_START(call_on_irq_stack)
mov sp, x29
ldp x29, x30, [sp], #16
scs_load_current
+ restore_irq x9
ret
SYM_FUNC_END(call_on_irq_stack)
NOKPROBE(call_on_irq_stack)
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index 4d11a8c29181..f440bf57b1a5 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -2,7 +2,7 @@
# Copyright 2022 Google LLC
KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
- -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \
+ -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \
$(DISABLE_LATENT_ENTROPY_PLUGIN) \
$(call cc-option,-mbranch-protection=none) \
-I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index a5ca15daeb8a..08b7042a2e2d 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -288,7 +288,9 @@ static void flush_gcs(void)
if (!system_supports_gcs())
return;
- gcs_free(current);
+ current->thread.gcspr_el0 = 0;
+ current->thread.gcs_base = 0;
+ current->thread.gcs_size = 0;
current->thread.gcs_el0_mode = 0;
write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
write_sysreg_s(0, SYS_GCSPR_EL0);
@@ -671,6 +673,11 @@ static void permission_overlay_switch(struct task_struct *next)
current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0);
if (current->thread.por_el0 != next->thread.por_el0) {
write_sysreg_s(next->thread.por_el0, SYS_POR_EL0);
+ /*
+ * No ISB required as we can tolerate spurious Overlay faults -
+ * the fault handler will check again based on the new value
+ * of POR_EL0.
+ */
}
}
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index a360e52db02f..4b001121c72d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -141,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
addr += n;
if (regs_within_kernel_stack(regs, (unsigned long)addr))
- return *addr;
+ return READ_ONCE_NOCHECK(*addr);
else
return 0;
}
@@ -1586,7 +1586,7 @@ enum aarch64_regset {
static const struct user_regset aarch64_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = sizeof(struct user_pt_regs) / sizeof(u64),
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1594,7 +1594,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = gpr_set
},
[REGSET_FPR] = {
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_fpsimd_state) / sizeof(u32),
/*
* We pretend we have 32-bit registers because the fpsr and
@@ -1607,7 +1607,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = fpr_set
},
[REGSET_TLS] = {
- .core_note_type = NT_ARM_TLS,
+ USER_REGSET_NOTE_TYPE(ARM_TLS),
.n = 2,
.size = sizeof(void *),
.align = sizeof(void *),
@@ -1616,7 +1616,7 @@ static const struct user_regset aarch64_regsets[] = {
},
#ifdef CONFIG_HAVE_HW_BREAKPOINT
[REGSET_HW_BREAK] = {
- .core_note_type = NT_ARM_HW_BREAK,
+ USER_REGSET_NOTE_TYPE(ARM_HW_BREAK),
.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1624,7 +1624,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = hw_break_set,
},
[REGSET_HW_WATCH] = {
- .core_note_type = NT_ARM_HW_WATCH,
+ USER_REGSET_NOTE_TYPE(ARM_HW_WATCH),
.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -1633,7 +1633,7 @@ static const struct user_regset aarch64_regsets[] = {
},
#endif
[REGSET_SYSTEM_CALL] = {
- .core_note_type = NT_ARM_SYSTEM_CALL,
+ USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL),
.n = 1,
.size = sizeof(int),
.align = sizeof(int),
@@ -1641,7 +1641,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = system_call_set,
},
[REGSET_FPMR] = {
- .core_note_type = NT_ARM_FPMR,
+ USER_REGSET_NOTE_TYPE(ARM_FPMR),
.n = 1,
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1650,7 +1650,7 @@ static const struct user_regset aarch64_regsets[] = {
},
#ifdef CONFIG_ARM64_SVE
[REGSET_SVE] = { /* Scalable Vector Extension */
- .core_note_type = NT_ARM_SVE,
+ USER_REGSET_NOTE_TYPE(ARM_SVE),
.n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX,
SVE_PT_REGS_SVE),
SVE_VQ_BYTES),
@@ -1662,7 +1662,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_SME
[REGSET_SSVE] = { /* Streaming mode SVE */
- .core_note_type = NT_ARM_SSVE,
+ USER_REGSET_NOTE_TYPE(ARM_SSVE),
.n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE),
SVE_VQ_BYTES),
.size = SVE_VQ_BYTES,
@@ -1671,7 +1671,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = ssve_set,
},
[REGSET_ZA] = { /* SME ZA */
- .core_note_type = NT_ARM_ZA,
+ USER_REGSET_NOTE_TYPE(ARM_ZA),
/*
* ZA is a single register but it's variably sized and
* the ptrace core requires that the size of any data
@@ -1687,7 +1687,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = za_set,
},
[REGSET_ZT] = { /* SME ZT */
- .core_note_type = NT_ARM_ZT,
+ USER_REGSET_NOTE_TYPE(ARM_ZT),
.n = 1,
.size = ZT_SIG_REG_BYTES,
.align = sizeof(u64),
@@ -1697,7 +1697,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_PTR_AUTH
[REGSET_PAC_MASK] = {
- .core_note_type = NT_ARM_PAC_MASK,
+ USER_REGSET_NOTE_TYPE(ARM_PAC_MASK),
.n = sizeof(struct user_pac_mask) / sizeof(u64),
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1705,7 +1705,7 @@ static const struct user_regset aarch64_regsets[] = {
/* this cannot be set dynamically */
},
[REGSET_PAC_ENABLED_KEYS] = {
- .core_note_type = NT_ARM_PAC_ENABLED_KEYS,
+ USER_REGSET_NOTE_TYPE(ARM_PAC_ENABLED_KEYS),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -1714,7 +1714,7 @@ static const struct user_regset aarch64_regsets[] = {
},
#ifdef CONFIG_CHECKPOINT_RESTORE
[REGSET_PACA_KEYS] = {
- .core_note_type = NT_ARM_PACA_KEYS,
+ USER_REGSET_NOTE_TYPE(ARM_PACA_KEYS),
.n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t),
.size = sizeof(__uint128_t),
.align = sizeof(__uint128_t),
@@ -1722,7 +1722,7 @@ static const struct user_regset aarch64_regsets[] = {
.set = pac_address_keys_set,
},
[REGSET_PACG_KEYS] = {
- .core_note_type = NT_ARM_PACG_KEYS,
+ USER_REGSET_NOTE_TYPE(ARM_PACG_KEYS),
.n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t),
.size = sizeof(__uint128_t),
.align = sizeof(__uint128_t),
@@ -1733,7 +1733,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
[REGSET_TAGGED_ADDR_CTRL] = {
- .core_note_type = NT_ARM_TAGGED_ADDR_CTRL,
+ USER_REGSET_NOTE_TYPE(ARM_TAGGED_ADDR_CTRL),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -1743,7 +1743,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_POE
[REGSET_POE] = {
- .core_note_type = NT_ARM_POE,
+ USER_REGSET_NOTE_TYPE(ARM_POE),
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
@@ -1753,7 +1753,7 @@ static const struct user_regset aarch64_regsets[] = {
#endif
#ifdef CONFIG_ARM64_GCS
[REGSET_GCS] = {
- .core_note_type = NT_ARM_GCS,
+ USER_REGSET_NOTE_TYPE(ARM_GCS),
.n = sizeof(struct user_gcs) / sizeof(u64),
.size = sizeof(u64),
.align = sizeof(u64),
@@ -1943,7 +1943,7 @@ static int compat_tls_set(struct task_struct *target,
static const struct user_regset aarch32_regsets[] = {
[REGSET_COMPAT_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = COMPAT_ELF_NGREG,
.size = sizeof(compat_elf_greg_t),
.align = sizeof(compat_elf_greg_t),
@@ -1951,7 +1951,7 @@ static const struct user_regset aarch32_regsets[] = {
.set = compat_gpr_set
},
[REGSET_COMPAT_VFP] = {
- .core_note_type = NT_ARM_VFP,
+ USER_REGSET_NOTE_TYPE(ARM_VFP),
.n = VFP_STATE_SIZE / sizeof(compat_ulong_t),
.size = sizeof(compat_ulong_t),
.align = sizeof(compat_ulong_t),
@@ -1968,7 +1968,7 @@ static const struct user_regset_view user_aarch32_view = {
static const struct user_regset aarch32_ptrace_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = COMPAT_ELF_NGREG,
.size = sizeof(compat_elf_greg_t),
.align = sizeof(compat_elf_greg_t),
@@ -1976,7 +1976,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
.set = compat_gpr_set
},
[REGSET_FPR] = {
- .core_note_type = NT_ARM_VFP,
+ USER_REGSET_NOTE_TYPE(ARM_VFP),
.n = VFP_STATE_SIZE / sizeof(compat_ulong_t),
.size = sizeof(compat_ulong_t),
.align = sizeof(compat_ulong_t),
@@ -1984,7 +1984,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
.set = compat_vfp_set
},
[REGSET_TLS] = {
- .core_note_type = NT_ARM_TLS,
+ USER_REGSET_NOTE_TYPE(ARM_TLS),
.n = 1,
.size = sizeof(compat_ulong_t),
.align = sizeof(compat_ulong_t),
@@ -1993,7 +1993,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
},
#ifdef CONFIG_HAVE_HW_BREAKPOINT
[REGSET_HW_BREAK] = {
- .core_note_type = NT_ARM_HW_BREAK,
+ USER_REGSET_NOTE_TYPE(ARM_HW_BREAK),
.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -2001,7 +2001,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
.set = hw_break_set,
},
[REGSET_HW_WATCH] = {
- .core_note_type = NT_ARM_HW_WATCH,
+ USER_REGSET_NOTE_TYPE(ARM_HW_WATCH),
.n = sizeof(struct user_hwdebug_state) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -2010,7 +2010,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = {
},
#endif
[REGSET_SYSTEM_CALL] = {
- .core_note_type = NT_ARM_SYSTEM_CALL,
+ USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL),
.n = 1,
.size = sizeof(int),
.align = sizeof(int),
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 3b3f6b56e733..21a795303568 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1143,7 +1143,7 @@ static inline unsigned int num_other_online_cpus(void)
void smp_send_stop(void)
{
static unsigned long stop_in_progress;
- cpumask_t mask;
+ static cpumask_t mask;
unsigned long timeout;
/*
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 5e27e46aa496..7dec05dd33b7 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -36,7 +36,8 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
# -Wmissing-prototypes and -Wmissing-declarations are removed from
# the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled.
CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
- $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
+ $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) \
+ $(GCC_PLUGINS_CFLAGS) \
$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
-Wmissing-prototypes -Wmissing-declarations
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index de2b4e9c9f9f..23dd3f3fc3eb 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -825,10 +825,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
if (!kvm_arm_vcpu_is_finalized(vcpu))
return -EPERM;
- ret = kvm_arch_vcpu_run_map_fp(vcpu);
- if (ret)
- return ret;
-
if (likely(vcpu_has_run_once(vcpu)))
return 0;
@@ -2129,7 +2125,7 @@ static void cpu_hyp_init(void *discard)
static void cpu_hyp_uninit(void *discard)
{
- if (__this_cpu_read(kvm_hyp_initialized)) {
+ if (!is_protected_kvm_enabled() && __this_cpu_read(kvm_hyp_initialized)) {
cpu_hyp_reset();
__this_cpu_write(kvm_hyp_initialized, 0);
}
@@ -2345,8 +2341,13 @@ static void __init teardown_hyp_mode(void)
free_hyp_pgds();
for_each_possible_cpu(cpu) {
+ if (per_cpu(kvm_hyp_initialized, cpu))
+ continue;
+
free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT);
- free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
+
+ if (!kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu])
+ continue;
if (free_sve) {
struct cpu_sve_state *sve_state;
@@ -2354,6 +2355,9 @@ static void __init teardown_hyp_mode(void)
sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
}
+
+ free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
+
}
}
@@ -2764,7 +2768,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
- if (new->type != KVM_IRQ_ROUTING_MSI)
+ if (old->type != KVM_IRQ_ROUTING_MSI ||
+ new->type != KVM_IRQ_ROUTING_MSI)
return true;
return memcmp(&old->msi, &new->msi, sizeof(new->msi));
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 8f6c8f57c6b9..15e17aca1dec 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -15,32 +15,6 @@
#include <asm/sysreg.h>
/*
- * Called on entry to KVM_RUN unless this vcpu previously ran at least
- * once and the most recent prior KVM_RUN for this vcpu was called from
- * the same task as current (highly likely).
- *
- * This is guaranteed to execute before kvm_arch_vcpu_load_fp(vcpu),
- * such that on entering hyp the relevant parts of current are already
- * mapped.
- */
-int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
-{
- struct user_fpsimd_state *fpsimd = &current->thread.uw.fpsimd_state;
- int ret;
-
- /* pKVM has its own tracking of the host fpsimd state. */
- if (is_protected_kvm_enabled())
- return 0;
-
- /* Make sure the host task fpsimd state is visible to hyp: */
- ret = kvm_share_hyp(fpsimd, fpsimd + 1);
- if (ret)
- return ret;
-
- return 0;
-}
-
-/*
* Prepare vcpu for saving the host's FPSIMD state and loading the guest's.
* The actual loading is done by the FPSIMD access trap taken to hyp.
*
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 76dfda116e56..2ad57b117385 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -65,6 +65,136 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
}
}
+static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
+{
+ u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA;
+
+ /*
+ * Always trap SME since it's not supported in KVM.
+ * TSM is RES1 if SME isn't implemented.
+ */
+ val |= CPTR_EL2_TSM;
+
+ if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
+ val |= CPTR_EL2_TZ;
+
+ if (!guest_owns_fp_regs())
+ val |= CPTR_EL2_TFP;
+
+ write_sysreg(val, cptr_el2);
+}
+
+static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
+{
+ /*
+ * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
+ * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
+ * except for some missing controls, such as TAM.
+ * In this case, CPTR_EL2.TAM has the same position with or without
+ * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
+ * shift value for trapping the AMU accesses.
+ */
+ u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA;
+ u64 cptr;
+
+ if (guest_owns_fp_regs()) {
+ val |= CPACR_EL1_FPEN;
+ if (vcpu_has_sve(vcpu))
+ val |= CPACR_EL1_ZEN;
+ }
+
+ if (!vcpu_has_nv(vcpu))
+ goto write;
+
+ /*
+ * The architecture is a bit crap (what a surprise): an EL2 guest
+ * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
+ * as they are RES0 in the guest's view. To work around it, trap the
+ * sucker using the very same bit it can't set...
+ */
+ if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
+ val |= CPTR_EL2_TCPAC;
+
+ /*
+ * Layer the guest hypervisor's trap configuration on top of our own if
+ * we're in a nested context.
+ */
+ if (is_hyp_ctxt(vcpu))
+ goto write;
+
+ cptr = vcpu_sanitised_cptr_el2(vcpu);
+
+ /*
+ * Pay attention, there's some interesting detail here.
+ *
+ * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
+ * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
+ *
+ * - CPTR_EL2.xEN = x0, traps are enabled
+ * - CPTR_EL2.xEN = x1, traps are disabled
+ *
+ * In other words, bit[0] determines if guest accesses trap or not. In
+ * the interest of simplicity, clear the entire field if the guest
+ * hypervisor has traps enabled to dispel any illusion of something more
+ * complicated taking place.
+ */
+ if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
+ val &= ~CPACR_EL1_FPEN;
+ if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
+ val &= ~CPACR_EL1_ZEN;
+
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
+ val |= cptr & CPACR_EL1_E0POE;
+
+ val |= cptr & CPTR_EL2_TCPAC;
+
+write:
+ write_sysreg(val, cpacr_el1);
+}
+
+static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+ if (!guest_owns_fp_regs())
+ __activate_traps_fpsimd32(vcpu);
+
+ if (has_vhe() || has_hvhe())
+ __activate_cptr_traps_vhe(vcpu);
+ else
+ __activate_cptr_traps_nvhe(vcpu);
+}
+
+static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
+{
+ u64 val = CPTR_NVHE_EL2_RES1;
+
+ if (!cpus_have_final_cap(ARM64_SVE))
+ val |= CPTR_EL2_TZ;
+ if (!cpus_have_final_cap(ARM64_SME))
+ val |= CPTR_EL2_TSM;
+
+ write_sysreg(val, cptr_el2);
+}
+
+static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
+{
+ u64 val = CPACR_EL1_FPEN;
+
+ if (cpus_have_final_cap(ARM64_SVE))
+ val |= CPACR_EL1_ZEN;
+ if (cpus_have_final_cap(ARM64_SME))
+ val |= CPACR_EL1_SMEN;
+
+ write_sysreg(val, cpacr_el1);
+}
+
+static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+ if (has_vhe() || has_hvhe())
+ __deactivate_cptr_traps_vhe(vcpu);
+ else
+ __deactivate_cptr_traps_nvhe(vcpu);
+}
+
#define reg_to_fgt_masks(reg) \
({ \
struct fgt_masks *m; \
@@ -486,11 +616,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
*/
if (system_supports_sve()) {
__hyp_sve_save_host();
-
- /* Re-enable SVE traps if not supported for the guest vcpu. */
- if (!vcpu_has_sve(vcpu))
- cpacr_clear_set(CPACR_EL1_ZEN, 0);
-
} else {
__fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
}
@@ -541,10 +666,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
/* Valid trap. Switch the context: */
/* First disable enough traps to allow us to update the registers */
- if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve()))
- cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
- else
- cpacr_clear_set(0, CPACR_EL1_FPEN);
+ __deactivate_cptr_traps(vcpu);
isb();
/* Write out the host state if it's in the registers */
@@ -566,6 +688,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
*host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED;
+ /*
+ * Re-enable traps necessary for the current state of the guest, e.g.
+ * those enabled by a guest hypervisor. The ERET to the guest will
+ * provide the necessary context synchronization.
+ */
+ __activate_cptr_traps(vcpu);
+
return true;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index a76522d63c3e..0b0a68b663d4 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -12,7 +12,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
ccflags-y += -fno-stack-protector \
-DDISABLE_BRANCH_PROFILING \
- $(DISABLE_STACKLEAK_PLUGIN)
+ $(DISABLE_KSTACK_ERASE)
hostprogs := gen-hyprel
HOST_EXTRACFLAGS += -I$(objtree)/include
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index e9198e56e784..3206b2c07f82 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -69,7 +69,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
if (!guest_owns_fp_regs())
return;
- cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
+ /*
+ * Traps have been disabled by __deactivate_cptr_traps(), but there
+ * hasn't necessarily been a context synchronization event yet.
+ */
isb();
if (vcpu_has_sve(vcpu))
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 95d7534c9679..8957734d6183 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -479,6 +479,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
{
struct kvm_mem_range cur;
kvm_pte_t pte;
+ u64 granule;
s8 level;
int ret;
@@ -496,18 +497,21 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
return -EPERM;
}
- do {
- u64 granule = kvm_granule_size(level);
+ for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) {
+ if (!kvm_level_supports_block_mapping(level))
+ continue;
+ granule = kvm_granule_size(level);
cur.start = ALIGN_DOWN(addr, granule);
cur.end = cur.start + granule;
- level++;
- } while ((level <= KVM_PGTABLE_LAST_LEVEL) &&
- !(kvm_level_supports_block_mapping(level) &&
- range_included(&cur, range)));
+ if (!range_included(&cur, range))
+ continue;
+ *range = cur;
+ return 0;
+ }
- *range = cur;
+ WARN_ON(1);
- return 0;
+ return -EINVAL;
}
int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 73affe1333a4..0e752b515d0f 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -47,65 +47,6 @@ struct fgt_masks hdfgwtr2_masks;
extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
-static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
-{
- u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */
-
- if (!guest_owns_fp_regs())
- __activate_traps_fpsimd32(vcpu);
-
- if (has_hvhe()) {
- val |= CPACR_EL1_TTA;
-
- if (guest_owns_fp_regs()) {
- val |= CPACR_EL1_FPEN;
- if (vcpu_has_sve(vcpu))
- val |= CPACR_EL1_ZEN;
- }
-
- write_sysreg(val, cpacr_el1);
- } else {
- val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
-
- /*
- * Always trap SME since it's not supported in KVM.
- * TSM is RES1 if SME isn't implemented.
- */
- val |= CPTR_EL2_TSM;
-
- if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
- val |= CPTR_EL2_TZ;
-
- if (!guest_owns_fp_regs())
- val |= CPTR_EL2_TFP;
-
- write_sysreg(val, cptr_el2);
- }
-}
-
-static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
-{
- if (has_hvhe()) {
- u64 val = CPACR_EL1_FPEN;
-
- if (cpus_have_final_cap(ARM64_SVE))
- val |= CPACR_EL1_ZEN;
- if (cpus_have_final_cap(ARM64_SME))
- val |= CPACR_EL1_SMEN;
-
- write_sysreg(val, cpacr_el1);
- } else {
- u64 val = CPTR_NVHE_EL2_RES1;
-
- if (!cpus_have_final_cap(ARM64_SVE))
- val |= CPTR_EL2_TZ;
- if (!cpus_have_final_cap(ARM64_SME))
- val |= CPTR_EL2_TSM;
-
- write_sysreg(val, cptr_el2);
- }
-}
-
static void __activate_traps(struct kvm_vcpu *vcpu)
{
___activate_traps(vcpu, vcpu->arch.hcr_el2);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 09df2b42bc1b..477f1580ffea 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -90,87 +90,6 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu)
return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE);
}
-static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
-{
- u64 cptr;
-
- /*
- * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
- * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
- * except for some missing controls, such as TAM.
- * In this case, CPTR_EL2.TAM has the same position with or without
- * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
- * shift value for trapping the AMU accesses.
- */
- u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM;
-
- if (guest_owns_fp_regs()) {
- val |= CPACR_EL1_FPEN;
- if (vcpu_has_sve(vcpu))
- val |= CPACR_EL1_ZEN;
- } else {
- __activate_traps_fpsimd32(vcpu);
- }
-
- if (!vcpu_has_nv(vcpu))
- goto write;
-
- /*
- * The architecture is a bit crap (what a surprise): an EL2 guest
- * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
- * as they are RES0 in the guest's view. To work around it, trap the
- * sucker using the very same bit it can't set...
- */
- if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
- val |= CPTR_EL2_TCPAC;
-
- /*
- * Layer the guest hypervisor's trap configuration on top of our own if
- * we're in a nested context.
- */
- if (is_hyp_ctxt(vcpu))
- goto write;
-
- cptr = vcpu_sanitised_cptr_el2(vcpu);
-
- /*
- * Pay attention, there's some interesting detail here.
- *
- * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
- * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
- *
- * - CPTR_EL2.xEN = x0, traps are enabled
- * - CPTR_EL2.xEN = x1, traps are disabled
- *
- * In other words, bit[0] determines if guest accesses trap or not. In
- * the interest of simplicity, clear the entire field if the guest
- * hypervisor has traps enabled to dispel any illusion of something more
- * complicated taking place.
- */
- if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
- val &= ~CPACR_EL1_FPEN;
- if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
- val &= ~CPACR_EL1_ZEN;
-
- if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
- val |= cptr & CPACR_EL1_E0POE;
-
- val |= cptr & CPTR_EL2_TCPAC;
-
-write:
- write_sysreg(val, cpacr_el1);
-}
-
-static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
-{
- u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN;
-
- if (cpus_have_final_cap(ARM64_SME))
- val |= CPACR_EL1_SMEN_EL1EN;
-
- write_sysreg(val, cpacr_el1);
-}
-
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -639,10 +558,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
host_ctxt = host_data_ptr(host_ctxt);
guest_ctxt = &vcpu->arch.ctxt;
- sysreg_save_host_state_vhe(host_ctxt);
-
fpsimd_lazy_switch_to_guest(vcpu);
+ sysreg_save_host_state_vhe(host_ctxt);
+
/*
* Note that ARM erratum 1165522 requires us to configure both stage 1
* and stage 2 translation for the guest context before we clear
@@ -667,15 +586,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
__deactivate_traps(vcpu);
- fpsimd_lazy_switch_to_host(vcpu);
-
sysreg_restore_host_state_vhe(host_ctxt);
+ __debug_switch_to_host(vcpu);
+
+ /*
+ * Ensure that all system register writes above have taken effect
+ * before returning to the host. In VHE mode, CPTR traps for
+ * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be
+ * manipulated after the ISB.
+ */
+ isb();
+
+ fpsimd_lazy_switch_to_host(vcpu);
+
if (guest_owns_fp_regs())
__fpsimd_save_fpexc32(vcpu);
- __debug_switch_to_host(vcpu);
-
return exit_code;
}
NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);
@@ -705,12 +632,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
*/
local_daif_restore(DAIF_PROCCTX_NOIRQ);
- /*
- * When we exit from the guest we change a number of CPU configuration
- * parameters, such as traps. We rely on the isb() in kvm_call_hyp*()
- * to make sure these changes take effect before running the host or
- * additional guests.
- */
return ret;
}
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 5b191f4dc566..dc1d26559bfa 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1402,6 +1402,21 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
}
}
+#define has_tgran_2(__r, __sz) \
+ ({ \
+ u64 _s1, _s2, _mmfr0 = __r; \
+ \
+ _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \
+ TGRAN##__sz##_2, _mmfr0); \
+ \
+ _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \
+ TGRAN##__sz, _mmfr0); \
+ \
+ ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \
+ _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \
+ (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \
+ _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \
+ })
/*
* Our emulated CPU doesn't support all the possible features. For the
* sake of simplicity (and probably mental sanity), wipe out a number
@@ -1411,6 +1426,8 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
*/
u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
{
+ u64 orig_val = val;
+
switch (reg) {
case SYS_ID_AA64ISAR0_EL1:
/* Support everything but TME */
@@ -1480,13 +1497,16 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
*/
switch (PAGE_SIZE) {
case SZ_4K:
- val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
+ if (has_tgran_2(orig_val, 4))
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
fallthrough;
case SZ_16K:
- val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
+ if (has_tgran_2(orig_val, 16))
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
fallthrough;
case SZ_64K:
- val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
+ if (has_tgran_2(orig_val, 64))
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
break;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 76c2f0da821f..c20bd6f21e60 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2624,7 +2624,7 @@ static bool access_mdcr(struct kvm_vcpu *vcpu,
*/
if (hpmn > vcpu->kvm->arch.nr_pmu_counters) {
hpmn = vcpu->kvm->arch.nr_pmu_counters;
- u64_replace_bits(val, hpmn, MDCR_EL2_HPMN);
+ u64p_replace_bits(&val, hpmn, MDCR_EL2_HPMN);
}
__vcpu_assign_sys_reg(vcpu, MDCR_EL2, val);
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
index d22a8ad7bcc5..679aafe77de2 100644
--- a/arch/arm64/kvm/vgic/vgic-v3-nested.c
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -36,6 +36,11 @@ struct shadow_if {
static DEFINE_PER_CPU(struct shadow_if, shadow_if);
+static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
+{
+ return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
+}
+
/*
* Nesting GICv3 support
*
@@ -209,6 +214,29 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
return reg;
}
+static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
+{
+ struct vgic_irq *irq;
+
+ if (!(lr & ICH_LR_HW))
+ return lr;
+
+ /* We have the HW bit set, check for validity of pINTID */
+ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ /* If there was no real mapping, nuke the HW bit */
+ if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
+ lr &= ~ICH_LR_HW;
+
+ /* Translate the virtual mapping to the real one, even if invalid */
+ if (irq) {
+ lr &= ~ICH_LR_PHYS_ID_MASK;
+ lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return lr;
+}
+
/*
* For LRs which have HW bit set such as timer interrupts, we modify them to
* have the host hardware interrupt number instead of the virtual one programmed
@@ -217,58 +245,37 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
struct vgic_v3_cpu_if *s_cpu_if)
{
- unsigned long lr_map = 0;
- int index = 0;
+ struct shadow_if *shadow_if;
+
+ shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
+ shadow_if->lr_map = 0;
for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
- struct vgic_irq *irq;
if (!(lr & ICH_LR_STATE))
- lr = 0;
-
- if (!(lr & ICH_LR_HW))
- goto next;
-
- /* We have the HW bit set, check for validity of pINTID */
- irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
- if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) {
- /* There was no real mapping, so nuke the HW bit */
- lr &= ~ICH_LR_HW;
- if (irq)
- vgic_put_irq(vcpu->kvm, irq);
- goto next;
- }
-
- /* Translate the virtual mapping to the real one */
- lr &= ~ICH_LR_PHYS_ID_MASK;
- lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+ continue;
- vgic_put_irq(vcpu->kvm, irq);
+ lr = translate_lr_pintid(vcpu, lr);
-next:
- s_cpu_if->vgic_lr[index] = lr;
- if (lr) {
- lr_map |= BIT(i);
- index++;
- }
+ s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
+ shadow_if->lr_map |= BIT(i);
}
- container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map;
- s_cpu_if->used_lrs = index;
+ s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
}
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
{
struct shadow_if *shadow_if = get_shadow_if();
- int i, index = 0;
+ int i;
for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
struct vgic_irq *irq;
if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
- goto next;
+ continue;
/*
* If we had a HW lr programmed by the guest hypervisor, we
@@ -277,15 +284,13 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
*/
irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
- goto next;
+ continue;
- lr = __gic_v3_get_lr(index);
+ lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
if (!(lr & ICH_LR_STATE))
irq->active = false;
vgic_put_irq(vcpu->kvm, irq);
- next:
- index++;
}
}
@@ -368,13 +373,11 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
val &= ~ICH_LR_STATE;
- val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;
+ val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE;
__vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
- s_cpu_if->vgic_lr[i] = 0;
}
- shadow_if->lr_map = 0;
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
}
@@ -398,9 +401,7 @@ void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
{
bool level;
- level = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En;
- if (level)
- level &= vgic_v3_get_misr(vcpu);
+ level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu);
kvm_vgic_inject_irq(vcpu->kvm, vcpu,
vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
}
diff --git a/arch/arm64/lib/.gitignore b/arch/arm64/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/arm64/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 027bfa9689c6..633e5223d944 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,7 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-
-obj-y += crypto/
-
lib-y := clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_page.o \
clear_page.o csum.o insn.o memchr.o memcpy.o \
@@ -16,12 +13,6 @@ endif
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
-crc32-arm64-y := crc32.o crc32-core.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
-crc-t10dif-arm64-y := crc-t10dif.o crc-t10dif-core.o
-
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
obj-$(CONFIG_ARM64_MTE) += mte.o
diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S
deleted file mode 100644
index 87dd6d46224d..000000000000
--- a/arch/arm64/lib/crc-t10dif-core.S
+++ /dev/null
@@ -1,469 +0,0 @@
-//
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd
-// Copyright (C) 2019-2024 Google LLC
-//
-// Authors: Ard Biesheuvel <ardb@google.com>
-// Eric Biggers <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-// Erdinc Ozturk <erdinc.ozturk@intel.com>
-// Vinodh Gopal <vinodh.gopal@intel.com>
-// James Guilford <james.guilford@intel.com>
-// Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses. You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the
-// distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Reference paper titled "Fast CRC Computation for Generic
-// Polynomials Using PCLMULQDQ Instruction"
-// URL: http://www.intel.com/content/dam/www/public/us/en/documents
-// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a+crypto
-
- init_crc .req w0
- buf .req x1
- len .req x2
- fold_consts_ptr .req x5
-
- fold_consts .req v10
-
- t3 .req v17
- t4 .req v18
- t5 .req v19
- t6 .req v20
- t7 .req v21
- t8 .req v22
-
- perm .req v27
-
- .macro pmull16x64_p64, a16, b64, c64
- pmull2 \c64\().1q, \a16\().2d, \b64\().2d
- pmull \b64\().1q, \a16\().1d, \b64\().1d
- .endm
-
- /*
- * Pairwise long polynomial multiplication of two 16-bit values
- *
- * { w0, w1 }, { y0, y1 }
- *
- * by two 64-bit values
- *
- * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
- *
- * where each vector element is a byte, ordered from least to most
- * significant.
- *
- * This can be implemented using 8x8 long polynomial multiplication, by
- * reorganizing the input so that each pairwise 8x8 multiplication
- * produces one of the terms from the decomposition below, and
- * combining the results of each rank and shifting them into place.
- *
- * Rank
- * 0 w0*x0 ^ | y0*z0 ^
- * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
- * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
- * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
- * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
- * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
- * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
- * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
- * 8 w1*x7 << 64 | y1*z7 << 64
- *
- * The inputs can be reorganized into
- *
- * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
- * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
- *
- * and after performing 8x8->16 bit long polynomial multiplication of
- * each of the halves of the first vector with those of the second one,
- * we obtain the following four vectors of 16-bit elements:
- *
- * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
- * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
- * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
- * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
- *
- * Results b and c can be XORed together, as the vector elements have
- * matching ranks. Then, the final XOR (*) can be pulled forward, and
- * applied between the halves of each of the remaining three vectors,
- * which are then shifted into place, and combined to produce two
- * 80-bit results.
- *
- * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
- * to the 64x64 bit one above, but XOR'ing the outputs together will
- * produce the expected result, and this is sufficient in the context of
- * this algorithm.
- */
- .macro pmull16x64_p8, a16, b64, c64
- ext t7.16b, \b64\().16b, \b64\().16b, #1
- tbl t5.16b, {\a16\().16b}, perm.16b
- uzp1 t7.16b, \b64\().16b, t7.16b
- bl __pmull_p8_16x64
- ext \b64\().16b, t4.16b, t4.16b, #15
- eor \c64\().16b, t8.16b, t5.16b
- .endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
- ext t6.16b, t5.16b, t5.16b, #8
-
- pmull t3.8h, t7.8b, t5.8b
- pmull t4.8h, t7.8b, t6.8b
- pmull2 t5.8h, t7.16b, t5.16b
- pmull2 t6.8h, t7.16b, t6.16b
-
- ext t8.16b, t3.16b, t3.16b, #8
- eor t4.16b, t4.16b, t6.16b
- ext t7.16b, t5.16b, t5.16b, #8
- ext t6.16b, t4.16b, t4.16b, #8
- eor t8.8b, t8.8b, t3.8b
- eor t5.8b, t5.8b, t7.8b
- eor t4.8b, t4.8b, t6.8b
- ext t5.16b, t5.16b, t5.16b, #14
- ret
-SYM_FUNC_END(__pmull_p8_16x64)
-
-
- // Fold reg1, reg2 into the next 32 data bytes, storing the result back
- // into reg1, reg2.
- .macro fold_32_bytes, p, reg1, reg2
- ldp q11, q12, [buf], #0x20
-
- pmull16x64_\p fold_consts, \reg1, v8
-
-CPU_LE( rev64 v11.16b, v11.16b )
-CPU_LE( rev64 v12.16b, v12.16b )
-
- pmull16x64_\p fold_consts, \reg2, v9
-
-CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
-CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
-
- eor \reg1\().16b, \reg1\().16b, v8.16b
- eor \reg2\().16b, \reg2\().16b, v9.16b
- eor \reg1\().16b, \reg1\().16b, v11.16b
- eor \reg2\().16b, \reg2\().16b, v12.16b
- .endm
-
- // Fold src_reg into dst_reg, optionally loading the next fold constants
- .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
- pmull16x64_\p fold_consts, \src_reg, v8
- .ifnb \load_next_consts
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- .endif
- eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
- eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
- .endm
-
- .macro crc_t10dif_pmull, p
-
- // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
- cmp len, #256
- b.lt .Lless_than_256_bytes_\@
-
- adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
-
- // Load the first 128 data bytes. Byte swapping is necessary to make
- // the bit order match the polynomial coefficient order.
- ldp q0, q1, [buf]
- ldp q2, q3, [buf, #0x20]
- ldp q4, q5, [buf, #0x40]
- ldp q6, q7, [buf, #0x60]
- add buf, buf, #0x80
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( rev64 v1.16b, v1.16b )
-CPU_LE( rev64 v2.16b, v2.16b )
-CPU_LE( rev64 v3.16b, v3.16b )
-CPU_LE( rev64 v4.16b, v4.16b )
-CPU_LE( rev64 v5.16b, v5.16b )
-CPU_LE( rev64 v6.16b, v6.16b )
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
-CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
-CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
-CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
-CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
-CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
-
- // XOR the first 16 data *bits* with the initial CRC value.
- movi v8.16b, #0
- mov v8.h[7], init_crc
- eor v0.16b, v0.16b, v8.16b
-
- // Load the constants for folding across 128 bytes.
- ld1 {fold_consts.2d}, [fold_consts_ptr]
-
- // Subtract 128 for the 128 data bytes just consumed. Subtract another
- // 128 to simplify the termination condition of the following loop.
- sub len, len, #256
-
- // While >= 128 data bytes remain (not counting v0-v7), fold the 128
- // bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
- fold_32_bytes \p, v0, v1
- fold_32_bytes \p, v2, v3
- fold_32_bytes \p, v4, v5
- fold_32_bytes \p, v6, v7
-
- subs len, len, #128
- b.ge .Lfold_128_bytes_loop_\@
-
- // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
- // Fold across 64 bytes.
- add fold_consts_ptr, fold_consts_ptr, #16
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- fold_16_bytes \p, v0, v4
- fold_16_bytes \p, v1, v5
- fold_16_bytes \p, v2, v6
- fold_16_bytes \p, v3, v7, 1
- // Fold across 32 bytes.
- fold_16_bytes \p, v4, v6
- fold_16_bytes \p, v5, v7, 1
- // Fold across 16 bytes.
- fold_16_bytes \p, v6, v7
-
- // Add 128 to get the correct number of data bytes remaining in 0...127
- // (not counting v7), following the previous extra subtraction by 128.
- // Then subtract 16 to simplify the termination condition of the
- // following loop.
- adds len, len, #(128-16)
-
- // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
- // into them, storing the result back into v7.
- b.lt .Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
- pmull16x64_\p fold_consts, v7, v8
- eor v7.16b, v7.16b, v8.16b
- ldr q0, [buf], #16
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
- eor v7.16b, v7.16b, v0.16b
- subs len, len, #16
- b.ge .Lfold_16_bytes_loop_\@
-
-.Lfold_16_bytes_loop_done_\@:
- // Add 16 to get the correct number of data bytes remaining in 0...15
- // (not counting v7), following the previous extra subtraction by 16.
- adds len, len, #16
- b.eq .Lreduce_final_16_bytes_\@
-
-.Lhandle_partial_segment_\@:
- // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
- // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
- // do this without needing a fold constant for each possible 'len',
- // redivide the bytes into a first chunk of 'len' bytes and a second
- // chunk of 16 bytes, then fold the first chunk into the second.
-
- // v0 = last 16 original data bytes
- add buf, buf, len
- ldr q0, [buf, #-16]
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
-
- // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
- adr_l x4, .Lbyteshift_table + 16
- sub x4, x4, len
- ld1 {v2.16b}, [x4]
- tbl v1.16b, {v7.16b}, v2.16b
-
- // v3 = first chunk: v7 right-shifted by '16-len' bytes.
- movi v3.16b, #0x80
- eor v2.16b, v2.16b, v3.16b
- tbl v3.16b, {v7.16b}, v2.16b
-
- // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
- sshr v2.16b, v2.16b, #7
-
- // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
- // then '16-len' bytes from v1 (high-order bytes).
- bsl v2.16b, v1.16b, v0.16b
-
- // Fold the first chunk into the second chunk, storing the result in v7.
- pmull16x64_\p fold_consts, v3, v0
- eor v7.16b, v3.16b, v0.16b
- eor v7.16b, v7.16b, v2.16b
- b .Lreduce_final_16_bytes_\@
-
-.Lless_than_256_bytes_\@:
- // Checksumming a buffer of length 16...255 bytes
-
- adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
-
- // Load the first 16 data bytes.
- ldr q7, [buf], #0x10
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
-
- // XOR the first 16 data *bits* with the initial CRC value.
- movi v0.16b, #0
- mov v0.h[7], init_crc
- eor v7.16b, v7.16b, v0.16b
-
- // Load the fold-across-16-bytes constants.
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
-
- cmp len, #16
- b.eq .Lreduce_final_16_bytes_\@ // len == 16
- subs len, len, #32
- b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
- add len, len, #16
- b .Lhandle_partial_segment_\@ // 17 <= len <= 31
-
-.Lreduce_final_16_bytes_\@:
- .endm
-
-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
- frame_push 1
-
- // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
- movi perm.4h, #8, lsl #8
- orr perm.2s, #1, lsl #16
- orr perm.2s, #1, lsl #24
- zip1 perm.16b, perm.16b, perm.16b
- zip1 perm.16b, perm.16b, perm.16b
-
- crc_t10dif_pmull p8
-
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
- str q7, [x3]
-
- frame_pop
- ret
-SYM_FUNC_END(crc_t10dif_pmull_p8)
-
- .align 5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
- crc_t10dif_pmull p64
-
- // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
- movi v2.16b, #0 // init zero register
-
- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
-
- // Fold the high 64 bits into the low 64 bits, while also multiplying by
- // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
- // whose low 48 bits are 0.
- ext v0.16b, v2.16b, v7.16b, #8
- pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
- eor v0.16b, v0.16b, v7.16b // + low bits * x^64
-
- // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
- // value congruent to x^64 * M(x) and whose low 48 bits are 0.
- ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
- mov v0.s[3], v2.s[0] // zero high 32 bits
- pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
- eor v0.16b, v0.16b, v1.16b // + low bits
-
- // Load G(x) and floor(x^48 / G(x)).
- ld1 {fold_consts.2d}, [fold_consts_ptr]
-
- // Use Barrett reduction to compute the final CRC value.
- pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
- ushr v1.2d, v1.2d, #32 // /= x^32
- pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
- ushr v0.2d, v0.2d, #48
- eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
- // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
- umov w0, v0.h[0]
- ret
-SYM_FUNC_END(crc_t10dif_pmull_p64)
-
- .section ".rodata", "a"
- .align 4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
- .quad 0x0000000000006123 // x^(8*128) mod G(x)
- .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
-// .Lfold_across_64_bytes_consts:
- .quad 0x0000000000001069 // x^(4*128) mod G(x)
- .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
-// .Lfold_across_32_bytes_consts:
- .quad 0x000000000000857d // x^(2*128) mod G(x)
- .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
-.Lfold_across_16_bytes_consts:
- .quad 0x000000000000a010 // x^(1*128) mod G(x)
- .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
-// .Lfinal_fold_consts:
- .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
- .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
- .quad 0x0000000000018bb7 // G(x)
- .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
- .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
- .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/arch/arm64/lib/crc-t10dif.c b/arch/arm64/lib/crc-t10dif.c
deleted file mode 100644
index c2ffe4fdb59d..000000000000
--- a/arch/arm64/lib/crc-t10dif.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-
-asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
- u8 out[16]);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
-{
- if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
- if (static_branch_likely(&have_pmull)) {
- if (crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc_t10dif_pmull_p64(crc, data, length);
- kernel_neon_end();
- return crc;
- }
- } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
- static_branch_likely(&have_asimd) &&
- crypto_simd_usable()) {
- u8 buf[16];
-
- kernel_neon_begin();
- crc_t10dif_pmull_p8(crc, data, length, buf);
- kernel_neon_end();
-
- return crc_t10dif_generic(0, buf, sizeof(buf));
- }
- }
- return crc_t10dif_generic(crc, data, length);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_arm64_init(void)
-{
- if (cpu_have_named_feature(ASIMD)) {
- static_branch_enable(&have_asimd);
- if (cpu_have_named_feature(PMULL))
- static_branch_enable(&have_pmull);
- }
- return 0;
-}
-subsys_initcall(crc_t10dif_arm64_init);
-
-static void __exit crc_t10dif_arm64_exit(void)
-{
-}
-module_exit(crc_t10dif_arm64_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crc32-core.S b/arch/arm64/lib/crc32-core.S
deleted file mode 100644
index 68825317460f..000000000000
--- a/arch/arm64/lib/crc32-core.S
+++ /dev/null
@@ -1,362 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
- *
- * Copyright (C) 2016 - 2018 Linaro Ltd.
- * Copyright (C) 2024 Google LLC
- *
- * Author: Ard Biesheuvel <ardb@kernel.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .cpu generic+crc+crypto
-
- .macro bitle, reg
- .endm
-
- .macro bitbe, reg
- rbit \reg, \reg
- .endm
-
- .macro bytele, reg
- .endm
-
- .macro bytebe, reg
- rbit \reg, \reg
- lsr \reg, \reg, #24
- .endm
-
- .macro hwordle, reg
-CPU_BE( rev16 \reg, \reg )
- .endm
-
- .macro hwordbe, reg
-CPU_LE( rev \reg, \reg )
- rbit \reg, \reg
-CPU_BE( lsr \reg, \reg, #16 )
- .endm
-
- .macro le, regs:vararg
- .irp r, \regs
-CPU_BE( rev \r, \r )
- .endr
- .endm
-
- .macro be, regs:vararg
- .irp r, \regs
-CPU_LE( rev \r, \r )
- .endr
- .irp r, \regs
- rbit \r, \r
- .endr
- .endm
-
- .macro __crc32, c, order=le
- bit\order w0
- cmp x2, #16
- b.lt 8f // less than 16 bytes
-
- and x7, x2, #0x1f
- and x2, x2, #~0x1f
- cbz x7, 32f // multiple of 32 bytes
-
- and x8, x7, #0xf
- ldp x3, x4, [x1]
- add x8, x8, x1
- add x1, x1, x7
- ldp x5, x6, [x8]
- \order x3, x4, x5, x6
-
- tst x7, #8
- crc32\c\()x w8, w0, x3
- csel x3, x3, x4, eq
- csel w0, w0, w8, eq
- tst x7, #4
- lsr x4, x3, #32
- crc32\c\()w w8, w0, w3
- csel x3, x3, x4, eq
- csel w0, w0, w8, eq
- tst x7, #2
- lsr w4, w3, #16
- crc32\c\()h w8, w0, w3
- csel w3, w3, w4, eq
- csel w0, w0, w8, eq
- tst x7, #1
- crc32\c\()b w8, w0, w3
- csel w0, w0, w8, eq
- tst x7, #16
- crc32\c\()x w8, w0, x5
- crc32\c\()x w8, w8, x6
- csel w0, w0, w8, eq
- cbz x2, 0f
-
-32: ldp x3, x4, [x1], #32
- sub x2, x2, #32
- ldp x5, x6, [x1, #-16]
- \order x3, x4, x5, x6
- crc32\c\()x w0, w0, x3
- crc32\c\()x w0, w0, x4
- crc32\c\()x w0, w0, x5
- crc32\c\()x w0, w0, x6
- cbnz x2, 32b
-0: bit\order w0
- ret
-
-8: tbz x2, #3, 4f
- ldr x3, [x1], #8
- \order x3
- crc32\c\()x w0, w0, x3
-4: tbz x2, #2, 2f
- ldr w3, [x1], #4
- \order w3
- crc32\c\()w w0, w0, w3
-2: tbz x2, #1, 1f
- ldrh w3, [x1], #2
- hword\order w3
- crc32\c\()h w0, w0, w3
-1: tbz x2, #0, 0f
- ldrb w3, [x1]
- byte\order w3
- crc32\c\()b w0, w0, w3
-0: bit\order w0
- ret
- .endm
-
- .align 5
-SYM_FUNC_START(crc32_le_arm64)
- __crc32
-SYM_FUNC_END(crc32_le_arm64)
-
- .align 5
-SYM_FUNC_START(crc32c_le_arm64)
- __crc32 c
-SYM_FUNC_END(crc32c_le_arm64)
-
- .align 5
-SYM_FUNC_START(crc32_be_arm64)
- __crc32 order=be
-SYM_FUNC_END(crc32_be_arm64)
-
- in .req x1
- len .req x2
-
- /*
- * w0: input CRC at entry, output CRC at exit
- * x1: pointer to input buffer
- * x2: length of input in bytes
- */
- .macro crc4way, insn, table, order=le
- bit\order w0
- lsr len, len, #6 // len := # of 64-byte blocks
-
- /* Process up to 64 blocks of 64 bytes at a time */
-.La\@: mov x3, #64
- cmp len, #64
- csel x3, x3, len, hi // x3 := min(len, 64)
- sub len, len, x3
-
- /* Divide the input into 4 contiguous blocks */
- add x4, x3, x3, lsl #1 // x4 := 3 * x3
- add x7, in, x3, lsl #4 // x7 := in + 16 * x3
- add x8, in, x3, lsl #5 // x8 := in + 32 * x3
- add x9, in, x4, lsl #4 // x9 := in + 16 * x4
-
- /* Load the folding coefficients from the lookup table */
- adr_l x5, \table - 12 // entry 0 omitted
- add x5, x5, x4, lsl #2 // x5 += 12 * x3
- ldp s0, s1, [x5]
- ldr s2, [x5, #8]
-
- /* Zero init partial CRCs for this iteration */
- mov w4, wzr
- mov w5, wzr
- mov w6, wzr
- mov x17, xzr
-
-.Lb\@: sub x3, x3, #1
- \insn w6, w6, x17
- ldp x10, x11, [in], #16
- ldp x12, x13, [x7], #16
- ldp x14, x15, [x8], #16
- ldp x16, x17, [x9], #16
-
- \order x10, x11, x12, x13, x14, x15, x16, x17
-
- /* Apply the CRC transform to 4 16-byte blocks in parallel */
- \insn w0, w0, x10
- \insn w4, w4, x12
- \insn w5, w5, x14
- \insn w6, w6, x16
- \insn w0, w0, x11
- \insn w4, w4, x13
- \insn w5, w5, x15
- cbnz x3, .Lb\@
-
- /* Combine the 4 partial results into w0 */
- mov v3.d[0], x0
- mov v4.d[0], x4
- mov v5.d[0], x5
- pmull v0.1q, v0.1d, v3.1d
- pmull v1.1q, v1.1d, v4.1d
- pmull v2.1q, v2.1d, v5.1d
- eor v0.8b, v0.8b, v1.8b
- eor v0.8b, v0.8b, v2.8b
- mov x5, v0.d[0]
- eor x5, x5, x17
- \insn w0, w6, x5
-
- mov in, x9
- cbnz len, .La\@
-
- bit\order w0
- ret
- .endm
-
- .align 5
-SYM_FUNC_START(crc32c_le_arm64_4way)
- crc4way crc32cx, .L0
-SYM_FUNC_END(crc32c_le_arm64_4way)
-
- .align 5
-SYM_FUNC_START(crc32_le_arm64_4way)
- crc4way crc32x, .L1
-SYM_FUNC_END(crc32_le_arm64_4way)
-
- .align 5
-SYM_FUNC_START(crc32_be_arm64_4way)
- crc4way crc32x, .L1, be
-SYM_FUNC_END(crc32_be_arm64_4way)
-
- .section .rodata, "a", %progbits
- .align 6
-.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27
- .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e
- .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b
- .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8
- .long 0x299847d5, 0x878a92a7, 0x39d3b296
- .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53
- .long 0xa60ce07b, 0x83348832, 0x47db8317
- .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092
- .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0
- .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7
- .long 0xf285651c, 0xce7f39f4, 0xdaece73e
- .long 0x271d9844, 0xd270f1a2, 0xab7aff2a
- .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385
- .long 0xcec3662e, 0x1b03397f, 0x83348832
- .long 0x8227bb8a, 0xb3e32c28, 0x299847d5
- .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
- .long 0xf6076544, 0x10746f3c, 0x18b33a4e
- .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b
- .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7
- .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b
- .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b
- .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
- .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56
- .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
- .long 0xa90fd27a, 0x0167d312, 0xc619809d
- .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
- .long 0x4597456a, 0x98d8d9cb, 0x65863b64
- .long 0xc9c8b782, 0x68bce87a, 0x1b03397f
- .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd
- .long 0x2342001e, 0x3771e98f, 0xb3e32c28
- .long 0xe8b6368b, 0x2178513a, 0x064f7f26
- .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c
- .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c
- .long 0x02ee03b2, 0xff0dba97, 0x10746f3c
- .long 0x135c83fd, 0xf872e54c, 0xc7a68855
- .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844
- .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
- .long 0xded288f8, 0xb3af077a, 0x93a5f730
- .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c
- .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2
- .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203
- .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e
- .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb
- .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a
- .long 0x8e1450f7, 0x2342001e, 0x8227bb8a
- .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
- .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35
- .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
- .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b
- .long 0xd6c3a807, 0x2664fd8b, 0x0167d312
- .long 0x1d31175f, 0x02ee03b2, 0xf6076544
- .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a
- .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf
- .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
- .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c
- .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a
- .long 0x8a074012, 0xded288f8, 0x57a3d037
- .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b
- .long 0x3be3c09b, 0x6353c1cc, 0x42d98888
- .long 0x465a4eee, 0xf48642e9, 0x3771e98f
- .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
- .long 0xa52f58ec, 0x9a5ede41, 0x2178513a
- .long 0x47972100, 0x45cddf4e, 0xe0ac139e
- .long 0x359674f7, 0xa51b6135, 0x170076fa
-
-.L1: .long 0xaf449247, 0x81256527, 0xccaa009e
- .long 0x57c54819, 0x1d9513d7, 0x81256527
- .long 0x3f41287a, 0x57c54819, 0xaf449247
- .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7
- .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
- .long 0x71d54a59, 0xf5e48c85, 0x57c54819
- .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed
- .long 0xd31343ea, 0xe95c1271, 0x910eeec1
- .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a
- .long 0x9ee62949, 0xcec97417, 0x9026d5b1
- .long 0xa55d1514, 0xf183c71b, 0xd1df2327
- .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85
- .long 0x9d842b80, 0xeea395c4, 0x3c656ced
- .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd
- .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
- .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271
- .long 0xef82aa68, 0xdb3935ea, 0xb918a347
- .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59
- .long 0x99cce860, 0x356d209f, 0xff6f2fc2
- .long 0xd8af8e46, 0xc352f6de, 0xcec97417
- .long 0xf1996890, 0xd8110ff1, 0x1c63267b
- .long 0x631bc508, 0xe95c7216, 0xf183c71b
- .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0
- .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea
- .long 0x7a92fffb, 0xf7003835, 0x4470ac44
- .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4
- .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
- .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40
- .long 0x60290934, 0x81b6f443, 0x6d40f445
- .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949
- .long 0xdcf5088a, 0x9dbdc100, 0x145575d5
- .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
- .long 0x255b139e, 0x631bc508, 0xa55d1514
- .long 0xd784eaa8, 0xce26786c, 0xdb3935ea
- .long 0x6d2c864a, 0x8068c345, 0x2586d334
- .long 0x02072e24, 0xdb3839f3, 0x21aa2b26
- .long 0x06689b0a, 0x5efd72f5, 0xe0575528
- .long 0x1e52f5ea, 0x4117915b, 0x356d209f
- .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
- .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de
- .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
- .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1
- .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0
- .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216
- .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356
- .long 0x0c540e7b, 0x753c81ff, 0x8e031a19
- .long 0x9924c781, 0xb9220208, 0x3edcde65
- .long 0x3954de39, 0x1753ab84, 0x1d6708a0
- .long 0xf32238b5, 0xbec81497, 0x9e70b943
- .long 0xbbd2cd2c, 0x0925d861, 0xf7003835
- .long 0xcc401304, 0xd784eaa8, 0xef82aa68
- .long 0x4987e684, 0x6044fbb0, 0x00eba0c8
- .long 0x3aa11427, 0x18fe3b4a, 0x87441142
- .long 0x297aad60, 0x02072e24, 0xd14bcc9b
- .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
- .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8
- .long 0x25b8822a, 0x1e52f5ea, 0x99cce860
- .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443
- .long 0x5690aa32, 0xa91fdefb, 0x688a110e
- .long 0x1357a093, 0x3796455c, 0xd8af8e46
- .long 0x798fdd33, 0xaaa18a37, 0x357b9517
- .long 0xc2815395, 0x54d42691, 0x9dbdc100
- .long 0x21cfc0f7, 0x28ae0976, 0xf1996890
- .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/arch/arm64/lib/crc32.c b/arch/arm64/lib/crc32.c
deleted file mode 100644
index ed3acd71178f..000000000000
--- a/arch/arm64/lib/crc32.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/crc32.h>
-#include <linux/linkage.h>
-#include <linux/module.h>
-
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#include <crypto/internal/simd.h>
-
-// The minimum input length to consider the 4-way interleaved code path
-static const size_t min_len = 1024;
-
-asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
-
-asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32_le_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32_le_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32c_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32c_le_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32c_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32_be_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32_be_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32_be_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-u32 crc32_optimizations(void)
-{
- if (alternative_has_cap_likely(ARM64_HAS_CRC32))
- return CRC32_LE_OPTIMIZATION |
- CRC32_BE_OPTIMIZATION |
- CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("arm64-optimized CRC32 functions");
diff --git a/arch/arm64/lib/crypto/.gitignore b/arch/arm64/lib/crypto/.gitignore
deleted file mode 100644
index 12d74d8b03d0..000000000000
--- a/arch/arm64/lib/crypto/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
-sha256-core.S
diff --git a/arch/arm64/lib/crypto/Kconfig b/arch/arm64/lib/crypto/Kconfig
deleted file mode 100644
index 129a7685cb4c..000000000000
--- a/arch/arm64/lib/crypto/Kconfig
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA20_NEON
- tristate
- depends on KERNEL_MODE_NEON
- default CRYPTO_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_NEON
- tristate
- depends on KERNEL_MODE_NEON
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM64
- tristate
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm64/lib/crypto/Makefile b/arch/arm64/lib/crypto/Makefile
deleted file mode 100644
index 946c09903711..000000000000
--- a/arch/arm64/lib/crypto/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
-poly1305-neon-y := poly1305-core.o poly1305-glue.o
-AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
-AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
-
-obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
-sha256-arm64-y := sha256.o sha256-core.o
-sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $(<) void $(@)
-
-$(obj)/%-core.S: $(src)/%-armv8.pl
- $(call cmd,perlasm)
-
-$(obj)/sha256-core.S: $(src)/sha2-armv8.pl
- $(call cmd,perlasm)
-
-clean-files += poly1305-core.S sha256-core.S
diff --git a/arch/arm64/lib/crypto/chacha-neon-core.S b/arch/arm64/lib/crypto/chacha-neon-core.S
deleted file mode 100644
index 80079586ecc7..000000000000
--- a/arch/arm64/lib/crypto/chacha-neon-core.S
+++ /dev/null
@@ -1,805 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Originally based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
- .text
- .align 6
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers v0-v3. It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in w3.
- *
- * Clobbers: w3, x10, v4, v12
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
- adr_l x10, ROT8
- ld1 {v12.4s}, [x10]
-
-.Ldoubleround:
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- add v0.4s, v0.4s, v1.4s
- eor v3.16b, v3.16b, v0.16b
- rev32 v3.8h, v3.8h
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- add v2.4s, v2.4s, v3.4s
- eor v4.16b, v1.16b, v2.16b
- shl v1.4s, v4.4s, #12
- sri v1.4s, v4.4s, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- add v0.4s, v0.4s, v1.4s
- eor v3.16b, v3.16b, v0.16b
- tbl v3.16b, {v3.16b}, v12.16b
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- add v2.4s, v2.4s, v3.4s
- eor v4.16b, v1.16b, v2.16b
- shl v1.4s, v4.4s, #7
- sri v1.4s, v4.4s, #25
-
- // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- ext v1.16b, v1.16b, v1.16b, #4
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- ext v2.16b, v2.16b, v2.16b, #8
- // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- ext v3.16b, v3.16b, v3.16b, #12
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- add v0.4s, v0.4s, v1.4s
- eor v3.16b, v3.16b, v0.16b
- rev32 v3.8h, v3.8h
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- add v2.4s, v2.4s, v3.4s
- eor v4.16b, v1.16b, v2.16b
- shl v1.4s, v4.4s, #12
- sri v1.4s, v4.4s, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- add v0.4s, v0.4s, v1.4s
- eor v3.16b, v3.16b, v0.16b
- tbl v3.16b, {v3.16b}, v12.16b
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- add v2.4s, v2.4s, v3.4s
- eor v4.16b, v1.16b, v2.16b
- shl v1.4s, v4.4s, #7
- sri v1.4s, v4.4s, #25
-
- // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- ext v1.16b, v1.16b, v1.16b, #12
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- ext v2.16b, v2.16b, v2.16b, #8
- // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- ext v3.16b, v3.16b, v3.16b, #4
-
- subs w3, w3, #2
- b.ne .Ldoubleround
-
- ret
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_neon)
- // x0: Input state matrix, s
- // x1: 1 data block output, o
- // x2: 1 data block input, i
- // w3: nrounds
-
- stp x29, x30, [sp, #-16]!
- mov x29, sp
-
- // x0..3 = s0..3
- ld1 {v0.4s-v3.4s}, [x0]
- ld1 {v8.4s-v11.4s}, [x0]
-
- bl chacha_permute
-
- ld1 {v4.16b-v7.16b}, [x2]
-
- // o0 = i0 ^ (x0 + s0)
- add v0.4s, v0.4s, v8.4s
- eor v0.16b, v0.16b, v4.16b
-
- // o1 = i1 ^ (x1 + s1)
- add v1.4s, v1.4s, v9.4s
- eor v1.16b, v1.16b, v5.16b
-
- // o2 = i2 ^ (x2 + s2)
- add v2.4s, v2.4s, v10.4s
- eor v2.16b, v2.16b, v6.16b
-
- // o3 = i3 ^ (x3 + s3)
- add v3.4s, v3.4s, v11.4s
- eor v3.16b, v3.16b, v7.16b
-
- st1 {v0.16b-v3.16b}, [x1]
-
- ldp x29, x30, [sp], #16
- ret
-SYM_FUNC_END(chacha_block_xor_neon)
-
-SYM_FUNC_START(hchacha_block_neon)
- // x0: Input state matrix, s
- // x1: output (8 32-bit words)
- // w2: nrounds
-
- stp x29, x30, [sp, #-16]!
- mov x29, sp
-
- ld1 {v0.4s-v3.4s}, [x0]
-
- mov w3, w2
- bl chacha_permute
-
- st1 {v0.4s}, [x1], #16
- st1 {v3.4s}, [x1]
-
- ldp x29, x30, [sp], #16
- ret
-SYM_FUNC_END(hchacha_block_neon)
-
- a0 .req w12
- a1 .req w13
- a2 .req w14
- a3 .req w15
- a4 .req w16
- a5 .req w17
- a6 .req w19
- a7 .req w20
- a8 .req w21
- a9 .req w22
- a10 .req w23
- a11 .req w24
- a12 .req w25
- a13 .req w26
- a14 .req w27
- a15 .req w28
-
- .align 6
-SYM_FUNC_START(chacha_4block_xor_neon)
- frame_push 10
-
- // x0: Input state matrix, s
- // x1: 4 data blocks output, o
- // x2: 4 data blocks input, i
- // w3: nrounds
- // x4: byte count
-
- adr_l x10, .Lpermute
- and x5, x4, #63
- add x10, x10, x5
-
- //
- // This function encrypts four consecutive ChaCha blocks by loading
- // the state matrix in NEON registers four times. The algorithm performs
- // each operation on the corresponding word of each state matrix, hence
- // requires no word shuffling. For final XORing step we transpose the
- // matrix by interleaving 32- and then 64-bit words, which allows us to
- // do XOR in NEON registers.
- //
- // At the same time, a fifth block is encrypted in parallel using
- // scalar registers
- //
- adr_l x9, CTRINC // ... and ROT8
- ld1 {v30.4s-v31.4s}, [x9]
-
- // x0..15[0-3] = s0..3[0..3]
- add x8, x0, #16
- ld4r { v0.4s- v3.4s}, [x0]
- ld4r { v4.4s- v7.4s}, [x8], #16
- ld4r { v8.4s-v11.4s}, [x8], #16
- ld4r {v12.4s-v15.4s}, [x8]
-
- mov a0, v0.s[0]
- mov a1, v1.s[0]
- mov a2, v2.s[0]
- mov a3, v3.s[0]
- mov a4, v4.s[0]
- mov a5, v5.s[0]
- mov a6, v6.s[0]
- mov a7, v7.s[0]
- mov a8, v8.s[0]
- mov a9, v9.s[0]
- mov a10, v10.s[0]
- mov a11, v11.s[0]
- mov a12, v12.s[0]
- mov a13, v13.s[0]
- mov a14, v14.s[0]
- mov a15, v15.s[0]
-
- // x12 += counter values 1-4
- add v12.4s, v12.4s, v30.4s
-
-.Ldoubleround4:
- // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- add v0.4s, v0.4s, v4.4s
- add a0, a0, a4
- add v1.4s, v1.4s, v5.4s
- add a1, a1, a5
- add v2.4s, v2.4s, v6.4s
- add a2, a2, a6
- add v3.4s, v3.4s, v7.4s
- add a3, a3, a7
-
- eor v12.16b, v12.16b, v0.16b
- eor a12, a12, a0
- eor v13.16b, v13.16b, v1.16b
- eor a13, a13, a1
- eor v14.16b, v14.16b, v2.16b
- eor a14, a14, a2
- eor v15.16b, v15.16b, v3.16b
- eor a15, a15, a3
-
- rev32 v12.8h, v12.8h
- ror a12, a12, #16
- rev32 v13.8h, v13.8h
- ror a13, a13, #16
- rev32 v14.8h, v14.8h
- ror a14, a14, #16
- rev32 v15.8h, v15.8h
- ror a15, a15, #16
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- add v8.4s, v8.4s, v12.4s
- add a8, a8, a12
- add v9.4s, v9.4s, v13.4s
- add a9, a9, a13
- add v10.4s, v10.4s, v14.4s
- add a10, a10, a14
- add v11.4s, v11.4s, v15.4s
- add a11, a11, a15
-
- eor v16.16b, v4.16b, v8.16b
- eor a4, a4, a8
- eor v17.16b, v5.16b, v9.16b
- eor a5, a5, a9
- eor v18.16b, v6.16b, v10.16b
- eor a6, a6, a10
- eor v19.16b, v7.16b, v11.16b
- eor a7, a7, a11
-
- shl v4.4s, v16.4s, #12
- shl v5.4s, v17.4s, #12
- shl v6.4s, v18.4s, #12
- shl v7.4s, v19.4s, #12
-
- sri v4.4s, v16.4s, #20
- ror a4, a4, #20
- sri v5.4s, v17.4s, #20
- ror a5, a5, #20
- sri v6.4s, v18.4s, #20
- ror a6, a6, #20
- sri v7.4s, v19.4s, #20
- ror a7, a7, #20
-
- // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- add v0.4s, v0.4s, v4.4s
- add a0, a0, a4
- add v1.4s, v1.4s, v5.4s
- add a1, a1, a5
- add v2.4s, v2.4s, v6.4s
- add a2, a2, a6
- add v3.4s, v3.4s, v7.4s
- add a3, a3, a7
-
- eor v12.16b, v12.16b, v0.16b
- eor a12, a12, a0
- eor v13.16b, v13.16b, v1.16b
- eor a13, a13, a1
- eor v14.16b, v14.16b, v2.16b
- eor a14, a14, a2
- eor v15.16b, v15.16b, v3.16b
- eor a15, a15, a3
-
- tbl v12.16b, {v12.16b}, v31.16b
- ror a12, a12, #24
- tbl v13.16b, {v13.16b}, v31.16b
- ror a13, a13, #24
- tbl v14.16b, {v14.16b}, v31.16b
- ror a14, a14, #24
- tbl v15.16b, {v15.16b}, v31.16b
- ror a15, a15, #24
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- add v8.4s, v8.4s, v12.4s
- add a8, a8, a12
- add v9.4s, v9.4s, v13.4s
- add a9, a9, a13
- add v10.4s, v10.4s, v14.4s
- add a10, a10, a14
- add v11.4s, v11.4s, v15.4s
- add a11, a11, a15
-
- eor v16.16b, v4.16b, v8.16b
- eor a4, a4, a8
- eor v17.16b, v5.16b, v9.16b
- eor a5, a5, a9
- eor v18.16b, v6.16b, v10.16b
- eor a6, a6, a10
- eor v19.16b, v7.16b, v11.16b
- eor a7, a7, a11
-
- shl v4.4s, v16.4s, #7
- shl v5.4s, v17.4s, #7
- shl v6.4s, v18.4s, #7
- shl v7.4s, v19.4s, #7
-
- sri v4.4s, v16.4s, #25
- ror a4, a4, #25
- sri v5.4s, v17.4s, #25
- ror a5, a5, #25
- sri v6.4s, v18.4s, #25
- ror a6, a6, #25
- sri v7.4s, v19.4s, #25
- ror a7, a7, #25
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- add v0.4s, v0.4s, v5.4s
- add a0, a0, a5
- add v1.4s, v1.4s, v6.4s
- add a1, a1, a6
- add v2.4s, v2.4s, v7.4s
- add a2, a2, a7
- add v3.4s, v3.4s, v4.4s
- add a3, a3, a4
-
- eor v15.16b, v15.16b, v0.16b
- eor a15, a15, a0
- eor v12.16b, v12.16b, v1.16b
- eor a12, a12, a1
- eor v13.16b, v13.16b, v2.16b
- eor a13, a13, a2
- eor v14.16b, v14.16b, v3.16b
- eor a14, a14, a3
-
- rev32 v15.8h, v15.8h
- ror a15, a15, #16
- rev32 v12.8h, v12.8h
- ror a12, a12, #16
- rev32 v13.8h, v13.8h
- ror a13, a13, #16
- rev32 v14.8h, v14.8h
- ror a14, a14, #16
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- add v10.4s, v10.4s, v15.4s
- add a10, a10, a15
- add v11.4s, v11.4s, v12.4s
- add a11, a11, a12
- add v8.4s, v8.4s, v13.4s
- add a8, a8, a13
- add v9.4s, v9.4s, v14.4s
- add a9, a9, a14
-
- eor v16.16b, v5.16b, v10.16b
- eor a5, a5, a10
- eor v17.16b, v6.16b, v11.16b
- eor a6, a6, a11
- eor v18.16b, v7.16b, v8.16b
- eor a7, a7, a8
- eor v19.16b, v4.16b, v9.16b
- eor a4, a4, a9
-
- shl v5.4s, v16.4s, #12
- shl v6.4s, v17.4s, #12
- shl v7.4s, v18.4s, #12
- shl v4.4s, v19.4s, #12
-
- sri v5.4s, v16.4s, #20
- ror a5, a5, #20
- sri v6.4s, v17.4s, #20
- ror a6, a6, #20
- sri v7.4s, v18.4s, #20
- ror a7, a7, #20
- sri v4.4s, v19.4s, #20
- ror a4, a4, #20
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- add v0.4s, v0.4s, v5.4s
- add a0, a0, a5
- add v1.4s, v1.4s, v6.4s
- add a1, a1, a6
- add v2.4s, v2.4s, v7.4s
- add a2, a2, a7
- add v3.4s, v3.4s, v4.4s
- add a3, a3, a4
-
- eor v15.16b, v15.16b, v0.16b
- eor a15, a15, a0
- eor v12.16b, v12.16b, v1.16b
- eor a12, a12, a1
- eor v13.16b, v13.16b, v2.16b
- eor a13, a13, a2
- eor v14.16b, v14.16b, v3.16b
- eor a14, a14, a3
-
- tbl v15.16b, {v15.16b}, v31.16b
- ror a15, a15, #24
- tbl v12.16b, {v12.16b}, v31.16b
- ror a12, a12, #24
- tbl v13.16b, {v13.16b}, v31.16b
- ror a13, a13, #24
- tbl v14.16b, {v14.16b}, v31.16b
- ror a14, a14, #24
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- add v10.4s, v10.4s, v15.4s
- add a10, a10, a15
- add v11.4s, v11.4s, v12.4s
- add a11, a11, a12
- add v8.4s, v8.4s, v13.4s
- add a8, a8, a13
- add v9.4s, v9.4s, v14.4s
- add a9, a9, a14
-
- eor v16.16b, v5.16b, v10.16b
- eor a5, a5, a10
- eor v17.16b, v6.16b, v11.16b
- eor a6, a6, a11
- eor v18.16b, v7.16b, v8.16b
- eor a7, a7, a8
- eor v19.16b, v4.16b, v9.16b
- eor a4, a4, a9
-
- shl v5.4s, v16.4s, #7
- shl v6.4s, v17.4s, #7
- shl v7.4s, v18.4s, #7
- shl v4.4s, v19.4s, #7
-
- sri v5.4s, v16.4s, #25
- ror a5, a5, #25
- sri v6.4s, v17.4s, #25
- ror a6, a6, #25
- sri v7.4s, v18.4s, #25
- ror a7, a7, #25
- sri v4.4s, v19.4s, #25
- ror a4, a4, #25
-
- subs w3, w3, #2
- b.ne .Ldoubleround4
-
- ld4r {v16.4s-v19.4s}, [x0], #16
- ld4r {v20.4s-v23.4s}, [x0], #16
-
- // x12 += counter values 0-3
- add v12.4s, v12.4s, v30.4s
-
- // x0[0-3] += s0[0]
- // x1[0-3] += s0[1]
- // x2[0-3] += s0[2]
- // x3[0-3] += s0[3]
- add v0.4s, v0.4s, v16.4s
- mov w6, v16.s[0]
- mov w7, v17.s[0]
- add v1.4s, v1.4s, v17.4s
- mov w8, v18.s[0]
- mov w9, v19.s[0]
- add v2.4s, v2.4s, v18.4s
- add a0, a0, w6
- add a1, a1, w7
- add v3.4s, v3.4s, v19.4s
- add a2, a2, w8
- add a3, a3, w9
-CPU_BE( rev a0, a0 )
-CPU_BE( rev a1, a1 )
-CPU_BE( rev a2, a2 )
-CPU_BE( rev a3, a3 )
-
- ld4r {v24.4s-v27.4s}, [x0], #16
- ld4r {v28.4s-v31.4s}, [x0]
-
- // x4[0-3] += s1[0]
- // x5[0-3] += s1[1]
- // x6[0-3] += s1[2]
- // x7[0-3] += s1[3]
- add v4.4s, v4.4s, v20.4s
- mov w6, v20.s[0]
- mov w7, v21.s[0]
- add v5.4s, v5.4s, v21.4s
- mov w8, v22.s[0]
- mov w9, v23.s[0]
- add v6.4s, v6.4s, v22.4s
- add a4, a4, w6
- add a5, a5, w7
- add v7.4s, v7.4s, v23.4s
- add a6, a6, w8
- add a7, a7, w9
-CPU_BE( rev a4, a4 )
-CPU_BE( rev a5, a5 )
-CPU_BE( rev a6, a6 )
-CPU_BE( rev a7, a7 )
-
- // x8[0-3] += s2[0]
- // x9[0-3] += s2[1]
- // x10[0-3] += s2[2]
- // x11[0-3] += s2[3]
- add v8.4s, v8.4s, v24.4s
- mov w6, v24.s[0]
- mov w7, v25.s[0]
- add v9.4s, v9.4s, v25.4s
- mov w8, v26.s[0]
- mov w9, v27.s[0]
- add v10.4s, v10.4s, v26.4s
- add a8, a8, w6
- add a9, a9, w7
- add v11.4s, v11.4s, v27.4s
- add a10, a10, w8
- add a11, a11, w9
-CPU_BE( rev a8, a8 )
-CPU_BE( rev a9, a9 )
-CPU_BE( rev a10, a10 )
-CPU_BE( rev a11, a11 )
-
- // x12[0-3] += s3[0]
- // x13[0-3] += s3[1]
- // x14[0-3] += s3[2]
- // x15[0-3] += s3[3]
- add v12.4s, v12.4s, v28.4s
- mov w6, v28.s[0]
- mov w7, v29.s[0]
- add v13.4s, v13.4s, v29.4s
- mov w8, v30.s[0]
- mov w9, v31.s[0]
- add v14.4s, v14.4s, v30.4s
- add a12, a12, w6
- add a13, a13, w7
- add v15.4s, v15.4s, v31.4s
- add a14, a14, w8
- add a15, a15, w9
-CPU_BE( rev a12, a12 )
-CPU_BE( rev a13, a13 )
-CPU_BE( rev a14, a14 )
-CPU_BE( rev a15, a15 )
-
- // interleave 32-bit words in state n, n+1
- ldp w6, w7, [x2], #64
- zip1 v16.4s, v0.4s, v1.4s
- ldp w8, w9, [x2, #-56]
- eor a0, a0, w6
- zip2 v17.4s, v0.4s, v1.4s
- eor a1, a1, w7
- zip1 v18.4s, v2.4s, v3.4s
- eor a2, a2, w8
- zip2 v19.4s, v2.4s, v3.4s
- eor a3, a3, w9
- ldp w6, w7, [x2, #-48]
- zip1 v20.4s, v4.4s, v5.4s
- ldp w8, w9, [x2, #-40]
- eor a4, a4, w6
- zip2 v21.4s, v4.4s, v5.4s
- eor a5, a5, w7
- zip1 v22.4s, v6.4s, v7.4s
- eor a6, a6, w8
- zip2 v23.4s, v6.4s, v7.4s
- eor a7, a7, w9
- ldp w6, w7, [x2, #-32]
- zip1 v24.4s, v8.4s, v9.4s
- ldp w8, w9, [x2, #-24]
- eor a8, a8, w6
- zip2 v25.4s, v8.4s, v9.4s
- eor a9, a9, w7
- zip1 v26.4s, v10.4s, v11.4s
- eor a10, a10, w8
- zip2 v27.4s, v10.4s, v11.4s
- eor a11, a11, w9
- ldp w6, w7, [x2, #-16]
- zip1 v28.4s, v12.4s, v13.4s
- ldp w8, w9, [x2, #-8]
- eor a12, a12, w6
- zip2 v29.4s, v12.4s, v13.4s
- eor a13, a13, w7
- zip1 v30.4s, v14.4s, v15.4s
- eor a14, a14, w8
- zip2 v31.4s, v14.4s, v15.4s
- eor a15, a15, w9
-
- add x3, x2, x4
- sub x3, x3, #128 // start of last block
-
- subs x5, x4, #128
- csel x2, x2, x3, ge
-
- // interleave 64-bit words in state n, n+2
- zip1 v0.2d, v16.2d, v18.2d
- zip2 v4.2d, v16.2d, v18.2d
- stp a0, a1, [x1], #64
- zip1 v8.2d, v17.2d, v19.2d
- zip2 v12.2d, v17.2d, v19.2d
- stp a2, a3, [x1, #-56]
-
- subs x6, x4, #192
- ld1 {v16.16b-v19.16b}, [x2], #64
- csel x2, x2, x3, ge
-
- zip1 v1.2d, v20.2d, v22.2d
- zip2 v5.2d, v20.2d, v22.2d
- stp a4, a5, [x1, #-48]
- zip1 v9.2d, v21.2d, v23.2d
- zip2 v13.2d, v21.2d, v23.2d
- stp a6, a7, [x1, #-40]
-
- subs x7, x4, #256
- ld1 {v20.16b-v23.16b}, [x2], #64
- csel x2, x2, x3, ge
-
- zip1 v2.2d, v24.2d, v26.2d
- zip2 v6.2d, v24.2d, v26.2d
- stp a8, a9, [x1, #-32]
- zip1 v10.2d, v25.2d, v27.2d
- zip2 v14.2d, v25.2d, v27.2d
- stp a10, a11, [x1, #-24]
-
- subs x8, x4, #320
- ld1 {v24.16b-v27.16b}, [x2], #64
- csel x2, x2, x3, ge
-
- zip1 v3.2d, v28.2d, v30.2d
- zip2 v7.2d, v28.2d, v30.2d
- stp a12, a13, [x1, #-16]
- zip1 v11.2d, v29.2d, v31.2d
- zip2 v15.2d, v29.2d, v31.2d
- stp a14, a15, [x1, #-8]
-
- tbnz x5, #63, .Lt128
- ld1 {v28.16b-v31.16b}, [x2]
-
- // xor with corresponding input, write to output
- eor v16.16b, v16.16b, v0.16b
- eor v17.16b, v17.16b, v1.16b
- eor v18.16b, v18.16b, v2.16b
- eor v19.16b, v19.16b, v3.16b
-
- tbnz x6, #63, .Lt192
-
- eor v20.16b, v20.16b, v4.16b
- eor v21.16b, v21.16b, v5.16b
- eor v22.16b, v22.16b, v6.16b
- eor v23.16b, v23.16b, v7.16b
-
- st1 {v16.16b-v19.16b}, [x1], #64
- tbnz x7, #63, .Lt256
-
- eor v24.16b, v24.16b, v8.16b
- eor v25.16b, v25.16b, v9.16b
- eor v26.16b, v26.16b, v10.16b
- eor v27.16b, v27.16b, v11.16b
-
- st1 {v20.16b-v23.16b}, [x1], #64
- tbnz x8, #63, .Lt320
-
- eor v28.16b, v28.16b, v12.16b
- eor v29.16b, v29.16b, v13.16b
- eor v30.16b, v30.16b, v14.16b
- eor v31.16b, v31.16b, v15.16b
-
- st1 {v24.16b-v27.16b}, [x1], #64
- st1 {v28.16b-v31.16b}, [x1]
-
-.Lout: frame_pop
- ret
-
- // fewer than 192 bytes of in/output
-.Lt192: cbz x5, 1f // exactly 128 bytes?
- ld1 {v28.16b-v31.16b}, [x10]
- add x5, x5, x1
- tbl v28.16b, {v4.16b-v7.16b}, v28.16b
- tbl v29.16b, {v4.16b-v7.16b}, v29.16b
- tbl v30.16b, {v4.16b-v7.16b}, v30.16b
- tbl v31.16b, {v4.16b-v7.16b}, v31.16b
-
-0: eor v20.16b, v20.16b, v28.16b
- eor v21.16b, v21.16b, v29.16b
- eor v22.16b, v22.16b, v30.16b
- eor v23.16b, v23.16b, v31.16b
- st1 {v20.16b-v23.16b}, [x5] // overlapping stores
-1: st1 {v16.16b-v19.16b}, [x1]
- b .Lout
-
- // fewer than 128 bytes of in/output
-.Lt128: ld1 {v28.16b-v31.16b}, [x10]
- add x5, x5, x1
- sub x1, x1, #64
- tbl v28.16b, {v0.16b-v3.16b}, v28.16b
- tbl v29.16b, {v0.16b-v3.16b}, v29.16b
- tbl v30.16b, {v0.16b-v3.16b}, v30.16b
- tbl v31.16b, {v0.16b-v3.16b}, v31.16b
- ld1 {v16.16b-v19.16b}, [x1] // reload first output block
- b 0b
-
- // fewer than 256 bytes of in/output
-.Lt256: cbz x6, 2f // exactly 192 bytes?
- ld1 {v4.16b-v7.16b}, [x10]
- add x6, x6, x1
- tbl v0.16b, {v8.16b-v11.16b}, v4.16b
- tbl v1.16b, {v8.16b-v11.16b}, v5.16b
- tbl v2.16b, {v8.16b-v11.16b}, v6.16b
- tbl v3.16b, {v8.16b-v11.16b}, v7.16b
-
- eor v28.16b, v28.16b, v0.16b
- eor v29.16b, v29.16b, v1.16b
- eor v30.16b, v30.16b, v2.16b
- eor v31.16b, v31.16b, v3.16b
- st1 {v28.16b-v31.16b}, [x6] // overlapping stores
-2: st1 {v20.16b-v23.16b}, [x1]
- b .Lout
-
- // fewer than 320 bytes of in/output
-.Lt320: cbz x7, 3f // exactly 256 bytes?
- ld1 {v4.16b-v7.16b}, [x10]
- add x7, x7, x1
- tbl v0.16b, {v12.16b-v15.16b}, v4.16b
- tbl v1.16b, {v12.16b-v15.16b}, v5.16b
- tbl v2.16b, {v12.16b-v15.16b}, v6.16b
- tbl v3.16b, {v12.16b-v15.16b}, v7.16b
-
- eor v28.16b, v28.16b, v0.16b
- eor v29.16b, v29.16b, v1.16b
- eor v30.16b, v30.16b, v2.16b
- eor v31.16b, v31.16b, v3.16b
- st1 {v28.16b-v31.16b}, [x7] // overlapping stores
-3: st1 {v24.16b-v27.16b}, [x1]
- b .Lout
-SYM_FUNC_END(chacha_4block_xor_neon)
-
- .section ".rodata", "a", %progbits
- .align L1_CACHE_SHIFT
-.Lpermute:
- .set .Li, 0
- .rept 128
- .byte (.Li - 64)
- .set .Li, .Li + 1
- .endr
-
-CTRINC: .word 1, 2, 3, 4
-ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/lib/crypto/chacha-neon-glue.c b/arch/arm64/lib/crypto/chacha-neon-glue.c
deleted file mode 100644
index d0188f974ca5..000000000000
--- a/arch/arm64/lib/crypto/chacha-neon-glue.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * ChaCha and HChaCha functions (ARM64 optimized)
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
- u8 *dst, const u8 *src, int nrounds);
-asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- int nrounds, int bytes);
-asmlinkage void hchacha_block_neon(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
- int bytes, int nrounds)
-{
- while (bytes > 0) {
- int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
-
- if (l <= CHACHA_BLOCK_SIZE) {
- u8 buf[CHACHA_BLOCK_SIZE];
-
- memcpy(buf, src, l);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, l);
- state->x[12] += 1;
- break;
- }
- chacha_4block_xor_neon(state, dst, src, nrounds, l);
- bytes -= l;
- src += l;
- dst += l;
- state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
- }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
- hchacha_block_generic(state, out, nrounds);
- } else {
- kernel_neon_begin();
- hchacha_block_neon(state, out, nrounds);
- kernel_neon_end();
- }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
- !crypto_simd_usable())
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- kernel_neon_begin();
- chacha_doneon(state, dst, src, todo, nrounds);
- kernel_neon_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- return static_key_enabled(&have_neon);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
-{
- if (cpu_have_named_feature(ASIMD))
- static_branch_enable(&have_neon);
- return 0;
-}
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crypto/poly1305-armv8.pl b/arch/arm64/lib/crypto/poly1305-armv8.pl
deleted file mode 100644
index 22c9069c0650..000000000000
--- a/arch/arm64/lib/crypto/poly1305-armv8.pl
+++ /dev/null
@@ -1,917 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-# IALU/gcc-4.9 NEON
-#
-# Apple A7 1.86/+5% 0.72
-# Cortex-A53 2.69/+58% 1.47
-# Cortex-A57 2.70/+7% 1.14
-# Denver 1.64/+50% 1.18(*)
-# X-Gene 2.13/+68% 2.27
-# Mongoose 1.77/+75% 1.12
-# Kryo 2.70/+55% 1.13
-# ThunderX2 1.17/+95% 1.36
-#
-# (*) estimate based on resources availability is less than 1.0,
-# i.e. measured result is worse than expected, presumably binary
-# translator is not almighty;
-
-$flavour=shift;
-$output=shift;
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern OPENSSL_armcap_P
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl poly1305_blocks
-.globl poly1305_emit
-
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
- cmp $inp,xzr
- stp xzr,xzr,[$ctx] // zero hash value
- stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
-
- csel x0,xzr,x0,eq
- b.eq .Lno_key
-
-#ifndef __KERNEL__
- adrp x17,OPENSSL_armcap_P
- ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
-#endif
-
- ldp $r0,$r1,[$inp] // load key
- mov $s1,#0xfffffffc0fffffff
- movk $s1,#0x0fff,lsl#48
-#ifdef __AARCH64EB__
- rev $r0,$r0 // flip bytes
- rev $r1,$r1
-#endif
- and $r0,$r0,$s1 // &=0ffffffc0fffffff
- and $s1,$s1,#-4
- and $r1,$r1,$s1 // &=0ffffffc0ffffffc
- mov w#$s1,#-1
- stp $r0,$r1,[$ctx,#32] // save key value
- str w#$s1,[$ctx,#48] // impossible key power value
-
-#ifndef __KERNEL__
- tst w17,#ARMV7_NEON
-
- adr $d0,.Lpoly1305_blocks
- adr $r0,.Lpoly1305_blocks_neon
- adr $d1,.Lpoly1305_emit
-
- csel $d0,$d0,$r0,eq
-
-# ifdef __ILP32__
- stp w#$d0,w#$d1,[$len]
-# else
- stp $d0,$d1,[$len]
-# endif
-#endif
- mov x0,#1
-.Lno_key:
- ret
-.size poly1305_init,.-poly1305_init
-
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
- ands $len,$len,#-16
- b.eq .Lno_data
-
- ldp $h0,$h1,[$ctx] // load hash value
- ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
- ldp $r0,$r1,[$ctx,#32] // load key value
-
-#ifdef __AARCH64EB__
- lsr $d0,$h0,#32
- mov w#$d1,w#$h0
- lsr $d2,$h1,#32
- mov w15,w#$h1
- lsr x16,$h2,#32
-#else
- mov w#$d0,w#$h0
- lsr $d1,$h0,#32
- mov w#$d2,w#$h1
- lsr x15,$h1,#32
- mov w16,w#$h2
-#endif
-
- add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
- lsr $d1,$d2,#12
- adds $d0,$d0,$d2,lsl#52
- add $d1,$d1,x15,lsl#14
- adc $d1,$d1,xzr
- lsr $d2,x16,#24
- adds $d1,$d1,x16,lsl#40
- adc $d2,$d2,xzr
-
- cmp x17,#0 // is_base2_26?
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- csel $h0,$h0,$d0,eq // choose between radixes
- csel $h1,$h1,$d1,eq
- csel $h2,$h2,$d2,eq
-
-.Loop:
- ldp $t0,$t1,[$inp],#16 // load input
- sub $len,$len,#16
-#ifdef __AARCH64EB__
- rev $t0,$t0
- rev $t1,$t1
-#endif
- adds $h0,$h0,$t0 // accumulate input
- adcs $h1,$h1,$t1
-
- mul $d0,$h0,$r0 // h0*r0
- adc $h2,$h2,$padbit
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adcs $h1,$d1,xzr
- adc $h2,$h2,xzr
-
- cbnz $len,.Loop
-
- stp $h0,$h1,[$ctx] // store hash value
- stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
-
-.Lno_data:
- ret
-.size poly1305_blocks,.-poly1305_blocks
-
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
-.Lpoly1305_emit:
- ldp $h0,$h1,[$ctx] // load hash base 2^64
- ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
- ldp $t0,$t1,[$nonce] // load nonce
-
-#ifdef __AARCH64EB__
- lsr $d0,$h0,#32
- mov w#$d1,w#$h0
- lsr $d2,$h1,#32
- mov w15,w#$h1
- lsr x16,$h2,#32
-#else
- mov w#$d0,w#$h0
- lsr $d1,$h0,#32
- mov w#$d2,w#$h1
- lsr x15,$h1,#32
- mov w16,w#$h2
-#endif
-
- add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
- lsr $d1,$d2,#12
- adds $d0,$d0,$d2,lsl#52
- add $d1,$d1,x15,lsl#14
- adc $d1,$d1,xzr
- lsr $d2,x16,#24
- adds $d1,$d1,x16,lsl#40
- adc $d2,$d2,xzr
-
- cmp $r0,#0 // is_base2_26?
- csel $h0,$h0,$d0,eq // choose between radixes
- csel $h1,$h1,$d1,eq
- csel $h2,$h2,$d2,eq
-
- adds $d0,$h0,#5 // compare to modulus
- adcs $d1,$h1,xzr
- adc $d2,$h2,xzr
-
- tst $d2,#-4 // see if it's carried/borrowed
-
- csel $h0,$h0,$d0,eq
- csel $h1,$h1,$d1,eq
-
-#ifdef __AARCH64EB__
- ror $t0,$t0,#32 // flip nonce words
- ror $t1,$t1,#32
-#endif
- adds $h0,$h0,$t0 // accumulate nonce
- adc $h1,$h1,$t1
-#ifdef __AARCH64EB__
- rev $h0,$h0 // flip output bytes
- rev $h1,$h1
-#endif
- stp $h0,$h1,[$mac] // write result
-
- ret
-.size poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros; # borrow
-
-$code.=<<___;
-.type poly1305_mult,%function
-.align 5
-poly1305_mult:
- mul $d0,$h0,$r0 // h0*r0
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adcs $h1,$d1,xzr
- adc $h2,$h2,xzr
-
- ret
-.size poly1305_mult,.-poly1305_mult
-
-.type poly1305_splat,%function
-.align 4
-poly1305_splat:
- and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x13,$h0,#26,#26
- extr x14,$h1,$h0,#52
- and x14,x14,#0x03ffffff
- ubfx x15,$h1,#14,#26
- extr x16,$h2,$h1,#40
-
- str w12,[$ctx,#16*0] // r0
- add w12,w13,w13,lsl#2 // r1*5
- str w13,[$ctx,#16*1] // r1
- add w13,w14,w14,lsl#2 // r2*5
- str w12,[$ctx,#16*2] // s1
- str w14,[$ctx,#16*3] // r2
- add w14,w15,w15,lsl#2 // r3*5
- str w13,[$ctx,#16*4] // s2
- str w15,[$ctx,#16*5] // r3
- add w15,w16,w16,lsl#2 // r4*5
- str w14,[$ctx,#16*6] // s3
- str w16,[$ctx,#16*7] // r4
- str w15,[$ctx,#16*8] // s4
-
- ret
-.size poly1305_splat,.-poly1305_splat
-
-#ifdef __KERNEL__
-.globl poly1305_blocks_neon
-#endif
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
- ldr $is_base2_26,[$ctx,#24]
- cmp $len,#128
- b.lo .Lpoly1305_blocks
-
- .inst 0xd503233f // paciasp
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- cbz $is_base2_26,.Lbase2_64_neon
-
- ldp w10,w11,[$ctx] // load hash value base 2^26
- ldp w12,w13,[$ctx,#8]
- ldr w14,[$ctx,#16]
-
- tst $len,#31
- b.eq .Leven_neon
-
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr $h1,x12,#12
- adds $h0,$h0,x12,lsl#52
- add $h1,$h1,x13,lsl#14
- adc $h1,$h1,xzr
- lsr $h2,x14,#24
- adds $h1,$h1,x14,lsl#40
- adc $d2,$h2,xzr // can be partially reduced...
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-
-#ifdef __AARCH64EB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl poly1305_mult
-
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- b .Leven_neon
-
-.align 4
-.Lbase2_64_neon:
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- ldp $h0,$h1,[$ctx] // load hash value base 2^64
- ldr $h2,[$ctx,#16]
-
- tst $len,#31
- b.eq .Linit_neon
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __AARCH64EB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl poly1305_mult
-
-.Linit_neon:
- ldr w17,[$ctx,#48] // first table element
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- cmp w17,#-1 // is value impossible?
- b.ne .Leven_neon
-
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
- ////////////////////////////////// initialize r^n table
- mov $h0,$r0 // r^1
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- mov $h1,$r1
- mov $h2,xzr
- add $ctx,$ctx,#48+12
- bl poly1305_splat
-
- bl poly1305_mult // r^2
- sub $ctx,$ctx,#4
- bl poly1305_splat
-
- bl poly1305_mult // r^3
- sub $ctx,$ctx,#4
- bl poly1305_splat
-
- bl poly1305_mult // r^4
- sub $ctx,$ctx,#4
- bl poly1305_splat
- sub $ctx,$ctx,#48 // restore original $ctx
- b .Ldo_neon
-
-.align 4
-.Leven_neon:
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
-.Ldo_neon:
- ldp x8,x12,[$inp,#32] // inp[2:3]
- subs $len,$len,#64
- ldp x9,x13,[$inp,#48]
- add $in2,$inp,#96
- adrp $zeros,.Lzeros
- add $zeros,$zeros,#:lo12:.Lzeros
-
- lsl $padbit,$padbit,#24
- add x15,$ctx,#48
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN23_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN23_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- fmov $IN23_2,x8
- fmov $IN23_3,x10
- fmov $IN23_4,x12
-
- ldp x8,x12,[$inp],#16 // inp[0:1]
- ldp x9,x13,[$inp],#48
-
- ld1 {$R0,$R1,$S1,$R2},[x15],#64
- ld1 {$S2,$R3,$S3,$R4},[x15],#64
- ld1 {$S4},[x15]
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN01_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN01_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- movi $MASK.2d,#-1
- fmov $IN01_2,x8
- fmov $IN01_3,x10
- fmov $IN01_4,x12
- ushr $MASK.2d,$MASK.2d,#38
-
- b.ls .Lskip_loop
-
-.align 4
-.Loop_neon:
- ////////////////////////////////////////////////////////////////
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- // \___________________/
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- // \___________________/ \____________________/
- //
- // Note that we start with inp[2:3]*r^2. This is because it
- // doesn't depend on reduction in previous iteration.
- ////////////////////////////////////////////////////////////////
- // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
- // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
- // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
- // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
- // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- subs $len,$len,#64
- umull $ACC4,$IN23_0,${R4}[2]
- csel $in2,$zeros,$in2,lo
- umull $ACC3,$IN23_0,${R3}[2]
- umull $ACC2,$IN23_0,${R2}[2]
- ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
- umull $ACC1,$IN23_0,${R1}[2]
- ldp x9,x13,[$in2],#48
- umull $ACC0,$IN23_0,${R0}[2]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- umlal $ACC4,$IN23_1,${R3}[2]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC3,$IN23_1,${R2}[2]
- and x5,x9,#0x03ffffff
- umlal $ACC2,$IN23_1,${R1}[2]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN23_1,${R0}[2]
- ubfx x7,x9,#26,#26
- umlal $ACC0,$IN23_1,${S4}[2]
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
-
- umlal $ACC4,$IN23_2,${R2}[2]
- extr x8,x12,x8,#52
- umlal $ACC3,$IN23_2,${R1}[2]
- extr x9,x13,x9,#52
- umlal $ACC2,$IN23_2,${R0}[2]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC1,$IN23_2,${S4}[2]
- fmov $IN23_0,x4
- umlal $ACC0,$IN23_2,${S3}[2]
- and x8,x8,#0x03ffffff
-
- umlal $ACC4,$IN23_3,${R1}[2]
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN23_3,${R0}[2]
- ubfx x10,x12,#14,#26
- umlal $ACC2,$IN23_3,${S4}[2]
- ubfx x11,x13,#14,#26
- umlal $ACC1,$IN23_3,${S3}[2]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC0,$IN23_3,${S2}[2]
- fmov $IN23_1,x6
-
- add $IN01_2,$IN01_2,$H2
- add x12,$padbit,x12,lsr#40
- umlal $ACC4,$IN23_4,${R0}[2]
- add x13,$padbit,x13,lsr#40
- umlal $ACC3,$IN23_4,${S4}[2]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC2,$IN23_4,${S3}[2]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN23_4,${S2}[2]
- fmov $IN23_2,x8
- umlal $ACC0,$IN23_4,${S1}[2]
- fmov $IN23_3,x10
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- fmov $IN23_4,x12
- umlal $ACC3,$IN01_2,${R1}[0]
- ldp x8,x12,[$inp],#16 // inp[0:1]
- umlal $ACC0,$IN01_2,${S3}[0]
- ldp x9,x13,[$inp],#48
- umlal $ACC4,$IN01_2,${R2}[0]
- umlal $ACC1,$IN01_2,${S4}[0]
- umlal $ACC2,$IN01_2,${R0}[0]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}[0]
- umlal $ACC4,$IN01_0,${R4}[0]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC2,$IN01_0,${R2}[0]
- and x5,x9,#0x03ffffff
- umlal $ACC0,$IN01_0,${R0}[0]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN01_0,${R1}[0]
- ubfx x7,x9,#26,#26
-
- add $IN01_3,$IN01_3,$H3
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- umlal $ACC3,$IN01_1,${R2}[0]
- extr x8,x12,x8,#52
- umlal $ACC4,$IN01_1,${R3}[0]
- extr x9,x13,x9,#52
- umlal $ACC0,$IN01_1,${S4}[0]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC2,$IN01_1,${R1}[0]
- fmov $IN01_0,x4
- umlal $ACC1,$IN01_1,${R0}[0]
- and x8,x8,#0x03ffffff
-
- add $IN01_4,$IN01_4,$H4
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN01_3,${R0}[0]
- ubfx x10,x12,#14,#26
- umlal $ACC0,$IN01_3,${S2}[0]
- ubfx x11,x13,#14,#26
- umlal $ACC4,$IN01_3,${R1}[0]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC1,$IN01_3,${S3}[0]
- fmov $IN01_1,x6
- umlal $ACC2,$IN01_3,${S4}[0]
- add x12,$padbit,x12,lsr#40
-
- umlal $ACC3,$IN01_4,${S4}[0]
- add x13,$padbit,x13,lsr#40
- umlal $ACC0,$IN01_4,${S1}[0]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC4,$IN01_4,${R0}[0]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN01_4,${S2}[0]
- fmov $IN01_2,x8
- umlal $ACC2,$IN01_4,${S3}[0]
- fmov $IN01_3,x10
- fmov $IN01_4,x12
-
- /////////////////////////////////////////////////////////////////
- // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
- //
- // [see discussion in poly1305-armv4 module]
-
- ushr $T0.2d,$ACC3,#26
- xtn $H3,$ACC3
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- bic $H3,#0xfc,lsl#24 // &=0x03ffffff
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- xtn $H4,$ACC4
- ushr $T1.2d,$ACC1,#26
- xtn $H1,$ACC1
- bic $H4,#0xfc,lsl#24
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- shrn $T1.2s,$ACC2,#26
- xtn $H2,$ACC2
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- bic $H1,#0xfc,lsl#24
- add $H3,$H3,$T1.2s // h2 -> h3
- bic $H2,#0xfc,lsl#24
-
- shrn $T0.2s,$ACC0,#26
- xtn $H0,$ACC0
- ushr $T1.2s,$H3,#26
- bic $H3,#0xfc,lsl#24
- bic $H0,#0xfc,lsl#24
- add $H1,$H1,$T0.2s // h0 -> h1
- add $H4,$H4,$T1.2s // h3 -> h4
-
- b.hi .Loop_neon
-
-.Lskip_loop:
- dup $IN23_2,${IN23_2}[0]
- add $IN01_2,$IN01_2,$H2
-
- ////////////////////////////////////////////////////////////////
- // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- adds $len,$len,#32
- b.ne .Long_tail
-
- dup $IN23_2,${IN01_2}[0]
- add $IN23_0,$IN01_0,$H0
- add $IN23_3,$IN01_3,$H3
- add $IN23_1,$IN01_1,$H1
- add $IN23_4,$IN01_4,$H4
-
-.Long_tail:
- dup $IN23_0,${IN23_0}[0]
- umull2 $ACC0,$IN23_2,${S3}
- umull2 $ACC3,$IN23_2,${R1}
- umull2 $ACC4,$IN23_2,${R2}
- umull2 $ACC2,$IN23_2,${R0}
- umull2 $ACC1,$IN23_2,${S4}
-
- dup $IN23_1,${IN23_1}[0]
- umlal2 $ACC0,$IN23_0,${R0}
- umlal2 $ACC2,$IN23_0,${R2}
- umlal2 $ACC3,$IN23_0,${R3}
- umlal2 $ACC4,$IN23_0,${R4}
- umlal2 $ACC1,$IN23_0,${R1}
-
- dup $IN23_3,${IN23_3}[0]
- umlal2 $ACC0,$IN23_1,${S4}
- umlal2 $ACC3,$IN23_1,${R2}
- umlal2 $ACC2,$IN23_1,${R1}
- umlal2 $ACC4,$IN23_1,${R3}
- umlal2 $ACC1,$IN23_1,${R0}
-
- dup $IN23_4,${IN23_4}[0]
- umlal2 $ACC3,$IN23_3,${R0}
- umlal2 $ACC4,$IN23_3,${R1}
- umlal2 $ACC0,$IN23_3,${S2}
- umlal2 $ACC1,$IN23_3,${S3}
- umlal2 $ACC2,$IN23_3,${S4}
-
- umlal2 $ACC3,$IN23_4,${S4}
- umlal2 $ACC0,$IN23_4,${S1}
- umlal2 $ACC4,$IN23_4,${R0}
- umlal2 $ACC1,$IN23_4,${S2}
- umlal2 $ACC2,$IN23_4,${S3}
-
- b.eq .Lshort_tail
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4:r^3 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- umlal $ACC3,$IN01_2,${R1}
- umlal $ACC0,$IN01_2,${S3}
- umlal $ACC4,$IN01_2,${R2}
- umlal $ACC1,$IN01_2,${S4}
- umlal $ACC2,$IN01_2,${R0}
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}
- umlal $ACC0,$IN01_0,${R0}
- umlal $ACC4,$IN01_0,${R4}
- umlal $ACC1,$IN01_0,${R1}
- umlal $ACC2,$IN01_0,${R2}
-
- add $IN01_3,$IN01_3,$H3
- umlal $ACC3,$IN01_1,${R2}
- umlal $ACC0,$IN01_1,${S4}
- umlal $ACC4,$IN01_1,${R3}
- umlal $ACC1,$IN01_1,${R0}
- umlal $ACC2,$IN01_1,${R1}
-
- add $IN01_4,$IN01_4,$H4
- umlal $ACC3,$IN01_3,${R0}
- umlal $ACC0,$IN01_3,${S2}
- umlal $ACC4,$IN01_3,${R1}
- umlal $ACC1,$IN01_3,${S3}
- umlal $ACC2,$IN01_3,${S4}
-
- umlal $ACC3,$IN01_4,${S4}
- umlal $ACC0,$IN01_4,${S1}
- umlal $ACC4,$IN01_4,${R0}
- umlal $ACC1,$IN01_4,${S2}
- umlal $ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
- ////////////////////////////////////////////////////////////////
- // horizontal add
-
- addp $ACC3,$ACC3,$ACC3
- ldp d8,d9,[sp,#16] // meet ABI requirements
- addp $ACC0,$ACC0,$ACC0
- ldp d10,d11,[sp,#32]
- addp $ACC4,$ACC4,$ACC4
- ldp d12,d13,[sp,#48]
- addp $ACC1,$ACC1,$ACC1
- ldp d14,d15,[sp,#64]
- addp $ACC2,$ACC2,$ACC2
- ldr x30,[sp,#8]
-
- ////////////////////////////////////////////////////////////////
- // lazy reduction, but without narrowing
-
- ushr $T0.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
-
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- and $ACC4,$ACC4,$MASK.2d
- ushr $T1.2d,$ACC1,#26
- and $ACC1,$ACC1,$MASK.2d
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- ushr $T1.2d,$ACC2,#26
- and $ACC2,$ACC2,$MASK.2d
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- add $ACC3,$ACC3,$T1.2d // h2 -> h3
-
- ushr $T0.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- ushr $T1.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- add $ACC1,$ACC1,$T0.2d // h0 -> h1
- add $ACC4,$ACC4,$T1.2d // h3 -> h4
-
- ////////////////////////////////////////////////////////////////
- // write the result, can be partially reduced
-
- st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
- mov x4,#1
- st1 {$ACC4}[0],[$ctx]
- str x4,[$ctx,#8] // set is_base2_26
-
- ldr x29,[sp],#80
- .inst 0xd50323bf // autiasp
- ret
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.pushsection .rodata
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0
-.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
-.popsection
-
-.align 2
-#if !defined(__KERNEL__) && !defined(_WIN64)
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
-___
-
-foreach (split("\n",$code)) {
- s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
- s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
- (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
- (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
- (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
- (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
- (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
- s/\.[124]([sd])\[/.$1\[/;
- s/w#x([0-9]+)/w$1/g;
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c
deleted file mode 100644
index 6a661cf04821..000000000000
--- a/arch/arm64/lib/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
- struct poly1305_block_state *state,
- const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
- u8 digest[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
- unsigned int len, u32 padbit)
-{
- len = round_down(len, POLY1305_BLOCK_SIZE);
- if (static_branch_likely(&have_neon)) {
- do {
- unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
- kernel_neon_begin();
- poly1305_blocks_neon(state, src, todo, 1);
- kernel_neon_end();
-
- len -= todo;
- src += todo;
- } while (len);
- } else
- poly1305_blocks(state, src, len, 1);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- /* We always can use at least the ARM64 scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init neon_poly1305_mod_init(void)
-{
- if (cpu_have_named_feature(ASIMD))
- static_branch_enable(&have_neon);
- return 0;
-}
-subsys_initcall(neon_poly1305_mod_init);
-
-static void __exit neon_poly1305_mod_exit(void)
-{
-}
-module_exit(neon_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crypto/sha2-armv8.pl b/arch/arm64/lib/crypto/sha2-armv8.pl
deleted file mode 100644
index 4aebd20c498b..000000000000
--- a/arch/arm64/lib/crypto/sha2-armv8.pl
+++ /dev/null
@@ -1,786 +0,0 @@
-#! /usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License"). You may not use
-# this file except in compliance with the License. You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# SHA256/512 for ARMv8.
-#
-# Performance in cycles per processed byte and improvement coefficient
-# over code generated with "default" compiler:
-#
-# SHA256-hw SHA256(*) SHA512
-# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
-# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
-# Denver 2.01 10.5 (+26%) 6.70 (+8%)
-# X-Gene 20.0 (+100%) 12.8 (+300%(***))
-# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-#
-# (*) Software SHA256 results are of lesser relevance, presented
-# mostly for informational purposes.
-# (**) The result is a trade-off: it's possible to improve it by
-# 10% (or by 1 cycle per round), but at the cost of 20% loss
-# on Cortex-A53 (or by 4 cycles per round).
-# (***) Super-impressive coefficients over gcc-generated code are
-# indication of some compiler "pathology", most notably code
-# generated with -mgeneral-regs-only is significantly faster
-# and the gap is only 40-90%.
-#
-# October 2016.
-#
-# Originally it was reckoned that it makes no sense to implement NEON
-# version of SHA256 for 64-bit processors. This is because performance
-# improvement on most wide-spread Cortex-A5x processors was observed
-# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-# observed that 32-bit NEON SHA256 performs significantly better than
-# 64-bit scalar version on *some* of the more recent processors. As
-# result 64-bit NEON version of SHA256 was added to provide best
-# all-round performance. For example it executes ~30% faster on X-Gene
-# and Mongoose. [For reference, NEON version of SHA512 is bound to
-# deliver much less improvement, likely *negative* on Cortex-A5x.
-# Which is why NEON support is limited to SHA256.]
-
-$output=pop;
-$flavour=pop;
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open OUT,"| \"$^X\" $xlate $flavour $output";
- *STDOUT=*OUT;
-} else {
- open STDOUT,">$output";
-}
-
-if ($output =~ /512/) {
- $BITS=512;
- $SZ=8;
- @Sigma0=(28,34,39);
- @Sigma1=(14,18,41);
- @sigma0=(1, 8, 7);
- @sigma1=(19,61, 6);
- $rounds=80;
- $reg_t="x";
-} else {
- $BITS=256;
- $SZ=4;
- @Sigma0=( 2,13,22);
- @Sigma1=( 6,11,25);
- @sigma0=( 7,18, 3);
- @sigma1=(17,19,10);
- $rounds=64;
- $reg_t="w";
-}
-
-$func="sha${BITS}_blocks_arch";
-
-($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
-
-@X=map("$reg_t$_",(3..15,0..2));
-@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
-($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
-
-sub BODY_00_xx {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my $j=($i+1)&15;
-my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
- $T0=@X[$i+3] if ($i<11);
-
-$code.=<<___ if ($i<16);
-#ifndef __AARCH64EB__
- rev @X[$i],@X[$i] // $i
-#endif
-___
-$code.=<<___ if ($i<13 && ($i&1));
- ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
-___
-$code.=<<___ if ($i==13);
- ldp @X[14],@X[15],[$inp]
-___
-$code.=<<___ if ($i>=14);
- ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
-___
-$code.=<<___ if ($i>0 && $i<16);
- add $a,$a,$t1 // h+=Sigma0(a)
-___
-$code.=<<___ if ($i>=11);
- str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
-___
-# While ARMv8 specifies merged rotate-n-logical operation such as
-# 'eor x,y,z,ror#n', it was found to negatively affect performance
-# on Apple A7. The reason seems to be that it requires even 'y' to
-# be available earlier. This means that such merged instruction is
-# not necessarily best choice on critical path... On the other hand
-# Cortex-A5x handles merged instructions much better than disjoint
-# rotate and logical... See (**) footnote above.
-$code.=<<___ if ($i<15);
- ror $t0,$e,#$Sigma1[0]
- add $h,$h,$t2 // h+=K[i]
- eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
- and $t1,$f,$e
- bic $t2,$g,$e
- add $h,$h,@X[$i&15] // h+=X[i]
- orr $t1,$t1,$t2 // Ch(e,f,g)
- eor $t2,$a,$b // a^b, b^c in next round
- eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
- ror $T0,$a,#$Sigma0[0]
- add $h,$h,$t1 // h+=Ch(e,f,g)
- eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
- add $h,$h,$t0 // h+=Sigma1(e)
- and $t3,$t3,$t2 // (b^c)&=(a^b)
- add $d,$d,$h // d+=h
- eor $t3,$t3,$b // Maj(a,b,c)
- eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
- add $h,$h,$t3 // h+=Maj(a,b,c)
- ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
- //add $h,$h,$t1 // h+=Sigma0(a)
-___
-$code.=<<___ if ($i>=15);
- ror $t0,$e,#$Sigma1[0]
- add $h,$h,$t2 // h+=K[i]
- ror $T1,@X[($j+1)&15],#$sigma0[0]
- and $t1,$f,$e
- ror $T2,@X[($j+14)&15],#$sigma1[0]
- bic $t2,$g,$e
- ror $T0,$a,#$Sigma0[0]
- add $h,$h,@X[$i&15] // h+=X[i]
- eor $t0,$t0,$e,ror#$Sigma1[1]
- eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
- orr $t1,$t1,$t2 // Ch(e,f,g)
- eor $t2,$a,$b // a^b, b^c in next round
- eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
- eor $T0,$T0,$a,ror#$Sigma0[1]
- add $h,$h,$t1 // h+=Ch(e,f,g)
- and $t3,$t3,$t2 // (b^c)&=(a^b)
- eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
- eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
- add $h,$h,$t0 // h+=Sigma1(e)
- eor $t3,$t3,$b // Maj(a,b,c)
- eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
- eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
- add @X[$j],@X[$j],@X[($j+9)&15]
- add $d,$d,$h // d+=h
- add $h,$h,$t3 // h+=Maj(a,b,c)
- ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
- add @X[$j],@X[$j],$T1
- add $h,$h,$t1 // h+=Sigma0(a)
- add @X[$j],@X[$j],$T2
-___
- ($t2,$t3)=($t3,$t2);
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern OPENSSL_armcap_P
-.globl $func
-.type $func,%function
-.align 6
-$func:
-___
-$code.=<<___ if ($SZ==4);
-#ifndef __KERNEL__
-# ifdef __ILP32__
- ldrsw x16,.LOPENSSL_armcap_P
-# else
- ldr x16,.LOPENSSL_armcap_P
-# endif
- adr x17,.LOPENSSL_armcap_P
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#ARMV8_SHA256
- b.ne .Lv8_entry
- tst w16,#ARMV7_NEON
- b.ne .Lneon_entry
-#endif
-___
-$code.=<<___;
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
-
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#4*$SZ
-
- ldp $A,$B,[$ctx] // load context
- ldp $C,$D,[$ctx,#2*$SZ]
- ldp $E,$F,[$ctx,#4*$SZ]
- add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
- ldp $G,$H,[$ctx,#6*$SZ]
- adr $Ktbl,.LK$BITS
- stp $ctx,$num,[x29,#96]
-
-.Loop:
- ldp @X[0],@X[1],[$inp],#2*$SZ
- ldr $t2,[$Ktbl],#$SZ // *K++
- eor $t3,$B,$C // magic seed
- str $inp,[x29,#112]
-___
-for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=".Loop_16_xx:\n";
-for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- cbnz $t2,.Loop_16_xx
-
- ldp $ctx,$num,[x29,#96]
- ldr $inp,[x29,#112]
- sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
-
- ldp @X[0],@X[1],[$ctx]
- ldp @X[2],@X[3],[$ctx,#2*$SZ]
- add $inp,$inp,#14*$SZ // advance input pointer
- ldp @X[4],@X[5],[$ctx,#4*$SZ]
- add $A,$A,@X[0]
- ldp @X[6],@X[7],[$ctx,#6*$SZ]
- add $B,$B,@X[1]
- add $C,$C,@X[2]
- add $D,$D,@X[3]
- stp $A,$B,[$ctx]
- add $E,$E,@X[4]
- add $F,$F,@X[5]
- stp $C,$D,[$ctx,#2*$SZ]
- add $G,$G,@X[6]
- add $H,$H,@X[7]
- cmp $inp,$num
- stp $E,$F,[$ctx,#4*$SZ]
- stp $G,$H,[$ctx,#6*$SZ]
- b.ne .Loop
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#4*$SZ
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- ret
-.size $func,.-$func
-
-.align 6
-.type .LK$BITS,%object
-.LK$BITS:
-___
-$code.=<<___ if ($SZ==8);
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
- .quad 0 // terminator
-___
-$code.=<<___ if ($SZ==4);
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0 //terminator
-___
-$code.=<<___;
-.size .LK$BITS,.-.LK$BITS
-#ifndef __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
- .long OPENSSL_armcap_P-.
-# else
- .quad OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-___
-
-if ($SZ==4) {
-my $Ktbl="x3";
-
-my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
-my @MSG=map("v$_.16b",(4..7));
-my ($W0,$W1)=("v16.4s","v17.4s");
-my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
-
-$code.=<<___;
-#ifndef __KERNEL__
-.type sha256_block_armv8,%function
-.align 6
-sha256_block_armv8:
-.Lv8_entry:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ld1.32 {$ABCD,$EFGH},[$ctx]
- adr $Ktbl,.LK256
-
-.Loop_hw:
- ld1 {@MSG[0]-@MSG[3]},[$inp],#64
- sub $num,$num,#1
- ld1.32 {$W0},[$Ktbl],#16
- rev32 @MSG[0],@MSG[0]
- rev32 @MSG[1],@MSG[1]
- rev32 @MSG[2],@MSG[2]
- rev32 @MSG[3],@MSG[3]
- orr $ABCD_SAVE,$ABCD,$ABCD // offload
- orr $EFGH_SAVE,$EFGH,$EFGH
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
- ld1.32 {$W1},[$Ktbl],#16
- add.i32 $W0,$W0,@MSG[0]
- sha256su0 @MSG[0],@MSG[1]
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
- sha256su1 @MSG[0],@MSG[2],@MSG[3]
-___
- ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
-}
-$code.=<<___;
- ld1.32 {$W1},[$Ktbl],#16
- add.i32 $W0,$W0,@MSG[0]
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- ld1.32 {$W0},[$Ktbl],#16
- add.i32 $W1,$W1,@MSG[1]
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- ld1.32 {$W1},[$Ktbl]
- add.i32 $W0,$W0,@MSG[2]
- sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- add.i32 $W1,$W1,@MSG[3]
- orr $abcd,$ABCD,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- add.i32 $ABCD,$ABCD,$ABCD_SAVE
- add.i32 $EFGH,$EFGH,$EFGH_SAVE
-
- cbnz $num,.Loop_hw
-
- st1.32 {$ABCD,$EFGH},[$ctx]
-
- ldr x29,[sp],#16
- ret
-.size sha256_block_armv8,.-sha256_block_armv8
-#endif
-___
-}
-
-if ($SZ==4) { ######################################### NEON stuff #
-# You'll surely note a lot of similarities with sha256-armv4 module,
-# and of course it's not a coincidence. sha256-armv4 was used as
-# initial template, but was adapted for ARMv8 instruction set and
-# extensively re-tuned for all-round performance.
-
-my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
-my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
-my $Ktbl="x16";
-my $Xfer="x17";
-my @X = map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
-my $j=0;
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
-sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
-sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
-
-sub Xupdate()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- &ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T2,$T0,$sigma0[0]);
- eval(shift(@insns));
- &ushr_32 ($T1,$T0,$sigma0[2]);
- eval(shift(@insns));
- &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
- eval(shift(@insns));
- &sli_32 ($T2,$T0,32-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T3,$T0,$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T1,$T1,$T2);
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_32 ($T3,$T0,32-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T4,$T7,$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_32 ($T4,$T7,32-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T5,$T7,$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T3,$T7,$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_u32 ($T3,$T7,32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T5,$T5,$T4);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &ushr_32 ($T6,@X[0],$sigma1[0]);
- eval(shift(@insns));
- &ushr_32 ($T7,@X[0],$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_32 ($T6,@X[0],32-$sigma1[0]);
- eval(shift(@insns));
- &ushr_32 ($T5,@X[0],$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T7,$T7,$T6);
- eval(shift(@insns));
- eval(shift(@insns));
- &sli_32 ($T5,@X[0],32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &ld1_32 ("{$T0}","[$Ktbl], #16");
- eval(shift(@insns));
- &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- &eor_8 ($T5,$T5,$T5);
- eval(shift(@insns));
- eval(shift(@insns));
- &mov (&Dhi($T5), &Dlo($T7));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 ($T0,$T0,@X[0]);
- while($#insns>=1) { eval(shift(@insns)); }
- &st1_32 ("{$T0}","[$Xfer], #16");
- eval(shift(@insns));
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- eval(shift(@insns));
- eval(shift(@insns));
- &ld1_8 ("{@X[0]}","[$inp],#16");
- eval(shift(@insns));
- eval(shift(@insns));
- &ld1_32 ("{$T0}","[$Ktbl],#16");
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &rev32 (@X[0],@X[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &add_32 ($T0,$T0,@X[0]);
- foreach (@insns) { eval; } # remaining instructions
- &st1_32 ("{$T0}","[$Xfer], #16");
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub body_00_15 () {
- (
- '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
- '&add ($h,$h,$t1)', # h+=X[i]+K[i]
- '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
- '&and ($t1,$f,$e)',
- '&bic ($t4,$g,$e)',
- '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
- '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
- '&orr ($t1,$t1,$t4)', # Ch(e,f,g)
- '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
- '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
- '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
- '&ror ($t0,$t0,"#$Sigma1[0]")',
- '&eor ($t2,$a,$b)', # a^b, b^c in next round
- '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
- '&add ($h,$h,$t0)', # h+=Sigma1(e)
- '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
- '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
- '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
- '&ror ($t4,$t4,"#$Sigma0[0]")',
- '&add ($d,$d,$h)', # d+=h
- '&eor ($t3,$t3,$b)', # Maj(a,b,c)
- '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
- )
-}
-
-$code.=<<___;
-#ifdef __KERNEL__
-.globl sha256_block_neon
-#endif
-.type sha256_block_neon,%function
-.align 4
-sha256_block_neon:
-.Lneon_entry:
- stp x29, x30, [sp, #-16]!
- mov x29, sp
- sub sp,sp,#16*4
-
- adr $Ktbl,.LK256
- add $num,$inp,$num,lsl#6 // len to point at the end of inp
-
- ld1.8 {@X[0]},[$inp], #16
- ld1.8 {@X[1]},[$inp], #16
- ld1.8 {@X[2]},[$inp], #16
- ld1.8 {@X[3]},[$inp], #16
- ld1.32 {$T0},[$Ktbl], #16
- ld1.32 {$T1},[$Ktbl], #16
- ld1.32 {$T2},[$Ktbl], #16
- ld1.32 {$T3},[$Ktbl], #16
- rev32 @X[0],@X[0] // yes, even on
- rev32 @X[1],@X[1] // big-endian
- rev32 @X[2],@X[2]
- rev32 @X[3],@X[3]
- mov $Xfer,sp
- add.32 $T0,$T0,@X[0]
- add.32 $T1,$T1,@X[1]
- add.32 $T2,$T2,@X[2]
- st1.32 {$T0-$T1},[$Xfer], #32
- add.32 $T3,$T3,@X[3]
- st1.32 {$T2-$T3},[$Xfer]
- sub $Xfer,$Xfer,#32
-
- ldp $A,$B,[$ctx]
- ldp $C,$D,[$ctx,#8]
- ldp $E,$F,[$ctx,#16]
- ldp $G,$H,[$ctx,#24]
- ldr $t1,[sp,#0]
- mov $t2,wzr
- eor $t3,$B,$C
- mov $t4,wzr
- b .L_00_48
-
-.align 4
-.L_00_48:
-___
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
-$code.=<<___;
- cmp $t1,#0 // check for K256 terminator
- ldr $t1,[sp,#0]
- sub $Xfer,$Xfer,#64
- bne .L_00_48
-
- sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
- cmp $inp,$num
- mov $Xfer, #64
- csel $Xfer, $Xfer, xzr, eq
- sub $inp,$inp,$Xfer // avoid SEGV
- mov $Xfer,sp
-___
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
-$code.=<<___;
- add $A,$A,$t4 // h+=Sigma0(a) from the past
- ldp $t0,$t1,[$ctx,#0]
- add $A,$A,$t2 // h+=Maj(a,b,c) from the past
- ldp $t2,$t3,[$ctx,#8]
- add $A,$A,$t0 // accumulate
- add $B,$B,$t1
- ldp $t0,$t1,[$ctx,#16]
- add $C,$C,$t2
- add $D,$D,$t3
- ldp $t2,$t3,[$ctx,#24]
- add $E,$E,$t0
- add $F,$F,$t1
- ldr $t1,[sp,#0]
- stp $A,$B,[$ctx,#0]
- add $G,$G,$t2
- mov $t2,wzr
- stp $C,$D,[$ctx,#8]
- add $H,$H,$t3
- stp $E,$F,[$ctx,#16]
- eor $t3,$B,$C
- stp $G,$H,[$ctx,#24]
- mov $t4,wzr
- mov $Xfer,sp
- b.ne .L_00_48
-
- ldr x29,[x29]
- add sp,sp,#16*4+16
- ret
-.size sha256_block_neon,.-sha256_block_neon
-___
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-{ my %opcode = (
- "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
- "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
-
- sub unsha256 {
- my ($mnemonic,$arg)=@_;
-
- $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
- &&
- sprintf ".inst\t0x%08x\t//%s %s",
- $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
- $mnemonic,$arg;
- }
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach(split("\n",$code)) {
-
- s/\`([^\`]*)\`/eval($1)/ge;
-
- s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
-
- s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
-
- s/\.[ui]?8(\s)/$1/;
- s/\.\w?32\b// and s/\.16b/\.4s/g;
- m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
-
- print $_,"\n";
-}
-
-close STDOUT;
diff --git a/arch/arm64/lib/crypto/sha256-ce.S b/arch/arm64/lib/crypto/sha256-ce.S
deleted file mode 100644
index f3e21c6d87d2..000000000000
--- a/arch/arm64/lib/crypto/sha256-ce.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a+crypto
-
- dga .req q20
- dgav .req v20
- dgb .req q21
- dgbv .req v21
-
- t0 .req v22
- t1 .req v23
-
- dg0q .req q24
- dg0v .req v24
- dg1q .req q25
- dg1v .req v25
- dg2q .req q26
- dg2v .req v26
-
- .macro add_only, ev, rc, s0
- mov dg2v.16b, dg0v.16b
- .ifeq \ev
- add t1.4s, v\s0\().4s, \rc\().4s
- sha256h dg0q, dg1q, t0.4s
- sha256h2 dg1q, dg2q, t0.4s
- .else
- .ifnb \s0
- add t0.4s, v\s0\().4s, \rc\().4s
- .endif
- sha256h dg0q, dg1q, t1.4s
- sha256h2 dg1q, dg2q, t1.4s
- .endif
- .endm
-
- .macro add_update, ev, rc, s0, s1, s2, s3
- sha256su0 v\s0\().4s, v\s1\().4s
- add_only \ev, \rc, \s1
- sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
- .endm
-
- /*
- * The SHA-256 round constants
- */
- .section ".rodata", "a"
- .align 4
-.Lsha2_rcon:
- .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
- .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
- .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
- .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
- .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
- .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
- .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
- .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
- .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
- .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
- .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
- .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
- .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
- .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
- .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
- .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
- /*
- * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
- * const u8 *data, size_t nblocks);
- */
- .text
-SYM_FUNC_START(__sha256_ce_transform)
- /* load round constants */
- adr_l x8, .Lsha2_rcon
- ld1 { v0.4s- v3.4s}, [x8], #64
- ld1 { v4.4s- v7.4s}, [x8], #64
- ld1 { v8.4s-v11.4s}, [x8], #64
- ld1 {v12.4s-v15.4s}, [x8]
-
- /* load state */
- ld1 {dgav.4s, dgbv.4s}, [x0]
-
- /* load input */
-0: ld1 {v16.4s-v19.4s}, [x1], #64
- sub x2, x2, #1
-
-CPU_LE( rev32 v16.16b, v16.16b )
-CPU_LE( rev32 v17.16b, v17.16b )
-CPU_LE( rev32 v18.16b, v18.16b )
-CPU_LE( rev32 v19.16b, v19.16b )
-
- add t0.4s, v16.4s, v0.4s
- mov dg0v.16b, dgav.16b
- mov dg1v.16b, dgbv.16b
-
- add_update 0, v1, 16, 17, 18, 19
- add_update 1, v2, 17, 18, 19, 16
- add_update 0, v3, 18, 19, 16, 17
- add_update 1, v4, 19, 16, 17, 18
-
- add_update 0, v5, 16, 17, 18, 19
- add_update 1, v6, 17, 18, 19, 16
- add_update 0, v7, 18, 19, 16, 17
- add_update 1, v8, 19, 16, 17, 18
-
- add_update 0, v9, 16, 17, 18, 19
- add_update 1, v10, 17, 18, 19, 16
- add_update 0, v11, 18, 19, 16, 17
- add_update 1, v12, 19, 16, 17, 18
-
- add_only 0, v13, 17
- add_only 1, v14, 18
- add_only 0, v15, 19
- add_only 1
-
- /* update state */
- add dgav.4s, dgav.4s, dg0v.4s
- add dgbv.4s, dgbv.4s, dg1v.4s
-
- /* return early if voluntary preemption is needed */
- cond_yield 1f, x5, x6
-
- /* handled all input blocks? */
- cbnz x2, 0b
-
- /* store new state */
-1: st1 {dgav.4s, dgbv.4s}, [x0]
- mov x0, x2
- ret
-SYM_FUNC_END(__sha256_ce_transform)
diff --git a/arch/arm64/lib/crypto/sha256.c b/arch/arm64/lib/crypto/sha256.c
deleted file mode 100644
index bcf7a3adc0c4..000000000000
--- a/arch/arm64/lib/crypto/sha256.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- static_branch_likely(&have_neon)) {
- if (static_branch_likely(&have_ce)) {
- do {
- size_t rem;
-
- kernel_neon_begin();
- rem = __sha256_ce_transform(state,
- data, nblocks);
- kernel_neon_end();
- data += (nblocks - rem) * SHA256_BLOCK_SIZE;
- nblocks = rem;
- } while (nblocks);
- } else {
- kernel_neon_begin();
- sha256_block_neon(state, data, nblocks);
- kernel_neon_end();
- }
- } else {
- sha256_blocks_arch(state, data, nblocks);
- }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-bool sha256_is_arch_optimized(void)
-{
- /* We always can use at least the ARM64 scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_arm64_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- cpu_have_named_feature(ASIMD)) {
- static_branch_enable(&have_neon);
- if (cpu_have_named_feature(SHA2))
- static_branch_enable(&have_ce);
- }
- return 0;
-}
-subsys_initcall(sha256_arm64_mod_init);
-
-static void __exit sha256_arm64_mod_exit(void)
-{
-}
-module_exit(sha256_arm64_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM64");
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ec0a337891dd..11eb8d1adc84 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -487,17 +487,29 @@ static void do_bad_area(unsigned long far, unsigned long esr,
}
}
-static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
- unsigned int mm_flags)
+static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags)
{
- unsigned long iss2 = ESR_ELx_ISS2(esr);
-
if (!system_supports_poe())
return false;
- if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay))
- return true;
-
+ /*
+ * We do not check whether an Overlay fault has occurred because we
+ * cannot make a decision based solely on its value:
+ *
+ * - If Overlay is set, a fault did occur due to POE, but it may be
+ * spurious in those cases where we update POR_EL0 without ISB (e.g.
+ * on context-switch). We would then need to manually check POR_EL0
+ * against vma_pkey(vma), which is exactly what
+ * arch_vma_access_permitted() does.
+ *
+ * - If Overlay is not set, we may still need to report a pkey fault.
+ * This is the case if an access was made within a mapping but with no
+ * page mapped, and POR_EL0 forbids the access (according to
+ * vma_pkey()). Such access will result in a SIGSEGV regardless
+ * because core code checks arch_vma_access_permitted(), but in order
+ * to report the correct error code - SEGV_PKUERR - we must handle
+ * that case here.
+ */
return !arch_vma_access_permitted(vma,
mm_flags & FAULT_FLAG_WRITE,
mm_flags & FAULT_FLAG_INSTRUCTION,
@@ -635,7 +647,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
goto bad_area;
}
- if (fault_from_pkey(esr, vma, mm_flags)) {
+ if (fault_from_pkey(vma, mm_flags)) {
pkey = vma_pkey(vma);
vma_end_read(vma);
fault = 0;
@@ -679,7 +691,7 @@ retry:
goto bad_area;
}
- if (fault_from_pkey(esr, vma, mm_flags)) {
+ if (fault_from_pkey(vma, mm_flags)) {
pkey = vma_pkey(vma);
mmap_read_unlock(mm);
fault = 0;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8fcf59ba39db..00ab1d648db6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1305,7 +1305,8 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
next = addr;
end = addr + PUD_SIZE;
do {
- pmd_free_pte_page(pmdp, next);
+ if (pmd_present(pmdp_get(pmdp)))
+ pmd_free_pte_page(pmdp, next);
} while (pmdp++, next += PMD_SIZE, next != end);
pud_clear(pudp);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 80d470aa469d..54dccfd6aa11 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -518,7 +518,6 @@ alternative_else_nop_endif
msr REG_PIR_EL1, x0
orr tcr2, tcr2, TCR2_EL1_PIE
- msr REG_TCR2_EL1, x0
.Lskip_indirection:
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 0765b3a8d6d6..8d9088bc577d 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -479,3 +479,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr