diff options
Diffstat (limited to 'arch/arm64')
83 files changed, 519 insertions, 5128 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 55fc331af337..7c456aa838b9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -21,8 +21,6 @@ config ARM64 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CC_PLATFORM - select ARCH_HAS_CRC32 - select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -187,12 +185,12 @@ config ARM64 select HAVE_ARCH_KCSAN if EXPERT select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB + select HAVE_ARCH_KSTACK_ERASE select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER - select HAVE_ARCH_STACKLEAK select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE @@ -256,6 +254,7 @@ config ARM64 select HOTPLUG_SMT if HOTPLUG_CPU select IRQ_DOMAIN select IRQ_FORCED_THREADING + select JUMP_LABEL select KASAN_VMALLOC if KASAN select LOCK_MM_AND_FIND_VMA select MODULES_USE_ELF_RELA diff --git a/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi b/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi index 8b7cbc2e78f5..51cd148f4227 100644 --- a/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun55i-a523.dtsi @@ -131,7 +131,7 @@ "PH5", "PH6", "PH7", "PH9", "PH10", "PH14", "PH15", "PH16", "PH17", "PH18"; allwinner,pinmux = <5>; - function = "emac0"; + function = "gmac0"; drive-strength = <40>; bias-disable; }; @@ -540,8 +540,8 @@ status = "disabled"; }; - emac0: ethernet@4500000 { - compatible = "allwinner,sun55i-a523-emac0", + gmac0: ethernet@4500000 { + compatible = "allwinner,sun55i-a523-gmac0", "allwinner,sun50i-a64-emac"; reg = <0x04500000 0x10000>; clocks = <&ccu CLK_BUS_EMAC0>; diff --git a/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts b/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts index 0f58d92a6adc..8bc0f2c72a24 100644 --- a/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts +++ b/arch/arm64/boot/dts/allwinner/sun55i-a527-cubie-a5e.dts @@ -12,7 +12,7 @@ compatible = "radxa,cubie-a5e", "allwinner,sun55i-a527"; aliases { - ethernet0 = &emac0; + ethernet0 = &gmac0; serial0 = &uart0; }; @@ -55,7 +55,7 @@ status = "okay"; }; -&emac0 { +&gmac0 { phy-mode = "rgmii-id"; phy-handle = <&ext_rgmii_phy>; phy-supply = <®_cldo3>; diff --git a/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts b/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts index 08127f0cdd35..142177c1f737 100644 --- a/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts +++ b/arch/arm64/boot/dts/allwinner/sun55i-t527-avaota-a1.dts @@ -12,7 +12,7 @@ compatible = "yuzukihd,avaota-a1", "allwinner,sun55i-t527"; aliases { - ethernet0 = &emac0; + ethernet0 = &gmac0; serial0 = &uart0; }; @@ -65,7 +65,7 @@ status = "okay"; }; -&emac0 { +&gmac0 { phy-mode = "rgmii-id"; phy-handle = <&ext_rgmii_phy>; phy-supply = <®_dcdc4>; diff --git a/arch/arm64/boot/dts/apple/spi1-nvram.dtsi b/arch/arm64/boot/dts/apple/spi1-nvram.dtsi index 3df2fd3993b5..9740fbf200f0 100644 --- a/arch/arm64/boot/dts/apple/spi1-nvram.dtsi +++ b/arch/arm64/boot/dts/apple/spi1-nvram.dtsi @@ -20,8 +20,6 @@ compatible = "jedec,spi-nor"; reg = <0x0>; spi-max-frequency = <25000000>; - #address-cells = <1>; - #size-cells = <1>; partitions { compatible = "fixed-partitions"; diff --git a/arch/arm64/boot/dts/apple/t8103-j293.dts b/arch/arm64/boot/dts/apple/t8103-j293.dts index e2d9439397f7..5b3c42e9f0e6 100644 --- a/arch/arm64/boot/dts/apple/t8103-j293.dts +++ b/arch/arm64/boot/dts/apple/t8103-j293.dts @@ -100,6 +100,8 @@ &displaydfr_mipi { status = "okay"; + #address-cells = <1>; + #size-cells = <0>; dfr_panel: panel@0 { compatible = "apple,j293-summit", "apple,summit"; diff --git a/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi b/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi index 8e82231acab5..0c8206156bfe 100644 --- a/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi +++ b/arch/arm64/boot/dts/apple/t8103-jxxx.dtsi @@ -71,7 +71,7 @@ */ &port00 { bus-range = <1 1>; - wifi0: network@0,0 { + wifi0: wifi@0,0 { compatible = "pci14e4,4425"; reg = <0x10000 0x0 0x0 0x0 0x0>; /* To be filled by the loader */ diff --git a/arch/arm64/boot/dts/apple/t8103.dtsi b/arch/arm64/boot/dts/apple/t8103.dtsi index 20faf0c0d809..3a204845b85b 100644 --- a/arch/arm64/boot/dts/apple/t8103.dtsi +++ b/arch/arm64/boot/dts/apple/t8103.dtsi @@ -405,8 +405,6 @@ compatible = "apple,t8103-display-pipe-mipi", "apple,h7-display-pipe-mipi"; reg = <0x2 0x28600000 0x0 0x100000>; power-domains = <&ps_mipi_dsi>; - #address-cells = <1>; - #size-cells = <0>; status = "disabled"; ports { diff --git a/arch/arm64/boot/dts/apple/t8112-j493.dts b/arch/arm64/boot/dts/apple/t8112-j493.dts index be86d34c6696..fb8ad7d4c65a 100644 --- a/arch/arm64/boot/dts/apple/t8112-j493.dts +++ b/arch/arm64/boot/dts/apple/t8112-j493.dts @@ -63,6 +63,8 @@ &displaydfr_mipi { status = "okay"; + #address-cells = <1>; + #size-cells = <0>; dfr_panel: panel@0 { compatible = "apple,j493-summit", "apple,summit"; diff --git a/arch/arm64/boot/dts/apple/t8112.dtsi b/arch/arm64/boot/dts/apple/t8112.dtsi index e95711d8337f..f68354194355 100644 --- a/arch/arm64/boot/dts/apple/t8112.dtsi +++ b/arch/arm64/boot/dts/apple/t8112.dtsi @@ -420,8 +420,6 @@ compatible = "apple,t8112-display-pipe-mipi", "apple,h7-display-pipe-mipi"; reg = <0x2 0x28600000 0x0 0x100000>; power-domains = <&ps_mipi_dsi>; - #address-cells = <1>; - #size-cells = <0>; status = "disabled"; ports { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index 0baf256b4400..983b2f0e8797 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -687,11 +687,12 @@ }; wdog0: watchdog@2ad0000 { - compatible = "fsl,imx21-wdt"; + compatible = "fsl,ls1046a-wdt", "fsl,imx21-wdt"; reg = <0x0 0x2ad0000 0x0 0x10000>; interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clockgen QORIQ_CLK_PLATFORM_PLL QORIQ_CLK_PLL_DIV(2)>; + big-endian; }; edma0: dma-controller@2c00000 { diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index d29710772569..1594ce9182a5 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -464,6 +464,7 @@ }; reg_nvcc_sd: LDO5 { + regulator-always-on; regulator-max-microvolt = <3300000>; regulator-min-microvolt = <1800000>; regulator-name = "On-module +V3.3_1.8_SD (LDO5)"; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi index 2f740d74707b..4bf818873fe3 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw71xx.dtsi @@ -70,7 +70,7 @@ tpm@1 { compatible = "atmel,attpm20p", "tcg,tpm_tis-spi"; reg = <0x1>; - spi-max-frequency = <36000000>; + spi-max-frequency = <25000000>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi index 5ab3ffe9931d..cf747ec6fa16 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw72xx.dtsi @@ -110,7 +110,7 @@ tpm@1 { compatible = "atmel,attpm20p", "tcg,tpm_tis-spi"; reg = <0x1>; - spi-max-frequency = <36000000>; + spi-max-frequency = <25000000>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi index e2b5e7ac3e46..5eb114d2360a 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw73xx.dtsi @@ -122,7 +122,7 @@ tpm@1 { compatible = "atmel,attpm20p", "tcg,tpm_tis-spi"; reg = <0x1>; - spi-max-frequency = <36000000>; + spi-max-frequency = <25000000>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts index 6daa2313f879..568d24265ddf 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-venice-gw74xx.dts @@ -201,7 +201,7 @@ tpm@0 { compatible = "atmel,attpm20p", "tcg,tpm_tis-spi"; reg = <0x0>; - spi-max-frequency = <36000000>; + spi-max-frequency = <25000000>; }; }; diff --git a/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts b/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts index 6c47f4b47356..9f4d0899a94d 100644 --- a/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts +++ b/arch/arm64/boot/dts/freescale/imx95-15x15-evk.dts @@ -574,17 +574,17 @@ &scmi_iomuxc { pinctrl_emdio: emdiogrp { fsl,pins = < - IMX95_PAD_ENET2_MDC__NETCMIX_TOP_NETC_MDC 0x57e - IMX95_PAD_ENET2_MDIO__NETCMIX_TOP_NETC_MDIO 0x97e + IMX95_PAD_ENET2_MDC__NETCMIX_TOP_NETC_MDC 0x50e + IMX95_PAD_ENET2_MDIO__NETCMIX_TOP_NETC_MDIO 0x90e >; }; pinctrl_enetc0: enetc0grp { fsl,pins = < - IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x57e - IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x57e - IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x57e - IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x57e + IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x50e + IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x50e + IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x50e + IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x50e IMX95_PAD_ENET1_TX_CTL__NETCMIX_TOP_ETH0_RGMII_TX_CTL 0x57e IMX95_PAD_ENET1_TXC__NETCMIX_TOP_ETH0_RGMII_TX_CLK 0x58e IMX95_PAD_ENET1_RX_CTL__NETCMIX_TOP_ETH0_RGMII_RX_CTL 0x57e @@ -598,10 +598,10 @@ pinctrl_enetc1: enetc1grp { fsl,pins = < - IMX95_PAD_ENET2_TD3__NETCMIX_TOP_ETH1_RGMII_TD3 0x57e - IMX95_PAD_ENET2_TD2__NETCMIX_TOP_ETH1_RGMII_TD2 0x57e - IMX95_PAD_ENET2_TD1__NETCMIX_TOP_ETH1_RGMII_TD1 0x57e - IMX95_PAD_ENET2_TD0__NETCMIX_TOP_ETH1_RGMII_TD0 0x57e + IMX95_PAD_ENET2_TD3__NETCMIX_TOP_ETH1_RGMII_TD3 0x50e + IMX95_PAD_ENET2_TD2__NETCMIX_TOP_ETH1_RGMII_TD2 0x50e + IMX95_PAD_ENET2_TD1__NETCMIX_TOP_ETH1_RGMII_TD1 0x50e + IMX95_PAD_ENET2_TD0__NETCMIX_TOP_ETH1_RGMII_TD0 0x50e IMX95_PAD_ENET2_TX_CTL__NETCMIX_TOP_ETH1_RGMII_TX_CTL 0x57e IMX95_PAD_ENET2_TXC__NETCMIX_TOP_ETH1_RGMII_TX_CLK 0x58e IMX95_PAD_ENET2_RX_CTL__NETCMIX_TOP_ETH1_RGMII_RX_CTL 0x57e diff --git a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts index 6886ea766655..d7d845231312 100644 --- a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts +++ b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts @@ -566,17 +566,17 @@ &scmi_iomuxc { pinctrl_emdio: emdiogrp{ fsl,pins = < - IMX95_PAD_ENET1_MDC__NETCMIX_TOP_NETC_MDC 0x57e - IMX95_PAD_ENET1_MDIO__NETCMIX_TOP_NETC_MDIO 0x97e + IMX95_PAD_ENET1_MDC__NETCMIX_TOP_NETC_MDC 0x50e + IMX95_PAD_ENET1_MDIO__NETCMIX_TOP_NETC_MDIO 0x90e >; }; pinctrl_enetc0: enetc0grp { fsl,pins = < - IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x57e - IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x57e - IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x57e - IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x57e + IMX95_PAD_ENET1_TD3__NETCMIX_TOP_ETH0_RGMII_TD3 0x50e + IMX95_PAD_ENET1_TD2__NETCMIX_TOP_ETH0_RGMII_TD2 0x50e + IMX95_PAD_ENET1_TD1__NETCMIX_TOP_ETH0_RGMII_TD1 0x50e + IMX95_PAD_ENET1_TD0__NETCMIX_TOP_ETH0_RGMII_TD0 0x50e IMX95_PAD_ENET1_TX_CTL__NETCMIX_TOP_ETH0_RGMII_TX_CTL 0x57e IMX95_PAD_ENET1_TXC__NETCMIX_TOP_ETH0_RGMII_TX_CLK 0x58e IMX95_PAD_ENET1_RX_CTL__NETCMIX_TOP_ETH0_RGMII_RX_CTL 0x57e diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 632631a29112..5aecdd9b62ff 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -1708,7 +1708,7 @@ <0x9 0 1 0>; reg-names = "dbi","atu", "dbi2", "app", "dma", "addr_space"; num-lanes = <1>; - interrupts = <GIC_SPI 317 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <GIC_SPI 311 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "dma"; clocks = <&scmi_clk IMX95_CLK_HSIO>, <&scmi_clk IMX95_CLK_HSIOPLL>, diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts index ae7a275fd223..cefecb7a23cf 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts +++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts @@ -1090,6 +1090,8 @@ }; &pmk8280_rtc { + qcom,uefi-rtc-info; + status = "okay"; }; diff --git a/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi b/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi index c02fd4d15c96..e3888bc143a0 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100-pmics.dtsi @@ -224,6 +224,7 @@ reg-names = "rtc", "alarm"; interrupts = <0x0 0x62 0x1 IRQ_TYPE_EDGE_RISING>; qcom,no-alarm; /* alarm owned by ADSP */ + qcom,uefi-rtc-info; }; pmk8550_sdam_2: nvram@7100 { diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi index ab232e5c7ad6..4203b335a263 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi @@ -379,6 +379,18 @@ <0 RK_PA7 RK_FUNC_GPIO &pcfg_pull_up>; }; }; + + spi1 { + spi1_csn0_gpio_pin: spi1-csn0-gpio-pin { + rockchip,pins = + <3 RK_PB1 RK_FUNC_GPIO &pcfg_pull_up_4ma>; + }; + + spi1_csn1_gpio_pin: spi1-csn1-gpio-pin { + rockchip,pins = + <3 RK_PB2 RK_FUNC_GPIO &pcfg_pull_up_4ma>; + }; + }; }; &pmu_io_domains { @@ -396,6 +408,17 @@ vqmmc-supply = <&vccio_sd>; }; +&spi1 { + /* + * Hardware CS has a very slow rise time of about 6us, + * causing transmission errors. + * With cs-gpios we have a rise time of about 20ns. + */ + cs-gpios = <&gpio3 RK_PB1 GPIO_ACTIVE_LOW>, <&gpio3 RK_PB2 GPIO_ACTIVE_LOW>; + pinctrl-names = "default"; + pinctrl-0 = <&spi1_clk &spi1_csn0_gpio_pin &spi1_csn1_gpio_pin &spi1_miso &spi1_mosi>; +}; + &tsadc { status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts index 3c127c5c2607..a9021c524afb 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-a.dts @@ -30,6 +30,7 @@ fan: gpio_fan { compatible = "gpio-fan"; + fan-supply = <&vcc12v_dcin>; gpios = <&gpio0 RK_PD5 GPIO_ACTIVE_HIGH>; gpio-fan,speed-map = < 0 0>, diff --git a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts index 3b31f0dd8f3b..539edc3c535f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts @@ -29,7 +29,6 @@ function-enumerator = <1>; gpios = <&gpio3 RK_PD6 GPIO_ACTIVE_HIGH>; label = "LAN-1"; - linux,default-trigger = "netdev"; }; led-lan2 { @@ -39,7 +38,6 @@ function-enumerator = <2>; gpios = <&gpio3 RK_PD7 GPIO_ACTIVE_HIGH>; label = "LAN-2"; - linux,default-trigger = "netdev"; }; power_led: led-sys { @@ -56,7 +54,6 @@ function = LED_FUNCTION_WAN; gpios = <&gpio2 RK_PC1 GPIO_ACTIVE_HIGH>; label = "WAN"; - linux,default-trigger = "netdev"; }; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts index b09e789c75c4..801b40fea4e8 100644 --- a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts +++ b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts @@ -211,10 +211,38 @@ status = "okay"; }; +&cpu_b0 { + cpu-supply = <&vdd_cpu_big_s0>; +}; + +&cpu_b1 { + cpu-supply = <&vdd_cpu_big_s0>; +}; + +&cpu_b2 { + cpu-supply = <&vdd_cpu_big_s0>; +}; + +&cpu_b3 { + cpu-supply = <&vdd_cpu_big_s0>; +}; + &cpu_l0 { cpu-supply = <&vdd_cpu_lit_s0>; }; +&cpu_l1 { + cpu-supply = <&vdd_cpu_lit_s0>; +}; + +&cpu_l2 { + cpu-supply = <&vdd_cpu_lit_s0>; +}; + +&cpu_l3 { + cpu-supply = <&vdd_cpu_lit_s0>; +}; + &gmac0 { phy-mode = "rgmii-id"; clock_in_out = "output"; diff --git a/arch/arm64/boot/dts/rockchip/rk3576.dtsi b/arch/arm64/boot/dts/rockchip/rk3576.dtsi index 1086482f0479..64812e3bcb61 100644 --- a/arch/arm64/boot/dts/rockchip/rk3576.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3576.dtsi @@ -615,7 +615,7 @@ <0 0 0 2 &pcie1_intc 1>, <0 0 0 3 &pcie1_intc 2>, <0 0 0 4 &pcie1_intc 3>; - linux,pci-domain = <0>; + linux,pci-domain = <1>; max-link-speed = <2>; num-ib-windows = <8>; num-viewport = <8>; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi index 7f874c77410c..6584d73660f6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-base-pinctrl.dtsi @@ -578,14 +578,14 @@ hdmim0_tx0_scl: hdmim0-tx0-scl { rockchip,pins = /* hdmim0_tx0_scl */ - <4 RK_PB7 5 &pcfg_pull_none>; + <4 RK_PB7 5 &pcfg_pull_none_drv_level_5_smt>; }; /omit-if-no-ref/ hdmim0_tx0_sda: hdmim0-tx0-sda { rockchip,pins = /* hdmim0_tx0_sda */ - <4 RK_PC0 5 &pcfg_pull_none>; + <4 RK_PC0 5 &pcfg_pull_none_drv_level_1_smt>; }; /omit-if-no-ref/ @@ -640,14 +640,14 @@ hdmim1_tx0_scl: hdmim1-tx0-scl { rockchip,pins = /* hdmim1_tx0_scl */ - <0 RK_PD5 11 &pcfg_pull_none>; + <0 RK_PD5 11 &pcfg_pull_none_drv_level_5_smt>; }; /omit-if-no-ref/ hdmim1_tx0_sda: hdmim1-tx0-sda { rockchip,pins = /* hdmim1_tx0_sda */ - <0 RK_PD4 11 &pcfg_pull_none>; + <0 RK_PD4 11 &pcfg_pull_none_drv_level_1_smt>; }; /omit-if-no-ref/ @@ -668,14 +668,14 @@ hdmim1_tx1_scl: hdmim1-tx1-scl { rockchip,pins = /* hdmim1_tx1_scl */ - <3 RK_PC6 5 &pcfg_pull_none>; + <3 RK_PC6 5 &pcfg_pull_none_drv_level_5_smt>; }; /omit-if-no-ref/ hdmim1_tx1_sda: hdmim1-tx1-sda { rockchip,pins = /* hdmim1_tx1_sda */ - <3 RK_PC5 5 &pcfg_pull_none>; + <3 RK_PC5 5 &pcfg_pull_none_drv_level_1_smt>; }; /omit-if-no-ref/ hdmim2_rx_cec: hdmim2-rx-cec { @@ -709,14 +709,14 @@ hdmim2_tx0_scl: hdmim2-tx0-scl { rockchip,pins = /* hdmim2_tx0_scl */ - <3 RK_PC7 5 &pcfg_pull_none>; + <3 RK_PC7 5 &pcfg_pull_none_drv_level_5_smt>; }; /omit-if-no-ref/ hdmim2_tx0_sda: hdmim2-tx0-sda { rockchip,pins = /* hdmim2_tx0_sda */ - <3 RK_PD0 5 &pcfg_pull_none>; + <3 RK_PD0 5 &pcfg_pull_none_drv_level_1_smt>; }; /omit-if-no-ref/ @@ -730,14 +730,14 @@ hdmim2_tx1_scl: hdmim2-tx1-scl { rockchip,pins = /* hdmim2_tx1_scl */ - <1 RK_PA4 5 &pcfg_pull_none>; + <1 RK_PA4 5 &pcfg_pull_none_drv_level_5_smt>; }; /omit-if-no-ref/ hdmim2_tx1_sda: hdmim2-tx1-sda { rockchip,pins = /* hdmim2_tx1_sda */ - <1 RK_PA3 5 &pcfg_pull_none>; + <1 RK_PA3 5 &pcfg_pull_none_drv_level_1_smt>; }; /omit-if-no-ref/ diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi index cc37f082adea..b07543315f87 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5.dtsi @@ -321,6 +321,7 @@ bus-width = <4>; cap-mmc-highspeed; cap-sd-highspeed; + cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>; disable-wp; max-frequency = <150000000>; no-sdio; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi index 244c66faa161..fb48ddc04bcb 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-extra-pinctrl.dtsi @@ -160,14 +160,15 @@ hdmim0_tx1_scl: hdmim0-tx1-scl { rockchip,pins = /* hdmim0_tx1_scl */ - <2 RK_PB5 4 &pcfg_pull_none>; + <2 RK_PB5 4 &pcfg_pull_none_drv_level_3_smt>; }; /omit-if-no-ref/ hdmim0_tx1_sda: hdmim0-tx1-sda { rockchip,pins = /* hdmim0_tx1_sda */ - <2 RK_PB4 4 &pcfg_pull_none>; + <2 RK_PB4 4 &pcfg_pull_none_drv_level_1_smt>; + }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts index 8b717c4017a4..b2947b36fada 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts @@ -474,6 +474,7 @@ bus-width = <4>; cap-mmc-highspeed; cap-sd-highspeed; + cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>; disable-wp; max-frequency = <150000000>; no-sdio; diff --git a/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi b/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi index 5c645437b507..b0475b7c655a 100644 --- a/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi +++ b/arch/arm64/boot/dts/rockchip/rockchip-pinconf.dtsi @@ -333,6 +333,41 @@ }; /omit-if-no-ref/ + pcfg_pull_none_drv_level_1_smt: pcfg-pull-none-drv-level-1-smt { + bias-disable; + drive-strength = <1>; + input-schmitt-enable; + }; + + /omit-if-no-ref/ + pcfg_pull_none_drv_level_2_smt: pcfg-pull-none-drv-level-2-smt { + bias-disable; + drive-strength = <2>; + input-schmitt-enable; + }; + + /omit-if-no-ref/ + pcfg_pull_none_drv_level_3_smt: pcfg-pull-none-drv-level-3-smt { + bias-disable; + drive-strength = <3>; + input-schmitt-enable; + }; + + /omit-if-no-ref/ + pcfg_pull_none_drv_level_4_smt: pcfg-pull-none-drv-level-4-smt { + bias-disable; + drive-strength = <4>; + input-schmitt-enable; + }; + + /omit-if-no-ref/ + pcfg_pull_none_drv_level_5_smt: pcfg-pull-none-drv-level-5-smt { + bias-disable; + drive-strength = <5>; + input-schmitt-enable; + }; + + /omit-if-no-ref/ pcfg_output_high: pcfg-output-high { output-high; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 897fc686e6a9..03117405b6ce 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1444,6 +1444,7 @@ CONFIG_PLATFORM_MHU=y CONFIG_BCM2835_MBOX=y CONFIG_QCOM_APCS_IPC=y CONFIG_MTK_ADSP_MBOX=m +CONFIG_QCOM_CPUCP_MBOX=m CONFIG_QCOM_IPCC=y CONFIG_ROCKCHIP_IOMMU=y CONFIG_TEGRA_IOMMU_SMMU=y @@ -1573,6 +1574,7 @@ CONFIG_RESET_QCOM_AOSS=y CONFIG_RESET_QCOM_PDC=m CONFIG_RESET_RZG2L_USBPHY_CTRL=y CONFIG_RESET_TI_SCI=y +CONFIG_PHY_SNPS_EUSB2=m CONFIG_PHY_XGENE=y CONFIG_PHY_CAN_TRANSCEIVER=m CONFIG_PHY_NXP_PTN3222=m @@ -1597,7 +1599,6 @@ CONFIG_PHY_QCOM_EDP=m CONFIG_PHY_QCOM_PCIE2=m CONFIG_PHY_QCOM_QMP=m CONFIG_PHY_QCOM_QUSB2=m -CONFIG_PHY_QCOM_SNPS_EUSB2=m CONFIG_PHY_QCOM_EUSB2_REPEATER=m CONFIG_PHY_QCOM_M31_USB=m CONFIG_PHY_QCOM_USB_HS=m @@ -1743,8 +1744,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_GHASH_ARM64_CE=y -CONFIG_CRYPTO_SHA1_ARM64_CE=y -CONFIG_CRYPTO_SHA512_ARM64_CE=m CONFIG_CRYPTO_SHA3_ARM64=m CONFIG_CRYPTO_SM3_ARM64_CE=m CONFIG_CRYPTO_AES_ARM64_CE_BLK=y diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index c44b0f202a1f..3bb5b513d5ae 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -25,36 +25,6 @@ config CRYPTO_NHPOLY1305_NEON Architecture: arm64 using: - NEON (Advanced SIMD) extensions -config CRYPTO_SHA1_ARM64_CE - tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_SHA1 - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: arm64 using: - - ARMv8 Crypto Extensions - -config CRYPTO_SHA512_ARM64 - tristate "Hash functions: SHA-384 and SHA-512" - select CRYPTO_HASH - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: arm64 - -config CRYPTO_SHA512_ARM64_CE - tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_SHA512_ARM64 - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: arm64 using: - - ARMv8 Crypto Extensions - config CRYPTO_SHA3_ARM64 tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index c231c980c514..a8b2cdbe202c 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -5,12 +5,6 @@ # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> # -obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o -sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o - -obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o -sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o - obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o @@ -53,9 +47,6 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o aes-neon-blk-y := aes-glue-neon.o aes-neon.o -obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o -sha512-arm64-y := sha512-glue.o sha512-core.o - obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o @@ -64,11 +55,3 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) void $(@) - -$(obj)/sha512-core.S: $(src)/../lib/crypto/sha2-armv8.pl - $(call cmd,perlasm) - -clean-files += sha512-core.S diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S deleted file mode 100644 index 9b1f2d82a6fe..000000000000 --- a/arch/arm64/crypto/sha1-ce-core.S +++ /dev/null @@ -1,150 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .text - .arch armv8-a+crypto - - k0 .req v0 - k1 .req v1 - k2 .req v2 - k3 .req v3 - - t0 .req v4 - t1 .req v5 - - dga .req q6 - dgav .req v6 - dgb .req s7 - dgbv .req v7 - - dg0q .req q12 - dg0s .req s12 - dg0v .req v12 - dg1s .req s13 - dg1v .req v13 - dg2s .req s14 - - .macro add_only, op, ev, rc, s0, dg1 - .ifc \ev, ev - add t1.4s, v\s0\().4s, \rc\().4s - sha1h dg2s, dg0s - .ifnb \dg1 - sha1\op dg0q, \dg1, t0.4s - .else - sha1\op dg0q, dg1s, t0.4s - .endif - .else - .ifnb \s0 - add t0.4s, v\s0\().4s, \rc\().4s - .endif - sha1h dg1s, dg0s - sha1\op dg0q, dg2s, t1.4s - .endif - .endm - - .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 - sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s - add_only \op, \ev, \rc, \s1, \dg1 - sha1su1 v\s0\().4s, v\s3\().4s - .endm - - .macro loadrc, k, val, tmp - movz \tmp, :abs_g0_nc:\val - movk \tmp, :abs_g1:\val - dup \k, \tmp - .endm - - /* - * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - * int blocks) - */ -SYM_FUNC_START(__sha1_ce_transform) - /* load round constants */ - loadrc k0.4s, 0x5a827999, w6 - loadrc k1.4s, 0x6ed9eba1, w6 - loadrc k2.4s, 0x8f1bbcdc, w6 - loadrc k3.4s, 0xca62c1d6, w6 - - /* load state */ - ld1 {dgav.4s}, [x0] - ldr dgb, [x0, #16] - - /* load sha1_ce_state::finalize */ - ldr_l w4, sha1_ce_offsetof_finalize, x4 - ldr w4, [x0, x4] - - /* load input */ -0: ld1 {v8.4s-v11.4s}, [x1], #64 - sub w2, w2, #1 - -CPU_LE( rev32 v8.16b, v8.16b ) -CPU_LE( rev32 v9.16b, v9.16b ) -CPU_LE( rev32 v10.16b, v10.16b ) -CPU_LE( rev32 v11.16b, v11.16b ) - -1: add t0.4s, v8.4s, k0.4s - mov dg0v.16b, dgav.16b - - add_update c, ev, k0, 8, 9, 10, 11, dgb - add_update c, od, k0, 9, 10, 11, 8 - add_update c, ev, k0, 10, 11, 8, 9 - add_update c, od, k0, 11, 8, 9, 10 - add_update c, ev, k1, 8, 9, 10, 11 - - add_update p, od, k1, 9, 10, 11, 8 - add_update p, ev, k1, 10, 11, 8, 9 - add_update p, od, k1, 11, 8, 9, 10 - add_update p, ev, k1, 8, 9, 10, 11 - add_update p, od, k2, 9, 10, 11, 8 - - add_update m, ev, k2, 10, 11, 8, 9 - add_update m, od, k2, 11, 8, 9, 10 - add_update m, ev, k2, 8, 9, 10, 11 - add_update m, od, k2, 9, 10, 11, 8 - add_update m, ev, k3, 10, 11, 8, 9 - - add_update p, od, k3, 11, 8, 9, 10 - add_only p, ev, k3, 9 - add_only p, od, k3, 10 - add_only p, ev, k3, 11 - add_only p, od - - /* update state */ - add dgbv.2s, dgbv.2s, dg1v.2s - add dgav.4s, dgav.4s, dg0v.4s - - cbz w2, 2f - cond_yield 3f, x5, x6 - b 0b - - /* - * Final block: add padding and total bit count. - * Skip if the input size was not a round multiple of the block size, - * the padding is handled by the C code in that case. - */ -2: cbz x4, 3f - ldr_l w4, sha1_ce_offsetof_count, x4 - ldr x4, [x0, x4] - movi v9.2d, #0 - mov x8, #0x80000000 - movi v10.2d, #0 - ror x7, x4, #29 // ror(lsl(x4, 3), 32) - fmov d8, x8 - mov x4, #0 - mov v11.d[0], xzr - mov v11.d[1], x7 - b 1b - - /* store new state */ -3: st1 {dgav.4s}, [x0] - str dgb, [x0, #16] - mov w0, w2 - ret -SYM_FUNC_END(__sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c deleted file mode 100644 index 65b6980817e5..000000000000 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ /dev/null @@ -1,118 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <asm/neon.h> -#include <asm/simd.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <crypto/sha1.h> -#include <crypto/sha1_base.h> -#include <linux/cpufeature.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> - -MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha1"); - -struct sha1_ce_state { - struct sha1_state sst; - u32 finalize; -}; - -extern const u32 sha1_ce_offsetof_count; -extern const u32 sha1_ce_offsetof_finalize; - -asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, - int blocks); - -static void sha1_ce_transform(struct sha1_state *sst, u8 const *src, - int blocks) -{ - while (blocks) { - int rem; - - kernel_neon_begin(); - rem = __sha1_ce_transform(container_of(sst, - struct sha1_ce_state, - sst), src, blocks); - kernel_neon_end(); - src += (blocks - rem) * SHA1_BLOCK_SIZE; - blocks = rem; - } -} - -const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count); -const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize); - -static int sha1_ce_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha1_ce_state *sctx = shash_desc_ctx(desc); - - sctx->finalize = 0; - return sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform); -} - -static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct sha1_ce_state *sctx = shash_desc_ctx(desc); - bool finalized = false; - - /* - * Allow the asm code to perform the finalization if there is no - * partial data and the input is a round multiple of the block size. - */ - if (len >= SHA1_BLOCK_SIZE) { - unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE); - - finalized = !remain; - sctx->finalize = finalized; - sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform); - data += len - remain; - len = remain; - } - if (!finalized) { - sctx->finalize = 0; - sha1_base_do_finup(desc, data, len, sha1_ce_transform); - } - return sha1_base_finish(desc, out); -} - -static struct shash_alg alg = { - .init = sha1_base_init, - .update = sha1_ce_update, - .finup = sha1_ce_finup, - .descsize = sizeof(struct sha1_ce_state), - .statesize = SHA1_STATE_SIZE, - .digestsize = SHA1_DIGEST_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ce", - .cra_priority = 200, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_ce_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit sha1_ce_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_cpu_feature_match(SHA1, sha1_ce_mod_init); -module_exit(sha1_ce_mod_fini); diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S deleted file mode 100644 index 91ef68b15fcc..000000000000 --- a/arch/arm64/crypto/sha512-ce-core.S +++ /dev/null @@ -1,206 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions - * - * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 - .set .Lq\b, \b - .set .Lv\b\().2d, \b - .endr - - .macro sha512h, rd, rn, rm - .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - .macro sha512h2, rd, rn, rm - .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - .macro sha512su0, rd, rn - .inst 0xcec08000 | .L\rd | (.L\rn << 5) - .endm - - .macro sha512su1, rd, rn, rm - .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16) - .endm - - /* - * The SHA-512 round constants - */ - .section ".rodata", "a" - .align 4 -.Lsha512_rcon: - .quad 0x428a2f98d728ae22, 0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538, 0x59f111f1b605d019 - .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242, 0x12835b0145706fbe - .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 - .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 - .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 - .quad 0x06ca6351e003826f, 0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df - .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6, 0x92722c851482353b - .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 - .quad 0xd192e819d6ef5218, 0xd69906245565a910 - .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec - .quad 0x90befffa23631e28, 0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b - .quad 0xca273eceea26619c, 0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae, 0x1b710b35131c471b - .quad 0x28db77f523047d84, 0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 - - .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4 - .ifnb \rc1 - ld1 {v\rc1\().2d}, [x4], #16 - .endif - add v5.2d, v\rc0\().2d, v\in0\().2d - ext v6.16b, v\i2\().16b, v\i3\().16b, #8 - ext v5.16b, v5.16b, v5.16b, #8 - ext v7.16b, v\i1\().16b, v\i2\().16b, #8 - add v\i3\().2d, v\i3\().2d, v5.2d - .ifnb \in1 - ext v5.16b, v\in3\().16b, v\in4\().16b, #8 - sha512su0 v\in0\().2d, v\in1\().2d - .endif - sha512h q\i3, q6, v7.2d - .ifnb \in1 - sha512su1 v\in0\().2d, v\in2\().2d, v5.2d - .endif - add v\i4\().2d, v\i1\().2d, v\i3\().2d - sha512h2 q\i3, q\i1, v\i0\().2d - .endm - - /* - * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src, - * int blocks) - */ - .text -SYM_FUNC_START(__sha512_ce_transform) - /* load state */ - ld1 {v8.2d-v11.2d}, [x0] - - /* load first 4 round constants */ - adr_l x3, .Lsha512_rcon - ld1 {v20.2d-v23.2d}, [x3], #64 - - /* load input */ -0: ld1 {v12.2d-v15.2d}, [x1], #64 - ld1 {v16.2d-v19.2d}, [x1], #64 - sub w2, w2, #1 - -CPU_LE( rev64 v12.16b, v12.16b ) -CPU_LE( rev64 v13.16b, v13.16b ) -CPU_LE( rev64 v14.16b, v14.16b ) -CPU_LE( rev64 v15.16b, v15.16b ) -CPU_LE( rev64 v16.16b, v16.16b ) -CPU_LE( rev64 v17.16b, v17.16b ) -CPU_LE( rev64 v18.16b, v18.16b ) -CPU_LE( rev64 v19.16b, v19.16b ) - - mov x4, x3 // rc pointer - - mov v0.16b, v8.16b - mov v1.16b, v9.16b - mov v2.16b, v10.16b - mov v3.16b, v11.16b - - // v0 ab cd -- ef gh ab - // v1 cd -- ef gh ab cd - // v2 ef gh ab cd -- ef - // v3 gh ab cd -- ef gh - // v4 -- ef gh ab cd -- - - dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17 - dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18 - dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19 - dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12 - dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13 - - dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14 - dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15 - dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16 - dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17 - dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18 - - dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19 - dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12 - dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13 - dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14 - dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15 - - dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16 - dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17 - dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18 - dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19 - dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12 - - dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13 - dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14 - dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15 - dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16 - dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17 - - dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18 - dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19 - dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12 - dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13 - dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14 - - dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15 - dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16 - dround 2, 3, 1, 4, 0, 28, 24, 12 - dround 4, 2, 0, 1, 3, 29, 25, 13 - dround 1, 4, 3, 0, 2, 30, 26, 14 - - dround 0, 1, 2, 3, 4, 31, 27, 15 - dround 3, 0, 4, 2, 1, 24, , 16 - dround 2, 3, 1, 4, 0, 25, , 17 - dround 4, 2, 0, 1, 3, 26, , 18 - dround 1, 4, 3, 0, 2, 27, , 19 - - /* update state */ - add v8.2d, v8.2d, v0.2d - add v9.2d, v9.2d, v1.2d - add v10.2d, v10.2d, v2.2d - add v11.2d, v11.2d, v3.2d - - cond_yield 3f, x4, x5 - /* handled all input blocks? */ - cbnz w2, 0b - - /* store new state */ -3: st1 {v8.2d-v11.2d}, [x0] - mov w0, w2 - ret -SYM_FUNC_END(__sha512_ce_transform) diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c deleted file mode 100644 index 6fb3001fa2c9..000000000000 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions - * - * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <asm/neon.h> -#include <crypto/internal/hash.h> -#include <crypto/sha2.h> -#include <crypto/sha512_base.h> -#include <linux/cpufeature.h> -#include <linux/kernel.h> -#include <linux/module.h> - -MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha512"); - -asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src, - int blocks); - -static void sha512_ce_transform(struct sha512_state *sst, u8 const *src, - int blocks) -{ - do { - int rem; - - kernel_neon_begin(); - rem = __sha512_ce_transform(sst, src, blocks); - kernel_neon_end(); - src += (blocks - rem) * SHA512_BLOCK_SIZE; - blocks = rem; - } while (blocks); -} - -static int sha512_ce_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update_blocks(desc, data, len, - sha512_ce_transform); -} - -static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha512_base_do_finup(desc, data, len, sha512_ce_transform); - return sha512_base_finish(desc, out); -} - -static struct shash_alg algs[] = { { - .init = sha384_base_init, - .update = sha512_ce_update, - .finup = sha512_ce_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA384_DIGEST_SIZE, - .base.cra_name = "sha384", - .base.cra_driver_name = "sha384-ce", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA512_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}, { - .init = sha512_base_init, - .update = sha512_ce_update, - .finup = sha512_ce_finup, - .descsize = SHA512_STATE_SIZE, - .digestsize = SHA512_DIGEST_SIZE, - .base.cra_name = "sha512", - .base.cra_driver_name = "sha512-ce", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA512_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -} }; - -static int __init sha512_ce_mod_init(void) -{ - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); -} - -static void __exit sha512_ce_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_cpu_feature_match(SHA512, sha512_ce_mod_init); -module_exit(sha512_ce_mod_fini); diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c deleted file mode 100644 index 15aa9d8b7b2c..000000000000 --- a/arch/arm64/crypto/sha512-glue.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Linux/arm64 port of the OpenSSL SHA512 implementation for AArch64 - * - * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org> - */ - -#include <crypto/internal/hash.h> -#include <crypto/sha2.h> -#include <crypto/sha512_base.h> -#include <linux/kernel.h> -#include <linux/module.h> - -MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64"); -MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha512"); - -asmlinkage void sha512_blocks_arch(u64 *digest, const void *data, - unsigned int num_blks); - -static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src, - int blocks) -{ - sha512_blocks_arch(sst->state, src, blocks); -} - -static int sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update_blocks(desc, data, len, - sha512_arm64_transform); -} - -static int sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha512_base_do_finup(desc, data, len, sha512_arm64_transform); - return sha512_base_finish(desc, out); -} - -static struct shash_alg algs[] = { { - .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_base_init, - .update = sha512_update, - .finup = sha512_finup, - .descsize = SHA512_STATE_SIZE, - .base.cra_name = "sha512", - .base.cra_driver_name = "sha512-arm64", - .base.cra_priority = 150, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA512_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}, { - .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_base_init, - .update = sha512_update, - .finup = sha512_finup, - .descsize = SHA512_STATE_SIZE, - .base.cra_name = "sha384", - .base.cra_driver_name = "sha384-arm64", - .base.cra_priority = 150, - .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | - CRYPTO_AHASH_ALG_FINUP_MAX, - .base.cra_blocksize = SHA384_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -} }; - -static int __init sha512_mod_init(void) -{ - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); -} - -static void __exit sha512_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_init(sha512_mod_init); -module_exit(sha512_mod_fini); diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index a407f9cd549e..c07a58b96329 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -150,7 +150,7 @@ acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor) {} #endif -static inline const char *acpi_get_enable_method(int cpu) +static __always_inline const char *acpi_get_enable_method(int cpu) { if (acpi_psci_present()) return "psci"; diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index ad63457a05c5..c56c21bb1eec 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -41,6 +41,11 @@ /* * Save/restore interrupts. */ + .macro save_and_disable_daif, flags + mrs \flags, daif + msr daifset, #0xf + .endm + .macro save_and_disable_irq, flags mrs \flags, daif msr daifset, #3 diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index ba5df0df02a4..9f38340d24c2 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -287,17 +287,6 @@ .Lskip_fgt2_\@: .endm -.macro __init_el2_gcs - mrs_s x1, SYS_ID_AA64PFR1_EL1 - ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 - cbz x1, .Lskip_gcs_\@ - - /* Ensure GCS is not enabled when we start trying to do BLs */ - msr_s SYS_GCSCR_EL1, xzr - msr_s SYS_GCSCRE0_EL1, xzr -.Lskip_gcs_\@: -.endm - /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -319,7 +308,6 @@ __init_el2_cptr __init_el2_fgt __init_el2_fgt2 - __init_el2_gcs .endm #ifndef __KVM_NVHE_HYPERVISOR__ @@ -371,6 +359,13 @@ msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 .Lskip_mpam_\@: + check_override id_aa64pfr1, ID_AA64PFR1_EL1_GCS_SHIFT, .Linit_gcs_\@, .Lskip_gcs_\@, x1, x2 + +.Linit_gcs_\@: + msr_s SYS_GCSCR_EL1, xzr + msr_s SYS_GCSCRE0_EL1, xzr + +.Lskip_gcs_\@: check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2 .Linit_sve_\@: /* SVE register access */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index bd020fc28aa9..0720898f563e 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -561,68 +561,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu_set_flag((v), e); \ } while (0) -#define __build_check_all_or_none(r, bits) \ - BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits)) - -#define __cpacr_to_cptr_clr(clr, set) \ - ({ \ - u64 cptr = 0; \ - \ - if ((set) & CPACR_EL1_FPEN) \ - cptr |= CPTR_EL2_TFP; \ - if ((set) & CPACR_EL1_ZEN) \ - cptr |= CPTR_EL2_TZ; \ - if ((set) & CPACR_EL1_SMEN) \ - cptr |= CPTR_EL2_TSM; \ - if ((clr) & CPACR_EL1_TTA) \ - cptr |= CPTR_EL2_TTA; \ - if ((clr) & CPTR_EL2_TAM) \ - cptr |= CPTR_EL2_TAM; \ - if ((clr) & CPTR_EL2_TCPAC) \ - cptr |= CPTR_EL2_TCPAC; \ - \ - cptr; \ - }) - -#define __cpacr_to_cptr_set(clr, set) \ - ({ \ - u64 cptr = 0; \ - \ - if ((clr) & CPACR_EL1_FPEN) \ - cptr |= CPTR_EL2_TFP; \ - if ((clr) & CPACR_EL1_ZEN) \ - cptr |= CPTR_EL2_TZ; \ - if ((clr) & CPACR_EL1_SMEN) \ - cptr |= CPTR_EL2_TSM; \ - if ((set) & CPACR_EL1_TTA) \ - cptr |= CPTR_EL2_TTA; \ - if ((set) & CPTR_EL2_TAM) \ - cptr |= CPTR_EL2_TAM; \ - if ((set) & CPTR_EL2_TCPAC) \ - cptr |= CPTR_EL2_TCPAC; \ - \ - cptr; \ - }) - -#define cpacr_clear_set(clr, set) \ - do { \ - BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \ - BUILD_BUG_ON((clr) & CPACR_EL1_E0POE); \ - __build_check_all_or_none((clr), CPACR_EL1_FPEN); \ - __build_check_all_or_none((set), CPACR_EL1_FPEN); \ - __build_check_all_or_none((clr), CPACR_EL1_ZEN); \ - __build_check_all_or_none((set), CPACR_EL1_ZEN); \ - __build_check_all_or_none((clr), CPACR_EL1_SMEN); \ - __build_check_all_or_none((set), CPACR_EL1_SMEN); \ - \ - if (has_vhe() || has_hvhe()) \ - sysreg_clear_set(cpacr_el1, clr, set); \ - else \ - sysreg_clear_set(cptr_el2, \ - __cpacr_to_cptr_clr(clr, set), \ - __cpacr_to_cptr_set(clr, set));\ - } while (0) - /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5ccca509dff1..3e41a880b062 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1289,9 +1289,8 @@ void kvm_arm_resume_guest(struct kvm *kvm); }) /* - * The couple of isb() below are there to guarantee the same behaviour - * on VHE as on !VHE, where the eret to EL1 acts as a context - * synchronization event. + * The isb() below is there to guarantee the same behaviour on VHE as on !VHE, + * where the eret to EL1 acts as a context synchronization event. */ #define kvm_call_hyp(f, ...) \ do { \ @@ -1309,7 +1308,6 @@ void kvm_arm_resume_guest(struct kvm *kvm); \ if (has_vhe()) { \ ret = f(__VA_ARGS__); \ - isb(); \ } else { \ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ @@ -1482,7 +1480,6 @@ int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range); /* Guest/host FPSIMD coordination helpers */ -int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 2920b0a51403..a2faf0049dab 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -34,7 +34,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ syscall.o proton-pack.o idle.o patching.o pi/ \ - rsi.o + rsi.o jump_label.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o @@ -47,7 +47,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_CPU_PM) += sleep.o suspend.o -obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_EFI) += efi.o efi-rt-wrapper.o obj-$(CONFIG_PCI) += pci.o diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b34044e20128..e151585c6cca 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3135,6 +3135,13 @@ static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope) } #endif +#ifdef CONFIG_ARM64_SME +static bool has_sme_feature(const struct arm64_cpu_capabilities *cap, int scope) +{ + return system_supports_sme() && has_user_cpuid_feature(cap, scope); +} +#endif + static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL), HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES), @@ -3223,31 +3230,31 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC), #ifdef CONFIG_ARM64_SME HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME), - HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), - HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), - HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), - HWCAP_CAP(ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), - HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), - HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), - HWCAP_CAP(ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16), - HWCAP_CAP(ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), - HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), - HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), - HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), - HWCAP_CAP(ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), - HWCAP_CAP(ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), + HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), #endif /* CONFIG_ARM64_SME */ HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT), HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 3857fd7ee8d4..62230d6dd919 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -15,6 +15,7 @@ #include <asm/efi.h> #include <asm/stacktrace.h> +#include <asm/vmap_stack.h> static bool region_is_misaligned(const efi_memory_desc_t *md) { @@ -214,9 +215,13 @@ static int __init arm64_efi_rt_init(void) if (!efi_enabled(EFI_RUNTIME_SERVICES)) return 0; - p = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, GFP_KERNEL, - NUMA_NO_NODE, &&l); -l: if (!p) { + if (!IS_ENABLED(CONFIG_VMAP_STACK)) { + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return -ENOMEM; + } + + p = arch_alloc_vmap_stack(THREAD_SIZE, NUMA_NO_NODE); + if (!p) { pr_warn("Failed to allocate EFI runtime stack\n"); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return -ENOMEM; diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5ae2a34b50bd..fe62218b8e0e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -614,7 +614,7 @@ SYM_CODE_END(ret_to_kernel) SYM_CODE_START_LOCAL(ret_to_user) ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step enable_step_tsk x19, x2 -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE bl stackleak_erase_on_task_stack #endif kernel_exit 0 @@ -825,6 +825,7 @@ SYM_CODE_END(__bp_harden_el1_vectors) * */ SYM_FUNC_START(cpu_switch_to) + save_and_disable_daif x11 mov x10, #THREAD_CPU_CONTEXT add x8, x0, x10 mov x9, sp @@ -848,6 +849,7 @@ SYM_FUNC_START(cpu_switch_to) ptrauth_keys_install_kernel x1, x8, x9, x10 scs_save x0 scs_load_current + restore_irq x11 ret SYM_FUNC_END(cpu_switch_to) NOKPROBE(cpu_switch_to) @@ -874,6 +876,7 @@ NOKPROBE(ret_from_fork) * Calls func(regs) using this CPU's irq stack and shadow irq stack. */ SYM_FUNC_START(call_on_irq_stack) + save_and_disable_daif x9 #ifdef CONFIG_SHADOW_CALL_STACK get_current_task x16 scs_save x16 @@ -888,8 +891,10 @@ SYM_FUNC_START(call_on_irq_stack) /* Move to the new stack and call the function there */ add sp, x16, #IRQ_STACK_SIZE + restore_irq x9 blr x1 + save_and_disable_daif x9 /* * Restore the SP from the FP, and restore the FP and LR from the frame * record. @@ -897,6 +902,7 @@ SYM_FUNC_START(call_on_irq_stack) mov sp, x29 ldp x29, x30, [sp], #16 scs_load_current + restore_irq x9 ret SYM_FUNC_END(call_on_irq_stack) NOKPROBE(call_on_irq_stack) diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile index 4d11a8c29181..f440bf57b1a5 100644 --- a/arch/arm64/kernel/pi/Makefile +++ b/arch/arm64/kernel/pi/Makefile @@ -2,7 +2,7 @@ # Copyright 2022 Google LLC KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ - -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \ $(DISABLE_LATENT_ENTROPY_PLUGIN) \ $(call cc-option,-mbranch-protection=none) \ -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 5954cec19660..08b7042a2e2d 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -673,6 +673,11 @@ static void permission_overlay_switch(struct task_struct *next) current->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); if (current->thread.por_el0 != next->thread.por_el0) { write_sysreg_s(next->thread.por_el0, SYS_POR_EL0); + /* + * No ISB required as we can tolerate spurious Overlay faults - + * the fault handler will check again based on the new value + * of POR_EL0. + */ } } diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index ee94b72bf8fb..4b001121c72d 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1586,7 +1586,7 @@ enum aarch64_regset { static const struct user_regset aarch64_regsets[] = { [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(struct user_pt_regs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), @@ -1594,7 +1594,7 @@ static const struct user_regset aarch64_regsets[] = { .set = gpr_set }, [REGSET_FPR] = { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(struct user_fpsimd_state) / sizeof(u32), /* * We pretend we have 32-bit registers because the fpsr and @@ -1607,7 +1607,7 @@ static const struct user_regset aarch64_regsets[] = { .set = fpr_set }, [REGSET_TLS] = { - .core_note_type = NT_ARM_TLS, + USER_REGSET_NOTE_TYPE(ARM_TLS), .n = 2, .size = sizeof(void *), .align = sizeof(void *), @@ -1616,7 +1616,7 @@ static const struct user_regset aarch64_regsets[] = { }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { - .core_note_type = NT_ARM_HW_BREAK, + USER_REGSET_NOTE_TYPE(ARM_HW_BREAK), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1624,7 +1624,7 @@ static const struct user_regset aarch64_regsets[] = { .set = hw_break_set, }, [REGSET_HW_WATCH] = { - .core_note_type = NT_ARM_HW_WATCH, + USER_REGSET_NOTE_TYPE(ARM_HW_WATCH), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -1633,7 +1633,7 @@ static const struct user_regset aarch64_regsets[] = { }, #endif [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_ARM_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL), .n = 1, .size = sizeof(int), .align = sizeof(int), @@ -1641,7 +1641,7 @@ static const struct user_regset aarch64_regsets[] = { .set = system_call_set, }, [REGSET_FPMR] = { - .core_note_type = NT_ARM_FPMR, + USER_REGSET_NOTE_TYPE(ARM_FPMR), .n = 1, .size = sizeof(u64), .align = sizeof(u64), @@ -1650,7 +1650,7 @@ static const struct user_regset aarch64_regsets[] = { }, #ifdef CONFIG_ARM64_SVE [REGSET_SVE] = { /* Scalable Vector Extension */ - .core_note_type = NT_ARM_SVE, + USER_REGSET_NOTE_TYPE(ARM_SVE), .n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), @@ -1662,7 +1662,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_SME [REGSET_SSVE] = { /* Streaming mode SVE */ - .core_note_type = NT_ARM_SSVE, + USER_REGSET_NOTE_TYPE(ARM_SSVE), .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, @@ -1671,7 +1671,7 @@ static const struct user_regset aarch64_regsets[] = { .set = ssve_set, }, [REGSET_ZA] = { /* SME ZA */ - .core_note_type = NT_ARM_ZA, + USER_REGSET_NOTE_TYPE(ARM_ZA), /* * ZA is a single register but it's variably sized and * the ptrace core requires that the size of any data @@ -1687,7 +1687,7 @@ static const struct user_regset aarch64_regsets[] = { .set = za_set, }, [REGSET_ZT] = { /* SME ZT */ - .core_note_type = NT_ARM_ZT, + USER_REGSET_NOTE_TYPE(ARM_ZT), .n = 1, .size = ZT_SIG_REG_BYTES, .align = sizeof(u64), @@ -1697,7 +1697,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_PTR_AUTH [REGSET_PAC_MASK] = { - .core_note_type = NT_ARM_PAC_MASK, + USER_REGSET_NOTE_TYPE(ARM_PAC_MASK), .n = sizeof(struct user_pac_mask) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), @@ -1705,7 +1705,7 @@ static const struct user_regset aarch64_regsets[] = { /* this cannot be set dynamically */ }, [REGSET_PAC_ENABLED_KEYS] = { - .core_note_type = NT_ARM_PAC_ENABLED_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PAC_ENABLED_KEYS), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1714,7 +1714,7 @@ static const struct user_regset aarch64_regsets[] = { }, #ifdef CONFIG_CHECKPOINT_RESTORE [REGSET_PACA_KEYS] = { - .core_note_type = NT_ARM_PACA_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PACA_KEYS), .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), @@ -1722,7 +1722,7 @@ static const struct user_regset aarch64_regsets[] = { .set = pac_address_keys_set, }, [REGSET_PACG_KEYS] = { - .core_note_type = NT_ARM_PACG_KEYS, + USER_REGSET_NOTE_TYPE(ARM_PACG_KEYS), .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), @@ -1733,7 +1733,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI [REGSET_TAGGED_ADDR_CTRL] = { - .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, + USER_REGSET_NOTE_TYPE(ARM_TAGGED_ADDR_CTRL), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1743,7 +1743,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_POE [REGSET_POE] = { - .core_note_type = NT_ARM_POE, + USER_REGSET_NOTE_TYPE(ARM_POE), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1753,7 +1753,7 @@ static const struct user_regset aarch64_regsets[] = { #endif #ifdef CONFIG_ARM64_GCS [REGSET_GCS] = { - .core_note_type = NT_ARM_GCS, + USER_REGSET_NOTE_TYPE(ARM_GCS), .n = sizeof(struct user_gcs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), @@ -1943,7 +1943,7 @@ static int compat_tls_set(struct task_struct *target, static const struct user_regset aarch32_regsets[] = { [REGSET_COMPAT_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), @@ -1951,7 +1951,7 @@ static const struct user_regset aarch32_regsets[] = { .set = compat_gpr_set }, [REGSET_COMPAT_VFP] = { - .core_note_type = NT_ARM_VFP, + USER_REGSET_NOTE_TYPE(ARM_VFP), .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1968,7 +1968,7 @@ static const struct user_regset_view user_aarch32_view = { static const struct user_regset aarch32_ptrace_regsets[] = { [REGSET_GPR] = { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), @@ -1976,7 +1976,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = compat_gpr_set }, [REGSET_FPR] = { - .core_note_type = NT_ARM_VFP, + USER_REGSET_NOTE_TYPE(ARM_VFP), .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1984,7 +1984,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = compat_vfp_set }, [REGSET_TLS] = { - .core_note_type = NT_ARM_TLS, + USER_REGSET_NOTE_TYPE(ARM_TLS), .n = 1, .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), @@ -1993,7 +1993,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { - .core_note_type = NT_ARM_HW_BREAK, + USER_REGSET_NOTE_TYPE(ARM_HW_BREAK), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -2001,7 +2001,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { .set = hw_break_set, }, [REGSET_HW_WATCH] = { - .core_note_type = NT_ARM_HW_WATCH, + USER_REGSET_NOTE_TYPE(ARM_HW_WATCH), .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), @@ -2010,7 +2010,7 @@ static const struct user_regset aarch32_ptrace_regsets[] = { }, #endif [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_ARM_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(ARM_SYSTEM_CALL), .n = 1, .size = sizeof(int), .align = sizeof(int), diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 3b3f6b56e733..21a795303568 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1143,7 +1143,7 @@ static inline unsigned int num_other_online_cpus(void) void smp_send_stop(void) { static unsigned long stop_in_progress; - cpumask_t mask; + static cpumask_t mask; unsigned long timeout; /* diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 5e27e46aa496..7dec05dd33b7 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -36,7 +36,8 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO # -Wmissing-prototypes and -Wmissing-declarations are removed from # the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled. CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ - $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ + $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) \ + $(GCC_PLUGINS_CFLAGS) \ $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ -Wmissing-prototypes -Wmissing-declarations diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index de2b4e9c9f9f..23dd3f3fc3eb 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -825,10 +825,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (!kvm_arm_vcpu_is_finalized(vcpu)) return -EPERM; - ret = kvm_arch_vcpu_run_map_fp(vcpu); - if (ret) - return ret; - if (likely(vcpu_has_run_once(vcpu))) return 0; @@ -2129,7 +2125,7 @@ static void cpu_hyp_init(void *discard) static void cpu_hyp_uninit(void *discard) { - if (__this_cpu_read(kvm_hyp_initialized)) { + if (!is_protected_kvm_enabled() && __this_cpu_read(kvm_hyp_initialized)) { cpu_hyp_reset(); __this_cpu_write(kvm_hyp_initialized, 0); } @@ -2345,8 +2341,13 @@ static void __init teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) { + if (per_cpu(kvm_hyp_initialized, cpu)) + continue; + free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT); - free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); + + if (!kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]) + continue; if (free_sve) { struct cpu_sve_state *sve_state; @@ -2354,6 +2355,9 @@ static void __init teardown_hyp_mode(void) sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; free_pages((unsigned long) sve_state, pkvm_host_sve_state_order()); } + + free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); + } } @@ -2764,7 +2768,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return memcmp(&old->msi, &new->msi, sizeof(new->msi)); diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 8f6c8f57c6b9..15e17aca1dec 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -15,32 +15,6 @@ #include <asm/sysreg.h> /* - * Called on entry to KVM_RUN unless this vcpu previously ran at least - * once and the most recent prior KVM_RUN for this vcpu was called from - * the same task as current (highly likely). - * - * This is guaranteed to execute before kvm_arch_vcpu_load_fp(vcpu), - * such that on entering hyp the relevant parts of current are already - * mapped. - */ -int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu) -{ - struct user_fpsimd_state *fpsimd = ¤t->thread.uw.fpsimd_state; - int ret; - - /* pKVM has its own tracking of the host fpsimd state. */ - if (is_protected_kvm_enabled()) - return 0; - - /* Make sure the host task fpsimd state is visible to hyp: */ - ret = kvm_share_hyp(fpsimd, fpsimd + 1); - if (ret) - return ret; - - return 0; -} - -/* * Prepare vcpu for saving the host's FPSIMD state and loading the guest's. * The actual loading is done by the FPSIMD access trap taken to hyp. * diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 76dfda116e56..2ad57b117385 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -65,6 +65,136 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); +} + +static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA; + u64 cptr; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_EL1_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); +} + +static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + + if (has_vhe() || has_hvhe()) + __activate_cptr_traps_vhe(vcpu); + else + __activate_cptr_traps_nvhe(vcpu); +} + +static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); +} + +static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN; + + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); +} + +static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_vhe() || has_hvhe()) + __deactivate_cptr_traps_vhe(vcpu); + else + __deactivate_cptr_traps_nvhe(vcpu); +} + #define reg_to_fgt_masks(reg) \ ({ \ struct fgt_masks *m; \ @@ -486,11 +616,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) */ if (system_supports_sve()) { __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_EL1_ZEN, 0); - } else { __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); } @@ -541,10 +666,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); - else - cpacr_clear_set(0, CPACR_EL1_FPEN); + __deactivate_cptr_traps(vcpu); isb(); /* Write out the host state if it's in the registers */ @@ -566,6 +688,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED; + /* + * Re-enable traps necessary for the current state of the guest, e.g. + * those enabled by a guest hypervisor. The ERET to the guest will + * provide the necessary context synchronization. + */ + __activate_cptr_traps(vcpu); + return true; } diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index a76522d63c3e..0b0a68b663d4 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -12,7 +12,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__ ccflags-y += -fno-stack-protector \ -DDISABLE_BRANCH_PROFILING \ - $(DISABLE_STACKLEAK_PLUGIN) + $(DISABLE_KSTACK_ERASE) hostprogs := gen-hyprel HOST_EXTRACFLAGS += -I$(objtree)/include diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index e9198e56e784..3206b2c07f82 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -69,7 +69,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) return; - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); + /* + * Traps have been disabled by __deactivate_cptr_traps(), but there + * hasn't necessarily been a context synchronization event yet. + */ isb(); if (vcpu_has_sve(vcpu)) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 95d7534c9679..8957734d6183 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -479,6 +479,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) { struct kvm_mem_range cur; kvm_pte_t pte; + u64 granule; s8 level; int ret; @@ -496,18 +497,21 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) return -EPERM; } - do { - u64 granule = kvm_granule_size(level); + for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) { + if (!kvm_level_supports_block_mapping(level)) + continue; + granule = kvm_granule_size(level); cur.start = ALIGN_DOWN(addr, granule); cur.end = cur.start + granule; - level++; - } while ((level <= KVM_PGTABLE_LAST_LEVEL) && - !(kvm_level_supports_block_mapping(level) && - range_included(&cur, range))); + if (!range_included(&cur, range)) + continue; + *range = cur; + return 0; + } - *range = cur; + WARN_ON(1); - return 0; + return -EINVAL; } int host_stage2_idmap_locked(phys_addr_t addr, u64 size, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 73affe1333a4..0e752b515d0f 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -47,65 +47,6 @@ struct fgt_masks hdfgwtr2_masks; extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - - if (!guest_owns_fp_regs()) - __activate_traps_fpsimd32(vcpu); - - if (has_hvhe()) { - val |= CPACR_EL1_TTA; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } - - write_sysreg(val, cpacr_el1); - } else { - val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; - - /* - * Always trap SME since it's not supported in KVM. - * TSM is RES1 if SME isn't implemented. - */ - val |= CPTR_EL2_TSM; - - if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) - val |= CPTR_EL2_TZ; - - if (!guest_owns_fp_regs()) - val |= CPTR_EL2_TFP; - - write_sysreg(val, cptr_el2); - } -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - if (has_hvhe()) { - u64 val = CPACR_EL1_FPEN; - - if (cpus_have_final_cap(ARM64_SVE)) - val |= CPACR_EL1_ZEN; - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN; - - write_sysreg(val, cpacr_el1); - } else { - u64 val = CPTR_NVHE_EL2_RES1; - - if (!cpus_have_final_cap(ARM64_SVE)) - val |= CPTR_EL2_TZ; - if (!cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; - - write_sysreg(val, cptr_el2); - } -} - static void __activate_traps(struct kvm_vcpu *vcpu) { ___activate_traps(vcpu, vcpu->arch.hcr_el2); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 09df2b42bc1b..477f1580ffea 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -90,87 +90,6 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu) return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 cptr; - - /* - * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to - * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, - * except for some missing controls, such as TAM. - * In this case, CPTR_EL2.TAM has the same position with or without - * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM - * shift value for trapping the AMU accesses. - */ - u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } else { - __activate_traps_fpsimd32(vcpu); - } - - if (!vcpu_has_nv(vcpu)) - goto write; - - /* - * The architecture is a bit crap (what a surprise): an EL2 guest - * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, - * as they are RES0 in the guest's view. To work around it, trap the - * sucker using the very same bit it can't set... - */ - if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) - val |= CPTR_EL2_TCPAC; - - /* - * Layer the guest hypervisor's trap configuration on top of our own if - * we're in a nested context. - */ - if (is_hyp_ctxt(vcpu)) - goto write; - - cptr = vcpu_sanitised_cptr_el2(vcpu); - - /* - * Pay attention, there's some interesting detail here. - * - * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two - * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): - * - * - CPTR_EL2.xEN = x0, traps are enabled - * - CPTR_EL2.xEN = x1, traps are disabled - * - * In other words, bit[0] determines if guest accesses trap or not. In - * the interest of simplicity, clear the entire field if the guest - * hypervisor has traps enabled to dispel any illusion of something more - * complicated taking place. - */ - if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_FPEN; - if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_ZEN; - - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - val |= cptr & CPACR_EL1_E0POE; - - val |= cptr & CPTR_EL2_TCPAC; - -write: - write_sysreg(val, cpacr_el1); -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; - - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN; - - write_sysreg(val, cpacr_el1); -} - static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -639,10 +558,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - sysreg_save_host_state_vhe(host_ctxt); - fpsimd_lazy_switch_to_guest(vcpu); + sysreg_save_host_state_vhe(host_ctxt); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -667,15 +586,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); - fpsimd_lazy_switch_to_host(vcpu); - sysreg_restore_host_state_vhe(host_ctxt); + __debug_switch_to_host(vcpu); + + /* + * Ensure that all system register writes above have taken effect + * before returning to the host. In VHE mode, CPTR traps for + * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be + * manipulated after the ISB. + */ + isb(); + + fpsimd_lazy_switch_to_host(vcpu); + if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); - __debug_switch_to_host(vcpu); - return exit_code; } NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); @@ -705,12 +632,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ local_daif_restore(DAIF_PROCCTX_NOIRQ); - /* - * When we exit from the guest we change a number of CPU configuration - * parameters, such as traps. We rely on the isb() in kvm_call_hyp*() - * to make sure these changes take effect before running the host or - * additional guests. - */ return ret; } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 5b191f4dc566..dc1d26559bfa 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1402,6 +1402,21 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) } } +#define has_tgran_2(__r, __sz) \ + ({ \ + u64 _s1, _s2, _mmfr0 = __r; \ + \ + _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz##_2, _mmfr0); \ + \ + _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \ + TGRAN##__sz, _mmfr0); \ + \ + ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \ + _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \ + (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \ + _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \ + }) /* * Our emulated CPU doesn't support all the possible features. For the * sake of simplicity (and probably mental sanity), wipe out a number @@ -1411,6 +1426,8 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) */ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) { + u64 orig_val = val; + switch (reg) { case SYS_ID_AA64ISAR0_EL1: /* Support everything but TME */ @@ -1480,13 +1497,16 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) */ switch (PAGE_SIZE) { case SZ_4K: - val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP); + if (has_tgran_2(orig_val, 4)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP); fallthrough; case SZ_16K: - val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP); + if (has_tgran_2(orig_val, 16)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP); fallthrough; case SZ_64K: - val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP); + if (has_tgran_2(orig_val, 64)) + val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP); break; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 76c2f0da821f..c20bd6f21e60 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2624,7 +2624,7 @@ static bool access_mdcr(struct kvm_vcpu *vcpu, */ if (hpmn > vcpu->kvm->arch.nr_pmu_counters) { hpmn = vcpu->kvm->arch.nr_pmu_counters; - u64_replace_bits(val, hpmn, MDCR_EL2_HPMN); + u64p_replace_bits(&val, hpmn, MDCR_EL2_HPMN); } __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index d22a8ad7bcc5..679aafe77de2 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -36,6 +36,11 @@ struct shadow_if { static DEFINE_PER_CPU(struct shadow_if, shadow_if); +static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) +{ + return hweight16(shadow_if->lr_map & (BIT(idx) - 1)); +} + /* * Nesting GICv3 support * @@ -209,6 +214,29 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) return reg; } +static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr) +{ + struct vgic_irq *irq; + + if (!(lr & ICH_LR_HW)) + return lr; + + /* We have the HW bit set, check for validity of pINTID */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + /* If there was no real mapping, nuke the HW bit */ + if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI) + lr &= ~ICH_LR_HW; + + /* Translate the virtual mapping to the real one, even if invalid */ + if (irq) { + lr &= ~ICH_LR_PHYS_ID_MASK; + lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + vgic_put_irq(vcpu->kvm, irq); + } + + return lr; +} + /* * For LRs which have HW bit set such as timer interrupts, we modify them to * have the host hardware interrupt number instead of the virtual one programmed @@ -217,58 +245,37 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, struct vgic_v3_cpu_if *s_cpu_if) { - unsigned long lr_map = 0; - int index = 0; + struct shadow_if *shadow_if; + + shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif); + shadow_if->lr_map = 0; for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); - struct vgic_irq *irq; if (!(lr & ICH_LR_STATE)) - lr = 0; - - if (!(lr & ICH_LR_HW)) - goto next; - - /* We have the HW bit set, check for validity of pINTID */ - irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) { - /* There was no real mapping, so nuke the HW bit */ - lr &= ~ICH_LR_HW; - if (irq) - vgic_put_irq(vcpu->kvm, irq); - goto next; - } - - /* Translate the virtual mapping to the real one */ - lr &= ~ICH_LR_PHYS_ID_MASK; - lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + continue; - vgic_put_irq(vcpu->kvm, irq); + lr = translate_lr_pintid(vcpu, lr); -next: - s_cpu_if->vgic_lr[index] = lr; - if (lr) { - lr_map |= BIT(i); - index++; - } + s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr; + shadow_if->lr_map |= BIT(i); } - container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map; - s_cpu_if->used_lrs = index; + s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); } void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); - int i, index = 0; + int i; for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); struct vgic_irq *irq; if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) - goto next; + continue; /* * If we had a HW lr programmed by the guest hypervisor, we @@ -277,15 +284,13 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) */ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ - goto next; + continue; - lr = __gic_v3_get_lr(index); + lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); if (!(lr & ICH_LR_STATE)) irq->active = false; vgic_put_irq(vcpu->kvm, irq); - next: - index++; } } @@ -368,13 +373,11 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); val &= ~ICH_LR_STATE; - val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; + val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE; __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - s_cpu_if->vgic_lr[i] = 0; } - shadow_if->lr_map = 0; vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; } @@ -398,9 +401,7 @@ void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu) { bool level; - level = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En; - if (level) - level &= vgic_v3_get_misr(vcpu); + level = (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En) && vgic_v3_get_misr(vcpu); kvm_vgic_inject_irq(vcpu->kvm, vcpu, vcpu->kvm->arch.vgic.mi_intid, level, vcpu); } diff --git a/arch/arm64/lib/.gitignore b/arch/arm64/lib/.gitignore new file mode 100644 index 000000000000..647d7a922e68 --- /dev/null +++ b/arch/arm64/lib/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# This now-removed directory used to contain generated files. +/crypto/ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 027bfa9689c6..633e5223d944 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 - -obj-y += crypto/ - lib-y := clear_user.o delay.o copy_from_user.o \ copy_to_user.o copy_page.o \ clear_page.o csum.o insn.o memchr.o memcpy.o \ @@ -16,12 +13,6 @@ endif lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o -obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o -crc32-arm64-y := crc32.o crc32-core.o - -obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o -crc-t10dif-arm64-y := crc-t10dif.o crc-t10dif-core.o - obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-$(CONFIG_ARM64_MTE) += mte.o diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S deleted file mode 100644 index 87dd6d46224d..000000000000 --- a/arch/arm64/lib/crc-t10dif-core.S +++ /dev/null @@ -1,469 +0,0 @@ -// -// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions -// -// Copyright (C) 2016 Linaro Ltd -// Copyright (C) 2019-2024 Google LLC -// -// Authors: Ard Biesheuvel <ardb@google.com> -// Eric Biggers <ebiggers@google.com> -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License version 2 as -// published by the Free Software Foundation. -// - -// Derived from the x86 version: -// -// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions -// -// Copyright (c) 2013, Intel Corporation -// -// Authors: -// Erdinc Ozturk <erdinc.ozturk@intel.com> -// Vinodh Gopal <vinodh.gopal@intel.com> -// James Guilford <james.guilford@intel.com> -// Tim Chen <tim.c.chen@linux.intel.com> -// -// This software is available to you under a choice of one of two -// licenses. You may choose to be licensed under the terms of the GNU -// General Public License (GPL) Version 2, available from the file -// COPYING in the main directory of this source tree, or the -// OpenIB.org BSD license below: -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the -// distribution. -// -// * Neither the name of the Intel Corporation nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// -// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Reference paper titled "Fast CRC Computation for Generic -// Polynomials Using PCLMULQDQ Instruction" -// URL: http://www.intel.com/content/dam/www/public/us/en/documents -// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -// - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .text - .arch armv8-a+crypto - - init_crc .req w0 - buf .req x1 - len .req x2 - fold_consts_ptr .req x5 - - fold_consts .req v10 - - t3 .req v17 - t4 .req v18 - t5 .req v19 - t6 .req v20 - t7 .req v21 - t8 .req v22 - - perm .req v27 - - .macro pmull16x64_p64, a16, b64, c64 - pmull2 \c64\().1q, \a16\().2d, \b64\().2d - pmull \b64\().1q, \a16\().1d, \b64\().1d - .endm - - /* - * Pairwise long polynomial multiplication of two 16-bit values - * - * { w0, w1 }, { y0, y1 } - * - * by two 64-bit values - * - * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } - * - * where each vector element is a byte, ordered from least to most - * significant. - * - * This can be implemented using 8x8 long polynomial multiplication, by - * reorganizing the input so that each pairwise 8x8 multiplication - * produces one of the terms from the decomposition below, and - * combining the results of each rank and shifting them into place. - * - * Rank - * 0 w0*x0 ^ | y0*z0 ^ - * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ - * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ - * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ - * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ - * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ - * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ - * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ - * 8 w1*x7 << 64 | y1*z7 << 64 - * - * The inputs can be reorganized into - * - * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } - * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } - * - * and after performing 8x8->16 bit long polynomial multiplication of - * each of the halves of the first vector with those of the second one, - * we obtain the following four vectors of 16-bit elements: - * - * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } - * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } - * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } - * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } - * - * Results b and c can be XORed together, as the vector elements have - * matching ranks. Then, the final XOR (*) can be pulled forward, and - * applied between the halves of each of the remaining three vectors, - * which are then shifted into place, and combined to produce two - * 80-bit results. - * - * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent - * to the 64x64 bit one above, but XOR'ing the outputs together will - * produce the expected result, and this is sufficient in the context of - * this algorithm. - */ - .macro pmull16x64_p8, a16, b64, c64 - ext t7.16b, \b64\().16b, \b64\().16b, #1 - tbl t5.16b, {\a16\().16b}, perm.16b - uzp1 t7.16b, \b64\().16b, t7.16b - bl __pmull_p8_16x64 - ext \b64\().16b, t4.16b, t4.16b, #15 - eor \c64\().16b, t8.16b, t5.16b - .endm - -SYM_FUNC_START_LOCAL(__pmull_p8_16x64) - ext t6.16b, t5.16b, t5.16b, #8 - - pmull t3.8h, t7.8b, t5.8b - pmull t4.8h, t7.8b, t6.8b - pmull2 t5.8h, t7.16b, t5.16b - pmull2 t6.8h, t7.16b, t6.16b - - ext t8.16b, t3.16b, t3.16b, #8 - eor t4.16b, t4.16b, t6.16b - ext t7.16b, t5.16b, t5.16b, #8 - ext t6.16b, t4.16b, t4.16b, #8 - eor t8.8b, t8.8b, t3.8b - eor t5.8b, t5.8b, t7.8b - eor t4.8b, t4.8b, t6.8b - ext t5.16b, t5.16b, t5.16b, #14 - ret -SYM_FUNC_END(__pmull_p8_16x64) - - - // Fold reg1, reg2 into the next 32 data bytes, storing the result back - // into reg1, reg2. - .macro fold_32_bytes, p, reg1, reg2 - ldp q11, q12, [buf], #0x20 - - pmull16x64_\p fold_consts, \reg1, v8 - -CPU_LE( rev64 v11.16b, v11.16b ) -CPU_LE( rev64 v12.16b, v12.16b ) - - pmull16x64_\p fold_consts, \reg2, v9 - -CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) -CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) - - eor \reg1\().16b, \reg1\().16b, v8.16b - eor \reg2\().16b, \reg2\().16b, v9.16b - eor \reg1\().16b, \reg1\().16b, v11.16b - eor \reg2\().16b, \reg2\().16b, v12.16b - .endm - - // Fold src_reg into dst_reg, optionally loading the next fold constants - .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts - pmull16x64_\p fold_consts, \src_reg, v8 - .ifnb \load_next_consts - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - .endif - eor \dst_reg\().16b, \dst_reg\().16b, v8.16b - eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b - .endm - - .macro crc_t10dif_pmull, p - - // For sizes less than 256 bytes, we can't fold 128 bytes at a time. - cmp len, #256 - b.lt .Lless_than_256_bytes_\@ - - adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts - - // Load the first 128 data bytes. Byte swapping is necessary to make - // the bit order match the polynomial coefficient order. - ldp q0, q1, [buf] - ldp q2, q3, [buf, #0x20] - ldp q4, q5, [buf, #0x40] - ldp q6, q7, [buf, #0x60] - add buf, buf, #0x80 -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( rev64 v1.16b, v1.16b ) -CPU_LE( rev64 v2.16b, v2.16b ) -CPU_LE( rev64 v3.16b, v3.16b ) -CPU_LE( rev64 v4.16b, v4.16b ) -CPU_LE( rev64 v5.16b, v5.16b ) -CPU_LE( rev64 v6.16b, v6.16b ) -CPU_LE( rev64 v7.16b, v7.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) -CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) -CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) -CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) -CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) -CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) -CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) - - // XOR the first 16 data *bits* with the initial CRC value. - movi v8.16b, #0 - mov v8.h[7], init_crc - eor v0.16b, v0.16b, v8.16b - - // Load the constants for folding across 128 bytes. - ld1 {fold_consts.2d}, [fold_consts_ptr] - - // Subtract 128 for the 128 data bytes just consumed. Subtract another - // 128 to simplify the termination condition of the following loop. - sub len, len, #256 - - // While >= 128 data bytes remain (not counting v0-v7), fold the 128 - // bytes v0-v7 into them, storing the result back into v0-v7. -.Lfold_128_bytes_loop_\@: - fold_32_bytes \p, v0, v1 - fold_32_bytes \p, v2, v3 - fold_32_bytes \p, v4, v5 - fold_32_bytes \p, v6, v7 - - subs len, len, #128 - b.ge .Lfold_128_bytes_loop_\@ - - // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. - - // Fold across 64 bytes. - add fold_consts_ptr, fold_consts_ptr, #16 - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - fold_16_bytes \p, v0, v4 - fold_16_bytes \p, v1, v5 - fold_16_bytes \p, v2, v6 - fold_16_bytes \p, v3, v7, 1 - // Fold across 32 bytes. - fold_16_bytes \p, v4, v6 - fold_16_bytes \p, v5, v7, 1 - // Fold across 16 bytes. - fold_16_bytes \p, v6, v7 - - // Add 128 to get the correct number of data bytes remaining in 0...127 - // (not counting v7), following the previous extra subtraction by 128. - // Then subtract 16 to simplify the termination condition of the - // following loop. - adds len, len, #(128-16) - - // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 - // into them, storing the result back into v7. - b.lt .Lfold_16_bytes_loop_done_\@ -.Lfold_16_bytes_loop_\@: - pmull16x64_\p fold_consts, v7, v8 - eor v7.16b, v7.16b, v8.16b - ldr q0, [buf], #16 -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) - eor v7.16b, v7.16b, v0.16b - subs len, len, #16 - b.ge .Lfold_16_bytes_loop_\@ - -.Lfold_16_bytes_loop_done_\@: - // Add 16 to get the correct number of data bytes remaining in 0...15 - // (not counting v7), following the previous extra subtraction by 16. - adds len, len, #16 - b.eq .Lreduce_final_16_bytes_\@ - -.Lhandle_partial_segment_\@: - // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first - // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To - // do this without needing a fold constant for each possible 'len', - // redivide the bytes into a first chunk of 'len' bytes and a second - // chunk of 16 bytes, then fold the first chunk into the second. - - // v0 = last 16 original data bytes - add buf, buf, len - ldr q0, [buf, #-16] -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) - - // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. - adr_l x4, .Lbyteshift_table + 16 - sub x4, x4, len - ld1 {v2.16b}, [x4] - tbl v1.16b, {v7.16b}, v2.16b - - // v3 = first chunk: v7 right-shifted by '16-len' bytes. - movi v3.16b, #0x80 - eor v2.16b, v2.16b, v3.16b - tbl v3.16b, {v7.16b}, v2.16b - - // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. - sshr v2.16b, v2.16b, #7 - - // v2 = second chunk: 'len' bytes from v0 (low-order bytes), - // then '16-len' bytes from v1 (high-order bytes). - bsl v2.16b, v1.16b, v0.16b - - // Fold the first chunk into the second chunk, storing the result in v7. - pmull16x64_\p fold_consts, v3, v0 - eor v7.16b, v3.16b, v0.16b - eor v7.16b, v7.16b, v2.16b - b .Lreduce_final_16_bytes_\@ - -.Lless_than_256_bytes_\@: - // Checksumming a buffer of length 16...255 bytes - - adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts - - // Load the first 16 data bytes. - ldr q7, [buf], #0x10 -CPU_LE( rev64 v7.16b, v7.16b ) -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) - - // XOR the first 16 data *bits* with the initial CRC value. - movi v0.16b, #0 - mov v0.h[7], init_crc - eor v7.16b, v7.16b, v0.16b - - // Load the fold-across-16-bytes constants. - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - - cmp len, #16 - b.eq .Lreduce_final_16_bytes_\@ // len == 16 - subs len, len, #32 - b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 - add len, len, #16 - b .Lhandle_partial_segment_\@ // 17 <= len <= 31 - -.Lreduce_final_16_bytes_\@: - .endm - -// -// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); -// -// Assumes len >= 16. -// -SYM_FUNC_START(crc_t10dif_pmull_p8) - frame_push 1 - - // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 } - movi perm.4h, #8, lsl #8 - orr perm.2s, #1, lsl #16 - orr perm.2s, #1, lsl #24 - zip1 perm.16b, perm.16b, perm.16b - zip1 perm.16b, perm.16b, perm.16b - - crc_t10dif_pmull p8 - -CPU_LE( rev64 v7.16b, v7.16b ) -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) - str q7, [x3] - - frame_pop - ret -SYM_FUNC_END(crc_t10dif_pmull_p8) - - .align 5 -// -// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); -// -// Assumes len >= 16. -// -SYM_FUNC_START(crc_t10dif_pmull_p64) - crc_t10dif_pmull p64 - - // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. - - movi v2.16b, #0 // init zero register - - // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. - ld1 {fold_consts.2d}, [fold_consts_ptr], #16 - - // Fold the high 64 bits into the low 64 bits, while also multiplying by - // x^64. This produces a 128-bit value congruent to x^64 * M(x) and - // whose low 48 bits are 0. - ext v0.16b, v2.16b, v7.16b, #8 - pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x)) - eor v0.16b, v0.16b, v7.16b // + low bits * x^64 - - // Fold the high 32 bits into the low 96 bits. This produces a 96-bit - // value congruent to x^64 * M(x) and whose low 48 bits are 0. - ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits - mov v0.s[3], v2.s[0] // zero high 32 bits - pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x)) - eor v0.16b, v0.16b, v1.16b // + low bits - - // Load G(x) and floor(x^48 / G(x)). - ld1 {fold_consts.2d}, [fold_consts_ptr] - - // Use Barrett reduction to compute the final CRC value. - pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x)) - ushr v1.2d, v1.2d, #32 // /= x^32 - pmull v1.1q, v1.1d, fold_consts.1d // *= G(x) - ushr v0.2d, v0.2d, #48 - eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits - // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. - - umov w0, v0.h[0] - ret -SYM_FUNC_END(crc_t10dif_pmull_p64) - - .section ".rodata", "a" - .align 4 - -// Fold constants precomputed from the polynomial 0x18bb7 -// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 -.Lfold_across_128_bytes_consts: - .quad 0x0000000000006123 // x^(8*128) mod G(x) - .quad 0x0000000000002295 // x^(8*128+64) mod G(x) -// .Lfold_across_64_bytes_consts: - .quad 0x0000000000001069 // x^(4*128) mod G(x) - .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) -// .Lfold_across_32_bytes_consts: - .quad 0x000000000000857d // x^(2*128) mod G(x) - .quad 0x0000000000007acc // x^(2*128+64) mod G(x) -.Lfold_across_16_bytes_consts: - .quad 0x000000000000a010 // x^(1*128) mod G(x) - .quad 0x0000000000001faa // x^(1*128+64) mod G(x) -// .Lfinal_fold_consts: - .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) - .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) -// .Lbarrett_reduction_consts: - .quad 0x0000000000018bb7 // G(x) - .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) - -// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - -// len] is the index vector to shift left by 'len' bytes, and is also {0x80, -// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. -.Lbyteshift_table: - .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 - .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f - .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 - .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/arm64/lib/crc-t10dif.c b/arch/arm64/lib/crc-t10dif.c deleted file mode 100644 index c2ffe4fdb59d..000000000000 --- a/arch/arm64/lib/crc-t10dif.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions - * - * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <linux/cpufeature.h> -#include <linux/crc-t10dif.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> - -#include <crypto/internal/simd.h> - -#include <asm/neon.h> -#include <asm/simd.h> - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); - -#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U - -asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len, - u8 out[16]); -asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); - -u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) -{ - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { - if (static_branch_likely(&have_pmull)) { - if (crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc_t10dif_pmull_p64(crc, data, length); - kernel_neon_end(); - return crc; - } - } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && - static_branch_likely(&have_asimd) && - crypto_simd_usable()) { - u8 buf[16]; - - kernel_neon_begin(); - crc_t10dif_pmull_p8(crc, data, length, buf); - kernel_neon_end(); - - return crc_t10dif_generic(0, buf, sizeof(buf)); - } - } - return crc_t10dif_generic(crc, data, length); -} -EXPORT_SYMBOL(crc_t10dif_arch); - -static int __init crc_t10dif_arm64_init(void) -{ - if (cpu_have_named_feature(ASIMD)) { - static_branch_enable(&have_asimd); - if (cpu_have_named_feature(PMULL)) - static_branch_enable(&have_pmull); - } - return 0; -} -subsys_initcall(crc_t10dif_arm64_init); - -static void __exit crc_t10dif_arm64_exit(void) -{ -} -module_exit(crc_t10dif_arm64_exit); - -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/lib/crc32-core.S b/arch/arm64/lib/crc32-core.S deleted file mode 100644 index 68825317460f..000000000000 --- a/arch/arm64/lib/crc32-core.S +++ /dev/null @@ -1,362 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions - * - * Copyright (C) 2016 - 2018 Linaro Ltd. - * Copyright (C) 2024 Google LLC - * - * Author: Ard Biesheuvel <ardb@kernel.org> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .cpu generic+crc+crypto - - .macro bitle, reg - .endm - - .macro bitbe, reg - rbit \reg, \reg - .endm - - .macro bytele, reg - .endm - - .macro bytebe, reg - rbit \reg, \reg - lsr \reg, \reg, #24 - .endm - - .macro hwordle, reg -CPU_BE( rev16 \reg, \reg ) - .endm - - .macro hwordbe, reg -CPU_LE( rev \reg, \reg ) - rbit \reg, \reg -CPU_BE( lsr \reg, \reg, #16 ) - .endm - - .macro le, regs:vararg - .irp r, \regs -CPU_BE( rev \r, \r ) - .endr - .endm - - .macro be, regs:vararg - .irp r, \regs -CPU_LE( rev \r, \r ) - .endr - .irp r, \regs - rbit \r, \r - .endr - .endm - - .macro __crc32, c, order=le - bit\order w0 - cmp x2, #16 - b.lt 8f // less than 16 bytes - - and x7, x2, #0x1f - and x2, x2, #~0x1f - cbz x7, 32f // multiple of 32 bytes - - and x8, x7, #0xf - ldp x3, x4, [x1] - add x8, x8, x1 - add x1, x1, x7 - ldp x5, x6, [x8] - \order x3, x4, x5, x6 - - tst x7, #8 - crc32\c\()x w8, w0, x3 - csel x3, x3, x4, eq - csel w0, w0, w8, eq - tst x7, #4 - lsr x4, x3, #32 - crc32\c\()w w8, w0, w3 - csel x3, x3, x4, eq - csel w0, w0, w8, eq - tst x7, #2 - lsr w4, w3, #16 - crc32\c\()h w8, w0, w3 - csel w3, w3, w4, eq - csel w0, w0, w8, eq - tst x7, #1 - crc32\c\()b w8, w0, w3 - csel w0, w0, w8, eq - tst x7, #16 - crc32\c\()x w8, w0, x5 - crc32\c\()x w8, w8, x6 - csel w0, w0, w8, eq - cbz x2, 0f - -32: ldp x3, x4, [x1], #32 - sub x2, x2, #32 - ldp x5, x6, [x1, #-16] - \order x3, x4, x5, x6 - crc32\c\()x w0, w0, x3 - crc32\c\()x w0, w0, x4 - crc32\c\()x w0, w0, x5 - crc32\c\()x w0, w0, x6 - cbnz x2, 32b -0: bit\order w0 - ret - -8: tbz x2, #3, 4f - ldr x3, [x1], #8 - \order x3 - crc32\c\()x w0, w0, x3 -4: tbz x2, #2, 2f - ldr w3, [x1], #4 - \order w3 - crc32\c\()w w0, w0, w3 -2: tbz x2, #1, 1f - ldrh w3, [x1], #2 - hword\order w3 - crc32\c\()h w0, w0, w3 -1: tbz x2, #0, 0f - ldrb w3, [x1] - byte\order w3 - crc32\c\()b w0, w0, w3 -0: bit\order w0 - ret - .endm - - .align 5 -SYM_FUNC_START(crc32_le_arm64) - __crc32 -SYM_FUNC_END(crc32_le_arm64) - - .align 5 -SYM_FUNC_START(crc32c_le_arm64) - __crc32 c -SYM_FUNC_END(crc32c_le_arm64) - - .align 5 -SYM_FUNC_START(crc32_be_arm64) - __crc32 order=be -SYM_FUNC_END(crc32_be_arm64) - - in .req x1 - len .req x2 - - /* - * w0: input CRC at entry, output CRC at exit - * x1: pointer to input buffer - * x2: length of input in bytes - */ - .macro crc4way, insn, table, order=le - bit\order w0 - lsr len, len, #6 // len := # of 64-byte blocks - - /* Process up to 64 blocks of 64 bytes at a time */ -.La\@: mov x3, #64 - cmp len, #64 - csel x3, x3, len, hi // x3 := min(len, 64) - sub len, len, x3 - - /* Divide the input into 4 contiguous blocks */ - add x4, x3, x3, lsl #1 // x4 := 3 * x3 - add x7, in, x3, lsl #4 // x7 := in + 16 * x3 - add x8, in, x3, lsl #5 // x8 := in + 32 * x3 - add x9, in, x4, lsl #4 // x9 := in + 16 * x4 - - /* Load the folding coefficients from the lookup table */ - adr_l x5, \table - 12 // entry 0 omitted - add x5, x5, x4, lsl #2 // x5 += 12 * x3 - ldp s0, s1, [x5] - ldr s2, [x5, #8] - - /* Zero init partial CRCs for this iteration */ - mov w4, wzr - mov w5, wzr - mov w6, wzr - mov x17, xzr - -.Lb\@: sub x3, x3, #1 - \insn w6, w6, x17 - ldp x10, x11, [in], #16 - ldp x12, x13, [x7], #16 - ldp x14, x15, [x8], #16 - ldp x16, x17, [x9], #16 - - \order x10, x11, x12, x13, x14, x15, x16, x17 - - /* Apply the CRC transform to 4 16-byte blocks in parallel */ - \insn w0, w0, x10 - \insn w4, w4, x12 - \insn w5, w5, x14 - \insn w6, w6, x16 - \insn w0, w0, x11 - \insn w4, w4, x13 - \insn w5, w5, x15 - cbnz x3, .Lb\@ - - /* Combine the 4 partial results into w0 */ - mov v3.d[0], x0 - mov v4.d[0], x4 - mov v5.d[0], x5 - pmull v0.1q, v0.1d, v3.1d - pmull v1.1q, v1.1d, v4.1d - pmull v2.1q, v2.1d, v5.1d - eor v0.8b, v0.8b, v1.8b - eor v0.8b, v0.8b, v2.8b - mov x5, v0.d[0] - eor x5, x5, x17 - \insn w0, w6, x5 - - mov in, x9 - cbnz len, .La\@ - - bit\order w0 - ret - .endm - - .align 5 -SYM_FUNC_START(crc32c_le_arm64_4way) - crc4way crc32cx, .L0 -SYM_FUNC_END(crc32c_le_arm64_4way) - - .align 5 -SYM_FUNC_START(crc32_le_arm64_4way) - crc4way crc32x, .L1 -SYM_FUNC_END(crc32_le_arm64_4way) - - .align 5 -SYM_FUNC_START(crc32_be_arm64_4way) - crc4way crc32x, .L1, be -SYM_FUNC_END(crc32_be_arm64_4way) - - .section .rodata, "a", %progbits - .align 6 -.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27 - .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e - .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b - .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8 - .long 0x299847d5, 0x878a92a7, 0x39d3b296 - .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53 - .long 0xa60ce07b, 0x83348832, 0x47db8317 - .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092 - .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0 - .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7 - .long 0xf285651c, 0xce7f39f4, 0xdaece73e - .long 0x271d9844, 0xd270f1a2, 0xab7aff2a - .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385 - .long 0xcec3662e, 0x1b03397f, 0x83348832 - .long 0x8227bb8a, 0xb3e32c28, 0x299847d5 - .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86 - .long 0xf6076544, 0x10746f3c, 0x18b33a4e - .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b - .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7 - .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b - .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b - .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4 - .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56 - .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2 - .long 0xa90fd27a, 0x0167d312, 0xc619809d - .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d - .long 0x4597456a, 0x98d8d9cb, 0x65863b64 - .long 0xc9c8b782, 0x68bce87a, 0x1b03397f - .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd - .long 0x2342001e, 0x3771e98f, 0xb3e32c28 - .long 0xe8b6368b, 0x2178513a, 0x064f7f26 - .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c - .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c - .long 0x02ee03b2, 0xff0dba97, 0x10746f3c - .long 0x135c83fd, 0xf872e54c, 0xc7a68855 - .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844 - .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c - .long 0xded288f8, 0xb3af077a, 0x93a5f730 - .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c - .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2 - .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203 - .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e - .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb - .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a - .long 0x8e1450f7, 0x2342001e, 0x8227bb8a - .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768 - .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35 - .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c - .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b - .long 0xd6c3a807, 0x2664fd8b, 0x0167d312 - .long 0x1d31175f, 0x02ee03b2, 0xf6076544 - .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a - .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf - .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb - .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c - .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a - .long 0x8a074012, 0xded288f8, 0x57a3d037 - .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b - .long 0x3be3c09b, 0x6353c1cc, 0x42d98888 - .long 0x465a4eee, 0xf48642e9, 0x3771e98f - .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9 - .long 0xa52f58ec, 0x9a5ede41, 0x2178513a - .long 0x47972100, 0x45cddf4e, 0xe0ac139e - .long 0x359674f7, 0xa51b6135, 0x170076fa - -.L1: .long 0xaf449247, 0x81256527, 0xccaa009e - .long 0x57c54819, 0x1d9513d7, 0x81256527 - .long 0x3f41287a, 0x57c54819, 0xaf449247 - .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7 - .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394 - .long 0x71d54a59, 0xf5e48c85, 0x57c54819 - .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed - .long 0xd31343ea, 0xe95c1271, 0x910eeec1 - .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a - .long 0x9ee62949, 0xcec97417, 0x9026d5b1 - .long 0xa55d1514, 0xf183c71b, 0xd1df2327 - .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85 - .long 0x9d842b80, 0xeea395c4, 0x3c656ced - .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd - .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd - .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271 - .long 0xef82aa68, 0xdb3935ea, 0xb918a347 - .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59 - .long 0x99cce860, 0x356d209f, 0xff6f2fc2 - .long 0xd8af8e46, 0xc352f6de, 0xcec97417 - .long 0xf1996890, 0xd8110ff1, 0x1c63267b - .long 0x631bc508, 0xe95c7216, 0xf183c71b - .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0 - .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea - .long 0x7a92fffb, 0xf7003835, 0x4470ac44 - .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4 - .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee - .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40 - .long 0x60290934, 0x81b6f443, 0x6d40f445 - .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949 - .long 0xdcf5088a, 0x9dbdc100, 0x145575d5 - .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d - .long 0x255b139e, 0x631bc508, 0xa55d1514 - .long 0xd784eaa8, 0xce26786c, 0xdb3935ea - .long 0x6d2c864a, 0x8068c345, 0x2586d334 - .long 0x02072e24, 0xdb3839f3, 0x21aa2b26 - .long 0x06689b0a, 0x5efd72f5, 0xe0575528 - .long 0x1e52f5ea, 0x4117915b, 0x356d209f - .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80 - .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de - .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c - .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1 - .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0 - .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216 - .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356 - .long 0x0c540e7b, 0x753c81ff, 0x8e031a19 - .long 0x9924c781, 0xb9220208, 0x3edcde65 - .long 0x3954de39, 0x1753ab84, 0x1d6708a0 - .long 0xf32238b5, 0xbec81497, 0x9e70b943 - .long 0xbbd2cd2c, 0x0925d861, 0xf7003835 - .long 0xcc401304, 0xd784eaa8, 0xef82aa68 - .long 0x4987e684, 0x6044fbb0, 0x00eba0c8 - .long 0x3aa11427, 0x18fe3b4a, 0x87441142 - .long 0x297aad60, 0x02072e24, 0xd14bcc9b - .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a - .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8 - .long 0x25b8822a, 0x1e52f5ea, 0x99cce860 - .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443 - .long 0x5690aa32, 0xa91fdefb, 0x688a110e - .long 0x1357a093, 0x3796455c, 0xd8af8e46 - .long 0x798fdd33, 0xaaa18a37, 0x357b9517 - .long 0xc2815395, 0x54d42691, 0x9dbdc100 - .long 0x21cfc0f7, 0x28ae0976, 0xf1996890 - .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6 diff --git a/arch/arm64/lib/crc32.c b/arch/arm64/lib/crc32.c deleted file mode 100644 index ed3acd71178f..000000000000 --- a/arch/arm64/lib/crc32.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include <linux/crc32.h> -#include <linux/linkage.h> -#include <linux/module.h> - -#include <asm/alternative.h> -#include <asm/cpufeature.h> -#include <asm/neon.h> -#include <asm/simd.h> - -#include <crypto/internal/simd.h> - -// The minimum input length to consider the 4-way interleaved code path -static const size_t min_len = 1024; - -asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); -asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); -asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); - -asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); -asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); -asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); - -u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) -{ - if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) - return crc32_le_base(crc, p, len); - - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc32_le_arm64_4way(crc, p, len); - kernel_neon_end(); - - p += round_down(len, 64); - len %= 64; - - if (!len) - return crc; - } - - return crc32_le_arm64(crc, p, len); -} -EXPORT_SYMBOL(crc32_le_arch); - -u32 crc32c_arch(u32 crc, const u8 *p, size_t len) -{ - if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) - return crc32c_base(crc, p, len); - - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc32c_le_arm64_4way(crc, p, len); - kernel_neon_end(); - - p += round_down(len, 64); - len %= 64; - - if (!len) - return crc; - } - - return crc32c_le_arm64(crc, p, len); -} -EXPORT_SYMBOL(crc32c_arch); - -u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) -{ - if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) - return crc32_be_base(crc, p, len); - - if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { - kernel_neon_begin(); - crc = crc32_be_arm64_4way(crc, p, len); - kernel_neon_end(); - - p += round_down(len, 64); - len %= 64; - - if (!len) - return crc; - } - - return crc32_be_arm64(crc, p, len); -} -EXPORT_SYMBOL(crc32_be_arch); - -u32 crc32_optimizations(void) -{ - if (alternative_has_cap_likely(ARM64_HAS_CRC32)) - return CRC32_LE_OPTIMIZATION | - CRC32_BE_OPTIMIZATION | - CRC32C_OPTIMIZATION; - return 0; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("arm64-optimized CRC32 functions"); diff --git a/arch/arm64/lib/crypto/.gitignore b/arch/arm64/lib/crypto/.gitignore deleted file mode 100644 index 12d74d8b03d0..000000000000 --- a/arch/arm64/lib/crypto/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -poly1305-core.S -sha256-core.S diff --git a/arch/arm64/lib/crypto/Kconfig b/arch/arm64/lib/crypto/Kconfig deleted file mode 100644 index 129a7685cb4c..000000000000 --- a/arch/arm64/lib/crypto/Kconfig +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA20_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_NEON - tristate - depends on KERNEL_MODE_NEON - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_ARM64 - tristate - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD diff --git a/arch/arm64/lib/crypto/Makefile b/arch/arm64/lib/crypto/Makefile deleted file mode 100644 index 946c09903711..000000000000 --- a/arch/arm64/lib/crypto/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o -poly1305-neon-y := poly1305-core.o poly1305-glue.o -AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch -AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch - -obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o -sha256-arm64-y := sha256.o sha256-core.o -sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) void $(@) - -$(obj)/%-core.S: $(src)/%-armv8.pl - $(call cmd,perlasm) - -$(obj)/sha256-core.S: $(src)/sha2-armv8.pl - $(call cmd,perlasm) - -clean-files += poly1305-core.S sha256-core.S diff --git a/arch/arm64/lib/crypto/chacha-neon-core.S b/arch/arm64/lib/crypto/chacha-neon-core.S deleted file mode 100644 index 80079586ecc7..000000000000 --- a/arch/arm64/lib/crypto/chacha-neon-core.S +++ /dev/null @@ -1,805 +0,0 @@ -/* - * ChaCha/HChaCha NEON helper functions - * - * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Originally based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> -#include <asm/cache.h> - - .text - .align 6 - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is stored in the four NEON - * registers v0-v3. It performs matrix operations on four words in parallel, - * but requires shuffling to rearrange the words after each round. - * - * The round count is given in w3. - * - * Clobbers: w3, x10, v4, v12 - */ -SYM_FUNC_START_LOCAL(chacha_permute) - - adr_l x10, ROT8 - ld1 {v12.4s}, [x10] - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - ext v1.16b, v1.16b, v1.16b, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - ext v3.16b, v3.16b, v3.16b, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - ext v1.16b, v1.16b, v1.16b, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - ext v3.16b, v3.16b, v3.16b, #4 - - subs w3, w3, #2 - b.ne .Ldoubleround - - ret -SYM_FUNC_END(chacha_permute) - -SYM_FUNC_START(chacha_block_xor_neon) - // x0: Input state matrix, s - // x1: 1 data block output, o - // x2: 1 data block input, i - // w3: nrounds - - stp x29, x30, [sp, #-16]! - mov x29, sp - - // x0..3 = s0..3 - ld1 {v0.4s-v3.4s}, [x0] - ld1 {v8.4s-v11.4s}, [x0] - - bl chacha_permute - - ld1 {v4.16b-v7.16b}, [x2] - - // o0 = i0 ^ (x0 + s0) - add v0.4s, v0.4s, v8.4s - eor v0.16b, v0.16b, v4.16b - - // o1 = i1 ^ (x1 + s1) - add v1.4s, v1.4s, v9.4s - eor v1.16b, v1.16b, v5.16b - - // o2 = i2 ^ (x2 + s2) - add v2.4s, v2.4s, v10.4s - eor v2.16b, v2.16b, v6.16b - - // o3 = i3 ^ (x3 + s3) - add v3.4s, v3.4s, v11.4s - eor v3.16b, v3.16b, v7.16b - - st1 {v0.16b-v3.16b}, [x1] - - ldp x29, x30, [sp], #16 - ret -SYM_FUNC_END(chacha_block_xor_neon) - -SYM_FUNC_START(hchacha_block_neon) - // x0: Input state matrix, s - // x1: output (8 32-bit words) - // w2: nrounds - - stp x29, x30, [sp, #-16]! - mov x29, sp - - ld1 {v0.4s-v3.4s}, [x0] - - mov w3, w2 - bl chacha_permute - - st1 {v0.4s}, [x1], #16 - st1 {v3.4s}, [x1] - - ldp x29, x30, [sp], #16 - ret -SYM_FUNC_END(hchacha_block_neon) - - a0 .req w12 - a1 .req w13 - a2 .req w14 - a3 .req w15 - a4 .req w16 - a5 .req w17 - a6 .req w19 - a7 .req w20 - a8 .req w21 - a9 .req w22 - a10 .req w23 - a11 .req w24 - a12 .req w25 - a13 .req w26 - a14 .req w27 - a15 .req w28 - - .align 6 -SYM_FUNC_START(chacha_4block_xor_neon) - frame_push 10 - - // x0: Input state matrix, s - // x1: 4 data blocks output, o - // x2: 4 data blocks input, i - // w3: nrounds - // x4: byte count - - adr_l x10, .Lpermute - and x5, x4, #63 - add x10, x10, x5 - - // - // This function encrypts four consecutive ChaCha blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. - // - // At the same time, a fifth block is encrypted in parallel using - // scalar registers - // - adr_l x9, CTRINC // ... and ROT8 - ld1 {v30.4s-v31.4s}, [x9] - - // x0..15[0-3] = s0..3[0..3] - add x8, x0, #16 - ld4r { v0.4s- v3.4s}, [x0] - ld4r { v4.4s- v7.4s}, [x8], #16 - ld4r { v8.4s-v11.4s}, [x8], #16 - ld4r {v12.4s-v15.4s}, [x8] - - mov a0, v0.s[0] - mov a1, v1.s[0] - mov a2, v2.s[0] - mov a3, v3.s[0] - mov a4, v4.s[0] - mov a5, v5.s[0] - mov a6, v6.s[0] - mov a7, v7.s[0] - mov a8, v8.s[0] - mov a9, v9.s[0] - mov a10, v10.s[0] - mov a11, v11.s[0] - mov a12, v12.s[0] - mov a13, v13.s[0] - mov a14, v14.s[0] - mov a15, v15.s[0] - - // x12 += counter values 1-4 - add v12.4s, v12.4s, v30.4s - -.Ldoubleround4: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - add v0.4s, v0.4s, v4.4s - add a0, a0, a4 - add v1.4s, v1.4s, v5.4s - add a1, a1, a5 - add v2.4s, v2.4s, v6.4s - add a2, a2, a6 - add v3.4s, v3.4s, v7.4s - add a3, a3, a7 - - eor v12.16b, v12.16b, v0.16b - eor a12, a12, a0 - eor v13.16b, v13.16b, v1.16b - eor a13, a13, a1 - eor v14.16b, v14.16b, v2.16b - eor a14, a14, a2 - eor v15.16b, v15.16b, v3.16b - eor a15, a15, a3 - - rev32 v12.8h, v12.8h - ror a12, a12, #16 - rev32 v13.8h, v13.8h - ror a13, a13, #16 - rev32 v14.8h, v14.8h - ror a14, a14, #16 - rev32 v15.8h, v15.8h - ror a15, a15, #16 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - add v8.4s, v8.4s, v12.4s - add a8, a8, a12 - add v9.4s, v9.4s, v13.4s - add a9, a9, a13 - add v10.4s, v10.4s, v14.4s - add a10, a10, a14 - add v11.4s, v11.4s, v15.4s - add a11, a11, a15 - - eor v16.16b, v4.16b, v8.16b - eor a4, a4, a8 - eor v17.16b, v5.16b, v9.16b - eor a5, a5, a9 - eor v18.16b, v6.16b, v10.16b - eor a6, a6, a10 - eor v19.16b, v7.16b, v11.16b - eor a7, a7, a11 - - shl v4.4s, v16.4s, #12 - shl v5.4s, v17.4s, #12 - shl v6.4s, v18.4s, #12 - shl v7.4s, v19.4s, #12 - - sri v4.4s, v16.4s, #20 - ror a4, a4, #20 - sri v5.4s, v17.4s, #20 - ror a5, a5, #20 - sri v6.4s, v18.4s, #20 - ror a6, a6, #20 - sri v7.4s, v19.4s, #20 - ror a7, a7, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - add v0.4s, v0.4s, v4.4s - add a0, a0, a4 - add v1.4s, v1.4s, v5.4s - add a1, a1, a5 - add v2.4s, v2.4s, v6.4s - add a2, a2, a6 - add v3.4s, v3.4s, v7.4s - add a3, a3, a7 - - eor v12.16b, v12.16b, v0.16b - eor a12, a12, a0 - eor v13.16b, v13.16b, v1.16b - eor a13, a13, a1 - eor v14.16b, v14.16b, v2.16b - eor a14, a14, a2 - eor v15.16b, v15.16b, v3.16b - eor a15, a15, a3 - - tbl v12.16b, {v12.16b}, v31.16b - ror a12, a12, #24 - tbl v13.16b, {v13.16b}, v31.16b - ror a13, a13, #24 - tbl v14.16b, {v14.16b}, v31.16b - ror a14, a14, #24 - tbl v15.16b, {v15.16b}, v31.16b - ror a15, a15, #24 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - add v8.4s, v8.4s, v12.4s - add a8, a8, a12 - add v9.4s, v9.4s, v13.4s - add a9, a9, a13 - add v10.4s, v10.4s, v14.4s - add a10, a10, a14 - add v11.4s, v11.4s, v15.4s - add a11, a11, a15 - - eor v16.16b, v4.16b, v8.16b - eor a4, a4, a8 - eor v17.16b, v5.16b, v9.16b - eor a5, a5, a9 - eor v18.16b, v6.16b, v10.16b - eor a6, a6, a10 - eor v19.16b, v7.16b, v11.16b - eor a7, a7, a11 - - shl v4.4s, v16.4s, #7 - shl v5.4s, v17.4s, #7 - shl v6.4s, v18.4s, #7 - shl v7.4s, v19.4s, #7 - - sri v4.4s, v16.4s, #25 - ror a4, a4, #25 - sri v5.4s, v17.4s, #25 - ror a5, a5, #25 - sri v6.4s, v18.4s, #25 - ror a6, a6, #25 - sri v7.4s, v19.4s, #25 - ror a7, a7, #25 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - add v0.4s, v0.4s, v5.4s - add a0, a0, a5 - add v1.4s, v1.4s, v6.4s - add a1, a1, a6 - add v2.4s, v2.4s, v7.4s - add a2, a2, a7 - add v3.4s, v3.4s, v4.4s - add a3, a3, a4 - - eor v15.16b, v15.16b, v0.16b - eor a15, a15, a0 - eor v12.16b, v12.16b, v1.16b - eor a12, a12, a1 - eor v13.16b, v13.16b, v2.16b - eor a13, a13, a2 - eor v14.16b, v14.16b, v3.16b - eor a14, a14, a3 - - rev32 v15.8h, v15.8h - ror a15, a15, #16 - rev32 v12.8h, v12.8h - ror a12, a12, #16 - rev32 v13.8h, v13.8h - ror a13, a13, #16 - rev32 v14.8h, v14.8h - ror a14, a14, #16 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - add v10.4s, v10.4s, v15.4s - add a10, a10, a15 - add v11.4s, v11.4s, v12.4s - add a11, a11, a12 - add v8.4s, v8.4s, v13.4s - add a8, a8, a13 - add v9.4s, v9.4s, v14.4s - add a9, a9, a14 - - eor v16.16b, v5.16b, v10.16b - eor a5, a5, a10 - eor v17.16b, v6.16b, v11.16b - eor a6, a6, a11 - eor v18.16b, v7.16b, v8.16b - eor a7, a7, a8 - eor v19.16b, v4.16b, v9.16b - eor a4, a4, a9 - - shl v5.4s, v16.4s, #12 - shl v6.4s, v17.4s, #12 - shl v7.4s, v18.4s, #12 - shl v4.4s, v19.4s, #12 - - sri v5.4s, v16.4s, #20 - ror a5, a5, #20 - sri v6.4s, v17.4s, #20 - ror a6, a6, #20 - sri v7.4s, v18.4s, #20 - ror a7, a7, #20 - sri v4.4s, v19.4s, #20 - ror a4, a4, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - add v0.4s, v0.4s, v5.4s - add a0, a0, a5 - add v1.4s, v1.4s, v6.4s - add a1, a1, a6 - add v2.4s, v2.4s, v7.4s - add a2, a2, a7 - add v3.4s, v3.4s, v4.4s - add a3, a3, a4 - - eor v15.16b, v15.16b, v0.16b - eor a15, a15, a0 - eor v12.16b, v12.16b, v1.16b - eor a12, a12, a1 - eor v13.16b, v13.16b, v2.16b - eor a13, a13, a2 - eor v14.16b, v14.16b, v3.16b - eor a14, a14, a3 - - tbl v15.16b, {v15.16b}, v31.16b - ror a15, a15, #24 - tbl v12.16b, {v12.16b}, v31.16b - ror a12, a12, #24 - tbl v13.16b, {v13.16b}, v31.16b - ror a13, a13, #24 - tbl v14.16b, {v14.16b}, v31.16b - ror a14, a14, #24 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - add v10.4s, v10.4s, v15.4s - add a10, a10, a15 - add v11.4s, v11.4s, v12.4s - add a11, a11, a12 - add v8.4s, v8.4s, v13.4s - add a8, a8, a13 - add v9.4s, v9.4s, v14.4s - add a9, a9, a14 - - eor v16.16b, v5.16b, v10.16b - eor a5, a5, a10 - eor v17.16b, v6.16b, v11.16b - eor a6, a6, a11 - eor v18.16b, v7.16b, v8.16b - eor a7, a7, a8 - eor v19.16b, v4.16b, v9.16b - eor a4, a4, a9 - - shl v5.4s, v16.4s, #7 - shl v6.4s, v17.4s, #7 - shl v7.4s, v18.4s, #7 - shl v4.4s, v19.4s, #7 - - sri v5.4s, v16.4s, #25 - ror a5, a5, #25 - sri v6.4s, v17.4s, #25 - ror a6, a6, #25 - sri v7.4s, v18.4s, #25 - ror a7, a7, #25 - sri v4.4s, v19.4s, #25 - ror a4, a4, #25 - - subs w3, w3, #2 - b.ne .Ldoubleround4 - - ld4r {v16.4s-v19.4s}, [x0], #16 - ld4r {v20.4s-v23.4s}, [x0], #16 - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v30.4s - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] - add v0.4s, v0.4s, v16.4s - mov w6, v16.s[0] - mov w7, v17.s[0] - add v1.4s, v1.4s, v17.4s - mov w8, v18.s[0] - mov w9, v19.s[0] - add v2.4s, v2.4s, v18.4s - add a0, a0, w6 - add a1, a1, w7 - add v3.4s, v3.4s, v19.4s - add a2, a2, w8 - add a3, a3, w9 -CPU_BE( rev a0, a0 ) -CPU_BE( rev a1, a1 ) -CPU_BE( rev a2, a2 ) -CPU_BE( rev a3, a3 ) - - ld4r {v24.4s-v27.4s}, [x0], #16 - ld4r {v28.4s-v31.4s}, [x0] - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - add v4.4s, v4.4s, v20.4s - mov w6, v20.s[0] - mov w7, v21.s[0] - add v5.4s, v5.4s, v21.4s - mov w8, v22.s[0] - mov w9, v23.s[0] - add v6.4s, v6.4s, v22.4s - add a4, a4, w6 - add a5, a5, w7 - add v7.4s, v7.4s, v23.4s - add a6, a6, w8 - add a7, a7, w9 -CPU_BE( rev a4, a4 ) -CPU_BE( rev a5, a5 ) -CPU_BE( rev a6, a6 ) -CPU_BE( rev a7, a7 ) - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - add v8.4s, v8.4s, v24.4s - mov w6, v24.s[0] - mov w7, v25.s[0] - add v9.4s, v9.4s, v25.4s - mov w8, v26.s[0] - mov w9, v27.s[0] - add v10.4s, v10.4s, v26.4s - add a8, a8, w6 - add a9, a9, w7 - add v11.4s, v11.4s, v27.4s - add a10, a10, w8 - add a11, a11, w9 -CPU_BE( rev a8, a8 ) -CPU_BE( rev a9, a9 ) -CPU_BE( rev a10, a10 ) -CPU_BE( rev a11, a11 ) - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - add v12.4s, v12.4s, v28.4s - mov w6, v28.s[0] - mov w7, v29.s[0] - add v13.4s, v13.4s, v29.4s - mov w8, v30.s[0] - mov w9, v31.s[0] - add v14.4s, v14.4s, v30.4s - add a12, a12, w6 - add a13, a13, w7 - add v15.4s, v15.4s, v31.4s - add a14, a14, w8 - add a15, a15, w9 -CPU_BE( rev a12, a12 ) -CPU_BE( rev a13, a13 ) -CPU_BE( rev a14, a14 ) -CPU_BE( rev a15, a15 ) - - // interleave 32-bit words in state n, n+1 - ldp w6, w7, [x2], #64 - zip1 v16.4s, v0.4s, v1.4s - ldp w8, w9, [x2, #-56] - eor a0, a0, w6 - zip2 v17.4s, v0.4s, v1.4s - eor a1, a1, w7 - zip1 v18.4s, v2.4s, v3.4s - eor a2, a2, w8 - zip2 v19.4s, v2.4s, v3.4s - eor a3, a3, w9 - ldp w6, w7, [x2, #-48] - zip1 v20.4s, v4.4s, v5.4s - ldp w8, w9, [x2, #-40] - eor a4, a4, w6 - zip2 v21.4s, v4.4s, v5.4s - eor a5, a5, w7 - zip1 v22.4s, v6.4s, v7.4s - eor a6, a6, w8 - zip2 v23.4s, v6.4s, v7.4s - eor a7, a7, w9 - ldp w6, w7, [x2, #-32] - zip1 v24.4s, v8.4s, v9.4s - ldp w8, w9, [x2, #-24] - eor a8, a8, w6 - zip2 v25.4s, v8.4s, v9.4s - eor a9, a9, w7 - zip1 v26.4s, v10.4s, v11.4s - eor a10, a10, w8 - zip2 v27.4s, v10.4s, v11.4s - eor a11, a11, w9 - ldp w6, w7, [x2, #-16] - zip1 v28.4s, v12.4s, v13.4s - ldp w8, w9, [x2, #-8] - eor a12, a12, w6 - zip2 v29.4s, v12.4s, v13.4s - eor a13, a13, w7 - zip1 v30.4s, v14.4s, v15.4s - eor a14, a14, w8 - zip2 v31.4s, v14.4s, v15.4s - eor a15, a15, w9 - - add x3, x2, x4 - sub x3, x3, #128 // start of last block - - subs x5, x4, #128 - csel x2, x2, x3, ge - - // interleave 64-bit words in state n, n+2 - zip1 v0.2d, v16.2d, v18.2d - zip2 v4.2d, v16.2d, v18.2d - stp a0, a1, [x1], #64 - zip1 v8.2d, v17.2d, v19.2d - zip2 v12.2d, v17.2d, v19.2d - stp a2, a3, [x1, #-56] - - subs x6, x4, #192 - ld1 {v16.16b-v19.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v1.2d, v20.2d, v22.2d - zip2 v5.2d, v20.2d, v22.2d - stp a4, a5, [x1, #-48] - zip1 v9.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - stp a6, a7, [x1, #-40] - - subs x7, x4, #256 - ld1 {v20.16b-v23.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v2.2d, v24.2d, v26.2d - zip2 v6.2d, v24.2d, v26.2d - stp a8, a9, [x1, #-32] - zip1 v10.2d, v25.2d, v27.2d - zip2 v14.2d, v25.2d, v27.2d - stp a10, a11, [x1, #-24] - - subs x8, x4, #320 - ld1 {v24.16b-v27.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v3.2d, v28.2d, v30.2d - zip2 v7.2d, v28.2d, v30.2d - stp a12, a13, [x1, #-16] - zip1 v11.2d, v29.2d, v31.2d - zip2 v15.2d, v29.2d, v31.2d - stp a14, a15, [x1, #-8] - - tbnz x5, #63, .Lt128 - ld1 {v28.16b-v31.16b}, [x2] - - // xor with corresponding input, write to output - eor v16.16b, v16.16b, v0.16b - eor v17.16b, v17.16b, v1.16b - eor v18.16b, v18.16b, v2.16b - eor v19.16b, v19.16b, v3.16b - - tbnz x6, #63, .Lt192 - - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v6.16b - eor v23.16b, v23.16b, v7.16b - - st1 {v16.16b-v19.16b}, [x1], #64 - tbnz x7, #63, .Lt256 - - eor v24.16b, v24.16b, v8.16b - eor v25.16b, v25.16b, v9.16b - eor v26.16b, v26.16b, v10.16b - eor v27.16b, v27.16b, v11.16b - - st1 {v20.16b-v23.16b}, [x1], #64 - tbnz x8, #63, .Lt320 - - eor v28.16b, v28.16b, v12.16b - eor v29.16b, v29.16b, v13.16b - eor v30.16b, v30.16b, v14.16b - eor v31.16b, v31.16b, v15.16b - - st1 {v24.16b-v27.16b}, [x1], #64 - st1 {v28.16b-v31.16b}, [x1] - -.Lout: frame_pop - ret - - // fewer than 192 bytes of in/output -.Lt192: cbz x5, 1f // exactly 128 bytes? - ld1 {v28.16b-v31.16b}, [x10] - add x5, x5, x1 - tbl v28.16b, {v4.16b-v7.16b}, v28.16b - tbl v29.16b, {v4.16b-v7.16b}, v29.16b - tbl v30.16b, {v4.16b-v7.16b}, v30.16b - tbl v31.16b, {v4.16b-v7.16b}, v31.16b - -0: eor v20.16b, v20.16b, v28.16b - eor v21.16b, v21.16b, v29.16b - eor v22.16b, v22.16b, v30.16b - eor v23.16b, v23.16b, v31.16b - st1 {v20.16b-v23.16b}, [x5] // overlapping stores -1: st1 {v16.16b-v19.16b}, [x1] - b .Lout - - // fewer than 128 bytes of in/output -.Lt128: ld1 {v28.16b-v31.16b}, [x10] - add x5, x5, x1 - sub x1, x1, #64 - tbl v28.16b, {v0.16b-v3.16b}, v28.16b - tbl v29.16b, {v0.16b-v3.16b}, v29.16b - tbl v30.16b, {v0.16b-v3.16b}, v30.16b - tbl v31.16b, {v0.16b-v3.16b}, v31.16b - ld1 {v16.16b-v19.16b}, [x1] // reload first output block - b 0b - - // fewer than 256 bytes of in/output -.Lt256: cbz x6, 2f // exactly 192 bytes? - ld1 {v4.16b-v7.16b}, [x10] - add x6, x6, x1 - tbl v0.16b, {v8.16b-v11.16b}, v4.16b - tbl v1.16b, {v8.16b-v11.16b}, v5.16b - tbl v2.16b, {v8.16b-v11.16b}, v6.16b - tbl v3.16b, {v8.16b-v11.16b}, v7.16b - - eor v28.16b, v28.16b, v0.16b - eor v29.16b, v29.16b, v1.16b - eor v30.16b, v30.16b, v2.16b - eor v31.16b, v31.16b, v3.16b - st1 {v28.16b-v31.16b}, [x6] // overlapping stores -2: st1 {v20.16b-v23.16b}, [x1] - b .Lout - - // fewer than 320 bytes of in/output -.Lt320: cbz x7, 3f // exactly 256 bytes? - ld1 {v4.16b-v7.16b}, [x10] - add x7, x7, x1 - tbl v0.16b, {v12.16b-v15.16b}, v4.16b - tbl v1.16b, {v12.16b-v15.16b}, v5.16b - tbl v2.16b, {v12.16b-v15.16b}, v6.16b - tbl v3.16b, {v12.16b-v15.16b}, v7.16b - - eor v28.16b, v28.16b, v0.16b - eor v29.16b, v29.16b, v1.16b - eor v30.16b, v30.16b, v2.16b - eor v31.16b, v31.16b, v3.16b - st1 {v28.16b-v31.16b}, [x7] // overlapping stores -3: st1 {v24.16b-v27.16b}, [x1] - b .Lout -SYM_FUNC_END(chacha_4block_xor_neon) - - .section ".rodata", "a", %progbits - .align L1_CACHE_SHIFT -.Lpermute: - .set .Li, 0 - .rept 128 - .byte (.Li - 64) - .set .Li, .Li + 1 - .endr - -CTRINC: .word 1, 2, 3, 4 -ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/lib/crypto/chacha-neon-glue.c b/arch/arm64/lib/crypto/chacha-neon-glue.c deleted file mode 100644 index d0188f974ca5..000000000000 --- a/arch/arm64/lib/crypto/chacha-neon-glue.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * ChaCha and HChaCha functions (ARM64 optimized) - * - * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <crypto/chacha.h> -#include <crypto/internal/simd.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> - -asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, int nrounds); -asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, - u8 *dst, const u8 *src, - int nrounds, int bytes); -asmlinkage void hchacha_block_neon(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, - int bytes, int nrounds) -{ - while (bytes > 0) { - int l = min(bytes, CHACHA_BLOCK_SIZE * 5); - - if (l <= CHACHA_BLOCK_SIZE) { - u8 buf[CHACHA_BLOCK_SIZE]; - - memcpy(buf, src, l); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, l); - state->x[12] += 1; - break; - } - chacha_4block_xor_neon(state, dst, src, nrounds, l); - bytes -= l; - src += l; - dst += l; - state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { - hchacha_block_generic(state, out, nrounds); - } else { - kernel_neon_begin(); - hchacha_block_neon(state, out, nrounds); - kernel_neon_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || - !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_neon_begin(); - chacha_doneon(state, dst, src, todo, nrounds); - kernel_neon_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&have_neon); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_simd_mod_init(void) -{ - if (cpu_have_named_feature(ASIMD)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/lib/crypto/poly1305-armv8.pl b/arch/arm64/lib/crypto/poly1305-armv8.pl deleted file mode 100644 index 22c9069c0650..000000000000 --- a/arch/arm64/lib/crypto/poly1305-armv8.pl +++ /dev/null @@ -1,917 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -# project. -# ==================================================================== -# -# This module implements Poly1305 hash for ARMv8. -# -# June 2015 -# -# Numbers are cycles per processed byte with poly1305_blocks alone. -# -# IALU/gcc-4.9 NEON -# -# Apple A7 1.86/+5% 0.72 -# Cortex-A53 2.69/+58% 1.47 -# Cortex-A57 2.70/+7% 1.14 -# Denver 1.64/+50% 1.18(*) -# X-Gene 2.13/+68% 2.27 -# Mongoose 1.77/+75% 1.12 -# Kryo 2.70/+55% 1.13 -# ThunderX2 1.17/+95% 1.36 -# -# (*) estimate based on resources availability is less than 1.0, -# i.e. measured result is worse than expected, presumably binary -# translator is not almighty; - -$flavour=shift; -$output=shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); -my ($mac,$nonce)=($inp,$len); - -my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -.extern OPENSSL_armcap_P -#endif - -.text - -// forward "declarations" are required for Apple -.globl poly1305_blocks -.globl poly1305_emit - -.globl poly1305_init -.type poly1305_init,%function -.align 5 -poly1305_init: - cmp $inp,xzr - stp xzr,xzr,[$ctx] // zero hash value - stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] - - csel x0,xzr,x0,eq - b.eq .Lno_key - -#ifndef __KERNEL__ - adrp x17,OPENSSL_armcap_P - ldr w17,[x17,#:lo12:OPENSSL_armcap_P] -#endif - - ldp $r0,$r1,[$inp] // load key - mov $s1,#0xfffffffc0fffffff - movk $s1,#0x0fff,lsl#48 -#ifdef __AARCH64EB__ - rev $r0,$r0 // flip bytes - rev $r1,$r1 -#endif - and $r0,$r0,$s1 // &=0ffffffc0fffffff - and $s1,$s1,#-4 - and $r1,$r1,$s1 // &=0ffffffc0ffffffc - mov w#$s1,#-1 - stp $r0,$r1,[$ctx,#32] // save key value - str w#$s1,[$ctx,#48] // impossible key power value - -#ifndef __KERNEL__ - tst w17,#ARMV7_NEON - - adr $d0,.Lpoly1305_blocks - adr $r0,.Lpoly1305_blocks_neon - adr $d1,.Lpoly1305_emit - - csel $d0,$d0,$r0,eq - -# ifdef __ILP32__ - stp w#$d0,w#$d1,[$len] -# else - stp $d0,$d1,[$len] -# endif -#endif - mov x0,#1 -.Lno_key: - ret -.size poly1305_init,.-poly1305_init - -.type poly1305_blocks,%function -.align 5 -poly1305_blocks: -.Lpoly1305_blocks: - ands $len,$len,#-16 - b.eq .Lno_data - - ldp $h0,$h1,[$ctx] // load hash value - ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] - ldp $r0,$r1,[$ctx,#32] // load key value - -#ifdef __AARCH64EB__ - lsr $d0,$h0,#32 - mov w#$d1,w#$h0 - lsr $d2,$h1,#32 - mov w15,w#$h1 - lsr x16,$h2,#32 -#else - mov w#$d0,w#$h0 - lsr $d1,$h0,#32 - mov w#$d2,w#$h1 - lsr x15,$h1,#32 - mov w16,w#$h2 -#endif - - add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 - lsr $d1,$d2,#12 - adds $d0,$d0,$d2,lsl#52 - add $d1,$d1,x15,lsl#14 - adc $d1,$d1,xzr - lsr $d2,x16,#24 - adds $d1,$d1,x16,lsl#40 - adc $d2,$d2,xzr - - cmp x17,#0 // is_base2_26? - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - csel $h0,$h0,$d0,eq // choose between radixes - csel $h1,$h1,$d1,eq - csel $h2,$h2,$d2,eq - -.Loop: - ldp $t0,$t1,[$inp],#16 // load input - sub $len,$len,#16 -#ifdef __AARCH64EB__ - rev $t0,$t0 - rev $t1,$t1 -#endif - adds $h0,$h0,$t0 // accumulate input - adcs $h1,$h1,$t1 - - mul $d0,$h0,$r0 // h0*r0 - adc $h2,$h2,$padbit - umulh $d1,$h0,$r0 - - mul $t0,$h1,$s1 // h1*5*r1 - umulh $t1,$h1,$s1 - - adds $d0,$d0,$t0 - mul $t0,$h0,$r1 // h0*r1 - adc $d1,$d1,$t1 - umulh $d2,$h0,$r1 - - adds $d1,$d1,$t0 - mul $t0,$h1,$r0 // h1*r0 - adc $d2,$d2,xzr - umulh $t1,$h1,$r0 - - adds $d1,$d1,$t0 - mul $t0,$h2,$s1 // h2*5*r1 - adc $d2,$d2,$t1 - mul $t1,$h2,$r0 // h2*r0 - - adds $d1,$d1,$t0 - adc $d2,$d2,$t1 - - and $t0,$d2,#-4 // final reduction - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$d0,$t0 - adcs $h1,$d1,xzr - adc $h2,$h2,xzr - - cbnz $len,.Loop - - stp $h0,$h1,[$ctx] // store hash value - stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] - -.Lno_data: - ret -.size poly1305_blocks,.-poly1305_blocks - -.type poly1305_emit,%function -.align 5 -poly1305_emit: -.Lpoly1305_emit: - ldp $h0,$h1,[$ctx] // load hash base 2^64 - ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] - ldp $t0,$t1,[$nonce] // load nonce - -#ifdef __AARCH64EB__ - lsr $d0,$h0,#32 - mov w#$d1,w#$h0 - lsr $d2,$h1,#32 - mov w15,w#$h1 - lsr x16,$h2,#32 -#else - mov w#$d0,w#$h0 - lsr $d1,$h0,#32 - mov w#$d2,w#$h1 - lsr x15,$h1,#32 - mov w16,w#$h2 -#endif - - add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 - lsr $d1,$d2,#12 - adds $d0,$d0,$d2,lsl#52 - add $d1,$d1,x15,lsl#14 - adc $d1,$d1,xzr - lsr $d2,x16,#24 - adds $d1,$d1,x16,lsl#40 - adc $d2,$d2,xzr - - cmp $r0,#0 // is_base2_26? - csel $h0,$h0,$d0,eq // choose between radixes - csel $h1,$h1,$d1,eq - csel $h2,$h2,$d2,eq - - adds $d0,$h0,#5 // compare to modulus - adcs $d1,$h1,xzr - adc $d2,$h2,xzr - - tst $d2,#-4 // see if it's carried/borrowed - - csel $h0,$h0,$d0,eq - csel $h1,$h1,$d1,eq - -#ifdef __AARCH64EB__ - ror $t0,$t0,#32 // flip nonce words - ror $t1,$t1,#32 -#endif - adds $h0,$h0,$t0 // accumulate nonce - adc $h1,$h1,$t1 -#ifdef __AARCH64EB__ - rev $h0,$h0 // flip output bytes - rev $h1,$h1 -#endif - stp $h0,$h1,[$mac] // write result - - ret -.size poly1305_emit,.-poly1305_emit -___ -my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); -my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); -my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); -my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); -my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); -my ($T0,$T1,$MASK) = map("v$_",(29..31)); - -my ($in2,$zeros)=("x16","x17"); -my $is_base2_26 = $zeros; # borrow - -$code.=<<___; -.type poly1305_mult,%function -.align 5 -poly1305_mult: - mul $d0,$h0,$r0 // h0*r0 - umulh $d1,$h0,$r0 - - mul $t0,$h1,$s1 // h1*5*r1 - umulh $t1,$h1,$s1 - - adds $d0,$d0,$t0 - mul $t0,$h0,$r1 // h0*r1 - adc $d1,$d1,$t1 - umulh $d2,$h0,$r1 - - adds $d1,$d1,$t0 - mul $t0,$h1,$r0 // h1*r0 - adc $d2,$d2,xzr - umulh $t1,$h1,$r0 - - adds $d1,$d1,$t0 - mul $t0,$h2,$s1 // h2*5*r1 - adc $d2,$d2,$t1 - mul $t1,$h2,$r0 // h2*r0 - - adds $d1,$d1,$t0 - adc $d2,$d2,$t1 - - and $t0,$d2,#-4 // final reduction - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$d0,$t0 - adcs $h1,$d1,xzr - adc $h2,$h2,xzr - - ret -.size poly1305_mult,.-poly1305_mult - -.type poly1305_splat,%function -.align 4 -poly1305_splat: - and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x13,$h0,#26,#26 - extr x14,$h1,$h0,#52 - and x14,x14,#0x03ffffff - ubfx x15,$h1,#14,#26 - extr x16,$h2,$h1,#40 - - str w12,[$ctx,#16*0] // r0 - add w12,w13,w13,lsl#2 // r1*5 - str w13,[$ctx,#16*1] // r1 - add w13,w14,w14,lsl#2 // r2*5 - str w12,[$ctx,#16*2] // s1 - str w14,[$ctx,#16*3] // r2 - add w14,w15,w15,lsl#2 // r3*5 - str w13,[$ctx,#16*4] // s2 - str w15,[$ctx,#16*5] // r3 - add w15,w16,w16,lsl#2 // r4*5 - str w14,[$ctx,#16*6] // s3 - str w16,[$ctx,#16*7] // r4 - str w15,[$ctx,#16*8] // s4 - - ret -.size poly1305_splat,.-poly1305_splat - -#ifdef __KERNEL__ -.globl poly1305_blocks_neon -#endif -.type poly1305_blocks_neon,%function -.align 5 -poly1305_blocks_neon: -.Lpoly1305_blocks_neon: - ldr $is_base2_26,[$ctx,#24] - cmp $len,#128 - b.lo .Lpoly1305_blocks - - .inst 0xd503233f // paciasp - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - cbz $is_base2_26,.Lbase2_64_neon - - ldp w10,w11,[$ctx] // load hash value base 2^26 - ldp w12,w13,[$ctx,#8] - ldr w14,[$ctx,#16] - - tst $len,#31 - b.eq .Leven_neon - - ldp $r0,$r1,[$ctx,#32] // load key value - - add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 - lsr $h1,x12,#12 - adds $h0,$h0,x12,lsl#52 - add $h1,$h1,x13,lsl#14 - adc $h1,$h1,xzr - lsr $h2,x14,#24 - adds $h1,$h1,x14,lsl#40 - adc $d2,$h2,xzr // can be partially reduced... - - ldp $d0,$d1,[$inp],#16 // load input - sub $len,$len,#16 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - -#ifdef __AARCH64EB__ - rev $d0,$d0 - rev $d1,$d1 -#endif - adds $h0,$h0,$d0 // accumulate input - adcs $h1,$h1,$d1 - adc $h2,$h2,$padbit - - bl poly1305_mult - - and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,$h0,#26,#26 - extr x12,$h1,$h0,#52 - and x12,x12,#0x03ffffff - ubfx x13,$h1,#14,#26 - extr x14,$h2,$h1,#40 - - b .Leven_neon - -.align 4 -.Lbase2_64_neon: - ldp $r0,$r1,[$ctx,#32] // load key value - - ldp $h0,$h1,[$ctx] // load hash value base 2^64 - ldr $h2,[$ctx,#16] - - tst $len,#31 - b.eq .Linit_neon - - ldp $d0,$d1,[$inp],#16 // load input - sub $len,$len,#16 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) -#ifdef __AARCH64EB__ - rev $d0,$d0 - rev $d1,$d1 -#endif - adds $h0,$h0,$d0 // accumulate input - adcs $h1,$h1,$d1 - adc $h2,$h2,$padbit - - bl poly1305_mult - -.Linit_neon: - ldr w17,[$ctx,#48] // first table element - and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,$h0,#26,#26 - extr x12,$h1,$h0,#52 - and x12,x12,#0x03ffffff - ubfx x13,$h1,#14,#26 - extr x14,$h2,$h1,#40 - - cmp w17,#-1 // is value impossible? - b.ne .Leven_neon - - fmov ${H0},x10 - fmov ${H1},x11 - fmov ${H2},x12 - fmov ${H3},x13 - fmov ${H4},x14 - - ////////////////////////////////// initialize r^n table - mov $h0,$r0 // r^1 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - mov $h1,$r1 - mov $h2,xzr - add $ctx,$ctx,#48+12 - bl poly1305_splat - - bl poly1305_mult // r^2 - sub $ctx,$ctx,#4 - bl poly1305_splat - - bl poly1305_mult // r^3 - sub $ctx,$ctx,#4 - bl poly1305_splat - - bl poly1305_mult // r^4 - sub $ctx,$ctx,#4 - bl poly1305_splat - sub $ctx,$ctx,#48 // restore original $ctx - b .Ldo_neon - -.align 4 -.Leven_neon: - fmov ${H0},x10 - fmov ${H1},x11 - fmov ${H2},x12 - fmov ${H3},x13 - fmov ${H4},x14 - -.Ldo_neon: - ldp x8,x12,[$inp,#32] // inp[2:3] - subs $len,$len,#64 - ldp x9,x13,[$inp,#48] - add $in2,$inp,#96 - adrp $zeros,.Lzeros - add $zeros,$zeros,#:lo12:.Lzeros - - lsl $padbit,$padbit,#24 - add x15,$ctx,#48 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov $IN23_0,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,$padbit,x12,lsr#40 - add x13,$padbit,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov $IN23_1,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - fmov $IN23_2,x8 - fmov $IN23_3,x10 - fmov $IN23_4,x12 - - ldp x8,x12,[$inp],#16 // inp[0:1] - ldp x9,x13,[$inp],#48 - - ld1 {$R0,$R1,$S1,$R2},[x15],#64 - ld1 {$S2,$R3,$S3,$R4},[x15],#64 - ld1 {$S4},[x15] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov $IN01_0,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,$padbit,x12,lsr#40 - add x13,$padbit,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov $IN01_1,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - movi $MASK.2d,#-1 - fmov $IN01_2,x8 - fmov $IN01_3,x10 - fmov $IN01_4,x12 - ushr $MASK.2d,$MASK.2d,#38 - - b.ls .Lskip_loop - -.align 4 -.Loop_neon: - //////////////////////////////////////////////////////////////// - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - // \___________________/ - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - // \___________________/ \____________________/ - // - // Note that we start with inp[2:3]*r^2. This is because it - // doesn't depend on reduction in previous iteration. - //////////////////////////////////////////////////////////////// - // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 - // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 - // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 - // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 - // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 - - subs $len,$len,#64 - umull $ACC4,$IN23_0,${R4}[2] - csel $in2,$zeros,$in2,lo - umull $ACC3,$IN23_0,${R3}[2] - umull $ACC2,$IN23_0,${R2}[2] - ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) - umull $ACC1,$IN23_0,${R1}[2] - ldp x9,x13,[$in2],#48 - umull $ACC0,$IN23_0,${R0}[2] -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - umlal $ACC4,$IN23_1,${R3}[2] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal $ACC3,$IN23_1,${R2}[2] - and x5,x9,#0x03ffffff - umlal $ACC2,$IN23_1,${R1}[2] - ubfx x6,x8,#26,#26 - umlal $ACC1,$IN23_1,${R0}[2] - ubfx x7,x9,#26,#26 - umlal $ACC0,$IN23_1,${S4}[2] - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - - umlal $ACC4,$IN23_2,${R2}[2] - extr x8,x12,x8,#52 - umlal $ACC3,$IN23_2,${R1}[2] - extr x9,x13,x9,#52 - umlal $ACC2,$IN23_2,${R0}[2] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal $ACC1,$IN23_2,${S4}[2] - fmov $IN23_0,x4 - umlal $ACC0,$IN23_2,${S3}[2] - and x8,x8,#0x03ffffff - - umlal $ACC4,$IN23_3,${R1}[2] - and x9,x9,#0x03ffffff - umlal $ACC3,$IN23_3,${R0}[2] - ubfx x10,x12,#14,#26 - umlal $ACC2,$IN23_3,${S4}[2] - ubfx x11,x13,#14,#26 - umlal $ACC1,$IN23_3,${S3}[2] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal $ACC0,$IN23_3,${S2}[2] - fmov $IN23_1,x6 - - add $IN01_2,$IN01_2,$H2 - add x12,$padbit,x12,lsr#40 - umlal $ACC4,$IN23_4,${R0}[2] - add x13,$padbit,x13,lsr#40 - umlal $ACC3,$IN23_4,${S4}[2] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal $ACC2,$IN23_4,${S3}[2] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal $ACC1,$IN23_4,${S2}[2] - fmov $IN23_2,x8 - umlal $ACC0,$IN23_4,${S1}[2] - fmov $IN23_3,x10 - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4 and accumulate - - add $IN01_0,$IN01_0,$H0 - fmov $IN23_4,x12 - umlal $ACC3,$IN01_2,${R1}[0] - ldp x8,x12,[$inp],#16 // inp[0:1] - umlal $ACC0,$IN01_2,${S3}[0] - ldp x9,x13,[$inp],#48 - umlal $ACC4,$IN01_2,${R2}[0] - umlal $ACC1,$IN01_2,${S4}[0] - umlal $ACC2,$IN01_2,${R0}[0] -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - add $IN01_1,$IN01_1,$H1 - umlal $ACC3,$IN01_0,${R3}[0] - umlal $ACC4,$IN01_0,${R4}[0] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal $ACC2,$IN01_0,${R2}[0] - and x5,x9,#0x03ffffff - umlal $ACC0,$IN01_0,${R0}[0] - ubfx x6,x8,#26,#26 - umlal $ACC1,$IN01_0,${R1}[0] - ubfx x7,x9,#26,#26 - - add $IN01_3,$IN01_3,$H3 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - umlal $ACC3,$IN01_1,${R2}[0] - extr x8,x12,x8,#52 - umlal $ACC4,$IN01_1,${R3}[0] - extr x9,x13,x9,#52 - umlal $ACC0,$IN01_1,${S4}[0] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal $ACC2,$IN01_1,${R1}[0] - fmov $IN01_0,x4 - umlal $ACC1,$IN01_1,${R0}[0] - and x8,x8,#0x03ffffff - - add $IN01_4,$IN01_4,$H4 - and x9,x9,#0x03ffffff - umlal $ACC3,$IN01_3,${R0}[0] - ubfx x10,x12,#14,#26 - umlal $ACC0,$IN01_3,${S2}[0] - ubfx x11,x13,#14,#26 - umlal $ACC4,$IN01_3,${R1}[0] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal $ACC1,$IN01_3,${S3}[0] - fmov $IN01_1,x6 - umlal $ACC2,$IN01_3,${S4}[0] - add x12,$padbit,x12,lsr#40 - - umlal $ACC3,$IN01_4,${S4}[0] - add x13,$padbit,x13,lsr#40 - umlal $ACC0,$IN01_4,${S1}[0] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal $ACC4,$IN01_4,${R0}[0] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal $ACC1,$IN01_4,${S2}[0] - fmov $IN01_2,x8 - umlal $ACC2,$IN01_4,${S3}[0] - fmov $IN01_3,x10 - fmov $IN01_4,x12 - - ///////////////////////////////////////////////////////////////// - // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - // and P. Schwabe - // - // [see discussion in poly1305-armv4 module] - - ushr $T0.2d,$ACC3,#26 - xtn $H3,$ACC3 - ushr $T1.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - add $ACC4,$ACC4,$T0.2d // h3 -> h4 - bic $H3,#0xfc,lsl#24 // &=0x03ffffff - add $ACC1,$ACC1,$T1.2d // h0 -> h1 - - ushr $T0.2d,$ACC4,#26 - xtn $H4,$ACC4 - ushr $T1.2d,$ACC1,#26 - xtn $H1,$ACC1 - bic $H4,#0xfc,lsl#24 - add $ACC2,$ACC2,$T1.2d // h1 -> h2 - - add $ACC0,$ACC0,$T0.2d - shl $T0.2d,$T0.2d,#2 - shrn $T1.2s,$ACC2,#26 - xtn $H2,$ACC2 - add $ACC0,$ACC0,$T0.2d // h4 -> h0 - bic $H1,#0xfc,lsl#24 - add $H3,$H3,$T1.2s // h2 -> h3 - bic $H2,#0xfc,lsl#24 - - shrn $T0.2s,$ACC0,#26 - xtn $H0,$ACC0 - ushr $T1.2s,$H3,#26 - bic $H3,#0xfc,lsl#24 - bic $H0,#0xfc,lsl#24 - add $H1,$H1,$T0.2s // h0 -> h1 - add $H4,$H4,$T1.2s // h3 -> h4 - - b.hi .Loop_neon - -.Lskip_loop: - dup $IN23_2,${IN23_2}[0] - add $IN01_2,$IN01_2,$H2 - - //////////////////////////////////////////////////////////////// - // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - adds $len,$len,#32 - b.ne .Long_tail - - dup $IN23_2,${IN01_2}[0] - add $IN23_0,$IN01_0,$H0 - add $IN23_3,$IN01_3,$H3 - add $IN23_1,$IN01_1,$H1 - add $IN23_4,$IN01_4,$H4 - -.Long_tail: - dup $IN23_0,${IN23_0}[0] - umull2 $ACC0,$IN23_2,${S3} - umull2 $ACC3,$IN23_2,${R1} - umull2 $ACC4,$IN23_2,${R2} - umull2 $ACC2,$IN23_2,${R0} - umull2 $ACC1,$IN23_2,${S4} - - dup $IN23_1,${IN23_1}[0] - umlal2 $ACC0,$IN23_0,${R0} - umlal2 $ACC2,$IN23_0,${R2} - umlal2 $ACC3,$IN23_0,${R3} - umlal2 $ACC4,$IN23_0,${R4} - umlal2 $ACC1,$IN23_0,${R1} - - dup $IN23_3,${IN23_3}[0] - umlal2 $ACC0,$IN23_1,${S4} - umlal2 $ACC3,$IN23_1,${R2} - umlal2 $ACC2,$IN23_1,${R1} - umlal2 $ACC4,$IN23_1,${R3} - umlal2 $ACC1,$IN23_1,${R0} - - dup $IN23_4,${IN23_4}[0] - umlal2 $ACC3,$IN23_3,${R0} - umlal2 $ACC4,$IN23_3,${R1} - umlal2 $ACC0,$IN23_3,${S2} - umlal2 $ACC1,$IN23_3,${S3} - umlal2 $ACC2,$IN23_3,${S4} - - umlal2 $ACC3,$IN23_4,${S4} - umlal2 $ACC0,$IN23_4,${S1} - umlal2 $ACC4,$IN23_4,${R0} - umlal2 $ACC1,$IN23_4,${S2} - umlal2 $ACC2,$IN23_4,${S3} - - b.eq .Lshort_tail - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4:r^3 and accumulate - - add $IN01_0,$IN01_0,$H0 - umlal $ACC3,$IN01_2,${R1} - umlal $ACC0,$IN01_2,${S3} - umlal $ACC4,$IN01_2,${R2} - umlal $ACC1,$IN01_2,${S4} - umlal $ACC2,$IN01_2,${R0} - - add $IN01_1,$IN01_1,$H1 - umlal $ACC3,$IN01_0,${R3} - umlal $ACC0,$IN01_0,${R0} - umlal $ACC4,$IN01_0,${R4} - umlal $ACC1,$IN01_0,${R1} - umlal $ACC2,$IN01_0,${R2} - - add $IN01_3,$IN01_3,$H3 - umlal $ACC3,$IN01_1,${R2} - umlal $ACC0,$IN01_1,${S4} - umlal $ACC4,$IN01_1,${R3} - umlal $ACC1,$IN01_1,${R0} - umlal $ACC2,$IN01_1,${R1} - - add $IN01_4,$IN01_4,$H4 - umlal $ACC3,$IN01_3,${R0} - umlal $ACC0,$IN01_3,${S2} - umlal $ACC4,$IN01_3,${R1} - umlal $ACC1,$IN01_3,${S3} - umlal $ACC2,$IN01_3,${S4} - - umlal $ACC3,$IN01_4,${S4} - umlal $ACC0,$IN01_4,${S1} - umlal $ACC4,$IN01_4,${R0} - umlal $ACC1,$IN01_4,${S2} - umlal $ACC2,$IN01_4,${S3} - -.Lshort_tail: - //////////////////////////////////////////////////////////////// - // horizontal add - - addp $ACC3,$ACC3,$ACC3 - ldp d8,d9,[sp,#16] // meet ABI requirements - addp $ACC0,$ACC0,$ACC0 - ldp d10,d11,[sp,#32] - addp $ACC4,$ACC4,$ACC4 - ldp d12,d13,[sp,#48] - addp $ACC1,$ACC1,$ACC1 - ldp d14,d15,[sp,#64] - addp $ACC2,$ACC2,$ACC2 - ldr x30,[sp,#8] - - //////////////////////////////////////////////////////////////// - // lazy reduction, but without narrowing - - ushr $T0.2d,$ACC3,#26 - and $ACC3,$ACC3,$MASK.2d - ushr $T1.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - - add $ACC4,$ACC4,$T0.2d // h3 -> h4 - add $ACC1,$ACC1,$T1.2d // h0 -> h1 - - ushr $T0.2d,$ACC4,#26 - and $ACC4,$ACC4,$MASK.2d - ushr $T1.2d,$ACC1,#26 - and $ACC1,$ACC1,$MASK.2d - add $ACC2,$ACC2,$T1.2d // h1 -> h2 - - add $ACC0,$ACC0,$T0.2d - shl $T0.2d,$T0.2d,#2 - ushr $T1.2d,$ACC2,#26 - and $ACC2,$ACC2,$MASK.2d - add $ACC0,$ACC0,$T0.2d // h4 -> h0 - add $ACC3,$ACC3,$T1.2d // h2 -> h3 - - ushr $T0.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - ushr $T1.2d,$ACC3,#26 - and $ACC3,$ACC3,$MASK.2d - add $ACC1,$ACC1,$T0.2d // h0 -> h1 - add $ACC4,$ACC4,$T1.2d // h3 -> h4 - - //////////////////////////////////////////////////////////////// - // write the result, can be partially reduced - - st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 - mov x4,#1 - st1 {$ACC4}[0],[$ctx] - str x4,[$ctx,#8] // set is_base2_26 - - ldr x29,[sp],#80 - .inst 0xd50323bf // autiasp - ret -.size poly1305_blocks_neon,.-poly1305_blocks_neon - -.pushsection .rodata -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0 -.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" -.popsection - -.align 2 -#if !defined(__KERNEL__) && !defined(_WIN64) -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -___ - -foreach (split("\n",$code)) { - s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or - s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or - (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or - (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or - (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or - (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or - (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); - - s/\.[124]([sd])\[/.$1\[/; - s/w#x([0-9]+)/w$1/g; - - print $_,"\n"; -} -close STDOUT; diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c deleted file mode 100644 index c9a74766785b..000000000000 --- a/arch/arm64/lib/crypto/poly1305-glue.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 - * - * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> - */ - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <crypto/internal/poly1305.h> -#include <linux/cpufeature.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/unaligned.h> - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, - unsigned int len, u32 padbit) -{ - len = round_down(len, POLY1305_BLOCK_SIZE); - if (static_branch_likely(&have_neon)) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else - poly1305_blocks(state, src, len, padbit); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -bool poly1305_is_arch_optimized(void) -{ - /* We always can use at least the ARM64 scalar implementation. */ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init neon_poly1305_mod_init(void) -{ - if (cpu_have_named_feature(ASIMD)) - static_branch_enable(&have_neon); - return 0; -} -subsys_initcall(neon_poly1305_mod_init); - -static void __exit neon_poly1305_mod_exit(void) -{ -} -module_exit(neon_poly1305_mod_exit); - -MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/lib/crypto/sha2-armv8.pl b/arch/arm64/lib/crypto/sha2-armv8.pl deleted file mode 100644 index 4aebd20c498b..000000000000 --- a/arch/arm64/lib/crypto/sha2-armv8.pl +++ /dev/null @@ -1,786 +0,0 @@ -#! /usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from the OpenSSL project but the author (Andy Polyakov) -# has relicensed it under the GPLv2. Therefore this program is free software; -# you can redistribute it and/or modify it under the terms of the GNU General -# Public License version 2 as published by the Free Software Foundation. -# -# The original headers, including the original license headers, are -# included below for completeness. - -# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. -# -# Licensed under the OpenSSL license (the "License"). You may not use -# this file except in compliance with the License. You can obtain a copy -# in the file LICENSE in the source distribution or at -# https://www.openssl.org/source/license.html - -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# SHA256/512 for ARMv8. -# -# Performance in cycles per processed byte and improvement coefficient -# over code generated with "default" compiler: -# -# SHA256-hw SHA256(*) SHA512 -# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) -# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) -# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) -# Denver 2.01 10.5 (+26%) 6.70 (+8%) -# X-Gene 20.0 (+100%) 12.8 (+300%(***)) -# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) -# -# (*) Software SHA256 results are of lesser relevance, presented -# mostly for informational purposes. -# (**) The result is a trade-off: it's possible to improve it by -# 10% (or by 1 cycle per round), but at the cost of 20% loss -# on Cortex-A53 (or by 4 cycles per round). -# (***) Super-impressive coefficients over gcc-generated code are -# indication of some compiler "pathology", most notably code -# generated with -mgeneral-regs-only is significantly faster -# and the gap is only 40-90%. -# -# October 2016. -# -# Originally it was reckoned that it makes no sense to implement NEON -# version of SHA256 for 64-bit processors. This is because performance -# improvement on most wide-spread Cortex-A5x processors was observed -# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was -# observed that 32-bit NEON SHA256 performs significantly better than -# 64-bit scalar version on *some* of the more recent processors. As -# result 64-bit NEON version of SHA256 was added to provide best -# all-round performance. For example it executes ~30% faster on X-Gene -# and Mongoose. [For reference, NEON version of SHA512 is bound to -# deliver much less improvement, likely *negative* on Cortex-A5x. -# Which is why NEON support is limited to SHA256.] - -$output=pop; -$flavour=pop; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open OUT,"| \"$^X\" $xlate $flavour $output"; - *STDOUT=*OUT; -} else { - open STDOUT,">$output"; -} - -if ($output =~ /512/) { - $BITS=512; - $SZ=8; - @Sigma0=(28,34,39); - @Sigma1=(14,18,41); - @sigma0=(1, 8, 7); - @sigma1=(19,61, 6); - $rounds=80; - $reg_t="x"; -} else { - $BITS=256; - $SZ=4; - @Sigma0=( 2,13,22); - @Sigma1=( 6,11,25); - @sigma0=( 7,18, 3); - @sigma1=(17,19,10); - $rounds=64; - $reg_t="w"; -} - -$func="sha${BITS}_blocks_arch"; - -($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); - -@X=map("$reg_t$_",(3..15,0..2)); -@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); -($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); - -sub BODY_00_xx { -my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; -my $j=($i+1)&15; -my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); - $T0=@X[$i+3] if ($i<11); - -$code.=<<___ if ($i<16); -#ifndef __AARCH64EB__ - rev @X[$i],@X[$i] // $i -#endif -___ -$code.=<<___ if ($i<13 && ($i&1)); - ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ -___ -$code.=<<___ if ($i==13); - ldp @X[14],@X[15],[$inp] -___ -$code.=<<___ if ($i>=14); - ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] -___ -$code.=<<___ if ($i>0 && $i<16); - add $a,$a,$t1 // h+=Sigma0(a) -___ -$code.=<<___ if ($i>=11); - str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] -___ -# While ARMv8 specifies merged rotate-n-logical operation such as -# 'eor x,y,z,ror#n', it was found to negatively affect performance -# on Apple A7. The reason seems to be that it requires even 'y' to -# be available earlier. This means that such merged instruction is -# not necessarily best choice on critical path... On the other hand -# Cortex-A5x handles merged instructions much better than disjoint -# rotate and logical... See (**) footnote above. -$code.=<<___ if ($i<15); - ror $t0,$e,#$Sigma1[0] - add $h,$h,$t2 // h+=K[i] - eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` - and $t1,$f,$e - bic $t2,$g,$e - add $h,$h,@X[$i&15] // h+=X[i] - orr $t1,$t1,$t2 // Ch(e,f,g) - eor $t2,$a,$b // a^b, b^c in next round - eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) - ror $T0,$a,#$Sigma0[0] - add $h,$h,$t1 // h+=Ch(e,f,g) - eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` - add $h,$h,$t0 // h+=Sigma1(e) - and $t3,$t3,$t2 // (b^c)&=(a^b) - add $d,$d,$h // d+=h - eor $t3,$t3,$b // Maj(a,b,c) - eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) - add $h,$h,$t3 // h+=Maj(a,b,c) - ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round - //add $h,$h,$t1 // h+=Sigma0(a) -___ -$code.=<<___ if ($i>=15); - ror $t0,$e,#$Sigma1[0] - add $h,$h,$t2 // h+=K[i] - ror $T1,@X[($j+1)&15],#$sigma0[0] - and $t1,$f,$e - ror $T2,@X[($j+14)&15],#$sigma1[0] - bic $t2,$g,$e - ror $T0,$a,#$Sigma0[0] - add $h,$h,@X[$i&15] // h+=X[i] - eor $t0,$t0,$e,ror#$Sigma1[1] - eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] - orr $t1,$t1,$t2 // Ch(e,f,g) - eor $t2,$a,$b // a^b, b^c in next round - eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) - eor $T0,$T0,$a,ror#$Sigma0[1] - add $h,$h,$t1 // h+=Ch(e,f,g) - and $t3,$t3,$t2 // (b^c)&=(a^b) - eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] - eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) - add $h,$h,$t0 // h+=Sigma1(e) - eor $t3,$t3,$b // Maj(a,b,c) - eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) - eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) - add @X[$j],@X[$j],@X[($j+9)&15] - add $d,$d,$h // d+=h - add $h,$h,$t3 // h+=Maj(a,b,c) - ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round - add @X[$j],@X[$j],$T1 - add $h,$h,$t1 // h+=Sigma0(a) - add @X[$j],@X[$j],$T2 -___ - ($t2,$t3)=($t3,$t2); -} - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#endif - -.text - -.extern OPENSSL_armcap_P -.globl $func -.type $func,%function -.align 6 -$func: -___ -$code.=<<___ if ($SZ==4); -#ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] - tst w16,#ARMV8_SHA256 - b.ne .Lv8_entry - tst w16,#ARMV7_NEON - b.ne .Lneon_entry -#endif -___ -$code.=<<___; - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*$SZ - - ldp $A,$B,[$ctx] // load context - ldp $C,$D,[$ctx,#2*$SZ] - ldp $E,$F,[$ctx,#4*$SZ] - add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input - ldp $G,$H,[$ctx,#6*$SZ] - adr $Ktbl,.LK$BITS - stp $ctx,$num,[x29,#96] - -.Loop: - ldp @X[0],@X[1],[$inp],#2*$SZ - ldr $t2,[$Ktbl],#$SZ // *K++ - eor $t3,$B,$C // magic seed - str $inp,[x29,#112] -___ -for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } -$code.=".Loop_16_xx:\n"; -for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; - cbnz $t2,.Loop_16_xx - - ldp $ctx,$num,[x29,#96] - ldr $inp,[x29,#112] - sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind - - ldp @X[0],@X[1],[$ctx] - ldp @X[2],@X[3],[$ctx,#2*$SZ] - add $inp,$inp,#14*$SZ // advance input pointer - ldp @X[4],@X[5],[$ctx,#4*$SZ] - add $A,$A,@X[0] - ldp @X[6],@X[7],[$ctx,#6*$SZ] - add $B,$B,@X[1] - add $C,$C,@X[2] - add $D,$D,@X[3] - stp $A,$B,[$ctx] - add $E,$E,@X[4] - add $F,$F,@X[5] - stp $C,$D,[$ctx,#2*$SZ] - add $G,$G,@X[6] - add $H,$H,@X[7] - cmp $inp,$num - stp $E,$F,[$ctx,#4*$SZ] - stp $G,$H,[$ctx,#6*$SZ] - b.ne .Loop - - ldp x19,x20,[x29,#16] - add sp,sp,#4*$SZ - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 - ret -.size $func,.-$func - -.align 6 -.type .LK$BITS,%object -.LK$BITS: -___ -$code.=<<___ if ($SZ==8); - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - .quad 0 // terminator -___ -$code.=<<___ if ($SZ==4); - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0 //terminator -___ -$code.=<<___; -.size .LK$BITS,.-.LK$BITS -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ - .long OPENSSL_armcap_P-. -# else - .quad OPENSSL_armcap_P-. -# endif -#endif -.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" -.align 2 -___ - -if ($SZ==4) { -my $Ktbl="x3"; - -my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); -my @MSG=map("v$_.16b",(4..7)); -my ($W0,$W1)=("v16.4s","v17.4s"); -my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); - -$code.=<<___; -#ifndef __KERNEL__ -.type sha256_block_armv8,%function -.align 6 -sha256_block_armv8: -.Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1.32 {$ABCD,$EFGH},[$ctx] - adr $Ktbl,.LK256 - -.Loop_hw: - ld1 {@MSG[0]-@MSG[3]},[$inp],#64 - sub $num,$num,#1 - ld1.32 {$W0},[$Ktbl],#16 - rev32 @MSG[0],@MSG[0] - rev32 @MSG[1],@MSG[1] - rev32 @MSG[2],@MSG[2] - rev32 @MSG[3],@MSG[3] - orr $ABCD_SAVE,$ABCD,$ABCD // offload - orr $EFGH_SAVE,$EFGH,$EFGH -___ -for($i=0;$i<12;$i++) { -$code.=<<___; - ld1.32 {$W1},[$Ktbl],#16 - add.i32 $W0,$W0,@MSG[0] - sha256su0 @MSG[0],@MSG[1] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - sha256su1 @MSG[0],@MSG[2],@MSG[3] -___ - ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); -} -$code.=<<___; - ld1.32 {$W1},[$Ktbl],#16 - add.i32 $W0,$W0,@MSG[0] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - ld1.32 {$W0},[$Ktbl],#16 - add.i32 $W1,$W1,@MSG[1] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - ld1.32 {$W1},[$Ktbl] - add.i32 $W0,$W0,@MSG[2] - sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - add.i32 $W1,$W1,@MSG[3] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - add.i32 $ABCD,$ABCD,$ABCD_SAVE - add.i32 $EFGH,$EFGH,$EFGH_SAVE - - cbnz $num,.Loop_hw - - st1.32 {$ABCD,$EFGH},[$ctx] - - ldr x29,[sp],#16 - ret -.size sha256_block_armv8,.-sha256_block_armv8 -#endif -___ -} - -if ($SZ==4) { ######################################### NEON stuff # -# You'll surely note a lot of similarities with sha256-armv4 module, -# and of course it's not a coincidence. sha256-armv4 was used as -# initial template, but was adapted for ARMv8 instruction set and -# extensively re-tuned for all-round performance. - -my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); -my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); -my $Ktbl="x16"; -my $Xfer="x17"; -my @X = map("q$_",(0..3)); -my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); -my $j=0; - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; - my $arg = pop; - $arg = "#$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; -} - -sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } -sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } -sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } - -sub Xupdate() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T2,$T0,$sigma0[0]); - eval(shift(@insns)); - &ushr_32 ($T1,$T0,$sigma0[2]); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] - eval(shift(@insns)); - &sli_32 ($T2,$T0,32-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T3,$T0,$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T1,$T1,$T2); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T3,$T0,32-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T4,$T7,$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T4,$T7,32-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T5,$T7,$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T3,$T7,$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &sli_u32 ($T3,$T7,32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T4); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T6,@X[0],$sigma1[0]); - eval(shift(@insns)); - &ushr_32 ($T7,@X[0],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T6,@X[0],32-$sigma1[0]); - eval(shift(@insns)); - &ushr_32 ($T5,@X[0],$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T7,$T7,$T6); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T5,@X[0],32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_32 ("{$T0}","[$Ktbl], #16"); - eval(shift(@insns)); - &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T5); - eval(shift(@insns)); - eval(shift(@insns)); - &mov (&Dhi($T5), &Dlo($T7)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 ($T0,$T0,@X[0]); - while($#insns>=1) { eval(shift(@insns)); } - &st1_32 ("{$T0}","[$Xfer], #16"); - eval(shift(@insns)); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xpreload() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_8 ("{@X[0]}","[$inp],#16"); - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_32 ("{$T0}","[$Ktbl],#16"); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &rev32 (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 ($T0,$T0,@X[0]); - foreach (@insns) { eval; } # remaining instructions - &st1_32 ("{$T0}","[$Xfer], #16"); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub body_00_15 () { - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. - '&add ($h,$h,$t1)', # h+=X[i]+K[i] - '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past - '&and ($t1,$f,$e)', - '&bic ($t4,$g,$e)', - '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', - '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past - '&orr ($t1,$t1,$t4)', # Ch(e,f,g) - '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) - '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', - '&add ($h,$h,$t1)', # h+=Ch(e,f,g) - '&ror ($t0,$t0,"#$Sigma1[0]")', - '&eor ($t2,$a,$b)', # a^b, b^c in next round - '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) - '&add ($h,$h,$t0)', # h+=Sigma1(e) - '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. - '&ldr ($t1,"[$Ktbl]") if ($j==15);'. - '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) - '&ror ($t4,$t4,"#$Sigma0[0]")', - '&add ($d,$d,$h)', # d+=h - '&eor ($t3,$t3,$b)', # Maj(a,b,c) - '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' - ) -} - -$code.=<<___; -#ifdef __KERNEL__ -.globl sha256_block_neon -#endif -.type sha256_block_neon,%function -.align 4 -sha256_block_neon: -.Lneon_entry: - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 - - adr $Ktbl,.LK256 - add $num,$inp,$num,lsl#6 // len to point at the end of inp - - ld1.8 {@X[0]},[$inp], #16 - ld1.8 {@X[1]},[$inp], #16 - ld1.8 {@X[2]},[$inp], #16 - ld1.8 {@X[3]},[$inp], #16 - ld1.32 {$T0},[$Ktbl], #16 - ld1.32 {$T1},[$Ktbl], #16 - ld1.32 {$T2},[$Ktbl], #16 - ld1.32 {$T3},[$Ktbl], #16 - rev32 @X[0],@X[0] // yes, even on - rev32 @X[1],@X[1] // big-endian - rev32 @X[2],@X[2] - rev32 @X[3],@X[3] - mov $Xfer,sp - add.32 $T0,$T0,@X[0] - add.32 $T1,$T1,@X[1] - add.32 $T2,$T2,@X[2] - st1.32 {$T0-$T1},[$Xfer], #32 - add.32 $T3,$T3,@X[3] - st1.32 {$T2-$T3},[$Xfer] - sub $Xfer,$Xfer,#32 - - ldp $A,$B,[$ctx] - ldp $C,$D,[$ctx,#8] - ldp $E,$F,[$ctx,#16] - ldp $G,$H,[$ctx,#24] - ldr $t1,[sp,#0] - mov $t2,wzr - eor $t3,$B,$C - mov $t4,wzr - b .L_00_48 - -.align 4 -.L_00_48: -___ - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); -$code.=<<___; - cmp $t1,#0 // check for K256 terminator - ldr $t1,[sp,#0] - sub $Xfer,$Xfer,#64 - bne .L_00_48 - - sub $Ktbl,$Ktbl,#256 // rewind $Ktbl - cmp $inp,$num - mov $Xfer, #64 - csel $Xfer, $Xfer, xzr, eq - sub $inp,$inp,$Xfer // avoid SEGV - mov $Xfer,sp -___ - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); -$code.=<<___; - add $A,$A,$t4 // h+=Sigma0(a) from the past - ldp $t0,$t1,[$ctx,#0] - add $A,$A,$t2 // h+=Maj(a,b,c) from the past - ldp $t2,$t3,[$ctx,#8] - add $A,$A,$t0 // accumulate - add $B,$B,$t1 - ldp $t0,$t1,[$ctx,#16] - add $C,$C,$t2 - add $D,$D,$t3 - ldp $t2,$t3,[$ctx,#24] - add $E,$E,$t0 - add $F,$F,$t1 - ldr $t1,[sp,#0] - stp $A,$B,[$ctx,#0] - add $G,$G,$t2 - mov $t2,wzr - stp $C,$D,[$ctx,#8] - add $H,$H,$t3 - stp $E,$F,[$ctx,#16] - eor $t3,$B,$C - stp $G,$H,[$ctx,#24] - mov $t4,wzr - mov $Xfer,sp - b.ne .L_00_48 - - ldr x29,[x29] - add sp,sp,#16*4+16 - ret -.size sha256_block_neon,.-sha256_block_neon -___ -} - -$code.=<<___; -#ifndef __KERNEL__ -.comm OPENSSL_armcap_P,4,4 -#endif -___ - -{ my %opcode = ( - "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, - "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); - - sub unsha256 { - my ($mnemonic,$arg)=@_; - - $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o - && - sprintf ".inst\t0x%08x\t//%s %s", - $opcode{$mnemonic}|$1|($2<<5)|($3<<16), - $mnemonic,$arg; - } -} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach(split("\n",$code)) { - - s/\`([^\`]*)\`/eval($1)/ge; - - s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; - - s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers - - s/\.[ui]?8(\s)/$1/; - s/\.\w?32\b// and s/\.16b/\.4s/g; - m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; - - print $_,"\n"; -} - -close STDOUT; diff --git a/arch/arm64/lib/crypto/sha256-ce.S b/arch/arm64/lib/crypto/sha256-ce.S deleted file mode 100644 index f3e21c6d87d2..000000000000 --- a/arch/arm64/lib/crypto/sha256-ce.S +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions - * - * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .text - .arch armv8-a+crypto - - dga .req q20 - dgav .req v20 - dgb .req q21 - dgbv .req v21 - - t0 .req v22 - t1 .req v23 - - dg0q .req q24 - dg0v .req v24 - dg1q .req q25 - dg1v .req v25 - dg2q .req q26 - dg2v .req v26 - - .macro add_only, ev, rc, s0 - mov dg2v.16b, dg0v.16b - .ifeq \ev - add t1.4s, v\s0\().4s, \rc\().4s - sha256h dg0q, dg1q, t0.4s - sha256h2 dg1q, dg2q, t0.4s - .else - .ifnb \s0 - add t0.4s, v\s0\().4s, \rc\().4s - .endif - sha256h dg0q, dg1q, t1.4s - sha256h2 dg1q, dg2q, t1.4s - .endif - .endm - - .macro add_update, ev, rc, s0, s1, s2, s3 - sha256su0 v\s0\().4s, v\s1\().4s - add_only \ev, \rc, \s1 - sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s - .endm - - /* - * The SHA-256 round constants - */ - .section ".rodata", "a" - .align 4 -.Lsha2_rcon: - .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 - - /* - * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], - * const u8 *data, size_t nblocks); - */ - .text -SYM_FUNC_START(__sha256_ce_transform) - /* load round constants */ - adr_l x8, .Lsha2_rcon - ld1 { v0.4s- v3.4s}, [x8], #64 - ld1 { v4.4s- v7.4s}, [x8], #64 - ld1 { v8.4s-v11.4s}, [x8], #64 - ld1 {v12.4s-v15.4s}, [x8] - - /* load state */ - ld1 {dgav.4s, dgbv.4s}, [x0] - - /* load input */ -0: ld1 {v16.4s-v19.4s}, [x1], #64 - sub x2, x2, #1 - -CPU_LE( rev32 v16.16b, v16.16b ) -CPU_LE( rev32 v17.16b, v17.16b ) -CPU_LE( rev32 v18.16b, v18.16b ) -CPU_LE( rev32 v19.16b, v19.16b ) - - add t0.4s, v16.4s, v0.4s - mov dg0v.16b, dgav.16b - mov dg1v.16b, dgbv.16b - - add_update 0, v1, 16, 17, 18, 19 - add_update 1, v2, 17, 18, 19, 16 - add_update 0, v3, 18, 19, 16, 17 - add_update 1, v4, 19, 16, 17, 18 - - add_update 0, v5, 16, 17, 18, 19 - add_update 1, v6, 17, 18, 19, 16 - add_update 0, v7, 18, 19, 16, 17 - add_update 1, v8, 19, 16, 17, 18 - - add_update 0, v9, 16, 17, 18, 19 - add_update 1, v10, 17, 18, 19, 16 - add_update 0, v11, 18, 19, 16, 17 - add_update 1, v12, 19, 16, 17, 18 - - add_only 0, v13, 17 - add_only 1, v14, 18 - add_only 0, v15, 19 - add_only 1 - - /* update state */ - add dgav.4s, dgav.4s, dg0v.4s - add dgbv.4s, dgbv.4s, dg1v.4s - - /* return early if voluntary preemption is needed */ - cond_yield 1f, x5, x6 - - /* handled all input blocks? */ - cbnz x2, 0b - - /* store new state */ -1: st1 {dgav.4s, dgbv.4s}, [x0] - mov x0, x2 - ret -SYM_FUNC_END(__sha256_ce_transform) diff --git a/arch/arm64/lib/crypto/sha256.c b/arch/arm64/lib/crypto/sha256.c deleted file mode 100644 index bcf7a3adc0c4..000000000000 --- a/arch/arm64/lib/crypto/sha256.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for ARM64 - * - * Copyright 2025 Google LLC - */ -#include <asm/neon.h> -#include <crypto/internal/sha2.h> -#include <linux/kernel.h> -#include <linux/module.h> - -asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -EXPORT_SYMBOL_GPL(sha256_blocks_arch); -asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); - -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - static_branch_likely(&have_neon)) { - if (static_branch_likely(&have_ce)) { - do { - size_t rem; - - kernel_neon_begin(); - rem = __sha256_ce_transform(state, - data, nblocks); - kernel_neon_end(); - data += (nblocks - rem) * SHA256_BLOCK_SIZE; - nblocks = rem; - } while (nblocks); - } else { - kernel_neon_begin(); - sha256_block_neon(state, data, nblocks); - kernel_neon_end(); - } - } else { - sha256_blocks_arch(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -bool sha256_is_arch_optimized(void) -{ - /* We always can use at least the ARM64 scalar implementation. */ - return true; -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init sha256_arm64_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - cpu_have_named_feature(ASIMD)) { - static_branch_enable(&have_neon); - if (cpu_have_named_feature(SHA2)) - static_branch_enable(&have_ce); - } - return 0; -} -subsys_initcall(sha256_arm64_mod_init); - -static void __exit sha256_arm64_mod_exit(void) -{ -} -module_exit(sha256_arm64_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for ARM64"); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index ec0a337891dd..11eb8d1adc84 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -487,17 +487,29 @@ static void do_bad_area(unsigned long far, unsigned long esr, } } -static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, - unsigned int mm_flags) +static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags) { - unsigned long iss2 = ESR_ELx_ISS2(esr); - if (!system_supports_poe()) return false; - if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) - return true; - + /* + * We do not check whether an Overlay fault has occurred because we + * cannot make a decision based solely on its value: + * + * - If Overlay is set, a fault did occur due to POE, but it may be + * spurious in those cases where we update POR_EL0 without ISB (e.g. + * on context-switch). We would then need to manually check POR_EL0 + * against vma_pkey(vma), which is exactly what + * arch_vma_access_permitted() does. + * + * - If Overlay is not set, we may still need to report a pkey fault. + * This is the case if an access was made within a mapping but with no + * page mapped, and POR_EL0 forbids the access (according to + * vma_pkey()). Such access will result in a SIGSEGV regardless + * because core code checks arch_vma_access_permitted(), but in order + * to report the correct error code - SEGV_PKUERR - we must handle + * that case here. + */ return !arch_vma_access_permitted(vma, mm_flags & FAULT_FLAG_WRITE, mm_flags & FAULT_FLAG_INSTRUCTION, @@ -635,7 +647,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto bad_area; } - if (fault_from_pkey(esr, vma, mm_flags)) { + if (fault_from_pkey(vma, mm_flags)) { pkey = vma_pkey(vma); vma_end_read(vma); fault = 0; @@ -679,7 +691,7 @@ retry: goto bad_area; } - if (fault_from_pkey(esr, vma, mm_flags)) { + if (fault_from_pkey(vma, mm_flags)) { pkey = vma_pkey(vma); mmap_read_unlock(mm); fault = 0; diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 80d470aa469d..54dccfd6aa11 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -518,7 +518,6 @@ alternative_else_nop_endif msr REG_PIR_EL1, x0 orr tcr2, tcr2, TCR2_EL1_PIE - msr REG_TCR2_EL1, x0 .Lskip_indirection: diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl index 0765b3a8d6d6..8d9088bc577d 100644 --- a/arch/arm64/tools/syscall_32.tbl +++ b/arch/arm64/tools/syscall_32.tbl @@ -479,3 +479,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr |