summaryrefslogtreecommitdiff
path: root/arch/arm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm')
-rw-r--r--arch/arm/Kconfig6
-rw-r--r--arch/arm/Makefile2
-rw-r--r--arch/arm/boot/compressed/Makefile2
-rw-r--r--arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi2
-rw-r--r--arch/arm/configs/exynos_defconfig2
-rw-r--r--arch/arm/configs/milbeaut_m10v_defconfig3
-rw-r--r--arch/arm/configs/multi_v7_defconfig9
-rw-r--r--arch/arm/configs/omap2plus_defconfig2
-rw-r--r--arch/arm/configs/pxa_defconfig2
-rw-r--r--arch/arm/crypto/Kconfig41
-rw-r--r--arch/arm/crypto/Makefile21
-rw-r--r--arch/arm/crypto/sha1-armv4-large.S507
-rw-r--r--arch/arm/crypto/sha1-armv7-neon.S634
-rw-r--r--arch/arm/crypto/sha1-ce-core.S123
-rw-r--r--arch/arm/crypto/sha1-ce-glue.c72
-rw-r--r--arch/arm/crypto/sha1_glue.c75
-rw-r--r--arch/arm/crypto/sha1_neon_glue.c83
-rw-r--r--arch/arm/crypto/sha512-armv4.pl657
-rw-r--r--arch/arm/crypto/sha512-glue.c110
-rw-r--r--arch/arm/crypto/sha512-neon-glue.c75
-rw-r--r--arch/arm/crypto/sha512.h3
-rw-r--r--arch/arm/kernel/entry-common.S2
-rw-r--r--arch/arm/kernel/ptrace.c6
-rw-r--r--arch/arm/lib/.gitignore4
-rw-r--r--arch/arm/lib/Makefile8
-rw-r--r--arch/arm/lib/crc-t10dif-core.S468
-rw-r--r--arch/arm/lib/crc-t10dif.c72
-rw-r--r--arch/arm/lib/crc32-core.S306
-rw-r--r--arch/arm/lib/crc32.c123
-rw-r--r--arch/arm/lib/crypto/.gitignore3
-rw-r--r--arch/arm/lib/crypto/Kconfig31
-rw-r--r--arch/arm/lib/crypto/Makefile32
-rw-r--r--arch/arm/lib/crypto/blake2s-core.S306
-rw-r--r--arch/arm/lib/crypto/blake2s-glue.c7
-rw-r--r--arch/arm/lib/crypto/chacha-glue.c138
-rw-r--r--arch/arm/lib/crypto/chacha-neon-core.S643
-rw-r--r--arch/arm/lib/crypto/chacha-scalar-core.S444
-rw-r--r--arch/arm/lib/crypto/poly1305-armv4.pl1236
-rw-r--r--arch/arm/lib/crypto/poly1305-glue.c80
-rw-r--r--arch/arm/lib/crypto/sha256-armv4.pl724
-rw-r--r--arch/arm/lib/crypto/sha256-ce.S123
-rw-r--r--arch/arm/lib/crypto/sha256.c64
-rw-r--r--arch/arm/mach-omap1/board-ams-delta.c42
-rw-r--r--arch/arm/mach-s3c/mach-crag6410.c17
-rw-r--r--arch/arm/mach-sa1100/assabet.c2
-rw-r--r--arch/arm/mach-sa1100/neponset.c2
-rw-r--r--arch/arm/mm/cache-feroceon-l2.c2
-rw-r--r--arch/arm/mm/cache-tauros2.c2
-rw-r--r--arch/arm/tools/syscall.tbl2
-rw-r--r--arch/arm/vdso/Makefile2
50 files changed, 54 insertions, 7268 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3072731fe09c..a6e80653abd1 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,8 +8,6 @@ config ARM
select ARCH_HAS_CACHE_LINE_SIZE if OF
select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_CPU_FINALIZE_INIT if MMU
- select ARCH_HAS_CRC32 if KERNEL_MODE_NEON
- select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DMA_ALLOC if MMU
@@ -87,11 +85,11 @@ config ARM
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
+ select HAVE_ARCH_KSTACK_ERASE
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_PFN_VALID
select HAVE_ARCH_SECCOMP
select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT
- select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE
@@ -121,7 +119,7 @@ config ARM
select HAVE_KERNEL_XZ
select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M
select HAVE_KRETPROBES if HAVE_KPROBES
- select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_CAN_USE_KEEP_IN_OVERLAY)
+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD) && LD_CAN_USE_KEEP_IN_OVERLAY
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_OPTPROBES if !THUMB2_KERNEL
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 4808d3ed98e4..e31e95ffd33f 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -149,7 +149,7 @@ endif
# Need -Uarm for gcc < 3.x
KBUILD_CPPFLAGS +=$(cpp-y)
KBUILD_CFLAGS +=$(CFLAGS_ABI) $(CFLAGS_ISA) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm
-KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_ISA) -Wa,$(arch-y) $(tune-y) -include asm/unified.h -msoft-float
+KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_ISA) -Wa,$(arch-y) $(tune-y) -include $(srctree)/arch/arm/include/asm/unified.h -msoft-float
KBUILD_RUSTFLAGS += --target=arm-unknown-linux-gnueabi
CHECKFLAGS += -D__arm__
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index d61369b1eabe..a159120d1e42 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -9,7 +9,6 @@ OBJS =
HEAD = head.o
OBJS += misc.o decompress.o
-CFLAGS_decompress.o += $(DISABLE_STACKLEAK_PLUGIN)
ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y)
OBJS += debug.o
AFLAGS_head.o += -DDEBUG
@@ -96,6 +95,7 @@ KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin \
-I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
+ $(DISABLE_KSTACK_ERASE) \
-I$(obj)
ccflags-remove-$(CONFIG_FUNCTION_TRACER) += -pg
asflags-y := -DZIMAGE
diff --git a/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi b/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
index f909b1d4dbca..e82cf312da25 100644
--- a/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
+++ b/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
@@ -652,7 +652,7 @@
reg = <0x01cb4000 0x3000>;
interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&ccu CLK_BUS_CSI>,
- <&ccu CLK_CSI1_SCLK>,
+ <&ccu CLK_CSI_SCLK>,
<&ccu CLK_DRAM_CSI>;
clock-names = "bus", "mod", "ram";
resets = <&ccu RST_BUS_CSI>;
diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index f71af368674c..6915c766923a 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -363,8 +363,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_CHACHA20_NEON=m
CONFIG_CRYPTO_DEV_EXYNOS_RNG=y
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index 242e7d5a3f68..a3be0b2ede09 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -98,9 +98,6 @@ CONFIG_CRYPTO_SELFTESTS=y
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_SEQIV=m
CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_AES_ARM_CE=m
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index 50c170b4619f..c75808b94be6 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -791,8 +791,11 @@ CONFIG_SND=m
CONFIG_SND_HDA_TEGRA=m
CONFIG_SND_HDA_INPUT_BEEP=y
CONFIG_SND_HDA_PATCH_LOADER=y
-CONFIG_SND_HDA_CODEC_REALTEK=m
+CONFIG_SND_HDA_CODEC_REALTEK=y
+CONFIG_SND_HDA_CODEC_REALTEK_LIB=m
+CONFIG_SND_HDA_CODEC_ALC269=m
CONFIG_SND_HDA_CODEC_HDMI=m
+CONFIG_SND_HDA_CODEC_HDMI_TEGRA=m
CONFIG_SND_USB_AUDIO=m
CONFIG_SND_SOC=m
CONFIG_SND_ATMEL_SOC=m
@@ -1280,9 +1283,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_AES_ARM_CE=m
@@ -1298,7 +1298,6 @@ CONFIG_CRYPTO_DEV_MARVELL_CESA=m
CONFIG_CRYPTO_DEV_QCE=m
CONFIG_CRYPTO_DEV_QCOM_RNG=m
CONFIG_CRYPTO_DEV_ROCKCHIP=m
-CONFIG_CRYPTO_DEV_STM32_CRC=m
CONFIG_CRYPTO_DEV_STM32_HASH=m
CONFIG_CRYPTO_DEV_STM32_CRYP=m
CONFIG_CMA_SIZE_MBYTES=64
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 9f9780c8e62a..046467637901 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -704,8 +704,6 @@ CONFIG_NLS_ISO8859_1=y
CONFIG_SECURITY=y
CONFIG_CRYPTO_MICHAEL_MIC=y
CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_CHACHA20_NEON=m
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index ff29c5b0e9c9..1a80602c1284 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -658,8 +658,6 @@ CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_XCBC=m
CONFIG_CRYPTO_DEFLATE=y
CONFIG_CRYPTO_LZO=y
-CONFIG_CRYPTO_SHA1_ARM=m
-CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_FONTS=y
CONFIG_FONT_8x8=y
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 7efb9a8596e4..1e5f3cdf691c 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -62,47 +62,6 @@ config CRYPTO_BLAKE2B_NEON
much faster than the SHA-2 family and slightly faster than
SHA-1.
-config CRYPTO_SHA1_ARM
- tristate "Hash functions: SHA-1"
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: arm
-
-config CRYPTO_SHA1_ARM_NEON
- tristate "Hash functions: SHA-1 (NEON)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_SHA1_ARM
- select CRYPTO_SHA1
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: arm using
- - NEON (Advanced SIMD) extensions
-
-config CRYPTO_SHA1_ARM_CE
- tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_SHA1_ARM
- select CRYPTO_HASH
- help
- SHA-1 secure hash algorithm (FIPS 180)
-
- Architecture: arm using ARMv8 Crypto Extensions
-
-config CRYPTO_SHA512_ARM
- tristate "Hash functions: SHA-384 and SHA-512 (NEON)"
- select CRYPTO_HASH
- depends on !CPU_V7M
- help
- SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
- Architecture: arm using
- - NEON (Advanced SIMD) extensions
-
config CRYPTO_AES_ARM
tristate "Ciphers: AES"
select CRYPTO_ALGAPI
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 8479137c6e80..4f23999ae17d 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -5,38 +5,17 @@
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
-obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
aes-arm-y := aes-cipher-core.o aes-cipher-glue.o
aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
-sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
-sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
-sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
-sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
curve25519-neon-y := curve25519-core.o curve25519-glue.o
-
-quiet_cmd_perl = PERL $@
- cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
- $(call cmd,perl)
-
-clean-files += sha512-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1
-
-AFLAGS_sha512-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S
deleted file mode 100644
index 1c8b685149f2..000000000000
--- a/arch/arm/crypto/sha1-armv4-large.S
+++ /dev/null
@@ -1,507 +0,0 @@
-#define __ARM_ARCH__ __LINUX_ARM_ARCH__
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see https://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ sha1_block procedure for ARMv4.
-@
-@ January 2007.
-
-@ Size/performance trade-off
-@ ====================================================================
-@ impl size in bytes comp cycles[*] measured performance
-@ ====================================================================
-@ thumb 304 3212 4420
-@ armv4-small 392/+29% 1958/+64% 2250/+96%
-@ armv4-compact 740/+89% 1552/+26% 1840/+22%
-@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
-@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
-@ ====================================================================
-@ thumb = same as 'small' but in Thumb instructions[**] and
-@ with recurring code in two private functions;
-@ small = detached Xload/update, loops are folded;
-@ compact = detached Xload/update, 5x unroll;
-@ large = interleaved Xload/update, 5x unroll;
-@ full unroll = interleaved Xload/update, full unroll, estimated[!];
-@
-@ [*] Manually counted instructions in "grand" loop body. Measured
-@ performance is affected by prologue and epilogue overhead,
-@ i-cache availability, branch penalties, etc.
-@ [**] While each Thumb instruction is twice smaller, they are not as
-@ diverse as ARM ones: e.g., there are only two arithmetic
-@ instructions with 3 arguments, no [fixed] rotate, addressing
-@ modes are limited. As result it takes more instructions to do
-@ the same job in Thumb, therefore the code is never twice as
-@ small and always slower.
-@ [***] which is also ~35% better than compiler generated code. Dual-
-@ issue Cortex A8 core was measured to process input block in
-@ ~990 cycles.
-
-@ August 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
-@ Cortex A8 core and in absolute terms ~870 cycles per input block
-@ [or 13.6 cycles per byte].
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 10%
-@ improvement on Cortex A8 core and 12.2 cycles per byte.
-
-#include <linux/linkage.h>
-
-.text
-
-.align 2
-ENTRY(sha1_block_data_order)
- stmdb sp!,{r4-r12,lr}
- add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
- ldmia r0,{r3,r4,r5,r6,r7}
-.Lloop:
- ldr r8,.LK_00_19
- mov r14,sp
- sub sp,sp,#15*4
- mov r5,r5,ror#30
- mov r6,r6,ror#30
- mov r7,r7,ror#30 @ [6]
-.L_00_15:
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r7,r8,r7,ror#2 @ E+=K_00_19
- eor r10,r5,r6 @ F_xx_xx
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r4,r10,ror#2
- add r7,r7,r9 @ E+=X[i]
- eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r7,r7,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r6,r8,r6,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r4,r5 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r6,r8,r6,ror#2 @ E+=K_00_19
- eor r10,r4,r5 @ F_xx_xx
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r3,r10,ror#2
- add r6,r6,r9 @ E+=X[i]
- eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r6,r6,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r5,r8,r5,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r3,r4 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r5,r8,r5,ror#2 @ E+=K_00_19
- eor r10,r3,r4 @ F_xx_xx
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r7,r10,ror#2
- add r5,r5,r9 @ E+=X[i]
- eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r5,r5,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r4,r8,r4,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r7,r3 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r4,r8,r4,ror#2 @ E+=K_00_19
- eor r10,r7,r3 @ F_xx_xx
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r6,r10,ror#2
- add r4,r4,r9 @ E+=X[i]
- eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r4,r4,r10 @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r3,r8,r3,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r6,r7 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r3,r8,r3,ror#2 @ E+=K_00_19
- eor r10,r6,r7 @ F_xx_xx
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r5,r10,ror#2
- add r3,r3,r9 @ E+=X[i]
- eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r3,r3,r10 @ E+=F_00_19(B,C,D)
- cmp r14,sp
- bne .L_00_15 @ [((11+4)*5+2)*3]
- sub sp,sp,#25*4
-#if __ARM_ARCH__<7
- ldrb r10,[r1,#2]
- ldrb r9,[r1,#3]
- ldrb r11,[r1,#1]
- add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1],#4
- orr r9,r9,r10,lsl#8
- eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#16
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r9,r12,lsl#24
-#else
- ldr r9,[r1],#4 @ handles unaligned
- add r7,r8,r7,ror#2 @ E+=K_00_19
- eor r10,r5,r6 @ F_xx_xx
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
-#ifdef __ARMEL__
- rev r9,r9 @ byte swap
-#endif
-#endif
- and r10,r4,r10,ror#2
- add r7,r7,r9 @ E+=X[i]
- eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
- str r9,[r14,#-4]!
- add r7,r7,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r3,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
- add r6,r6,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r7,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
- add r5,r5,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r6,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
- add r4,r4,r10 @ E+=F_00_19(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r5,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
- add r3,r3,r10 @ E+=F_00_19(B,C,D)
-
- ldr r8,.LK_20_39 @ [+15+16*4]
- cmn sp,#0 @ [+3], clear carry to denote 20_39
-.L_20_39_or_60_79:
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r7,r8,r7,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r5,r6 @ F_xx_xx
- mov r9,r9,ror#31
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r4,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r7,r7,r9 @ E+=X[i]
- add r7,r7,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r3,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- add r6,r6,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r7,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- add r5,r5,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r6,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- add r4,r4,r10 @ E+=F_20_39(B,C,D)
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- eor r10,r5,r10,ror#2 @ F_xx_xx
- @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- add r3,r3,r10 @ E+=F_20_39(B,C,D)
- ARM( teq r14,sp ) @ preserve carry
- THUMB( mov r11,sp )
- THUMB( teq r14,r11 ) @ preserve carry
- bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
- bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
-
- ldr r8,.LK_40_59
- sub sp,sp,#20*4 @ [+2]
-.L_40_59:
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r7,r8,r7,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r5,r6 @ F_xx_xx
- mov r9,r9,ror#31
- add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r4,r10,ror#2 @ F_xx_xx
- and r11,r5,r6 @ F_xx_xx
- add r7,r7,r9 @ E+=X[i]
- add r7,r7,r10 @ E+=F_40_59(B,C,D)
- add r7,r7,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r6,r8,r6,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r4,r5 @ F_xx_xx
- mov r9,r9,ror#31
- add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r3,r10,ror#2 @ F_xx_xx
- and r11,r4,r5 @ F_xx_xx
- add r6,r6,r9 @ E+=X[i]
- add r6,r6,r10 @ E+=F_40_59(B,C,D)
- add r6,r6,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r5,r8,r5,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r3,r4 @ F_xx_xx
- mov r9,r9,ror#31
- add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r7,r10,ror#2 @ F_xx_xx
- and r11,r3,r4 @ F_xx_xx
- add r5,r5,r9 @ E+=X[i]
- add r5,r5,r10 @ E+=F_40_59(B,C,D)
- add r5,r5,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r4,r8,r4,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r7,r3 @ F_xx_xx
- mov r9,r9,ror#31
- add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r6,r10,ror#2 @ F_xx_xx
- and r11,r7,r3 @ F_xx_xx
- add r4,r4,r9 @ E+=X[i]
- add r4,r4,r10 @ E+=F_40_59(B,C,D)
- add r4,r4,r11,ror#2
- ldr r9,[r14,#15*4]
- ldr r10,[r14,#13*4]
- ldr r11,[r14,#7*4]
- add r3,r8,r3,ror#2 @ E+=K_xx_xx
- ldr r12,[r14,#2*4]
- eor r9,r9,r10
- eor r11,r11,r12 @ 1 cycle stall
- eor r10,r6,r7 @ F_xx_xx
- mov r9,r9,ror#31
- add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- eor r9,r9,r11,ror#31
- str r9,[r14,#-4]!
- and r10,r5,r10,ror#2 @ F_xx_xx
- and r11,r6,r7 @ F_xx_xx
- add r3,r3,r9 @ E+=X[i]
- add r3,r3,r10 @ E+=F_40_59(B,C,D)
- add r3,r3,r11,ror#2
- cmp r14,sp
- bne .L_40_59 @ [+((12+5)*5+2)*4]
-
- ldr r8,.LK_60_79
- sub sp,sp,#20*4
- cmp sp,#0 @ set carry to denote 60_79
- b .L_20_39_or_60_79 @ [+4], spare 300 bytes
-.L_done:
- add sp,sp,#80*4 @ "deallocate" stack frame
- ldmia r0,{r8,r9,r10,r11,r12}
- add r3,r8,r3
- add r4,r9,r4
- add r5,r10,r5,ror#2
- add r6,r11,r6,ror#2
- add r7,r12,r7,ror#2
- stmia r0,{r3,r4,r5,r6,r7}
- teq r1,r2
- bne .Lloop @ [+18], total 1307
-
- ldmia sp!,{r4-r12,pc}
-.align 2
-.LK_00_19: .word 0x5a827999
-.LK_20_39: .word 0x6ed9eba1
-.LK_40_59: .word 0x8f1bbcdc
-.LK_60_79: .word 0xca62c1d6
-ENDPROC(sha1_block_data_order)
-.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
deleted file mode 100644
index 28d816a6a530..000000000000
--- a/arch/arm/crypto/sha1-armv7-neon.S
+++ /dev/null
@@ -1,634 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
- *
- * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-.syntax unified
-.fpu neon
-
-.text
-
-
-/* Context structure */
-
-#define state_h0 0
-#define state_h1 4
-#define state_h2 8
-#define state_h3 12
-#define state_h4 16
-
-
-/* Constants */
-
-#define K1 0x5A827999
-#define K2 0x6ED9EBA1
-#define K3 0x8F1BBCDC
-#define K4 0xCA62C1D6
-.align 4
-.LK_VEC:
-.LK1: .long K1, K1, K1, K1
-.LK2: .long K2, K2, K2, K2
-.LK3: .long K3, K3, K3, K3
-.LK4: .long K4, K4, K4, K4
-
-
-/* Register macros */
-
-#define RSTATE r0
-#define RDATA r1
-#define RNBLKS r2
-#define ROLDSTACK r3
-#define RWK lr
-
-#define _a r4
-#define _b r5
-#define _c r6
-#define _d r7
-#define _e r8
-
-#define RT0 r9
-#define RT1 r10
-#define RT2 r11
-#define RT3 r12
-
-#define W0 q0
-#define W1 q7
-#define W2 q2
-#define W3 q3
-#define W4 q4
-#define W5 q6
-#define W6 q5
-#define W7 q1
-
-#define tmp0 q8
-#define tmp1 q9
-#define tmp2 q10
-#define tmp3 q11
-
-#define qK1 q12
-#define qK2 q13
-#define qK3 q14
-#define qK4 q15
-
-#ifdef CONFIG_CPU_BIG_ENDIAN
-#define ARM_LE(code...)
-#else
-#define ARM_LE(code...) code
-#endif
-
-/* Round function macros. */
-
-#define WK_offs(i) (((i) & 15) * 4)
-
-#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ldr RT3, [sp, WK_offs(i)]; \
- pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- bic RT0, d, b; \
- add e, e, a, ror #(32 - 5); \
- and RT1, c, b; \
- pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add RT0, RT0, RT3; \
- add e, e, RT1; \
- ror b, #(32 - 30); \
- pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add e, e, RT0;
-
-#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ldr RT3, [sp, WK_offs(i)]; \
- pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- eor RT0, d, b; \
- add e, e, a, ror #(32 - 5); \
- eor RT0, RT0, c; \
- pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add e, e, RT3; \
- ror b, #(32 - 30); \
- pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add e, e, RT0; \
-
-#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ldr RT3, [sp, WK_offs(i)]; \
- pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- eor RT0, b, c; \
- and RT1, b, c; \
- add e, e, a, ror #(32 - 5); \
- pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- and RT0, RT0, d; \
- add RT1, RT1, RT3; \
- add e, e, RT0; \
- ror b, #(32 - 30); \
- pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
- add e, e, RT1;
-
-#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define R(a,b,c,d,e,f,i) \
- _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
- W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define dummy(...)
-
-
-/* Input expansion macros. */
-
-/********* Precalc macros for rounds 0-15 *************************************/
-
-#define W_PRECALC_00_15() \
- add RWK, sp, #(WK_offs(0)); \
- \
- vld1.32 {W0, W7}, [RDATA]!; \
- ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
- vld1.32 {W6, W5}, [RDATA]!; \
- vadd.u32 tmp0, W0, curK; \
- ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
- ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
- vadd.u32 tmp1, W7, curK; \
- ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
- vadd.u32 tmp2, W6, curK; \
- vst1.32 {tmp0, tmp1}, [RWK]!; \
- vadd.u32 tmp3, W5, curK; \
- vst1.32 {tmp2, tmp3}, [RWK]; \
-
-#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vld1.32 {W0, W7}, [RDATA]!; \
-
-#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- add RWK, sp, #(WK_offs(0)); \
-
-#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
-
-#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vld1.32 {W6, W5}, [RDATA]!; \
-
-#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp0, W0, curK; \
-
-#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
-
-#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
-
-#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp1, W7, curK; \
-
-#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
-
-#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp2, W6, curK; \
-
-#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vst1.32 {tmp0, tmp1}, [RWK]!; \
-
-#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp3, W5, curK; \
-
-#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vst1.32 {tmp2, tmp3}, [RWK]; \
-
-
-/********* Precalc macros for rounds 16-31 ************************************/
-
-#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor tmp0, tmp0; \
- vext.8 W, W_m16, W_m12, #8; \
-
-#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- add RWK, sp, #(WK_offs(i)); \
- vext.8 tmp0, W_m04, tmp0, #4; \
-
-#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor tmp0, tmp0, W_m16; \
- veor.32 W, W, W_m08; \
-
-#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor tmp1, tmp1; \
- veor W, W, tmp0; \
-
-#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vshl.u32 tmp0, W, #1; \
-
-#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vext.8 tmp1, tmp1, W, #(16-12); \
- vshr.u32 W, W, #31; \
-
-#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vorr tmp0, tmp0, W; \
- vshr.u32 W, tmp1, #30; \
-
-#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vshl.u32 tmp1, tmp1, #2; \
-
-#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor tmp0, tmp0, W; \
-
-#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor W, tmp0, tmp1; \
-
-#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp0, W, curK; \
-
-#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vst1.32 {tmp0}, [RWK];
-
-
-/********* Precalc macros for rounds 32-79 ************************************/
-
-#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor W, W_m28; \
-
-#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vext.8 tmp0, W_m08, W_m04, #8; \
-
-#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor W, W_m16; \
-
-#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- veor W, tmp0; \
-
-#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- add RWK, sp, #(WK_offs(i&~3)); \
-
-#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vshl.u32 tmp1, W, #2; \
-
-#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vshr.u32 tmp0, W, #30; \
-
-#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vorr W, tmp0, tmp1; \
-
-#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vadd.u32 tmp0, W, curK; \
-
-#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- vst1.32 {tmp0}, [RWK];
-
-
-/*
- * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
- *
- * unsigned int
- * sha1_transform_neon (void *ctx, const unsigned char *data,
- * unsigned int nblks)
- */
-.align 3
-ENTRY(sha1_transform_neon)
- /* input:
- * r0: ctx, CTX
- * r1: data (64*nblks bytes)
- * r2: nblks
- */
-
- cmp RNBLKS, #0;
- beq .Ldo_nothing;
-
- push {r4-r12, lr};
- /*vpush {q4-q7};*/
-
- adr RT3, .LK_VEC;
-
- mov ROLDSTACK, sp;
-
- /* Align stack. */
- sub RT0, sp, #(16*4);
- and RT0, #(~(16-1));
- mov sp, RT0;
-
- vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
-
- /* Get the values of the chaining variables. */
- ldm RSTATE, {_a-_e};
-
- vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
-
-#undef curK
-#define curK qK1
- /* Precalc 0-15. */
- W_PRECALC_00_15();
-
-.Loop:
- /* Transform 0-15 + Precalc 16-31. */
- _R( _a, _b, _c, _d, _e, F1, 0,
- WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
- W4, W5, W6, W7, W0, _, _, _ );
- _R( _e, _a, _b, _c, _d, F1, 1,
- WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
- W4, W5, W6, W7, W0, _, _, _ );
- _R( _d, _e, _a, _b, _c, F1, 2,
- WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
- W4, W5, W6, W7, W0, _, _, _ );
- _R( _c, _d, _e, _a, _b, F1, 3,
- WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
- W4, W5, W6, W7, W0, _, _, _ );
-
-#undef curK
-#define curK qK2
- _R( _b, _c, _d, _e, _a, F1, 4,
- WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
- W3, W4, W5, W6, W7, _, _, _ );
- _R( _a, _b, _c, _d, _e, F1, 5,
- WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
- W3, W4, W5, W6, W7, _, _, _ );
- _R( _e, _a, _b, _c, _d, F1, 6,
- WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
- W3, W4, W5, W6, W7, _, _, _ );
- _R( _d, _e, _a, _b, _c, F1, 7,
- WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
- W3, W4, W5, W6, W7, _, _, _ );
-
- _R( _c, _d, _e, _a, _b, F1, 8,
- WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
- W2, W3, W4, W5, W6, _, _, _ );
- _R( _b, _c, _d, _e, _a, F1, 9,
- WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
- W2, W3, W4, W5, W6, _, _, _ );
- _R( _a, _b, _c, _d, _e, F1, 10,
- WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
- W2, W3, W4, W5, W6, _, _, _ );
- _R( _e, _a, _b, _c, _d, F1, 11,
- WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
- W2, W3, W4, W5, W6, _, _, _ );
-
- _R( _d, _e, _a, _b, _c, F1, 12,
- WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
- W1, W2, W3, W4, W5, _, _, _ );
- _R( _c, _d, _e, _a, _b, F1, 13,
- WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
- W1, W2, W3, W4, W5, _, _, _ );
- _R( _b, _c, _d, _e, _a, F1, 14,
- WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
- W1, W2, W3, W4, W5, _, _, _ );
- _R( _a, _b, _c, _d, _e, F1, 15,
- WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
- W1, W2, W3, W4, W5, _, _, _ );
-
- /* Transform 16-63 + Precalc 32-79. */
- _R( _e, _a, _b, _c, _d, F1, 16,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _d, _e, _a, _b, _c, F1, 17,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _c, _d, _e, _a, _b, F1, 18,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _b, _c, _d, _e, _a, F1, 19,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
- W0, W1, W2, W3, W4, W5, W6, W7);
-
- _R( _a, _b, _c, _d, _e, F2, 20,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _e, _a, _b, _c, _d, F2, 21,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _d, _e, _a, _b, _c, F2, 22,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _c, _d, _e, _a, _b, F2, 23,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
- W7, W0, W1, W2, W3, W4, W5, W6);
-
-#undef curK
-#define curK qK3
- _R( _b, _c, _d, _e, _a, F2, 24,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _a, _b, _c, _d, _e, F2, 25,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _e, _a, _b, _c, _d, F2, 26,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _d, _e, _a, _b, _c, F2, 27,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
- W6, W7, W0, W1, W2, W3, W4, W5);
-
- _R( _c, _d, _e, _a, _b, F2, 28,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _b, _c, _d, _e, _a, F2, 29,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _a, _b, _c, _d, _e, F2, 30,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _e, _a, _b, _c, _d, F2, 31,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
- W5, W6, W7, W0, W1, W2, W3, W4);
-
- _R( _d, _e, _a, _b, _c, F2, 32,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
- W4, W5, W6, W7, W0, W1, W2, W3);
- _R( _c, _d, _e, _a, _b, F2, 33,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
- W4, W5, W6, W7, W0, W1, W2, W3);
- _R( _b, _c, _d, _e, _a, F2, 34,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
- W4, W5, W6, W7, W0, W1, W2, W3);
- _R( _a, _b, _c, _d, _e, F2, 35,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
- W4, W5, W6, W7, W0, W1, W2, W3);
-
- _R( _e, _a, _b, _c, _d, F2, 36,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
- W3, W4, W5, W6, W7, W0, W1, W2);
- _R( _d, _e, _a, _b, _c, F2, 37,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
- W3, W4, W5, W6, W7, W0, W1, W2);
- _R( _c, _d, _e, _a, _b, F2, 38,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
- W3, W4, W5, W6, W7, W0, W1, W2);
- _R( _b, _c, _d, _e, _a, F2, 39,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
- W3, W4, W5, W6, W7, W0, W1, W2);
-
- _R( _a, _b, _c, _d, _e, F3, 40,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
- W2, W3, W4, W5, W6, W7, W0, W1);
- _R( _e, _a, _b, _c, _d, F3, 41,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
- W2, W3, W4, W5, W6, W7, W0, W1);
- _R( _d, _e, _a, _b, _c, F3, 42,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
- W2, W3, W4, W5, W6, W7, W0, W1);
- _R( _c, _d, _e, _a, _b, F3, 43,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
- W2, W3, W4, W5, W6, W7, W0, W1);
-
-#undef curK
-#define curK qK4
- _R( _b, _c, _d, _e, _a, F3, 44,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
- W1, W2, W3, W4, W5, W6, W7, W0);
- _R( _a, _b, _c, _d, _e, F3, 45,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
- W1, W2, W3, W4, W5, W6, W7, W0);
- _R( _e, _a, _b, _c, _d, F3, 46,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
- W1, W2, W3, W4, W5, W6, W7, W0);
- _R( _d, _e, _a, _b, _c, F3, 47,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
- W1, W2, W3, W4, W5, W6, W7, W0);
-
- _R( _c, _d, _e, _a, _b, F3, 48,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _b, _c, _d, _e, _a, F3, 49,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _a, _b, _c, _d, _e, F3, 50,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
- W0, W1, W2, W3, W4, W5, W6, W7);
- _R( _e, _a, _b, _c, _d, F3, 51,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
- W0, W1, W2, W3, W4, W5, W6, W7);
-
- _R( _d, _e, _a, _b, _c, F3, 52,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _c, _d, _e, _a, _b, F3, 53,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _b, _c, _d, _e, _a, F3, 54,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
- W7, W0, W1, W2, W3, W4, W5, W6);
- _R( _a, _b, _c, _d, _e, F3, 55,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
- W7, W0, W1, W2, W3, W4, W5, W6);
-
- _R( _e, _a, _b, _c, _d, F3, 56,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _d, _e, _a, _b, _c, F3, 57,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _c, _d, _e, _a, _b, F3, 58,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
- W6, W7, W0, W1, W2, W3, W4, W5);
- _R( _b, _c, _d, _e, _a, F3, 59,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
- W6, W7, W0, W1, W2, W3, W4, W5);
-
- subs RNBLKS, #1;
-
- _R( _a, _b, _c, _d, _e, F4, 60,
- WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _e, _a, _b, _c, _d, F4, 61,
- WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _d, _e, _a, _b, _c, F4, 62,
- WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
- W5, W6, W7, W0, W1, W2, W3, W4);
- _R( _c, _d, _e, _a, _b, F4, 63,
- WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
- W5, W6, W7, W0, W1, W2, W3, W4);
-
- beq .Lend;
-
- /* Transform 64-79 + Precalc 0-15 of next block. */
-#undef curK
-#define curK qK1
- _R( _b, _c, _d, _e, _a, F4, 64,
- WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _a, _b, _c, _d, _e, F4, 65,
- WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _e, _a, _b, _c, _d, F4, 66,
- WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _d, _e, _a, _b, _c, F4, 67,
- WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
- _R( _c, _d, _e, _a, _b, F4, 68,
- dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _b, _c, _d, _e, _a, F4, 69,
- dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _a, _b, _c, _d, _e, F4, 70,
- WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _e, _a, _b, _c, _d, F4, 71,
- WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
- _R( _d, _e, _a, _b, _c, F4, 72,
- dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _c, _d, _e, _a, _b, F4, 73,
- dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _b, _c, _d, _e, _a, F4, 74,
- WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _a, _b, _c, _d, _e, F4, 75,
- WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
- _R( _e, _a, _b, _c, _d, F4, 76,
- WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _d, _e, _a, _b, _c, F4, 77,
- WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _c, _d, _e, _a, _b, F4, 78,
- WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
- _R( _b, _c, _d, _e, _a, F4, 79,
- WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
-
- /* Update the chaining variables. */
- ldm RSTATE, {RT0-RT3};
- add _a, RT0;
- ldr RT0, [RSTATE, #state_h4];
- add _b, RT1;
- add _c, RT2;
- add _d, RT3;
- add _e, RT0;
- stm RSTATE, {_a-_e};
-
- b .Loop;
-
-.Lend:
- /* Transform 64-79 */
- R( _b, _c, _d, _e, _a, F4, 64 );
- R( _a, _b, _c, _d, _e, F4, 65 );
- R( _e, _a, _b, _c, _d, F4, 66 );
- R( _d, _e, _a, _b, _c, F4, 67 );
- R( _c, _d, _e, _a, _b, F4, 68 );
- R( _b, _c, _d, _e, _a, F4, 69 );
- R( _a, _b, _c, _d, _e, F4, 70 );
- R( _e, _a, _b, _c, _d, F4, 71 );
- R( _d, _e, _a, _b, _c, F4, 72 );
- R( _c, _d, _e, _a, _b, F4, 73 );
- R( _b, _c, _d, _e, _a, F4, 74 );
- R( _a, _b, _c, _d, _e, F4, 75 );
- R( _e, _a, _b, _c, _d, F4, 76 );
- R( _d, _e, _a, _b, _c, F4, 77 );
- R( _c, _d, _e, _a, _b, F4, 78 );
- R( _b, _c, _d, _e, _a, F4, 79 );
-
- mov sp, ROLDSTACK;
-
- /* Update the chaining variables. */
- ldm RSTATE, {RT0-RT3};
- add _a, RT0;
- ldr RT0, [RSTATE, #state_h4];
- add _b, RT1;
- add _c, RT2;
- add _d, RT3;
- /*vpop {q4-q7};*/
- add _e, RT0;
- stm RSTATE, {_a-_e};
-
- pop {r4-r12, pc};
-
-.Ldo_nothing:
- bx lr
-ENDPROC(sha1_transform_neon)
diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S
deleted file mode 100644
index 8a702e051738..000000000000
--- a/arch/arm/crypto/sha1-ce-core.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
-
- k0 .req q0
- k1 .req q1
- k2 .req q2
- k3 .req q3
-
- ta0 .req q4
- ta1 .req q5
- tb0 .req q5
- tb1 .req q4
-
- dga .req q6
- dgb .req q7
- dgbs .req s28
-
- dg0 .req q12
- dg1a0 .req q13
- dg1a1 .req q14
- dg1b0 .req q14
- dg1b1 .req q13
-
- .macro add_only, op, ev, rc, s0, dg1
- .ifnb \s0
- vadd.u32 tb\ev, q\s0, \rc
- .endif
- sha1h.32 dg1b\ev, dg0
- .ifb \dg1
- sha1\op\().32 dg0, dg1a\ev, ta\ev
- .else
- sha1\op\().32 dg0, \dg1, ta\ev
- .endif
- .endm
-
- .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
- sha1su0.32 q\s0, q\s1, q\s2
- add_only \op, \ev, \rc, \s1, \dg1
- sha1su1.32 q\s0, q\s3
- .endm
-
- .align 6
-.Lsha1_rcon:
- .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
- .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
- .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
- .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
-
- /*
- * void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
- * int blocks);
- */
-ENTRY(sha1_ce_transform)
- /* load round constants */
- adr ip, .Lsha1_rcon
- vld1.32 {k0-k1}, [ip, :128]!
- vld1.32 {k2-k3}, [ip, :128]
-
- /* load state */
- vld1.32 {dga}, [r0]
- vldr dgbs, [r0, #16]
-
- /* load input */
-0: vld1.32 {q8-q9}, [r1]!
- vld1.32 {q10-q11}, [r1]!
- subs r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
- vrev32.8 q8, q8
- vrev32.8 q9, q9
- vrev32.8 q10, q10
- vrev32.8 q11, q11
-#endif
-
- vadd.u32 ta0, q8, k0
- vmov dg0, dga
-
- add_update c, 0, k0, 8, 9, 10, 11, dgb
- add_update c, 1, k0, 9, 10, 11, 8
- add_update c, 0, k0, 10, 11, 8, 9
- add_update c, 1, k0, 11, 8, 9, 10
- add_update c, 0, k1, 8, 9, 10, 11
-
- add_update p, 1, k1, 9, 10, 11, 8
- add_update p, 0, k1, 10, 11, 8, 9
- add_update p, 1, k1, 11, 8, 9, 10
- add_update p, 0, k1, 8, 9, 10, 11
- add_update p, 1, k2, 9, 10, 11, 8
-
- add_update m, 0, k2, 10, 11, 8, 9
- add_update m, 1, k2, 11, 8, 9, 10
- add_update m, 0, k2, 8, 9, 10, 11
- add_update m, 1, k2, 9, 10, 11, 8
- add_update m, 0, k3, 10, 11, 8, 9
-
- add_update p, 1, k3, 11, 8, 9, 10
- add_only p, 0, k3, 9
- add_only p, 1, k3, 10
- add_only p, 0, k3, 11
- add_only p, 1
-
- /* update state */
- vadd.u32 dga, dga, dg0
- vadd.u32 dgb, dgb, dg1a0
- bne 0b
-
- /* store new state */
- vst1.32 {dga}, [r0]
- vstr dgbs, [r0, #16]
- bx lr
-ENDPROC(sha1_ce_transform)
diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
deleted file mode 100644
index fac07a4799de..000000000000
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
- int blocks);
-
-static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- int remain;
-
- kernel_neon_begin();
- remain = sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
- kernel_neon_end();
-
- return remain;
-}
-
-static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- kernel_neon_begin();
- sha1_base_do_finup(desc, data, len, sha1_ce_transform);
- kernel_neon_end();
-
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .init = sha1_base_init,
- .update = sha1_ce_update,
- .finup = sha1_ce_finup,
- .descsize = SHA1_STATE_SIZE,
- .digestsize = SHA1_DIGEST_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-ce",
- .cra_priority = 200,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init sha1_ce_mod_init(void)
-{
- return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_ce_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(SHA1, sha1_ce_mod_init);
-module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
deleted file mode 100644
index 255da00c7d98..000000000000
--- a/arch/arm/crypto/sha1_glue.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation
- *
- * This file is based on sha1_generic.c and sha1_ssse3_glue.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_block_data_order(struct sha1_state *digest,
- const u8 *data, int rounds);
-
-static int sha1_update_arm(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- /* make sure signature matches sha1_block_fn() */
- BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
-
- return sha1_base_do_update_blocks(desc, data, len,
- sha1_block_data_order);
-}
-
-static int sha1_finup_arm(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha1_base_do_finup(desc, data, len, sha1_block_data_order);
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_update_arm,
- .finup = sha1_finup_arm,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name= "sha1-asm",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-
-static int __init sha1_mod_init(void)
-{
- return crypto_register_shash(&alg);
-}
-
-
-static void __exit sha1_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-
-module_init(sha1_mod_init);
-module_exit(sha1_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (ARM)");
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
deleted file mode 100644
index d321850f22a6..000000000000
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
- * ARM NEON instructions.
- *
- * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is based on sha1_generic.c and sha1_ssse3_glue.c:
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_transform_neon(struct sha1_state *state_h,
- const u8 *data, int rounds);
-
-static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- int remain;
-
- kernel_neon_begin();
- remain = sha1_base_do_update_blocks(desc, data, len,
- sha1_transform_neon);
- kernel_neon_end();
-
- return remain;
-}
-
-static int sha1_neon_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- kernel_neon_begin();
- sha1_base_do_finup(desc, data, len, sha1_transform_neon);
- kernel_neon_end();
-
- return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_base_init,
- .update = sha1_neon_update,
- .finup = sha1_neon_finup,
- .descsize = SHA1_STATE_SIZE,
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1-neon",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-};
-
-static int __init sha1_neon_mod_init(void)
-{
- if (!cpu_has_neon())
- return -ENODEV;
-
- return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_neon_mod_fini(void)
-{
- crypto_unregister_shash(&alg);
-}
-
-module_init(sha1_neon_mod_init);
-module_exit(sha1_neon_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated");
-MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
deleted file mode 100644
index 2fc3516912fa..000000000000
--- a/arch/arm/crypto/sha512-armv4.pl
+++ /dev/null
@@ -1,657 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA512 block procedure for ARMv4. September 2007.
-
-# This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-# Xscale PXA250 core].
-#
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 6% improvement on
-# Cortex A8 core and ~40 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 7%
-# improvement on Coxtex A8 core and ~38 cycles per byte.
-
-# March 2011.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process
-# one byte in 23.3 cycles or ~60% faster than integer-only code.
-
-# August 2012.
-#
-# Improve NEON performance by 12% on Snapdragon S4. In absolute
-# terms it's 22.6 cycles per byte, which is disappointing result.
-# Technical writers asserted that 3-way S4 pipeline can sustain
-# multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
-# for further details. On side note Cortex-A15 processes one byte in
-# 16 cycles.
-
-# Byte order [in]dependence. =========================================
-#
-# Originally caller was expected to maintain specific *dword* order in
-# h[0-7], namely with most significant dword at *lower* address, which
-# was reflected in below two parameters as 0 and 4. Now caller is
-# expected to maintain native byte order for whole 64-bit values.
-$hi="HI";
-$lo="LO";
-# ====================================================================
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0"; # parameter block
-$inp="r1";
-$len="r2";
-
-$Tlo="r3";
-$Thi="r4";
-$Alo="r5";
-$Ahi="r6";
-$Elo="r7";
-$Ehi="r8";
-$t0="r9";
-$t1="r10";
-$t2="r11";
-$t3="r12";
-############ r13 is stack pointer
-$Ktbl="r14";
-############ r15 is program counter
-
-$Aoff=8*0;
-$Boff=8*1;
-$Coff=8*2;
-$Doff=8*3;
-$Eoff=8*4;
-$Foff=8*5;
-$Goff=8*6;
-$Hoff=8*7;
-$Xoff=8*8;
-
-sub BODY_00_15() {
-my $magic = shift;
-$code.=<<___;
- @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
- @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
- @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
- mov $t0,$Elo,lsr#14
- str $Tlo,[sp,#$Xoff+0]
- mov $t1,$Ehi,lsr#14
- str $Thi,[sp,#$Xoff+4]
- eor $t0,$t0,$Ehi,lsl#18
- ldr $t2,[sp,#$Hoff+0] @ h.lo
- eor $t1,$t1,$Elo,lsl#18
- ldr $t3,[sp,#$Hoff+4] @ h.hi
- eor $t0,$t0,$Elo,lsr#18
- eor $t1,$t1,$Ehi,lsr#18
- eor $t0,$t0,$Ehi,lsl#14
- eor $t1,$t1,$Elo,lsl#14
- eor $t0,$t0,$Ehi,lsr#9
- eor $t1,$t1,$Elo,lsr#9
- eor $t0,$t0,$Elo,lsl#23
- eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
- adds $Tlo,$Tlo,$t0
- ldr $t0,[sp,#$Foff+0] @ f.lo
- adc $Thi,$Thi,$t1 @ T += Sigma1(e)
- ldr $t1,[sp,#$Foff+4] @ f.hi
- adds $Tlo,$Tlo,$t2
- ldr $t2,[sp,#$Goff+0] @ g.lo
- adc $Thi,$Thi,$t3 @ T += h
- ldr $t3,[sp,#$Goff+4] @ g.hi
-
- eor $t0,$t0,$t2
- str $Elo,[sp,#$Eoff+0]
- eor $t1,$t1,$t3
- str $Ehi,[sp,#$Eoff+4]
- and $t0,$t0,$Elo
- str $Alo,[sp,#$Aoff+0]
- and $t1,$t1,$Ehi
- str $Ahi,[sp,#$Aoff+4]
- eor $t0,$t0,$t2
- ldr $t2,[$Ktbl,#$lo] @ K[i].lo
- eor $t1,$t1,$t3 @ Ch(e,f,g)
- ldr $t3,[$Ktbl,#$hi] @ K[i].hi
-
- adds $Tlo,$Tlo,$t0
- ldr $Elo,[sp,#$Doff+0] @ d.lo
- adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
- ldr $Ehi,[sp,#$Doff+4] @ d.hi
- adds $Tlo,$Tlo,$t2
- and $t0,$t2,#0xff
- adc $Thi,$Thi,$t3 @ T += K[i]
- adds $Elo,$Elo,$Tlo
- ldr $t2,[sp,#$Boff+0] @ b.lo
- adc $Ehi,$Ehi,$Thi @ d += T
- teq $t0,#$magic
-
- ldr $t3,[sp,#$Coff+0] @ c.lo
-#if __ARM_ARCH__>=7
- it eq @ Thumb2 thing, sanity check in ARM
-#endif
- orreq $Ktbl,$Ktbl,#1
- @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
- @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
- @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
- mov $t0,$Alo,lsr#28
- mov $t1,$Ahi,lsr#28
- eor $t0,$t0,$Ahi,lsl#4
- eor $t1,$t1,$Alo,lsl#4
- eor $t0,$t0,$Ahi,lsr#2
- eor $t1,$t1,$Alo,lsr#2
- eor $t0,$t0,$Alo,lsl#30
- eor $t1,$t1,$Ahi,lsl#30
- eor $t0,$t0,$Ahi,lsr#7
- eor $t1,$t1,$Alo,lsr#7
- eor $t0,$t0,$Alo,lsl#25
- eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
- adds $Tlo,$Tlo,$t0
- and $t0,$Alo,$t2
- adc $Thi,$Thi,$t1 @ T += Sigma0(a)
-
- ldr $t1,[sp,#$Boff+4] @ b.hi
- orr $Alo,$Alo,$t2
- ldr $t2,[sp,#$Coff+4] @ c.hi
- and $Alo,$Alo,$t3
- and $t3,$Ahi,$t1
- orr $Ahi,$Ahi,$t1
- orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
- and $Ahi,$Ahi,$t2
- adds $Alo,$Alo,$Tlo
- orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
- sub sp,sp,#8
- adc $Ahi,$Ahi,$Thi @ h += T
- tst $Ktbl,#1
- add $Ktbl,$Ktbl,#8
-___
-}
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
-# define VFP_ABI_POP vldmia sp!,{d8-d15}
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-#endif
-
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code 32
-# endif
-#endif
-
-.type K512,%object
-.align 5
-K512:
-WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
-WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
-WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
-WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
-WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
-WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
-WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
-WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
-WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
-WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
-WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
-WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
-WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
-WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
-WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
-WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
-WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
-WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
-WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
-WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
-WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
-WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
-WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
-WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
-WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
-WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
-WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
-WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
-WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
-WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
-WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
-WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
-WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
-WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
-WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
-WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
-WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
-WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
-WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
-WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
-.size K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha512_block_data_order
-.skip 32-4
-#else
-.skip 32
-#endif
-
-.global sha512_block_data_order
-.type sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
-#if __ARM_ARCH__<7
- sub r3,pc,#8 @ sha512_block_data_order
-#else
- adr r3,.Lsha512_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#1
- bne .LNEON
-#endif
- add $len,$inp,$len,lsl#7 @ len to point at the end of inp
- stmdb sp!,{r4-r12,lr}
- sub $Ktbl,r3,#672 @ K512
- sub sp,sp,#9*8
-
- ldr $Elo,[$ctx,#$Eoff+$lo]
- ldr $Ehi,[$ctx,#$Eoff+$hi]
- ldr $t0, [$ctx,#$Goff+$lo]
- ldr $t1, [$ctx,#$Goff+$hi]
- ldr $t2, [$ctx,#$Hoff+$lo]
- ldr $t3, [$ctx,#$Hoff+$hi]
-.Loop:
- str $t0, [sp,#$Goff+0]
- str $t1, [sp,#$Goff+4]
- str $t2, [sp,#$Hoff+0]
- str $t3, [sp,#$Hoff+4]
- ldr $Alo,[$ctx,#$Aoff+$lo]
- ldr $Ahi,[$ctx,#$Aoff+$hi]
- ldr $Tlo,[$ctx,#$Boff+$lo]
- ldr $Thi,[$ctx,#$Boff+$hi]
- ldr $t0, [$ctx,#$Coff+$lo]
- ldr $t1, [$ctx,#$Coff+$hi]
- ldr $t2, [$ctx,#$Doff+$lo]
- ldr $t3, [$ctx,#$Doff+$hi]
- str $Tlo,[sp,#$Boff+0]
- str $Thi,[sp,#$Boff+4]
- str $t0, [sp,#$Coff+0]
- str $t1, [sp,#$Coff+4]
- str $t2, [sp,#$Doff+0]
- str $t3, [sp,#$Doff+4]
- ldr $Tlo,[$ctx,#$Foff+$lo]
- ldr $Thi,[$ctx,#$Foff+$hi]
- str $Tlo,[sp,#$Foff+0]
- str $Thi,[sp,#$Foff+4]
-
-.L00_15:
-#if __ARM_ARCH__<7
- ldrb $Tlo,[$inp,#7]
- ldrb $t0, [$inp,#6]
- ldrb $t1, [$inp,#5]
- ldrb $t2, [$inp,#4]
- ldrb $Thi,[$inp,#3]
- ldrb $t3, [$inp,#2]
- orr $Tlo,$Tlo,$t0,lsl#8
- ldrb $t0, [$inp,#1]
- orr $Tlo,$Tlo,$t1,lsl#16
- ldrb $t1, [$inp],#8
- orr $Tlo,$Tlo,$t2,lsl#24
- orr $Thi,$Thi,$t3,lsl#8
- orr $Thi,$Thi,$t0,lsl#16
- orr $Thi,$Thi,$t1,lsl#24
-#else
- ldr $Tlo,[$inp,#4]
- ldr $Thi,[$inp],#8
-#ifdef __ARMEL__
- rev $Tlo,$Tlo
- rev $Thi,$Thi
-#endif
-#endif
-___
- &BODY_00_15(0x94);
-$code.=<<___;
- tst $Ktbl,#1
- beq .L00_15
- ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
- ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
- bic $Ktbl,$Ktbl,#1
-.L16_79:
- @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
- @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
- @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
- mov $Tlo,$t0,lsr#1
- ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
- mov $Thi,$t1,lsr#1
- ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
- eor $Tlo,$Tlo,$t1,lsl#31
- eor $Thi,$Thi,$t0,lsl#31
- eor $Tlo,$Tlo,$t0,lsr#8
- eor $Thi,$Thi,$t1,lsr#8
- eor $Tlo,$Tlo,$t1,lsl#24
- eor $Thi,$Thi,$t0,lsl#24
- eor $Tlo,$Tlo,$t0,lsr#7
- eor $Thi,$Thi,$t1,lsr#7
- eor $Tlo,$Tlo,$t1,lsl#25
-
- @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
- @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
- @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
- mov $t0,$t2,lsr#19
- mov $t1,$t3,lsr#19
- eor $t0,$t0,$t3,lsl#13
- eor $t1,$t1,$t2,lsl#13
- eor $t0,$t0,$t3,lsr#29
- eor $t1,$t1,$t2,lsr#29
- eor $t0,$t0,$t2,lsl#3
- eor $t1,$t1,$t3,lsl#3
- eor $t0,$t0,$t2,lsr#6
- eor $t1,$t1,$t3,lsr#6
- ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
- eor $t0,$t0,$t3,lsl#26
-
- ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
- adds $Tlo,$Tlo,$t0
- ldr $t0,[sp,#`$Xoff+8*16`+0]
- adc $Thi,$Thi,$t1
-
- ldr $t1,[sp,#`$Xoff+8*16`+4]
- adds $Tlo,$Tlo,$t2
- adc $Thi,$Thi,$t3
- adds $Tlo,$Tlo,$t0
- adc $Thi,$Thi,$t1
-___
- &BODY_00_15(0x17);
-$code.=<<___;
-#if __ARM_ARCH__>=7
- ittt eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
- ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
- beq .L16_79
- bic $Ktbl,$Ktbl,#1
-
- ldr $Tlo,[sp,#$Boff+0]
- ldr $Thi,[sp,#$Boff+4]
- ldr $t0, [$ctx,#$Aoff+$lo]
- ldr $t1, [$ctx,#$Aoff+$hi]
- ldr $t2, [$ctx,#$Boff+$lo]
- ldr $t3, [$ctx,#$Boff+$hi]
- adds $t0,$Alo,$t0
- str $t0, [$ctx,#$Aoff+$lo]
- adc $t1,$Ahi,$t1
- str $t1, [$ctx,#$Aoff+$hi]
- adds $t2,$Tlo,$t2
- str $t2, [$ctx,#$Boff+$lo]
- adc $t3,$Thi,$t3
- str $t3, [$ctx,#$Boff+$hi]
-
- ldr $Alo,[sp,#$Coff+0]
- ldr $Ahi,[sp,#$Coff+4]
- ldr $Tlo,[sp,#$Doff+0]
- ldr $Thi,[sp,#$Doff+4]
- ldr $t0, [$ctx,#$Coff+$lo]
- ldr $t1, [$ctx,#$Coff+$hi]
- ldr $t2, [$ctx,#$Doff+$lo]
- ldr $t3, [$ctx,#$Doff+$hi]
- adds $t0,$Alo,$t0
- str $t0, [$ctx,#$Coff+$lo]
- adc $t1,$Ahi,$t1
- str $t1, [$ctx,#$Coff+$hi]
- adds $t2,$Tlo,$t2
- str $t2, [$ctx,#$Doff+$lo]
- adc $t3,$Thi,$t3
- str $t3, [$ctx,#$Doff+$hi]
-
- ldr $Tlo,[sp,#$Foff+0]
- ldr $Thi,[sp,#$Foff+4]
- ldr $t0, [$ctx,#$Eoff+$lo]
- ldr $t1, [$ctx,#$Eoff+$hi]
- ldr $t2, [$ctx,#$Foff+$lo]
- ldr $t3, [$ctx,#$Foff+$hi]
- adds $Elo,$Elo,$t0
- str $Elo,[$ctx,#$Eoff+$lo]
- adc $Ehi,$Ehi,$t1
- str $Ehi,[$ctx,#$Eoff+$hi]
- adds $t2,$Tlo,$t2
- str $t2, [$ctx,#$Foff+$lo]
- adc $t3,$Thi,$t3
- str $t3, [$ctx,#$Foff+$hi]
-
- ldr $Alo,[sp,#$Goff+0]
- ldr $Ahi,[sp,#$Goff+4]
- ldr $Tlo,[sp,#$Hoff+0]
- ldr $Thi,[sp,#$Hoff+4]
- ldr $t0, [$ctx,#$Goff+$lo]
- ldr $t1, [$ctx,#$Goff+$hi]
- ldr $t2, [$ctx,#$Hoff+$lo]
- ldr $t3, [$ctx,#$Hoff+$hi]
- adds $t0,$Alo,$t0
- str $t0, [$ctx,#$Goff+$lo]
- adc $t1,$Ahi,$t1
- str $t1, [$ctx,#$Goff+$hi]
- adds $t2,$Tlo,$t2
- str $t2, [$ctx,#$Hoff+$lo]
- adc $t3,$Thi,$t3
- str $t3, [$ctx,#$Hoff+$hi]
-
- add sp,sp,#640
- sub $Ktbl,$Ktbl,#640
-
- teq $inp,$len
- bne .Loop
-
- add sp,sp,#8*9 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r12,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size sha512_block_data_order,.-sha512_block_data_order
-___
-
-{
-my @Sigma0=(28,34,39);
-my @Sigma1=(14,18,41);
-my @sigma0=(1, 8, 7);
-my @sigma1=(19,61,6);
-
-my $Ktbl="r3";
-my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
-
-my @X=map("d$_",(0..15));
-my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
-
-sub NEON_00_15() {
-my $i=shift;
-my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
-
-$code.=<<___ if ($i<16 || $i&1);
- vshr.u64 $t0,$e,#@Sigma1[0] @ $i
-#if $i<16
- vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
-#endif
- vshr.u64 $t1,$e,#@Sigma1[1]
-#if $i>0
- vadd.i64 $a,$Maj @ h+=Maj from the past
-#endif
- vshr.u64 $t2,$e,#@Sigma1[2]
-___
-$code.=<<___;
- vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
- vsli.64 $t0,$e,#`64-@Sigma1[0]`
- vsli.64 $t1,$e,#`64-@Sigma1[1]`
- vmov $Ch,$e
- vsli.64 $t2,$e,#`64-@Sigma1[2]`
-#if $i<16 && defined(__ARMEL__)
- vrev64.8 @X[$i],@X[$i]
-#endif
- veor $t1,$t0
- vbsl $Ch,$f,$g @ Ch(e,f,g)
- vshr.u64 $t0,$a,#@Sigma0[0]
- veor $t2,$t1 @ Sigma1(e)
- vadd.i64 $T1,$Ch,$h
- vshr.u64 $t1,$a,#@Sigma0[1]
- vsli.64 $t0,$a,#`64-@Sigma0[0]`
- vadd.i64 $T1,$t2
- vshr.u64 $t2,$a,#@Sigma0[2]
- vadd.i64 $K,@X[$i%16]
- vsli.64 $t1,$a,#`64-@Sigma0[1]`
- veor $Maj,$a,$b
- vsli.64 $t2,$a,#`64-@Sigma0[2]`
- veor $h,$t0,$t1
- vadd.i64 $T1,$K
- vbsl $Maj,$c,$b @ Maj(a,b,c)
- veor $h,$t2 @ Sigma0(a)
- vadd.i64 $d,$T1
- vadd.i64 $Maj,$T1
- @ vadd.i64 $h,$Maj
-___
-}
-
-sub NEON_16_79() {
-my $i=shift;
-
-if ($i&1) { &NEON_00_15($i,@_); return; }
-
-# 2x-vectorized, therefore runs every 2nd round
-my @X=map("q$_",(0..7)); # view @X as 128-bit vector
-my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
-my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
-my $e=@_[4]; # $e from NEON_00_15
-$i /= 2;
-$code.=<<___;
- vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
- vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
- vadd.i64 @_[0],d30 @ h+=Maj from the past
- vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
- vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
- vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
- vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
- veor $s1,$t0
- vshr.u64 $t0,$s0,#@sigma0[0]
- veor $s1,$t1 @ sigma1(X[i+14])
- vshr.u64 $t1,$s0,#@sigma0[1]
- vadd.i64 @X[$i%8],$s1
- vshr.u64 $s1,$s0,#@sigma0[2]
- vsli.64 $t0,$s0,#`64-@sigma0[0]`
- vsli.64 $t1,$s0,#`64-@sigma0[1]`
- vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
- veor $s1,$t0
- vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
- vadd.i64 @X[$i%8],$s0
- vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
- veor $s1,$t1 @ sigma0(X[i+1])
- vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
- vadd.i64 @X[$i%8],$s1
-___
- &NEON_00_15(2*$i,@_);
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch armv7-a
-.fpu neon
-
-.global sha512_block_data_order_neon
-.type sha512_block_data_order_neon,%function
-.align 4
-sha512_block_data_order_neon:
-.LNEON:
- dmb @ errata #451034 on early Cortex A8
- add $len,$inp,$len,lsl#7 @ len to point at the end of inp
- VFP_ABI_PUSH
- adr $Ktbl,.Lsha512_block_data_order
- sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
- vldmia $ctx,{$A-$H} @ load context
-.Loop_neon:
-___
-for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- mov $cnt,#4
-.L16_79_neon:
- subs $cnt,#1
-___
-for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
- bne .L16_79_neon
-
- vadd.i64 $A,d30 @ h+=Maj from the past
- vldmia $ctx,{d24-d31} @ load context to temp
- vadd.i64 q8,q12 @ vectorized accumulate
- vadd.i64 q9,q13
- vadd.i64 q10,q14
- vadd.i64 q11,q15
- vstmia $ctx,{$A-$H} @ save context
- teq $inp,$len
- sub $Ktbl,#640 @ rewind K512
- bne .Loop_neon
-
- VFP_ABI_POP
- ret @ bx lr
-.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
-#endif
-___
-}
-$code.=<<___;
-.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx lr/gm;
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-print $code;
-close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
deleted file mode 100644
index f8a6480889b1..000000000000
--- a/arch/arm/crypto/sha512-glue.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha512-glue.c - accelerated SHA-384/512 for ARM
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha512.h"
-
-MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha384-arm");
-MODULE_ALIAS_CRYPTO("sha512-arm");
-
-asmlinkage void sha512_block_data_order(struct sha512_state *state,
- u8 const *src, int blocks);
-
-static int sha512_arm_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return sha512_base_do_update_blocks(desc, data, len,
- sha512_block_data_order);
-}
-
-static int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha512_base_do_finup(desc, data, len, sha512_block_data_order);
- return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg sha512_arm_algs[] = { {
- .init = sha384_base_init,
- .update = sha512_arm_update,
- .finup = sha512_arm_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA384_DIGEST_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-arm",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .init = sha512_base_init,
- .update = sha512_arm_update,
- .finup = sha512_arm_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA512_DIGEST_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-arm",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int __init sha512_arm_mod_init(void)
-{
- int err;
-
- err = crypto_register_shashes(sha512_arm_algs,
- ARRAY_SIZE(sha512_arm_algs));
- if (err)
- return err;
-
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
- err = crypto_register_shashes(sha512_neon_algs,
- ARRAY_SIZE(sha512_neon_algs));
- if (err)
- goto err_unregister;
- }
- return 0;
-
-err_unregister:
- crypto_unregister_shashes(sha512_arm_algs,
- ARRAY_SIZE(sha512_arm_algs));
-
- return err;
-}
-
-static void __exit sha512_arm_mod_fini(void)
-{
- crypto_unregister_shashes(sha512_arm_algs,
- ARRAY_SIZE(sha512_arm_algs));
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
- crypto_unregister_shashes(sha512_neon_algs,
- ARRAY_SIZE(sha512_neon_algs));
-}
-
-module_init(sha512_arm_mod_init);
-module_exit(sha512_arm_mod_fini);
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
deleted file mode 100644
index bd528077fefb..000000000000
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha512.h"
-
-MODULE_ALIAS_CRYPTO("sha384-neon");
-MODULE_ALIAS_CRYPTO("sha512-neon");
-
-asmlinkage void sha512_block_data_order_neon(struct sha512_state *state,
- const u8 *src, int blocks);
-
-static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- int remain;
-
- kernel_neon_begin();
- remain = sha512_base_do_update_blocks(desc, data, len,
- sha512_block_data_order_neon);
- kernel_neon_end();
- return remain;
-}
-
-static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- kernel_neon_begin();
- sha512_base_do_finup(desc, data, len, sha512_block_data_order_neon);
- kernel_neon_end();
- return sha512_base_finish(desc, out);
-}
-
-struct shash_alg sha512_neon_algs[] = { {
- .init = sha384_base_init,
- .update = sha512_neon_update,
- .finup = sha512_neon_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA384_DIGEST_SIZE,
- .base = {
- .cra_name = "sha384",
- .cra_driver_name = "sha384-neon",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA384_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
-
- }
-}, {
- .init = sha512_base_init,
- .update = sha512_neon_update,
- .finup = sha512_neon_finup,
- .descsize = SHA512_STATE_SIZE,
- .digestsize = SHA512_DIGEST_SIZE,
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512-neon",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
diff --git a/arch/arm/crypto/sha512.h b/arch/arm/crypto/sha512.h
deleted file mode 100644
index eeaee52cda69..000000000000
--- a/arch/arm/crypto/sha512.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-extern struct shash_alg sha512_neon_algs[2];
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index f379c852dcb7..88336a1292bb 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -119,7 +119,7 @@ no_work_pending:
ct_user_enter save = 0
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
bl stackleak_erase_on_task_stack
#endif
restore_user_regs fast = 0, offset = 0
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index c421a899fc84..7951b2c06fec 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -677,7 +677,7 @@ enum arm_regset {
static const struct user_regset arm_regsets[] = {
[REGSET_GPR] = {
- .core_note_type = NT_PRSTATUS,
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
.n = ELF_NGREG,
.size = sizeof(u32),
.align = sizeof(u32),
@@ -689,7 +689,7 @@ static const struct user_regset arm_regsets[] = {
* For the FPA regs in fpstate, the real fields are a mixture
* of sizes, so pretend that the registers are word-sized:
*/
- .core_note_type = NT_PRFPREG,
+ USER_REGSET_NOTE_TYPE(PRFPREG),
.n = sizeof(struct user_fp) / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
@@ -702,7 +702,7 @@ static const struct user_regset arm_regsets[] = {
* Pretend that the VFP regs are word-sized, since the FPSCR is
* a single word dangling at the end of struct user_vfp:
*/
- .core_note_type = NT_ARM_VFP,
+ USER_REGSET_NOTE_TYPE(ARM_VFP),
.n = ARM_VFPREGS_SIZE / sizeof(u32),
.size = sizeof(u32),
.align = sizeof(u32),
diff --git a/arch/arm/lib/.gitignore b/arch/arm/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/arm/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 91ea0e29107a..0ca5aae1bcc3 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -5,8 +5,6 @@
# Copyright (C) 1995-2000 Russell King
#
-obj-y += crypto/
-
lib-y := changebit.o csumipv6.o csumpartial.o \
csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
delay.o delay-loop.o findbit.o memchr.o memcpy.o \
@@ -47,9 +45,3 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
endif
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
-
-obj-$(CONFIG_CRC32_ARCH) += crc32-arm.o
-crc32-arm-y := crc32.o crc32-core.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm.o
-crc-t10dif-arm-y := crc-t10dif.o crc-t10dif-core.o
diff --git a/arch/arm/lib/crc-t10dif-core.S b/arch/arm/lib/crc-t10dif-core.S
deleted file mode 100644
index 2bbf2df9c1e2..000000000000
--- a/arch/arm/lib/crc-t10dif-core.S
+++ /dev/null
@@ -1,468 +0,0 @@
-//
-// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-// Erdinc Ozturk <erdinc.ozturk@intel.com>
-// Vinodh Gopal <vinodh.gopal@intel.com>
-// James Guilford <james.guilford@intel.com>
-// Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses. You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the
-// distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Reference paper titled "Fast CRC Computation for Generic
-// Polynomials Using PCLMULQDQ Instruction"
-// URL: http://www.intel.com/content/dam/www/public/us/en/documents
-// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-#ifdef CONFIG_CPU_ENDIAN_BE8
-#define CPU_LE(code...)
-#else
-#define CPU_LE(code...) code
-#endif
-
- .text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
-
- init_crc .req r0
- buf .req r1
- len .req r2
-
- fold_consts_ptr .req ip
-
- q0l .req d0
- q0h .req d1
- q1l .req d2
- q1h .req d3
- q2l .req d4
- q2h .req d5
- q3l .req d6
- q3h .req d7
- q4l .req d8
- q4h .req d9
- q5l .req d10
- q5h .req d11
- q6l .req d12
- q6h .req d13
- q7l .req d14
- q7h .req d15
- q8l .req d16
- q8h .req d17
- q9l .req d18
- q9h .req d19
- q10l .req d20
- q10h .req d21
- q11l .req d22
- q11h .req d23
- q12l .req d24
- q12h .req d25
-
- FOLD_CONSTS .req q10
- FOLD_CONST_L .req q10l
- FOLD_CONST_H .req q10h
-
- /*
- * Pairwise long polynomial multiplication of two 16-bit values
- *
- * { w0, w1 }, { y0, y1 }
- *
- * by two 64-bit values
- *
- * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
- *
- * where each vector element is a byte, ordered from least to most
- * significant. The resulting 80-bit vectors are XOR'ed together.
- *
- * This can be implemented using 8x8 long polynomial multiplication, by
- * reorganizing the input so that each pairwise 8x8 multiplication
- * produces one of the terms from the decomposition below, and
- * combining the results of each rank and shifting them into place.
- *
- * Rank
- * 0 w0*x0 ^ | y0*z0 ^
- * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
- * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
- * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
- * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
- * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
- * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
- * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
- * 8 w1*x7 << 64 | y1*z7 << 64
- *
- * The inputs can be reorganized into
- *
- * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
- * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
- *
- * and after performing 8x8->16 bit long polynomial multiplication of
- * each of the halves of the first vector with those of the second one,
- * we obtain the following four vectors of 16-bit elements:
- *
- * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
- * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
- * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
- * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
- *
- * Results b and c can be XORed together, as the vector elements have
- * matching ranks. Then, the final XOR can be pulled forward, and
- * applied between the halves of each of the remaining three vectors,
- * which are then shifted into place, and XORed together to produce the
- * final 80-bit result.
- */
- .macro pmull16x64_p8, v16, v64
- vext.8 q11, \v64, \v64, #1
- vld1.64 {q12}, [r4, :128]
- vuzp.8 q11, \v64
- vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
- vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
- bl __pmull16x64_p8
- veor \v64, q12, q14
- .endm
-
-__pmull16x64_p8:
- vmull.p8 q13, d23, d24
- vmull.p8 q14, d23, d25
- vmull.p8 q15, d22, d24
- vmull.p8 q12, d22, d25
-
- veor q14, q14, q15
- veor d24, d24, d25
- veor d26, d26, d27
- veor d28, d28, d29
- vmov.i32 d25, #0
- vmov.i32 d29, #0
- vext.8 q12, q12, q12, #14
- vext.8 q14, q14, q14, #15
- veor d24, d24, d26
- bx lr
-ENDPROC(__pmull16x64_p8)
-
- .macro pmull16x64_p64, v16, v64
- vmull.p64 q11, \v64\()l, \v16\()_L
- vmull.p64 \v64, \v64\()h, \v16\()_H
- veor \v64, \v64, q11
- .endm
-
- // Fold reg1, reg2 into the next 32 data bytes, storing the result back
- // into reg1, reg2.
- .macro fold_32_bytes, reg1, reg2, p
- vld1.64 {q8-q9}, [buf]!
-
- pmull16x64_\p FOLD_CONST, \reg1
- pmull16x64_\p FOLD_CONST, \reg2
-
-CPU_LE( vrev64.8 q8, q8 )
-CPU_LE( vrev64.8 q9, q9 )
- vswp q8l, q8h
- vswp q9l, q9h
-
- veor.8 \reg1, \reg1, q8
- veor.8 \reg2, \reg2, q9
- .endm
-
- // Fold src_reg into dst_reg, optionally loading the next fold constants
- .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts
- pmull16x64_\p FOLD_CONST, \src_reg
- .ifnb \load_next_consts
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- .endif
- veor.8 \dst_reg, \dst_reg, \src_reg
- .endm
-
- .macro crct10dif, p
- // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
- cmp len, #256
- blt .Lless_than_256_bytes\@
-
- mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
-
- // Load the first 128 data bytes. Byte swapping is necessary to make
- // the bit order match the polynomial coefficient order.
- vld1.64 {q0-q1}, [buf]!
- vld1.64 {q2-q3}, [buf]!
- vld1.64 {q4-q5}, [buf]!
- vld1.64 {q6-q7}, [buf]!
-CPU_LE( vrev64.8 q0, q0 )
-CPU_LE( vrev64.8 q1, q1 )
-CPU_LE( vrev64.8 q2, q2 )
-CPU_LE( vrev64.8 q3, q3 )
-CPU_LE( vrev64.8 q4, q4 )
-CPU_LE( vrev64.8 q5, q5 )
-CPU_LE( vrev64.8 q6, q6 )
-CPU_LE( vrev64.8 q7, q7 )
- vswp q0l, q0h
- vswp q1l, q1h
- vswp q2l, q2h
- vswp q3l, q3h
- vswp q4l, q4h
- vswp q5l, q5h
- vswp q6l, q6h
- vswp q7l, q7h
-
- // XOR the first 16 data *bits* with the initial CRC value.
- vmov.i8 q8h, #0
- vmov.u16 q8h[3], init_crc
- veor q0h, q0h, q8h
-
- // Load the constants for folding across 128 bytes.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
- // Subtract 128 for the 128 data bytes just consumed. Subtract another
- // 128 to simplify the termination condition of the following loop.
- sub len, len, #256
-
- // While >= 128 data bytes remain (not counting q0-q7), fold the 128
- // bytes q0-q7 into them, storing the result back into q0-q7.
-.Lfold_128_bytes_loop\@:
- fold_32_bytes q0, q1, \p
- fold_32_bytes q2, q3, \p
- fold_32_bytes q4, q5, \p
- fold_32_bytes q6, q7, \p
- subs len, len, #128
- bge .Lfold_128_bytes_loop\@
-
- // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
-
- // Fold across 64 bytes.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- fold_16_bytes q0, q4, \p
- fold_16_bytes q1, q5, \p
- fold_16_bytes q2, q6, \p
- fold_16_bytes q3, q7, \p, 1
- // Fold across 32 bytes.
- fold_16_bytes q4, q6, \p
- fold_16_bytes q5, q7, \p, 1
- // Fold across 16 bytes.
- fold_16_bytes q6, q7, \p
-
- // Add 128 to get the correct number of data bytes remaining in 0...127
- // (not counting q7), following the previous extra subtraction by 128.
- // Then subtract 16 to simplify the termination condition of the
- // following loop.
- adds len, len, #(128-16)
-
- // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
- // into them, storing the result back into q7.
- blt .Lfold_16_bytes_loop_done\@
-.Lfold_16_bytes_loop\@:
- pmull16x64_\p FOLD_CONST, q7
- vld1.64 {q0}, [buf]!
-CPU_LE( vrev64.8 q0, q0 )
- vswp q0l, q0h
- veor.8 q7, q7, q0
- subs len, len, #16
- bge .Lfold_16_bytes_loop\@
-
-.Lfold_16_bytes_loop_done\@:
- // Add 16 to get the correct number of data bytes remaining in 0...15
- // (not counting q7), following the previous extra subtraction by 16.
- adds len, len, #16
- beq .Lreduce_final_16_bytes\@
-
-.Lhandle_partial_segment\@:
- // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
- // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
- // do this without needing a fold constant for each possible 'len',
- // redivide the bytes into a first chunk of 'len' bytes and a second
- // chunk of 16 bytes, then fold the first chunk into the second.
-
- // q0 = last 16 original data bytes
- add buf, buf, len
- sub buf, buf, #16
- vld1.64 {q0}, [buf]
-CPU_LE( vrev64.8 q0, q0 )
- vswp q0l, q0h
-
- // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
- mov_l r1, .Lbyteshift_table + 16
- sub r1, r1, len
- vld1.8 {q2}, [r1]
- vtbl.8 q1l, {q7l-q7h}, q2l
- vtbl.8 q1h, {q7l-q7h}, q2h
-
- // q3 = first chunk: q7 right-shifted by '16-len' bytes.
- vmov.i8 q3, #0x80
- veor.8 q2, q2, q3
- vtbl.8 q3l, {q7l-q7h}, q2l
- vtbl.8 q3h, {q7l-q7h}, q2h
-
- // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
- vshr.s8 q2, q2, #7
-
- // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
- // then '16-len' bytes from q1 (high-order bytes).
- vbsl.8 q2, q1, q0
-
- // Fold the first chunk into the second chunk, storing the result in q7.
- pmull16x64_\p FOLD_CONST, q3
- veor.8 q7, q3, q2
- b .Lreduce_final_16_bytes\@
-
-.Lless_than_256_bytes\@:
- // Checksumming a buffer of length 16...255 bytes
-
- mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
-
- // Load the first 16 data bytes.
- vld1.64 {q7}, [buf]!
-CPU_LE( vrev64.8 q7, q7 )
- vswp q7l, q7h
-
- // XOR the first 16 data *bits* with the initial CRC value.
- vmov.i8 q0h, #0
- vmov.u16 q0h[3], init_crc
- veor.8 q7h, q7h, q0h
-
- // Load the fold-across-16-bytes constants.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
- cmp len, #16
- beq .Lreduce_final_16_bytes\@ // len == 16
- subs len, len, #32
- addlt len, len, #16
- blt .Lhandle_partial_segment\@ // 17 <= len <= 31
- b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
-
-.Lreduce_final_16_bytes\@:
- .endm
-
-//
-// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-ENTRY(crc_t10dif_pmull64)
- crct10dif p64
-
- // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
-
- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
- // Fold the high 64 bits into the low 64 bits, while also multiplying by
- // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
- // whose low 48 bits are 0.
- vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
- veor.8 q0h, q0h, q7l // + low bits * x^64
-
- // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
- // value congruent to x^64 * M(x) and whose low 48 bits are 0.
- vmov.i8 q1, #0
- vmov s4, s3 // extract high 32 bits
- vmov s3, s5 // zero high 32 bits
- vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
- veor.8 q0, q0, q1 // + low bits
-
- // Load G(x) and floor(x^48 / G(x)).
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
-
- // Use Barrett reduction to compute the final CRC value.
- vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
- vshr.u64 q1l, q1l, #32 // /= x^32
- vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
- vshr.u64 q0l, q0l, #48
- veor.8 q0l, q0l, q1l // + low 16 nonzero bits
- // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
-
- vmov.u16 r0, q0l[0]
- bx lr
-ENDPROC(crc_t10dif_pmull64)
-
-ENTRY(crc_t10dif_pmull8)
- push {r4, lr}
- mov_l r4, .L16x64perm
-
- crct10dif p8
-
-CPU_LE( vrev64.8 q7, q7 )
- vswp q7l, q7h
- vst1.64 {q7}, [r3, :128]
- pop {r4, pc}
-ENDPROC(crc_t10dif_pmull8)
-
- .section ".rodata", "a"
- .align 4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
- .quad 0x0000000000006123 // x^(8*128) mod G(x)
- .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
-// .Lfold_across_64_bytes_consts:
- .quad 0x0000000000001069 // x^(4*128) mod G(x)
- .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
-// .Lfold_across_32_bytes_consts:
- .quad 0x000000000000857d // x^(2*128) mod G(x)
- .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
-.Lfold_across_16_bytes_consts:
- .quad 0x000000000000a010 // x^(1*128) mod G(x)
- .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
-// .Lfinal_fold_consts:
- .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
- .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
- .quad 0x0000000000018bb7 // G(x)
- .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
- .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
- .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
-
-.L16x64perm:
- .quad 0x808080800000000, 0x909090901010101
diff --git a/arch/arm/lib/crc-t10dif.c b/arch/arm/lib/crc-t10dif.c
deleted file mode 100644
index 1093f8ec13b0..000000000000
--- a/arch/arm/lib/crc-t10dif.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-
-asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
-asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
- u8 out[16]);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
-{
- if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
- if (static_branch_likely(&have_pmull)) {
- if (crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc_t10dif_pmull64(crc, data, length);
- kernel_neon_end();
- return crc;
- }
- } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
- static_branch_likely(&have_neon) &&
- crypto_simd_usable()) {
- u8 buf[16] __aligned(16);
-
- kernel_neon_begin();
- crc_t10dif_pmull8(crc, data, length, buf);
- kernel_neon_end();
-
- return crc_t10dif_generic(0, buf, sizeof(buf));
- }
- }
- return crc_t10dif_generic(crc, data, length);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_arm_init(void)
-{
- if (elf_hwcap & HWCAP_NEON) {
- static_branch_enable(&have_neon);
- if (elf_hwcap2 & HWCAP2_PMULL)
- static_branch_enable(&have_pmull);
- }
- return 0;
-}
-subsys_initcall(crc_t10dif_arm_init);
-
-static void __exit crc_t10dif_arm_exit(void)
-{
-}
-module_exit(crc_t10dif_arm_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crc32-core.S b/arch/arm/lib/crc32-core.S
deleted file mode 100644
index 6f674f30c70b..000000000000
--- a/arch/arm/lib/crc32-core.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see http://www.gnu.org/licenses
- *
- * Please visit http://www.xyratex.com/contact if you need additional
- * information or have any questions.
- *
- * GPL HEADER END
- */
-
-/*
- * Copyright 2012 Xyratex Technology Limited
- *
- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
- * calculation.
- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
- * at:
- * https://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2B: Instruction Set Reference, N-Z
- *
- * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
- * Alexander Boyko <Alexander_Boyko@xyratex.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .align 6
- .arch armv8-a
- .arch_extension crc
- .fpu crypto-neon-fp-armv8
-
-.Lcrc32_constants:
- /*
- * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
- * #define CONSTANT_R1 0x154442bd4LL
- *
- * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
- * #define CONSTANT_R2 0x1c6e41596LL
- */
- .quad 0x0000000154442bd4
- .quad 0x00000001c6e41596
-
- /*
- * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
- * #define CONSTANT_R3 0x1751997d0LL
- *
- * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
- * #define CONSTANT_R4 0x0ccaa009eLL
- */
- .quad 0x00000001751997d0
- .quad 0x00000000ccaa009e
-
- /*
- * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
- * #define CONSTANT_R5 0x163cd6124LL
- */
- .quad 0x0000000163cd6124
- .quad 0x00000000FFFFFFFF
-
- /*
- * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
- *
- * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
- * = 0x1F7011641LL
- * #define CONSTANT_RU 0x1F7011641LL
- */
- .quad 0x00000001DB710641
- .quad 0x00000001F7011641
-
-.Lcrc32c_constants:
- .quad 0x00000000740eef02
- .quad 0x000000009e4addf8
- .quad 0x00000000f20c0dfe
- .quad 0x000000014cd00bd6
- .quad 0x00000000dd45aab8
- .quad 0x00000000FFFFFFFF
- .quad 0x0000000105ec76f0
- .quad 0x00000000dea713f1
-
- dCONSTANTl .req d0
- dCONSTANTh .req d1
- qCONSTANT .req q0
-
- BUF .req r0
- LEN .req r1
- CRC .req r2
-
- qzr .req q9
-
- /**
- * Calculate crc32
- * BUF - buffer
- * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
- * CRC - initial crc32
- * return %eax crc32
- * uint crc32_pmull_le(unsigned char const *buffer,
- * size_t len, uint crc32)
- */
-SYM_FUNC_START(crc32_pmull_le)
- adr r3, .Lcrc32_constants
- b 0f
-SYM_FUNC_END(crc32_pmull_le)
-
-SYM_FUNC_START(crc32c_pmull_le)
- adr r3, .Lcrc32c_constants
-
-0: bic LEN, LEN, #15
- vld1.8 {q1-q2}, [BUF, :128]!
- vld1.8 {q3-q4}, [BUF, :128]!
- vmov.i8 qzr, #0
- vmov.i8 qCONSTANT, #0
- vmov.32 dCONSTANTl[0], CRC
- veor.8 d2, d2, dCONSTANTl
- sub LEN, LEN, #0x40
- cmp LEN, #0x40
- blt less_64
-
- vld1.64 {qCONSTANT}, [r3]
-
-loop_64: /* 64 bytes Full cache line folding */
- sub LEN, LEN, #0x40
-
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q6, d5, dCONSTANTh
- vmull.p64 q7, d7, dCONSTANTh
- vmull.p64 q8, d9, dCONSTANTh
-
- vmull.p64 q1, d2, dCONSTANTl
- vmull.p64 q2, d4, dCONSTANTl
- vmull.p64 q3, d6, dCONSTANTl
- vmull.p64 q4, d8, dCONSTANTl
-
- veor.8 q1, q1, q5
- vld1.8 {q5}, [BUF, :128]!
- veor.8 q2, q2, q6
- vld1.8 {q6}, [BUF, :128]!
- veor.8 q3, q3, q7
- vld1.8 {q7}, [BUF, :128]!
- veor.8 q4, q4, q8
- vld1.8 {q8}, [BUF, :128]!
-
- veor.8 q1, q1, q5
- veor.8 q2, q2, q6
- veor.8 q3, q3, q7
- veor.8 q4, q4, q8
-
- cmp LEN, #0x40
- bge loop_64
-
-less_64: /* Folding cache line into 128bit */
- vldr dCONSTANTl, [r3, #16]
- vldr dCONSTANTh, [r3, #24]
-
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q5
- veor.8 q1, q1, q2
-
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q5
- veor.8 q1, q1, q3
-
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q5
- veor.8 q1, q1, q4
-
- teq LEN, #0
- beq fold_64
-
-loop_16: /* Folding rest buffer into 128bit */
- subs LEN, LEN, #0x10
-
- vld1.8 {q2}, [BUF, :128]!
- vmull.p64 q5, d3, dCONSTANTh
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q5
- veor.8 q1, q1, q2
-
- bne loop_16
-
-fold_64:
- /* perform the last 64 bit fold, also adds 32 zeroes
- * to the input stream */
- vmull.p64 q2, d2, dCONSTANTh
- vext.8 q1, q1, qzr, #8
- veor.8 q1, q1, q2
-
- /* final 32-bit fold */
- vldr dCONSTANTl, [r3, #32]
- vldr d6, [r3, #40]
- vmov.i8 d7, #0
-
- vext.8 q2, q1, qzr, #4
- vand.8 d2, d2, d6
- vmull.p64 q1, d2, dCONSTANTl
- veor.8 q1, q1, q2
-
- /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
- vldr dCONSTANTl, [r3, #48]
- vldr dCONSTANTh, [r3, #56]
-
- vand.8 q2, q1, q3
- vext.8 q2, qzr, q2, #8
- vmull.p64 q2, d5, dCONSTANTh
- vand.8 q2, q2, q3
- vmull.p64 q2, d4, dCONSTANTl
- veor.8 q1, q1, q2
- vmov r0, s5
-
- bx lr
-SYM_FUNC_END(crc32c_pmull_le)
-
- .macro __crc32, c
- subs ip, r2, #8
- bmi .Ltail\c
-
- tst r1, #3
- bne .Lunaligned\c
-
- teq ip, #0
-.Laligned8\c:
- ldrd r2, r3, [r1], #8
-ARM_BE8(rev r2, r2 )
-ARM_BE8(rev r3, r3 )
- crc32\c\()w r0, r0, r2
- crc32\c\()w r0, r0, r3
- bxeq lr
- subs ip, ip, #8
- bpl .Laligned8\c
-
-.Ltail\c:
- tst ip, #4
- beq 2f
- ldr r3, [r1], #4
-ARM_BE8(rev r3, r3 )
- crc32\c\()w r0, r0, r3
-
-2: tst ip, #2
- beq 1f
- ldrh r3, [r1], #2
-ARM_BE8(rev16 r3, r3 )
- crc32\c\()h r0, r0, r3
-
-1: tst ip, #1
- bxeq lr
- ldrb r3, [r1]
- crc32\c\()b r0, r0, r3
- bx lr
-
-.Lunaligned\c:
- tst r1, #1
- beq 2f
- ldrb r3, [r1], #1
- subs r2, r2, #1
- crc32\c\()b r0, r0, r3
-
- tst r1, #2
- beq 0f
-2: ldrh r3, [r1], #2
- subs r2, r2, #2
-ARM_BE8(rev16 r3, r3 )
- crc32\c\()h r0, r0, r3
-
-0: subs ip, r2, #8
- bpl .Laligned8\c
- b .Ltail\c
- .endm
-
- .align 5
-SYM_FUNC_START(crc32_armv8_le)
- __crc32
-SYM_FUNC_END(crc32_armv8_le)
-
- .align 5
-SYM_FUNC_START(crc32c_armv8_le)
- __crc32 c
-SYM_FUNC_END(crc32c_armv8_le)
diff --git a/arch/arm/lib/crc32.c b/arch/arm/lib/crc32.c
deleted file mode 100644
index f2bef8849c7c..000000000000
--- a/arch/arm/lib/crc32.c
+++ /dev/null
@@ -1,123 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */
-
-asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc);
-asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len);
-
-asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc);
-asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len);
-
-static u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len)
-{
- if (static_branch_likely(&have_crc32))
- return crc32_armv8_le(crc, p, len);
- return crc32_le_base(crc, p, len);
-}
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- if (len >= PMULL_MIN_LEN + 15 &&
- static_branch_likely(&have_pmull) && crypto_simd_usable()) {
- size_t n = -(uintptr_t)p & 15;
-
- /* align p to 16-byte boundary */
- if (n) {
- crc = crc32_le_scalar(crc, p, n);
- p += n;
- len -= n;
- }
- n = round_down(len, 16);
- kernel_neon_begin();
- crc = crc32_pmull_le(p, n, crc);
- kernel_neon_end();
- p += n;
- len -= n;
- }
- return crc32_le_scalar(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-static u32 crc32c_scalar(u32 crc, const u8 *p, size_t len)
-{
- if (static_branch_likely(&have_crc32))
- return crc32c_armv8_le(crc, p, len);
- return crc32c_base(crc, p, len);
-}
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- if (len >= PMULL_MIN_LEN + 15 &&
- static_branch_likely(&have_pmull) && crypto_simd_usable()) {
- size_t n = -(uintptr_t)p & 15;
-
- /* align p to 16-byte boundary */
- if (n) {
- crc = crc32c_scalar(crc, p, n);
- p += n;
- len -= n;
- }
- n = round_down(len, 16);
- kernel_neon_begin();
- crc = crc32c_pmull_le(p, n, crc);
- kernel_neon_end();
- p += n;
- len -= n;
- }
- return crc32c_scalar(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_arm_init(void)
-{
- if (elf_hwcap2 & HWCAP2_CRC32)
- static_branch_enable(&have_crc32);
- if (elf_hwcap2 & HWCAP2_PMULL)
- static_branch_enable(&have_pmull);
- return 0;
-}
-subsys_initcall(crc32_arm_init);
-
-static void __exit crc32_arm_exit(void)
-{
-}
-module_exit(crc32_arm_exit);
-
-u32 crc32_optimizations(void)
-{
- if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL))
- return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/.gitignore b/arch/arm/lib/crypto/.gitignore
deleted file mode 100644
index 12d74d8b03d0..000000000000
--- a/arch/arm/lib/crypto/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
-sha256-core.S
diff --git a/arch/arm/lib/crypto/Kconfig b/arch/arm/lib/crypto/Kconfig
deleted file mode 100644
index d1ad664f0c67..000000000000
--- a/arch/arm/lib/crypto/Kconfig
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_ARM
- bool "Hash functions: BLAKE2s"
- select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- help
- BLAKE2s cryptographic hash function (RFC 7693)
-
- Architecture: arm
-
- This is faster than the generic implementations of BLAKE2s and
- BLAKE2b, but slower than the NEON implementation of BLAKE2b.
- There is no NEON implementation of BLAKE2s, since NEON doesn't
- really help with it.
-
-config CRYPTO_CHACHA20_NEON
- tristate
- default CRYPTO_LIB_CHACHA
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_ARM
- tristate
- default CRYPTO_LIB_POLY1305
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM
- tristate
- depends on !CPU_V7M
- default CRYPTO_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256
- select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm/lib/crypto/Makefile b/arch/arm/lib/crypto/Makefile
deleted file mode 100644
index 431f77c3ff6f..000000000000
--- a/arch/arm/lib/crypto/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
-libblake2s-arm-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-scalar-core.o chacha-glue.o
-chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
-poly1305-arm-y := poly1305-core.o poly1305-glue.o
-
-obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
-sha256-arm-y := sha256.o sha256-core.o
-sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
-quiet_cmd_perl = PERL $@
- cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
- $(call cmd,perl)
-
-clean-files += poly1305-core.S sha256-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1
-
-# massage the perlasm code a bit so we only get the NEON routine if we need it
-poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
-poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
-AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
-
-AFLAGS_sha256-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/lib/crypto/blake2s-core.S b/arch/arm/lib/crypto/blake2s-core.S
deleted file mode 100644
index df40e46601f1..000000000000
--- a/arch/arm/lib/crypto/blake2s-core.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * BLAKE2s digest algorithm, ARM scalar implementation
- *
- * Copyright 2020 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- // Registers used to hold message words temporarily. There aren't
- // enough ARM registers to hold the whole message block, so we have to
- // load the words on-demand.
- M_0 .req r12
- M_1 .req r14
-
-// The BLAKE2s initialization vector
-.Lblake2s_IV:
- .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
- .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-
-.macro __ldrd a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
- ldrd \a, \b, [\src, #\offset]
-#else
- ldr \a, [\src, #\offset]
- ldr \b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
- strd \a, \b, [\dst, #\offset]
-#else
- str \a, [\dst, #\offset]
- str \b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _le32_bswap a, tmp
-#ifdef __ARMEB__
- rev_l \a, \tmp
-#endif
-.endm
-
-.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
- _le32_bswap \a, \tmp
- _le32_bswap \b, \tmp
- _le32_bswap \c, \tmp
- _le32_bswap \d, \tmp
- _le32_bswap \e, \tmp
- _le32_bswap \f, \tmp
- _le32_bswap \g, \tmp
- _le32_bswap \h, \tmp
-.endm
-
-// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
-// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
-// columns/diagonals. s0-s1 are the word offsets to the message words the first
-// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
-// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
-//
-// Note that to save instructions, the rotations don't happen when the
-// pseudocode says they should, but rather they are delayed until the values are
-// used. See the comment above _blake2s_round().
-.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
-
- ldr M_0, [sp, #32 + 4 * \s0]
- ldr M_1, [sp, #32 + 4 * \s2]
-
- // a += b + m[blake2s_sigma[r][2*i + 0]];
- add \a0, \a0, \b0, ror #brot
- add \a1, \a1, \b1, ror #brot
- add \a0, \a0, M_0
- add \a1, \a1, M_1
-
- // d = ror32(d ^ a, 16);
- eor \d0, \a0, \d0, ror #drot
- eor \d1, \a1, \d1, ror #drot
-
- // c += d;
- add \c0, \c0, \d0, ror #16
- add \c1, \c1, \d1, ror #16
-
- // b = ror32(b ^ c, 12);
- eor \b0, \c0, \b0, ror #brot
- eor \b1, \c1, \b1, ror #brot
-
- ldr M_0, [sp, #32 + 4 * \s1]
- ldr M_1, [sp, #32 + 4 * \s3]
-
- // a += b + m[blake2s_sigma[r][2*i + 1]];
- add \a0, \a0, \b0, ror #12
- add \a1, \a1, \b1, ror #12
- add \a0, \a0, M_0
- add \a1, \a1, M_1
-
- // d = ror32(d ^ a, 8);
- eor \d0, \a0, \d0, ror#16
- eor \d1, \a1, \d1, ror#16
-
- // c += d;
- add \c0, \c0, \d0, ror#8
- add \c1, \c1, \d1, ror#8
-
- // b = ror32(b ^ c, 7);
- eor \b0, \c0, \b0, ror#12
- eor \b1, \c1, \b1, ror#12
-.endm
-
-// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
-// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
-// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
-// r14 are free to use. The macro arguments s0-s15 give the order in which the
-// message words are used in this round.
-//
-// All rotates are performed using the implicit rotate operand accepted by the
-// 'add' and 'eor' instructions. This is faster than using explicit rotate
-// instructions. To make this work, we allow the values in the second and last
-// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
-// wrong rotation amount. The rotation amount is then fixed up just in time
-// when the values are used. 'brot' is the number of bits the values in row 'b'
-// need to be rotated right to arrive at the correct values, and 'drot'
-// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
-// that they end up as (7, 8) after every round.
-.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
- s8, s9, s10, s11, s12, s13, s14, s15
-
- // Mix first two columns:
- // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
- __ldrd r10, r11, sp, 16 // load v[12] and v[13]
- _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
- \s0, \s1, \s2, \s3
- __strd r8, r9, sp, 0
- __strd r10, r11, sp, 16
-
- // Mix second two columns:
- // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
- __ldrd r8, r9, sp, 8 // load v[10] and v[11]
- __ldrd r10, r11, sp, 24 // load v[14] and v[15]
- _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
- \s4, \s5, \s6, \s7
- str r10, [sp, #24] // store v[14]
- // v[10], v[11], and v[15] are used below, so no need to store them yet.
-
- .set brot, 7
- .set drot, 8
-
- // Mix first two diagonals:
- // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
- ldr r10, [sp, #16] // load v[12]
- _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
- \s8, \s9, \s10, \s11
- __strd r8, r9, sp, 8
- str r11, [sp, #28]
- str r10, [sp, #16]
-
- // Mix second two diagonals:
- // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
- __ldrd r8, r9, sp, 0 // load v[8] and v[9]
- __ldrd r10, r11, sp, 20 // load v[13] and v[14]
- _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
- \s12, \s13, \s14, \s15
- __strd r10, r11, sp, 20
-.endm
-
-//
-// void blake2s_compress(struct blake2s_state *state,
-// const u8 *block, size_t nblocks, u32 inc);
-//
-// Only the first three fields of struct blake2s_state are used:
-// u32 h[8]; (inout)
-// u32 t[2]; (inout)
-// u32 f[2]; (in)
-//
- .align 5
-ENTRY(blake2s_compress)
- push {r0-r2,r4-r11,lr} // keep this an even number
-
-.Lnext_block:
- // r0 is 'state'
- // r1 is 'block'
- // r3 is 'inc'
-
- // Load and increment the counter t[0..1].
- __ldrd r10, r11, r0, 32
- adds r10, r10, r3
- adc r11, r11, #0
- __strd r10, r11, r0, 32
-
- // _blake2s_round is very short on registers, so copy the message block
- // to the stack to save a register during the rounds. This also has the
- // advantage that misalignment only needs to be dealt with in one place.
- sub sp, sp, #64
- mov r12, sp
- tst r1, #3
- bne .Lcopy_block_misaligned
- ldmia r1!, {r2-r9}
- _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
- stmia r12!, {r2-r9}
- ldmia r1!, {r2-r9}
- _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
- stmia r12, {r2-r9}
-.Lcopy_block_done:
- str r1, [sp, #68] // Update message pointer
-
- // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
- // for spilling v[8..9]. Leave v[8..9] in r8-r9.
- mov r14, r0 // r14 = state
- adr r12, .Lblake2s_IV
- ldmia r12!, {r8-r9} // load IV[0..1]
- __ldrd r0, r1, r14, 40 // load f[0..1]
- ldm r12, {r2-r7} // load IV[3..7]
- eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
- eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
- eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
- eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
- push {r2-r7} // push v[9..15]
- sub sp, sp, #8 // leave space for v[8..9]
-
- // Load h[0..7] == v[0..7].
- ldm r14, {r0-r7}
-
- // Execute the rounds. Each round is provided the order in which it
- // needs to use the message words.
- .set brot, 0
- .set drot, 0
- _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
- _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
- _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
- _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
- _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
- _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
- _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
- _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
- _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
-
- // Fold the final state matrix into the hash chaining value:
- //
- // for (i = 0; i < 8; i++)
- // h[i] ^= v[i] ^ v[i + 8];
- //
- ldr r14, [sp, #96] // r14 = &h[0]
- add sp, sp, #8 // v[8..9] are already loaded.
- pop {r10-r11} // load v[10..11]
- eor r0, r0, r8
- eor r1, r1, r9
- eor r2, r2, r10
- eor r3, r3, r11
- ldm r14, {r8-r11} // load h[0..3]
- eor r0, r0, r8
- eor r1, r1, r9
- eor r2, r2, r10
- eor r3, r3, r11
- stmia r14!, {r0-r3} // store new h[0..3]
- ldm r14, {r0-r3} // load old h[4..7]
- pop {r8-r11} // load v[12..15]
- eor r0, r0, r4, ror #brot
- eor r1, r1, r5, ror #brot
- eor r2, r2, r6, ror #brot
- eor r3, r3, r7, ror #brot
- eor r0, r0, r8, ror #drot
- eor r1, r1, r9, ror #drot
- eor r2, r2, r10, ror #drot
- eor r3, r3, r11, ror #drot
- add sp, sp, #64 // skip copy of message block
- stm r14, {r0-r3} // store new h[4..7]
-
- // Advance to the next block, if there is one. Note that if there are
- // multiple blocks, then 'inc' (the counter increment amount) must be
- // 64. So we can simply set it to 64 without re-loading it.
- ldm sp, {r0, r1, r2} // load (state, block, nblocks)
- mov r3, #64 // set 'inc'
- subs r2, r2, #1 // nblocks--
- str r2, [sp, #8]
- bne .Lnext_block // nblocks != 0?
-
- pop {r0-r2,r4-r11,pc}
-
- // The next message block (pointed to by r1) isn't 4-byte aligned, so it
- // can't be loaded using ldmia. Copy it to the stack buffer (pointed to
- // by r12) using an alternative method. r2-r9 are free to use.
-.Lcopy_block_misaligned:
- mov r2, #64
-1:
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- ldr r3, [r1], #4
- _le32_bswap r3, r4
-#else
- ldrb r3, [r1, #0]
- ldrb r4, [r1, #1]
- ldrb r5, [r1, #2]
- ldrb r6, [r1, #3]
- add r1, r1, #4
- orr r3, r3, r4, lsl #8
- orr r3, r3, r5, lsl #16
- orr r3, r3, r6, lsl #24
-#endif
- subs r2, r2, #4
- str r3, [r12], #4
- bne 1b
- b .Lcopy_block_done
-ENDPROC(blake2s_compress)
diff --git a/arch/arm/lib/crypto/blake2s-glue.c b/arch/arm/lib/crypto/blake2s-glue.c
deleted file mode 100644
index 0238a70d9581..000000000000
--- a/arch/arm/lib/crypto/blake2s-glue.c
+++ /dev/null
@@ -1,7 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <crypto/internal/blake2s.h>
-#include <linux/module.h>
-
-/* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress);
diff --git a/arch/arm/lib/crypto/chacha-glue.c b/arch/arm/lib/crypto/chacha-glue.c
deleted file mode 100644
index 88ec96415283..000000000000
--- a/arch/arm/lib/crypto/chacha-glue.c
+++ /dev/null
@@ -1,138 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha and HChaCha functions (ARM optimized)
- *
- * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/cputype.h>
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
- u8 *dst, const u8 *src, int nrounds);
-asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
- u8 *dst, const u8 *src,
- int nrounds, unsigned int nbytes);
-asmlinkage void hchacha_block_arm(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-asmlinkage void hchacha_block_neon(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
- const struct chacha_state *state, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
-
-static inline bool neon_usable(void)
-{
- return static_branch_likely(&use_neon) && crypto_simd_usable();
-}
-
-static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- u8 buf[CHACHA_BLOCK_SIZE];
-
- while (bytes > CHACHA_BLOCK_SIZE) {
- unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
-
- chacha_4block_xor_neon(state, dst, src, nrounds, l);
- bytes -= l;
- src += l;
- dst += l;
- state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
- }
- if (bytes) {
- const u8 *s = src;
- u8 *d = dst;
-
- if (bytes != CHACHA_BLOCK_SIZE)
- s = d = memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, d, s, nrounds);
- if (d != dst)
- memcpy(dst, buf, bytes);
- state->x[12]++;
- }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
- hchacha_block_arm(state, out, nrounds);
- } else {
- kernel_neon_begin();
- hchacha_block_neon(state, out, nrounds);
- kernel_neon_end();
- }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
- bytes <= CHACHA_BLOCK_SIZE) {
- chacha_doarm(dst, src, bytes, state, nrounds);
- state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
- return;
- }
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- kernel_neon_begin();
- chacha_doneon(state, dst, src, todo, nrounds);
- kernel_neon_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
- /* We always can use at least the ARM scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_arm_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
- switch (read_cpuid_part()) {
- case ARM_CPU_PART_CORTEX_A7:
- case ARM_CPU_PART_CORTEX_A5:
- /*
- * The Cortex-A7 and Cortex-A5 do not perform well with
- * the NEON implementation but do incredibly with the
- * scalar one and use less power.
- */
- break;
- default:
- static_branch_enable(&use_neon);
- }
- }
- return 0;
-}
-subsys_initcall(chacha_arm_mod_init);
-
-static void __exit chacha_arm_mod_exit(void)
-{
-}
-module_exit(chacha_arm_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/chacha-neon-core.S b/arch/arm/lib/crypto/chacha-neon-core.S
deleted file mode 100644
index ddd62b6294a5..000000000000
--- a/arch/arm/lib/crypto/chacha-neon-core.S
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
- /*
- * NEON doesn't have a rotate instruction. The alternatives are, more or less:
- *
- * (a) vshl.u32 + vsri.u32 (needs temporary register)
- * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
- * (c) vrev32.16 (16-bit rotations only)
- * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
- * needs index vector)
- *
- * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
- * the only choices are (a) and (b). We use (a) since it takes two-thirds the
- * cycles of (b) on both Cortex-A7 and Cortex-A53.
- *
- * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
- * and doesn't need a temporary register.
- *
- * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
- * is twice as fast as (a), even when doing (a) on multiple registers
- * simultaneously to eliminate the stall between vshl and vsri. Also, it
- * parallelizes better when temporary registers are scarce.
- *
- * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
- * (a), so the need to load the rotation table actually makes the vtbl method
- * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
- * seems to be a good compromise to get a more significant speed boost on some
- * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
- */
-
-#include <linux/linkage.h>
-#include <asm/cache.h>
-
- .text
- .fpu neon
- .align 5
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers q0-q3. It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in r3.
- *
- * Clobbers: r3, ip, q4-q5
- */
-chacha_permute:
-
- adr ip, .Lrol8_table
- vld1.8 {d10}, [ip, :64]
-
-.Ldoubleround:
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vrev32.16 q3, q3
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #12
- vsri.u32 q1, q4, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vtbl.8 d6, {d6}, d10
- vtbl.8 d7, {d7}, d10
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #7
- vsri.u32 q1, q4, #25
-
- // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vext.8 q1, q1, q1, #4
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vext.8 q2, q2, q2, #8
- // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vext.8 q3, q3, q3, #12
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vrev32.16 q3, q3
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #12
- vsri.u32 q1, q4, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vtbl.8 d6, {d6}, d10
- vtbl.8 d7, {d7}, d10
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #7
- vsri.u32 q1, q4, #25
-
- // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vext.8 q1, q1, q1, #12
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vext.8 q2, q2, q2, #8
- // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vext.8 q3, q3, q3, #4
-
- subs r3, r3, #2
- bne .Ldoubleround
-
- bx lr
-ENDPROC(chacha_permute)
-
-ENTRY(chacha_block_xor_neon)
- // r0: Input state matrix, s
- // r1: 1 data block output, o
- // r2: 1 data block input, i
- // r3: nrounds
- push {lr}
-
- // x0..3 = s0..3
- add ip, r0, #0x20
- vld1.32 {q0-q1}, [r0]
- vld1.32 {q2-q3}, [ip]
-
- vmov q8, q0
- vmov q9, q1
- vmov q10, q2
- vmov q11, q3
-
- bl chacha_permute
-
- add ip, r2, #0x20
- vld1.8 {q4-q5}, [r2]
- vld1.8 {q6-q7}, [ip]
-
- // o0 = i0 ^ (x0 + s0)
- vadd.i32 q0, q0, q8
- veor q0, q0, q4
-
- // o1 = i1 ^ (x1 + s1)
- vadd.i32 q1, q1, q9
- veor q1, q1, q5
-
- // o2 = i2 ^ (x2 + s2)
- vadd.i32 q2, q2, q10
- veor q2, q2, q6
-
- // o3 = i3 ^ (x3 + s3)
- vadd.i32 q3, q3, q11
- veor q3, q3, q7
-
- add ip, r1, #0x20
- vst1.8 {q0-q1}, [r1]
- vst1.8 {q2-q3}, [ip]
-
- pop {pc}
-ENDPROC(chacha_block_xor_neon)
-
-ENTRY(hchacha_block_neon)
- // r0: Input state matrix, s
- // r1: output (8 32-bit words)
- // r2: nrounds
- push {lr}
-
- vld1.32 {q0-q1}, [r0]!
- vld1.32 {q2-q3}, [r0]
-
- mov r3, r2
- bl chacha_permute
-
- vst1.32 {q0}, [r1]!
- vst1.32 {q3}, [r1]
-
- pop {pc}
-ENDPROC(hchacha_block_neon)
-
- .align 4
-.Lctrinc: .word 0, 1, 2, 3
-.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
-
- .align 5
-ENTRY(chacha_4block_xor_neon)
- push {r4, lr}
- mov r4, sp // preserve the stack pointer
- sub ip, sp, #0x20 // allocate a 32 byte buffer
- bic ip, ip, #0x1f // aligned to 32 bytes
- mov sp, ip
-
- // r0: Input state matrix, s
- // r1: 4 data blocks output, o
- // r2: 4 data blocks input, i
- // r3: nrounds
-
- //
- // This function encrypts four consecutive ChaCha blocks by loading
- // the state matrix in NEON registers four times. The algorithm performs
- // each operation on the corresponding word of each state matrix, hence
- // requires no word shuffling. The words are re-interleaved before the
- // final addition of the original state and the XORing step.
- //
-
- // x0..15[0-3] = s0..15[0-3]
- add ip, r0, #0x20
- vld1.32 {q0-q1}, [r0]
- vld1.32 {q2-q3}, [ip]
-
- adr lr, .Lctrinc
- vdup.32 q15, d7[1]
- vdup.32 q14, d7[0]
- vld1.32 {q4}, [lr, :128]
- vdup.32 q13, d6[1]
- vdup.32 q12, d6[0]
- vdup.32 q11, d5[1]
- vdup.32 q10, d5[0]
- vadd.u32 q12, q12, q4 // x12 += counter values 0-3
- vdup.32 q9, d4[1]
- vdup.32 q8, d4[0]
- vdup.32 q7, d3[1]
- vdup.32 q6, d3[0]
- vdup.32 q5, d2[1]
- vdup.32 q4, d2[0]
- vdup.32 q3, d1[1]
- vdup.32 q2, d1[0]
- vdup.32 q1, d0[1]
- vdup.32 q0, d0[0]
-
- adr ip, .Lrol8_table
- b 1f
-
-.Ldoubleround4:
- vld1.32 {q8-q9}, [sp, :256]
-1:
- // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vadd.i32 q0, q0, q4
- vadd.i32 q1, q1, q5
- vadd.i32 q2, q2, q6
- vadd.i32 q3, q3, q7
-
- veor q12, q12, q0
- veor q13, q13, q1
- veor q14, q14, q2
- veor q15, q15, q3
-
- vrev32.16 q12, q12
- vrev32.16 q13, q13
- vrev32.16 q14, q14
- vrev32.16 q15, q15
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vadd.i32 q8, q8, q12
- vadd.i32 q9, q9, q13
- vadd.i32 q10, q10, q14
- vadd.i32 q11, q11, q15
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q4, q8
- veor q9, q5, q9
- vshl.u32 q4, q8, #12
- vshl.u32 q5, q9, #12
- vsri.u32 q4, q8, #20
- vsri.u32 q5, q9, #20
-
- veor q8, q6, q10
- veor q9, q7, q11
- vshl.u32 q6, q8, #12
- vshl.u32 q7, q9, #12
- vsri.u32 q6, q8, #20
- vsri.u32 q7, q9, #20
-
- // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vld1.8 {d16}, [ip, :64]
- vadd.i32 q0, q0, q4
- vadd.i32 q1, q1, q5
- vadd.i32 q2, q2, q6
- vadd.i32 q3, q3, q7
-
- veor q12, q12, q0
- veor q13, q13, q1
- veor q14, q14, q2
- veor q15, q15, q3
-
- vtbl.8 d24, {d24}, d16
- vtbl.8 d25, {d25}, d16
- vtbl.8 d26, {d26}, d16
- vtbl.8 d27, {d27}, d16
- vtbl.8 d28, {d28}, d16
- vtbl.8 d29, {d29}, d16
- vtbl.8 d30, {d30}, d16
- vtbl.8 d31, {d31}, d16
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vadd.i32 q8, q8, q12
- vadd.i32 q9, q9, q13
- vadd.i32 q10, q10, q14
- vadd.i32 q11, q11, q15
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q4, q8
- veor q9, q5, q9
- vshl.u32 q4, q8, #7
- vshl.u32 q5, q9, #7
- vsri.u32 q4, q8, #25
- vsri.u32 q5, q9, #25
-
- veor q8, q6, q10
- veor q9, q7, q11
- vshl.u32 q6, q8, #7
- vshl.u32 q7, q9, #7
- vsri.u32 q6, q8, #25
- vsri.u32 q7, q9, #25
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vadd.i32 q0, q0, q5
- vadd.i32 q1, q1, q6
- vadd.i32 q2, q2, q7
- vadd.i32 q3, q3, q4
-
- veor q15, q15, q0
- veor q12, q12, q1
- veor q13, q13, q2
- veor q14, q14, q3
-
- vrev32.16 q15, q15
- vrev32.16 q12, q12
- vrev32.16 q13, q13
- vrev32.16 q14, q14
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vadd.i32 q10, q10, q15
- vadd.i32 q11, q11, q12
- vadd.i32 q8, q8, q13
- vadd.i32 q9, q9, q14
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q7, q8
- veor q9, q4, q9
- vshl.u32 q7, q8, #12
- vshl.u32 q4, q9, #12
- vsri.u32 q7, q8, #20
- vsri.u32 q4, q9, #20
-
- veor q8, q5, q10
- veor q9, q6, q11
- vshl.u32 q5, q8, #12
- vshl.u32 q6, q9, #12
- vsri.u32 q5, q8, #20
- vsri.u32 q6, q9, #20
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vld1.8 {d16}, [ip, :64]
- vadd.i32 q0, q0, q5
- vadd.i32 q1, q1, q6
- vadd.i32 q2, q2, q7
- vadd.i32 q3, q3, q4
-
- veor q15, q15, q0
- veor q12, q12, q1
- veor q13, q13, q2
- veor q14, q14, q3
-
- vtbl.8 d30, {d30}, d16
- vtbl.8 d31, {d31}, d16
- vtbl.8 d24, {d24}, d16
- vtbl.8 d25, {d25}, d16
- vtbl.8 d26, {d26}, d16
- vtbl.8 d27, {d27}, d16
- vtbl.8 d28, {d28}, d16
- vtbl.8 d29, {d29}, d16
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vadd.i32 q10, q10, q15
- vadd.i32 q11, q11, q12
- vadd.i32 q8, q8, q13
- vadd.i32 q9, q9, q14
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q7, q8
- veor q9, q4, q9
- vshl.u32 q7, q8, #7
- vshl.u32 q4, q9, #7
- vsri.u32 q7, q8, #25
- vsri.u32 q4, q9, #25
-
- veor q8, q5, q10
- veor q9, q6, q11
- vshl.u32 q5, q8, #7
- vshl.u32 q6, q9, #7
- vsri.u32 q5, q8, #25
- vsri.u32 q6, q9, #25
-
- subs r3, r3, #2
- bne .Ldoubleround4
-
- // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
- // x8..9[0-3] are on the stack.
-
- // Re-interleave the words in the first two rows of each block (x0..7).
- // Also add the counter values 0-3 to x12[0-3].
- vld1.32 {q8}, [lr, :128] // load counter values 0-3
- vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
- vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
- vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
- vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
- vadd.u32 q12, q8 // x12 += counter values 0-3
- vswp d1, d4
- vswp d3, d6
- vld1.32 {q8-q9}, [r0]! // load s0..7
- vswp d9, d12
- vswp d11, d14
-
- // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
- // after XORing the first 32 bytes.
- vswp q1, q4
-
- // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
-
- // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
- vadd.u32 q0, q0, q8
- vadd.u32 q2, q2, q8
- vadd.u32 q4, q4, q8
- vadd.u32 q3, q3, q8
-
- // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
- vadd.u32 q1, q1, q9
- vadd.u32 q6, q6, q9
- vadd.u32 q5, q5, q9
- vadd.u32 q7, q7, q9
-
- // XOR first 32 bytes using keystream from first two rows of first block
- vld1.8 {q8-q9}, [r2]!
- veor q8, q8, q0
- veor q9, q9, q1
- vst1.8 {q8-q9}, [r1]!
-
- // Re-interleave the words in the last two rows of each block (x8..15).
- vld1.32 {q8-q9}, [sp, :256]
- mov sp, r4 // restore original stack pointer
- ldr r4, [r4, #8] // load number of bytes
- vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
- vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
- vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
- vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
- vld1.32 {q0-q1}, [r0] // load s8..15
- vswp d25, d28
- vswp d27, d30
- vswp d17, d20
- vswp d19, d22
-
- // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
-
- // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
- vadd.u32 q8, q8, q0
- vadd.u32 q10, q10, q0
- vadd.u32 q9, q9, q0
- vadd.u32 q11, q11, q0
-
- // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
- vadd.u32 q12, q12, q1
- vadd.u32 q14, q14, q1
- vadd.u32 q13, q13, q1
- vadd.u32 q15, q15, q1
-
- // XOR the rest of the data with the keystream
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #96
- veor q0, q0, q8
- veor q1, q1, q12
- ble .Lle96
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q2
- veor q1, q1, q6
- ble .Lle128
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q10
- veor q1, q1, q14
- ble .Lle160
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q4
- veor q1, q1, q5
- ble .Lle192
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q9
- veor q1, q1, q13
- ble .Lle224
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- subs r4, r4, #32
- veor q0, q0, q3
- veor q1, q1, q7
- blt .Llt256
-.Lout:
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]
- veor q0, q0, q11
- veor q1, q1, q15
- vst1.8 {q0-q1}, [r1]
-
- pop {r4, pc}
-
-.Lle192:
- vmov q4, q9
- vmov q5, q13
-
-.Lle160:
- // nothing to do
-
-.Lfinalblock:
- // Process the final block if processing less than 4 full blocks.
- // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
- // previous 32 byte output block that still needs to be written at
- // [r1] in q0-q1.
- beq .Lfullblock
-
-.Lpartialblock:
- adr lr, .Lpermute + 32
- add r2, r2, r4
- add lr, lr, r4
- add r4, r4, r1
-
- vld1.8 {q2-q3}, [lr]
- vld1.8 {q6-q7}, [r2]
-
- add r4, r4, #32
-
- vtbl.8 d4, {q4-q5}, d4
- vtbl.8 d5, {q4-q5}, d5
- vtbl.8 d6, {q4-q5}, d6
- vtbl.8 d7, {q4-q5}, d7
-
- veor q6, q6, q2
- veor q7, q7, q3
-
- vst1.8 {q6-q7}, [r4] // overlapping stores
- vst1.8 {q0-q1}, [r1]
- pop {r4, pc}
-
-.Lfullblock:
- vmov q11, q4
- vmov q15, q5
- b .Lout
-.Lle96:
- vmov q4, q2
- vmov q5, q6
- b .Lfinalblock
-.Lle128:
- vmov q4, q10
- vmov q5, q14
- b .Lfinalblock
-.Lle224:
- vmov q4, q3
- vmov q5, q7
- b .Lfinalblock
-.Llt256:
- vmov q4, q11
- vmov q5, q15
- b .Lpartialblock
-ENDPROC(chacha_4block_xor_neon)
-
- .align L1_CACHE_SHIFT
-.Lpermute:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
- .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
- .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
- .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
- .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/arch/arm/lib/crypto/chacha-scalar-core.S b/arch/arm/lib/crypto/chacha-scalar-core.S
deleted file mode 100644
index 4951df05c158..000000000000
--- a/arch/arm/lib/crypto/chacha-scalar-core.S
+++ /dev/null
@@ -1,444 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2018 Google, Inc.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Design notes:
- *
- * 16 registers would be needed to hold the state matrix, but only 14 are
- * available because 'sp' and 'pc' cannot be used. So we spill the elements
- * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
- * 'ldrd' and one 'strd' instruction per round.
- *
- * All rotates are performed using the implicit rotate operand accepted by the
- * 'add' and 'eor' instructions. This is faster than using explicit rotate
- * instructions. To make this work, we allow the values in the second and last
- * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
- * wrong rotation amount. The rotation amount is then fixed up just in time
- * when the values are used. 'brot' is the number of bits the values in row 'b'
- * need to be rotated right to arrive at the correct values, and 'drot'
- * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
- * that they end up as (25, 24) after every round.
- */
-
- // ChaCha state registers
- X0 .req r0
- X1 .req r1
- X2 .req r2
- X3 .req r3
- X4 .req r4
- X5 .req r5
- X6 .req r6
- X7 .req r7
- X8_X10 .req r8 // shared by x8 and x10
- X9_X11 .req r9 // shared by x9 and x11
- X12 .req r10
- X13 .req r11
- X14 .req r12
- X15 .req r14
-
-.macro _le32_bswap_4x a, b, c, d, tmp
-#ifdef __ARMEB__
- rev_l \a, \tmp
- rev_l \b, \tmp
- rev_l \c, \tmp
- rev_l \d, \tmp
-#endif
-.endm
-
-.macro __ldrd a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
- ldrd \a, \b, [\src, #\offset]
-#else
- ldr \a, [\src, #\offset]
- ldr \b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
- strd \a, \b, [\dst, #\offset]
-#else
- str \a, [\dst, #\offset]
- str \b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
-
- // a += b; d ^= a; d = rol(d, 16);
- add \a1, \a1, \b1, ror #brot
- add \a2, \a2, \b2, ror #brot
- eor \d1, \a1, \d1, ror #drot
- eor \d2, \a2, \d2, ror #drot
- // drot == 32 - 16 == 16
-
- // c += d; b ^= c; b = rol(b, 12);
- add \c1, \c1, \d1, ror #16
- add \c2, \c2, \d2, ror #16
- eor \b1, \c1, \b1, ror #brot
- eor \b2, \c2, \b2, ror #brot
- // brot == 32 - 12 == 20
-
- // a += b; d ^= a; d = rol(d, 8);
- add \a1, \a1, \b1, ror #20
- add \a2, \a2, \b2, ror #20
- eor \d1, \a1, \d1, ror #16
- eor \d2, \a2, \d2, ror #16
- // drot == 32 - 8 == 24
-
- // c += d; b ^= c; b = rol(b, 7);
- add \c1, \c1, \d1, ror #24
- add \c2, \c2, \d2, ror #24
- eor \b1, \c1, \b1, ror #20
- eor \b2, \c2, \b2, ror #20
- // brot == 32 - 7 == 25
-.endm
-
-.macro _doubleround
-
- // column round
-
- // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
- _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
-
- // save (x8, x9); restore (x10, x11)
- __strd X8_X10, X9_X11, sp, 0
- __ldrd X8_X10, X9_X11, sp, 8
-
- // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
- _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
-
- .set brot, 25
- .set drot, 24
-
- // diagonal round
-
- // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
- _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
-
- // save (x10, x11); restore (x8, x9)
- __strd X8_X10, X9_X11, sp, 8
- __ldrd X8_X10, X9_X11, sp, 0
-
- // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
- _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
-.endm
-
-.macro _chacha_permute nrounds
- .set brot, 0
- .set drot, 0
- .rept \nrounds / 2
- _doubleround
- .endr
-.endm
-
-.macro _chacha nrounds
-
-.Lnext_block\@:
- // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
- // Registers contain x0-x9,x12-x15.
-
- // Do the core ChaCha permutation to update x0-x15.
- _chacha_permute \nrounds
-
- add sp, #8
- // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers contain x0-x9,x12-x15.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
- push {X8_X10, X9_X11, X12, X13, X14, X15}
-
- // Load (OUT, IN, LEN).
- ldr r14, [sp, #96]
- ldr r12, [sp, #100]
- ldr r11, [sp, #104]
-
- orr r10, r14, r12
-
- // Use slow path if fewer than 64 bytes remain.
- cmp r11, #64
- blt .Lxor_slowpath\@
-
- // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
- // ARMv6+, since ldmia and stmia (used below) still require alignment.
- tst r10, #3
- bne .Lxor_slowpath\@
-
- // Fast path: XOR 64 bytes of aligned data.
-
- // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // x0-x3
- __ldrd r8, r9, sp, 32
- __ldrd r10, r11, sp, 40
- add X0, X0, r8
- add X1, X1, r9
- add X2, X2, r10
- add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8
- ldmia r12!, {r8-r11}
- eor X0, X0, r8
- eor X1, X1, r9
- eor X2, X2, r10
- eor X3, X3, r11
- stmia r14!, {X0-X3}
-
- // x4-x7
- __ldrd r8, r9, sp, 48
- __ldrd r10, r11, sp, 56
- add X4, r8, X4, ror #brot
- add X5, r9, X5, ror #brot
- ldmia r12!, {X0-X3}
- add X6, r10, X6, ror #brot
- add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8
- eor X4, X4, X0
- eor X5, X5, X1
- eor X6, X6, X2
- eor X7, X7, X3
- stmia r14!, {X4-X7}
-
- // x8-x15
- pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
- __ldrd r8, r9, sp, 32
- __ldrd r10, r11, sp, 40
- add r0, r0, r8 // x8
- add r1, r1, r9 // x9
- add r6, r6, r10 // x10
- add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8
- ldmia r12!, {r8-r11}
- eor r0, r0, r8 // x8
- eor r1, r1, r9 // x9
- eor r6, r6, r10 // x10
- eor r7, r7, r11 // x11
- stmia r14!, {r0,r1,r6,r7}
- ldmia r12!, {r0,r1,r6,r7}
- __ldrd r8, r9, sp, 48
- __ldrd r10, r11, sp, 56
- add r2, r8, r2, ror #drot // x12
- add r3, r9, r3, ror #drot // x13
- add r4, r10, r4, ror #drot // x14
- add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9
- ldr r9, [sp, #72] // load LEN
- eor r2, r2, r0 // x12
- eor r3, r3, r1 // x13
- eor r4, r4, r6 // x14
- eor r5, r5, r7 // x15
- subs r9, #64 // decrement and check LEN
- stmia r14!, {r2-r5}
-
- beq .Ldone\@
-
-.Lprepare_for_next_block\@:
-
- // Stack: x0-x15 OUT IN LEN
-
- // Increment block counter (x12)
- add r8, #1
-
- // Store updated (OUT, IN, LEN)
- str r14, [sp, #64]
- str r12, [sp, #68]
- str r9, [sp, #72]
-
- mov r14, sp
-
- // Store updated block counter (x12)
- str r8, [sp, #48]
-
- sub sp, #16
-
- // Reload state and do next block
- ldmia r14!, {r0-r11} // load x0-x11
- __strd r10, r11, sp, 8 // store x10-x11 before state
- ldmia r14, {r10-r12,r14} // load x12-x15
- b .Lnext_block\@
-
-.Lxor_slowpath\@:
- // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
- // We handle it by storing the 64 bytes of keystream to the stack, then
- // XOR-ing the needed portion with the data.
-
- // Allocate keystream buffer
- sub sp, #64
- mov r14, sp
-
- // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
- // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
- // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
- // Save keystream for x0-x3
- __ldrd r8, r9, sp, 96
- __ldrd r10, r11, sp, 104
- add X0, X0, r8
- add X1, X1, r9
- add X2, X2, r10
- add X3, X3, r11
- _le32_bswap_4x X0, X1, X2, X3, r8
- stmia r14!, {X0-X3}
-
- // Save keystream for x4-x7
- __ldrd r8, r9, sp, 112
- __ldrd r10, r11, sp, 120
- add X4, r8, X4, ror #brot
- add X5, r9, X5, ror #brot
- add X6, r10, X6, ror #brot
- add X7, r11, X7, ror #brot
- _le32_bswap_4x X4, X5, X6, X7, r8
- add r8, sp, #64
- stmia r14!, {X4-X7}
-
- // Save keystream for x8-x15
- ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
- __ldrd r8, r9, sp, 128
- __ldrd r10, r11, sp, 136
- add r0, r0, r8 // x8
- add r1, r1, r9 // x9
- add r6, r6, r10 // x10
- add r7, r7, r11 // x11
- _le32_bswap_4x r0, r1, r6, r7, r8
- stmia r14!, {r0,r1,r6,r7}
- __ldrd r8, r9, sp, 144
- __ldrd r10, r11, sp, 152
- add r2, r8, r2, ror #drot // x12
- add r3, r9, r3, ror #drot // x13
- add r4, r10, r4, ror #drot // x14
- add r5, r11, r5, ror #drot // x15
- _le32_bswap_4x r2, r3, r4, r5, r9
- stmia r14, {r2-r5}
-
- // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
- // Registers: r8 is block counter, r12 is IN.
-
- ldr r9, [sp, #168] // LEN
- ldr r14, [sp, #160] // OUT
- cmp r9, #64
- mov r0, sp
- movle r1, r9
- movgt r1, #64
- // r1 is number of bytes to XOR, in range [1, 64]
-
-.if __LINUX_ARM_ARCH__ < 6
- orr r2, r12, r14
- tst r2, #3 // IN or OUT misaligned?
- bne .Lxor_next_byte\@
-.endif
-
- // XOR a word at a time
-.rept 16
- subs r1, #4
- blt .Lxor_words_done\@
- ldr r2, [r12], #4
- ldr r3, [r0], #4
- eor r2, r2, r3
- str r2, [r14], #4
-.endr
- b .Lxor_slowpath_done\@
-.Lxor_words_done\@:
- ands r1, r1, #3
- beq .Lxor_slowpath_done\@
-
- // XOR a byte at a time
-.Lxor_next_byte\@:
- ldrb r2, [r12], #1
- ldrb r3, [r0], #1
- eor r2, r2, r3
- strb r2, [r14], #1
- subs r1, #1
- bne .Lxor_next_byte\@
-
-.Lxor_slowpath_done\@:
- subs r9, #64
- add sp, #96
- bgt .Lprepare_for_next_block\@
-
-.Ldone\@:
-.endm // _chacha
-
-/*
- * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
- * const struct chacha_state *state, int nrounds);
- */
-ENTRY(chacha_doarm)
- cmp r2, #0 // len == 0?
- reteq lr
-
- ldr ip, [sp]
- cmp ip, #12
-
- push {r0-r2,r4-r11,lr}
-
- // Push state x0-x15 onto stack.
- // Also store an extra copy of x10-x11 just before the state.
-
- add X12, r3, #48
- ldm X12, {X12,X13,X14,X15}
- push {X12,X13,X14,X15}
- sub sp, sp, #64
-
- __ldrd X8_X10, X9_X11, r3, 40
- __strd X8_X10, X9_X11, sp, 8
- __strd X8_X10, X9_X11, sp, 56
- ldm r3, {X0-X9_X11}
- __strd X0, X1, sp, 16
- __strd X2, X3, sp, 24
- __strd X4, X5, sp, 32
- __strd X6, X7, sp, 40
- __strd X8_X10, X9_X11, sp, 48
-
- beq 1f
- _chacha 20
-
-0: add sp, #76
- pop {r4-r11, pc}
-
-1: _chacha 12
- b 0b
-ENDPROC(chacha_doarm)
-
-/*
- * void hchacha_block_arm(const struct chacha_state *state,
- * u32 out[HCHACHA_OUT_WORDS], int nrounds);
- */
-ENTRY(hchacha_block_arm)
- push {r1,r4-r11,lr}
-
- cmp r2, #12 // ChaCha12 ?
-
- mov r14, r0
- ldmia r14!, {r0-r11} // load x0-x11
- push {r10-r11} // store x10-x11 to stack
- ldm r14, {r10-r12,r14} // load x12-x15
- sub sp, #8
-
- beq 1f
- _chacha_permute 20
-
- // Skip over (unused0-unused1, x10-x11)
-0: add sp, #16
-
- // Fix up rotations of x12-x15
- ror X12, X12, #drot
- ror X13, X13, #drot
- pop {r4} // load 'out'
- ror X14, X14, #drot
- ror X15, X15, #drot
-
- // Store (x0-x3,x12-x15) to 'out'
- stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
-
- pop {r4-r11,pc}
-
-1: _chacha_permute 12
- b 0b
-ENDPROC(hchacha_block_arm)
diff --git a/arch/arm/lib/crypto/poly1305-armv4.pl b/arch/arm/lib/crypto/poly1305-armv4.pl
deleted file mode 100644
index d57c6e2fc84a..000000000000
--- a/arch/arm/lib/crypto/poly1305-armv4.pl
+++ /dev/null
@@ -1,1236 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# IALU(*)/gcc-4.4 NEON
-#
-# ARM11xx(ARMv6) 7.78/+100% -
-# Cortex-A5 6.35/+130% 3.00
-# Cortex-A8 6.25/+115% 2.36
-# Cortex-A9 5.10/+95% 2.55
-# Cortex-A15 3.85/+85% 1.25(**)
-# Snapdragon S4 5.70/+100% 1.48(**)
-#
-# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**) these are trade-off results, they can be improved by ~8% but at
-# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit poly1305_emit_arch
-.globl poly1305_blocks_neon
-#endif
-
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-.text
-
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
-.Lpoly1305_init:
- stmdb sp!,{r4-r11}
-
- eor r3,r3,r3
- cmp $inp,#0
- str r3,[$ctx,#0] @ zero hash value
- str r3,[$ctx,#4]
- str r3,[$ctx,#8]
- str r3,[$ctx,#12]
- str r3,[$ctx,#16]
- str r3,[$ctx,#36] @ clear is_base2_26
- add $ctx,$ctx,#20
-
-#ifdef __thumb2__
- it eq
-#endif
- moveq r0,#0
- beq .Lno_key
-
-#if __ARM_MAX_ARCH__>=7
- mov r3,#-1
- str r3,[$ctx,#28] @ impossible key power value
-# ifndef __KERNEL__
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
-# endif
-#endif
- ldrb r4,[$inp,#0]
- mov r10,#0x0fffffff
- ldrb r5,[$inp,#1]
- and r3,r10,#-4 @ 0x0ffffffc
- ldrb r6,[$inp,#2]
- ldrb r7,[$inp,#3]
- orr r4,r4,r5,lsl#8
- ldrb r5,[$inp,#4]
- orr r4,r4,r6,lsl#16
- ldrb r6,[$inp,#5]
- orr r4,r4,r7,lsl#24
- ldrb r7,[$inp,#6]
- and r4,r4,r10
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-# if !defined(_WIN32)
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# endif
-# if defined(__APPLE__) || defined(_WIN32)
- ldr r12,[r12]
-# endif
-#endif
- ldrb r8,[$inp,#7]
- orr r5,r5,r6,lsl#8
- ldrb r6,[$inp,#8]
- orr r5,r5,r7,lsl#16
- ldrb r7,[$inp,#9]
- orr r5,r5,r8,lsl#24
- ldrb r8,[$inp,#10]
- and r5,r5,r3
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __thumb2__
- adr r9,.Lpoly1305_blocks_neon
- adr r11,.Lpoly1305_blocks
- it ne
- movne r11,r9
- adr r12,.Lpoly1305_emit
- orr r11,r11,#1 @ thumb-ify addresses
- orr r12,r12,#1
-# else
- add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
- ite eq
- addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
-# endif
-#endif
- ldrb r9,[$inp,#11]
- orr r6,r6,r7,lsl#8
- ldrb r7,[$inp,#12]
- orr r6,r6,r8,lsl#16
- ldrb r8,[$inp,#13]
- orr r6,r6,r9,lsl#24
- ldrb r9,[$inp,#14]
- and r6,r6,r3
-
- ldrb r10,[$inp,#15]
- orr r7,r7,r8,lsl#8
- str r4,[$ctx,#0]
- orr r7,r7,r9,lsl#16
- str r5,[$ctx,#4]
- orr r7,r7,r10,lsl#24
- str r6,[$ctx,#8]
- and r7,r7,r3
- str r7,[$ctx,#12]
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
-#else
- mov r0,#0
-#endif
-.Lno_key:
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
- stmdb sp!,{r3-r11,lr}
-
- ands $len,$len,#-16
- beq .Lno_data
-
- add $len,$len,$inp @ end pointer
- sub sp,sp,#32
-
-#if __ARM_ARCH__<7
- ldmia $ctx,{$h0-$r3} @ load context
- add $ctx,$ctx,#20
- str $len,[sp,#16] @ offload stuff
- str $ctx,[sp,#12]
-#else
- ldr lr,[$ctx,#36] @ is_base2_26
- ldmia $ctx!,{$h0-$h4} @ load hash value
- str $len,[sp,#16] @ offload stuff
- str $ctx,[sp,#12]
-
- adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
- mov $r1,$h1,lsr#6
- adcs $r1,$r1,$h2,lsl#20
- mov $r2,$h2,lsr#12
- adcs $r2,$r2,$h3,lsl#14
- mov $r3,$h3,lsr#18
- adcs $r3,$r3,$h4,lsl#8
- mov $len,#0
- teq lr,#0
- str $len,[$ctx,#16] @ clear is_base2_26
- adc $len,$len,$h4,lsr#24
-
- itttt ne
- movne $h0,$r0 @ choose between radixes
- movne $h1,$r1
- movne $h2,$r2
- movne $h3,$r3
- ldmia $ctx,{$r0-$r3} @ load key
- it ne
- movne $h4,$len
-#endif
-
- mov lr,$inp
- cmp $padbit,#0
- str $r1,[sp,#20]
- str $r2,[sp,#24]
- str $r3,[sp,#28]
- b .Loop
-
-.align 4
-.Loop:
-#if __ARM_ARCH__<7
- ldrb r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi $h4,$h4,#1 @ 1<<128
- ldrb r1,[lr,#-15]
- ldrb r2,[lr,#-14]
- ldrb r3,[lr,#-13]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-12]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-11]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-10]
- adds $h0,$h0,r3 @ accumulate input
-
- ldrb r3,[lr,#-9]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-8]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-7]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-6]
- adcs $h1,$h1,r3
-
- ldrb r3,[lr,#-5]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-4]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-3]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-2]
- adcs $h2,$h2,r3
-
- ldrb r3,[lr,#-1]
- orr r1,r0,r1,lsl#8
- str lr,[sp,#8] @ offload input pointer
- orr r2,r1,r2,lsl#16
- add $s1,$r1,$r1,lsr#2
- orr r3,r2,r3,lsl#24
-#else
- ldr r0,[lr],#16 @ load input
- it hi
- addhi $h4,$h4,#1 @ padbit
- ldr r1,[lr,#-12]
- ldr r2,[lr,#-8]
- ldr r3,[lr,#-4]
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
- adds $h0,$h0,r0 @ accumulate input
- str lr,[sp,#8] @ offload input pointer
- adcs $h1,$h1,r1
- add $s1,$r1,$r1,lsr#2
- adcs $h2,$h2,r2
-#endif
- add $s2,$r2,$r2,lsr#2
- adcs $h3,$h3,r3
- add $s3,$r3,$r3,lsr#2
-
- umull r2,r3,$h1,$r0
- adc $h4,$h4,#0
- umull r0,r1,$h0,$r0
- umlal r2,r3,$h4,$s1
- umlal r0,r1,$h3,$s1
- ldr $r1,[sp,#20] @ reload $r1
- umlal r2,r3,$h2,$s3
- umlal r0,r1,$h1,$s3
- umlal r2,r3,$h3,$s2
- umlal r0,r1,$h2,$s2
- umlal r2,r3,$h0,$r1
- str r0,[sp,#0] @ future $h0
- mul r0,$s2,$h4
- ldr $r2,[sp,#24] @ reload $r2
- adds r2,r2,r1 @ d1+=d0>>32
- eor r1,r1,r1
- adc lr,r3,#0 @ future $h2
- str r2,[sp,#4] @ future $h1
-
- mul r2,$s3,$h4
- eor r3,r3,r3
- umlal r0,r1,$h3,$s3
- ldr $r3,[sp,#28] @ reload $r3
- umlal r2,r3,$h3,$r0
- umlal r0,r1,$h2,$r0
- umlal r2,r3,$h2,$r1
- umlal r0,r1,$h1,$r1
- umlal r2,r3,$h1,$r2
- umlal r0,r1,$h0,$r2
- umlal r2,r3,$h0,$r3
- ldr $h0,[sp,#0]
- mul $h4,$r0,$h4
- ldr $h1,[sp,#4]
-
- adds $h2,lr,r0 @ d2+=d1>>32
- ldr lr,[sp,#8] @ reload input pointer
- adc r1,r1,#0
- adds $h3,r2,r1 @ d3+=d2>>32
- ldr r0,[sp,#16] @ reload end pointer
- adc r3,r3,#0
- add $h4,$h4,r3 @ h4+=d3>>32
-
- and r1,$h4,#-4
- and $h4,$h4,#3
- add r1,r1,r1,lsr#2 @ *=5
- adds $h0,$h0,r1
- adcs $h1,$h1,#0
- adcs $h2,$h2,#0
- adcs $h3,$h3,#0
- adc $h4,$h4,#0
-
- cmp r0,lr @ done yet?
- bhi .Loop
-
- ldr $ctx,[sp,#12]
- add sp,sp,#32
- stmdb $ctx,{$h0-$h4} @ store the result
-
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r3-r11,pc}
-#else
- ldmia sp!,{r3-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$ctx;
-
-$code.=<<___;
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
-.Lpoly1305_emit:
- stmdb sp!,{r4-r11}
-
- ldmia $ctx,{$h0-$h4}
-
-#if __ARM_ARCH__>=7
- ldr ip,[$ctx,#36] @ is_base2_26
-
- adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
- mov $g1,$h1,lsr#6
- adcs $g1,$g1,$h2,lsl#20
- mov $g2,$h2,lsr#12
- adcs $g2,$g2,$h3,lsl#14
- mov $g3,$h3,lsr#18
- adcs $g3,$g3,$h4,lsl#8
- mov $g4,#0
- adc $g4,$g4,$h4,lsr#24
-
- tst ip,ip
- itttt ne
- movne $h0,$g0
- movne $h1,$g1
- movne $h2,$g2
- movne $h3,$g3
- it ne
- movne $h4,$g4
-#endif
-
- adds $g0,$h0,#5 @ compare to modulus
- adcs $g1,$h1,#0
- adcs $g2,$h2,#0
- adcs $g3,$h3,#0
- adc $g4,$h4,#0
- tst $g4,#4 @ did it carry/borrow?
-
-#ifdef __thumb2__
- it ne
-#endif
- movne $h0,$g0
- ldr $g0,[$nonce,#0]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h1,$g1
- ldr $g1,[$nonce,#4]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h2,$g2
- ldr $g2,[$nonce,#8]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h3,$g3
- ldr $g3,[$nonce,#12]
-
- adds $h0,$h0,$g0
- adcs $h1,$h1,$g1
- adcs $h2,$h2,$g2
- adc $h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
- rev $h0,$h0
- rev $h1,$h1
- rev $h2,$h2
- rev $h3,$h3
-# endif
- str $h0,[$mac,#0]
- str $h1,[$mac,#4]
- str $h2,[$mac,#8]
- str $h3,[$mac,#12]
-#else
- strb $h0,[$mac,#0]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#4]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#8]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#12]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#1]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#5]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#9]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#13]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#2]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#6]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#10]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#14]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#3]
- strb $h1,[$mac,#7]
- strb $h2,[$mac,#11]
- strb $h3,[$mac,#15]
-#endif
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.fpu neon
-
-.type poly1305_init_neon,%function
-.align 5
-poly1305_init_neon:
-.Lpoly1305_init_neon:
- ldr r3,[$ctx,#48] @ first table element
- cmp r3,#-1 @ is value impossible?
- bne .Lno_init_neon
-
- ldr r4,[$ctx,#20] @ load key base 2^32
- ldr r5,[$ctx,#24]
- ldr r6,[$ctx,#28]
- ldr r7,[$ctx,#32]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- and r3,r3,#0x03ffffff
- and r4,r4,#0x03ffffff
- and r5,r5,#0x03ffffff
-
- vdup.32 $R0,r2 @ r^1 in both lanes
- add r2,r3,r3,lsl#2 @ *5
- vdup.32 $R1,r3
- add r3,r4,r4,lsl#2
- vdup.32 $S1,r2
- vdup.32 $R2,r4
- add r4,r5,r5,lsl#2
- vdup.32 $S2,r3
- vdup.32 $R3,r5
- add r5,r6,r6,lsl#2
- vdup.32 $S3,r4
- vdup.32 $R4,r6
- vdup.32 $S4,r5
-
- mov $zeros,#2 @ counter
-
-.Lsquare_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-
- vmull.u32 $D0,$R0,${R0}[1]
- vmull.u32 $D1,$R1,${R0}[1]
- vmull.u32 $D2,$R2,${R0}[1]
- vmull.u32 $D3,$R3,${R0}[1]
- vmull.u32 $D4,$R4,${R0}[1]
-
- vmlal.u32 $D0,$R4,${S1}[1]
- vmlal.u32 $D1,$R0,${R1}[1]
- vmlal.u32 $D2,$R1,${R1}[1]
- vmlal.u32 $D3,$R2,${R1}[1]
- vmlal.u32 $D4,$R3,${R1}[1]
-
- vmlal.u32 $D0,$R3,${S2}[1]
- vmlal.u32 $D1,$R4,${S2}[1]
- vmlal.u32 $D3,$R1,${R2}[1]
- vmlal.u32 $D2,$R0,${R2}[1]
- vmlal.u32 $D4,$R2,${R2}[1]
-
- vmlal.u32 $D0,$R2,${S3}[1]
- vmlal.u32 $D3,$R0,${R3}[1]
- vmlal.u32 $D1,$R3,${S3}[1]
- vmlal.u32 $D2,$R4,${S3}[1]
- vmlal.u32 $D4,$R1,${R3}[1]
-
- vmlal.u32 $D3,$R4,${S4}[1]
- vmlal.u32 $D0,$R1,${S4}[1]
- vmlal.u32 $D1,$R2,${S4}[1]
- vmlal.u32 $D2,$R3,${S4}[1]
- vmlal.u32 $D4,$R0,${R4}[1]
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- @ and P. Schwabe
- @
- @ H0>>+H1>>+H2>>+H3>>+H4
- @ H3>>+H4>>*5+H0>>+H1
- @
- @ Trivia.
- @
- @ Result of multiplication of n-bit number by m-bit number is
- @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
- @ m-bit number multiplied by 2^n is still n+m bits wide.
- @
- @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
- @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
- @ one is n+1 bits wide.
- @
- @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
- @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
- @ can be 27. However! In cases when their width exceeds 26 bits
- @ they are limited by 2^26+2^6. This in turn means that *sum*
- @ of the products with these values can still be viewed as sum
- @ of 52-bit numbers as long as the amount of addends is not a
- @ power of 2. For example,
- @
- @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
- @
- @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
- @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
- @ 8 * (2^52) or 2^55. However, the value is then multiplied by
- @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
- @ which is less than 32 * (2^52) or 2^57. And when processing
- @ data we are looking at triple as many addends...
- @
- @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
- @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
- @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
- @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
- @ instruction accepts 2x32-bit input and writes 2x64-bit result.
- @ This means that result of reduction have to be compressed upon
- @ loop wrap-around. This can be done in the process of reduction
- @ to minimize amount of instructions [as well as amount of
- @ 128-bit instructions, which benefits low-end processors], but
- @ one has to watch for H2 (which is narrower than H0) and 5*H4
- @ not being wider than 58 bits, so that result of right shift
- @ by 26 bits fits in 32 bits. This is also useful on x86,
- @ because it allows to use paddd in place for paddq, which
- @ benefits Atom, where paddq is ridiculously slow.
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vbic.i32 $D4#lo,#0xfc000000
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vbic.i32 $D2#lo,#0xfc000000
-
- vshr.u32 $T0#lo,$D0#lo,#26
- vbic.i32 $D0#lo,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
-
- subs $zeros,$zeros,#1
- beq .Lsquare_break_neon
-
- add $tbl0,$ctx,#(48+0*9*4)
- add $tbl1,$ctx,#(48+1*9*4)
-
- vtrn.32 $R0,$D0#lo @ r^2:r^1
- vtrn.32 $R2,$D2#lo
- vtrn.32 $R3,$D3#lo
- vtrn.32 $R1,$D1#lo
- vtrn.32 $R4,$D4#lo
-
- vshl.u32 $S2,$R2,#2 @ *5
- vshl.u32 $S3,$R3,#2
- vshl.u32 $S1,$R1,#2
- vshl.u32 $S4,$R4,#2
- vadd.i32 $S2,$S2,$R2
- vadd.i32 $S1,$S1,$R1
- vadd.i32 $S3,$S3,$R3
- vadd.i32 $S4,$S4,$R4
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0,:32]
- vst1.32 {${S4}[1]},[$tbl1,:32]
-
- b .Lsquare_neon
-
-.align 4
-.Lsquare_break_neon:
- add $tbl0,$ctx,#(48+2*4*9)
- add $tbl1,$ctx,#(48+3*4*9)
-
- vmov $R0,$D0#lo @ r^4:r^3
- vshl.u32 $S1,$D1#lo,#2 @ *5
- vmov $R1,$D1#lo
- vshl.u32 $S2,$D2#lo,#2
- vmov $R2,$D2#lo
- vshl.u32 $S3,$D3#lo,#2
- vmov $R3,$D3#lo
- vshl.u32 $S4,$D4#lo,#2
- vmov $R4,$D4#lo
- vadd.i32 $S1,$S1,$D1#lo
- vadd.i32 $S2,$S2,$D2#lo
- vadd.i32 $S3,$S3,$D3#lo
- vadd.i32 $S4,$S4,$D4#lo
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0]
- vst1.32 {${S4}[1]},[$tbl1]
-
-.Lno_init_neon:
- ret @ bx lr
-.size poly1305_init_neon,.-poly1305_init_neon
-
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
- ldr ip,[$ctx,#36] @ is_base2_26
-
- cmp $len,#64
- blo .Lpoly1305_blocks
-
- stmdb sp!,{r4-r7}
- vstmdb sp!,{d8-d15} @ ABI specification says so
-
- tst ip,ip @ is_base2_26?
- bne .Lbase2_26_neon
-
- stmdb sp!,{r1-r3,lr}
- bl .Lpoly1305_init_neon
-
- ldr r4,[$ctx,#0] @ load hash value base 2^32
- ldr r5,[$ctx,#4]
- ldr r6,[$ctx,#8]
- ldr r7,[$ctx,#12]
- ldr ip,[$ctx,#16]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- veor $D0#lo,$D0#lo,$D0#lo
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- veor $D1#lo,$D1#lo,$D1#lo
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- veor $D2#lo,$D2#lo,$D2#lo
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- veor $D3#lo,$D3#lo,$D3#lo
- and r3,r3,#0x03ffffff
- orr r6,r6,ip,lsl#24
- veor $D4#lo,$D4#lo,$D4#lo
- and r4,r4,#0x03ffffff
- mov r1,#1
- and r5,r5,#0x03ffffff
- str r1,[$ctx,#36] @ set is_base2_26
-
- vmov.32 $D0#lo[0],r2
- vmov.32 $D1#lo[0],r3
- vmov.32 $D2#lo[0],r4
- vmov.32 $D3#lo[0],r5
- vmov.32 $D4#lo[0],r6
- adr $zeros,.Lzeros
-
- ldmia sp!,{r1-r3,lr}
- b .Lhash_loaded
-
-.align 4
-.Lbase2_26_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ load hash value
-
- veor $D0#lo,$D0#lo,$D0#lo
- veor $D1#lo,$D1#lo,$D1#lo
- veor $D2#lo,$D2#lo,$D2#lo
- veor $D3#lo,$D3#lo,$D3#lo
- veor $D4#lo,$D4#lo,$D4#lo
- vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- adr $zeros,.Lzeros
- vld1.32 {$D4#lo[0]},[$ctx]
- sub $ctx,$ctx,#16 @ rewind
-
-.Lhash_loaded:
- add $in2,$inp,#32
- mov $padbit,$padbit,lsl#24
- tst $len,#31
- beq .Leven
-
- vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
- vmov.32 $H4#lo[0],$padbit
- sub $len,$len,#16
- add $in2,$inp,#32
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3#lo,$H3#lo,#18
-
- vsri.u32 $H3#lo,$H2#lo,#14
- vshl.u32 $H2#lo,$H2#lo,#12
- vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
-
- vbic.i32 $H3#lo,#0xfc000000
- vsri.u32 $H2#lo,$H1#lo,#20
- vshl.u32 $H1#lo,$H1#lo,#6
-
- vbic.i32 $H2#lo,#0xfc000000
- vsri.u32 $H1#lo,$H0#lo,#26
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
-
- vbic.i32 $H0#lo,#0xfc000000
- vbic.i32 $H1#lo,#0xfc000000
- vadd.i32 $H2#hi,$H2#lo,$D2#lo
-
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
-
- mov $tbl1,$zeros
- add $tbl0,$ctx,#48
-
- cmp $len,$len
- b .Long_tail
-
-.align 4
-.Leven:
- subs $len,$len,#64
- it lo
- movlo $in2,$zeros
-
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
- itt hi
- addhi $tbl1,$ctx,#(48+1*9*4)
- addhi $tbl0,$ctx,#(48+3*9*4)
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3,$H3,#18
-
- vsri.u32 $H3,$H2,#14
- vshl.u32 $H2,$H2,#12
-
- vbic.i32 $H3,#0xfc000000
- vsri.u32 $H2,$H1,#20
- vshl.u32 $H1,$H1,#6
-
- vbic.i32 $H2,#0xfc000000
- vsri.u32 $H1,$H0,#26
-
- vbic.i32 $H0,#0xfc000000
- vbic.i32 $H1,#0xfc000000
-
- bls .Lskip_loop
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- b .Loop_neon
-
-.align 5
-.Loop_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- @ \___________________/
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- @ \___________________/ \____________________/
- @
- @ Note that we start with inp[2:3]*r^2. This is because it
- @ doesn't depend on reduction in previous iteration.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ inp[2:3]*r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
- vmull.u32 $D2,$H2#hi,${R0}[1]
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,${R0}[1]
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,${R0}[1]
- vmlal.u32 $D2,$H1#hi,${R1}[1]
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,${R0}[1]
-
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,${R0}[1]
- subs $len,$len,#64
- vmlal.u32 $D0,$H4#hi,${S1}[1]
- it lo
- movlo $in2,$zeros
- vmlal.u32 $D3,$H2#hi,${R1}[1]
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D1,$H0#hi,${R1}[1]
- vmlal.u32 $D4,$H3#hi,${R1}[1]
-
- vmlal.u32 $D0,$H3#hi,${S2}[1]
- vmlal.u32 $D3,$H1#hi,${R2}[1]
- vmlal.u32 $D4,$H2#hi,${R2}[1]
- vmlal.u32 $D1,$H4#hi,${S2}[1]
- vmlal.u32 $D2,$H0#hi,${R2}[1]
-
- vmlal.u32 $D3,$H0#hi,${R3}[1]
- vmlal.u32 $D0,$H2#hi,${S3}[1]
- vmlal.u32 $D4,$H1#hi,${R3}[1]
- vmlal.u32 $D1,$H3#hi,${S3}[1]
- vmlal.u32 $D2,$H4#hi,${S3}[1]
-
- vmlal.u32 $D3,$H4#hi,${S4}[1]
- vmlal.u32 $D0,$H1#hi,${S4}[1]
- vmlal.u32 $D4,$H0#hi,${R4}[1]
- vmlal.u32 $D1,$H2#hi,${S4}[1]
- vmlal.u32 $D2,$H3#hi,${S4}[1]
-
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4 and accumulate
-
- vmlal.u32 $D3,$H3#lo,${R0}[0]
- vmlal.u32 $D0,$H0#lo,${R0}[0]
- vmlal.u32 $D4,$H4#lo,${R0}[0]
- vmlal.u32 $D1,$H1#lo,${R0}[0]
- vmlal.u32 $D2,$H2#lo,${R0}[0]
- vld1.32 ${S4}[0],[$tbl0,:32]
-
- vmlal.u32 $D3,$H2#lo,${R1}[0]
- vmlal.u32 $D0,$H4#lo,${S1}[0]
- vmlal.u32 $D4,$H3#lo,${R1}[0]
- vmlal.u32 $D1,$H0#lo,${R1}[0]
- vmlal.u32 $D2,$H1#lo,${R1}[0]
-
- vmlal.u32 $D3,$H1#lo,${R2}[0]
- vmlal.u32 $D0,$H3#lo,${S2}[0]
- vmlal.u32 $D4,$H2#lo,${R2}[0]
- vmlal.u32 $D1,$H4#lo,${S2}[0]
- vmlal.u32 $D2,$H0#lo,${R2}[0]
-
- vmlal.u32 $D3,$H0#lo,${R3}[0]
- vmlal.u32 $D0,$H2#lo,${S3}[0]
- vmlal.u32 $D4,$H1#lo,${R3}[0]
- vmlal.u32 $D1,$H3#lo,${S3}[0]
- vmlal.u32 $D3,$H4#lo,${S4}[0]
-
- vmlal.u32 $D2,$H4#lo,${S3}[0]
- vmlal.u32 $D0,$H1#lo,${S4}[0]
- vmlal.u32 $D4,$H0#lo,${R4}[0]
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vmlal.u32 $D1,$H2#lo,${S4}[0]
- vmlal.u32 $D2,$H3#lo,${S4}[0]
-
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
- vrev32.8 $H3,$H3
-# endif
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction interleaved with base 2^32 -> base 2^26 of
- @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vshl.u32 $H3,$H3,#18
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vsri.u32 $H3,$H2,#14
- vbic.i32 $D4#lo,#0xfc000000
- vshl.u32 $H2,$H2,#12
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vbic.i32 $H3,#0xfc000000
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
- vsri.u32 $H2,$H1,#20
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vshl.u32 $H1,$H1,#6
- vbic.i32 $D2#lo,#0xfc000000
- vbic.i32 $H2,#0xfc000000
-
- vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
- vmovn.i64 $D0#lo,$D0
- vsri.u32 $H1,$H0,#26
- vbic.i32 $H0,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vbic.i32 $D0#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
- vbic.i32 $H1,#0xfc000000
-
- bhi .Loop_neon
-
-.Lskip_loop:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- add $tbl1,$ctx,#(48+0*9*4)
- add $tbl0,$ctx,#(48+1*9*4)
- adds $len,$len,#32
- it ne
- movne $len,#0
- bne .Long_tail
-
- vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
- vadd.i32 $H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
- vmull.u32 $D2,$H2#hi,$R0
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,$R0
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,$R0
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,$R0
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,$R0
-
- vmlal.u32 $D0,$H4#hi,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#hi,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#hi,$R1
- vmlal.u32 $D4,$H3#hi,$R1
- vmlal.u32 $D2,$H1#hi,$R1
-
- vmlal.u32 $D3,$H1#hi,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#hi,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#hi,$R2
- vmlal.u32 $D1,$H4#hi,$S2
- vmlal.u32 $D2,$H0#hi,$R2
-
- vmlal.u32 $D3,$H0#hi,$R3
- it ne
- addne $tbl1,$ctx,#(48+2*9*4)
- vmlal.u32 $D0,$H2#hi,$S3
- it ne
- addne $tbl0,$ctx,#(48+3*9*4)
- vmlal.u32 $D4,$H1#hi,$R3
- vmlal.u32 $D1,$H3#hi,$S3
- vmlal.u32 $D2,$H4#hi,$S3
-
- vmlal.u32 $D3,$H4#hi,$S4
- vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
- vmlal.u32 $D0,$H1#hi,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#hi,$R4
- vmlal.u32 $D1,$H2#hi,$S4
- vmlal.u32 $D2,$H3#hi,$S4
-
- beq .Lshort_tail
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4:r^3 and accumulate
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
-
- vmlal.u32 $D2,$H2#lo,$R0
- vmlal.u32 $D0,$H0#lo,$R0
- vmlal.u32 $D3,$H3#lo,$R0
- vmlal.u32 $D1,$H1#lo,$R0
- vmlal.u32 $D4,$H4#lo,$R0
-
- vmlal.u32 $D0,$H4#lo,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#lo,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#lo,$R1
- vmlal.u32 $D4,$H3#lo,$R1
- vmlal.u32 $D2,$H1#lo,$R1
-
- vmlal.u32 $D3,$H1#lo,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#lo,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#lo,$R2
- vmlal.u32 $D1,$H4#lo,$S2
- vmlal.u32 $D2,$H0#lo,$R2
-
- vmlal.u32 $D3,$H0#lo,$R3
- vmlal.u32 $D0,$H2#lo,$S3
- vmlal.u32 $D4,$H1#lo,$R3
- vmlal.u32 $D1,$H3#lo,$S3
- vmlal.u32 $D2,$H4#lo,$S3
-
- vmlal.u32 $D3,$H4#lo,$S4
- vorn $MASK,$MASK,$MASK @ all-ones
- vmlal.u32 $D0,$H1#lo,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#lo,$R4
- vmlal.u32 $D1,$H2#lo,$S4
- vmlal.u32 $D2,$H3#lo,$S4
-
-.Lshort_tail:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 $D3#lo,$D3#lo,$D3#hi
- vadd.i64 $D0#lo,$D0#lo,$D0#hi
- vadd.i64 $D4#lo,$D4#lo,$D4#hi
- vadd.i64 $D1#lo,$D1#lo,$D1#hi
- vadd.i64 $D2#lo,$D2#lo,$D2#hi
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction, but without narrowing
-
- vshr.u64 $T0,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vshr.u64 $T1,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
-
- vshr.u64 $T0,$D4,#26
- vand.i64 $D4,$D4,$MASK
- vshr.u64 $T1,$D1,#26
- vand.i64 $D1,$D1,$MASK
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
-
- vadd.i64 $D0,$D0,$T0
- vshl.u64 $T0,$T0,#2
- vshr.u64 $T1,$D2,#26
- vand.i64 $D2,$D2,$MASK
- vadd.i64 $D0,$D0,$T0 @ h4 -> h0
- vadd.i64 $D3,$D3,$T1 @ h2 -> h3
-
- vshr.u64 $T0,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vshr.u64 $T1,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vadd.i64 $D1,$D1,$T0 @ h0 -> h1
- vadd.i64 $D4,$D4,$T1 @ h3 -> h4
-
- cmp $len,#0
- bne .Leven
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ store hash value
-
- vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- vst1.32 {$D4#lo[0]},[$ctx]
-
- vldmia sp!,{d8-d15} @ epilogue
- ldmia sp!,{r4-r7}
- ret @ bx lr
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-#ifndef __KERNEL__
-.LOPENSSL_armcap:
-# ifdef _WIN32
-.word OPENSSL_armcap_P
-# else
-.word OPENSSL_armcap_P-.Lpoly1305_init
-# endif
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
-#endif
-___
-} }
-$code.=<<___;
-.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
-.align 2
-___
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/poly1305-glue.c b/arch/arm/lib/crypto/poly1305-glue.c
deleted file mode 100644
index 2603b0771f2c..000000000000
--- a/arch/arm/lib/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
- struct poly1305_block_state *state,
- const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
- u8 digest[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-void __weak poly1305_blocks_neon(struct poly1305_block_state *state,
- const u8 *src, u32 len, u32 hibit)
-{
-}
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
- unsigned int len, u32 padbit)
-{
- len = round_down(len, POLY1305_BLOCK_SIZE);
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- static_branch_likely(&have_neon)) {
- do {
- unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
- kernel_neon_begin();
- poly1305_blocks_neon(state, src, todo, padbit);
- kernel_neon_end();
-
- len -= todo;
- src += todo;
- } while (len);
- } else
- poly1305_blocks_arm(state, src, len, padbit);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
- /* We always can use at least the ARM scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init arm_poly1305_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- (elf_hwcap & HWCAP_NEON))
- static_branch_enable(&have_neon);
- return 0;
-}
-subsys_initcall(arm_poly1305_mod_init);
-
-static void __exit arm_poly1305_mod_exit(void)
-{
-}
-module_exit(arm_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/sha256-armv4.pl b/arch/arm/lib/crypto/sha256-armv4.pl
deleted file mode 100644
index 8122db7fd599..000000000000
--- a/arch/arm/lib/crypto/sha256-armv4.pl
+++ /dev/null
@@ -1,724 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA256 block procedure for ARMv4. May 2007.
-
-# Performance is ~2x better than gcc 3.4 generated code and in "abso-
-# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte [on single-issue Xscale PXA250 core].
-
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 22% improvement on
-# Cortex A8 core and ~20 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-# September 2013.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process one
-# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-# code (meaning that latter performs sub-optimally, nothing was done
-# about it).
-
-# May 2014.
-#
-# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0"; $t0="r0";
-$inp="r1"; $t4="r1";
-$len="r2"; $t1="r2";
-$T1="r3"; $t3="r3";
-$A="r4";
-$B="r5";
-$C="r6";
-$D="r7";
-$E="r8";
-$F="r9";
-$G="r10";
-$H="r11";
-@V=($A,$B,$C,$D,$E,$F,$G,$H);
-$t2="r12";
-$Ktbl="r14";
-
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-
-sub BODY_00_15 {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___ if ($i<16);
-#if __ARM_ARCH__>=7
- @ ldr $t1,[$inp],#4 @ $i
-# if $i==15
- str $inp,[sp,#17*4] @ make room for $t4
-# endif
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
-# ifndef __ARMEB__
- rev $t1,$t1
-# endif
-#else
- @ ldrb $t1,[$inp,#3] @ $i
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- ldrb $t2,[$inp,#2]
- ldrb $t0,[$inp,#1]
- orr $t1,$t1,$t2,lsl#8
- ldrb $t2,[$inp],#4
- orr $t1,$t1,$t0,lsl#16
-# if $i==15
- str $inp,[sp,#17*4] @ make room for $t4
-# endif
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
- orr $t1,$t1,$t2,lsl#24
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
-#endif
-___
-$code.=<<___;
- ldr $t2,[$Ktbl],#4 @ *K256++
- add $h,$h,$t1 @ h+=X[i]
- str $t1,[sp,#`$i%16`*4]
- eor $t1,$f,$g
- add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
- and $t1,$t1,$e
- add $h,$h,$t2 @ h+=K256[i]
- eor $t1,$t1,$g @ Ch(e,f,g)
- eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
- add $h,$h,$t1 @ h+=Ch(e,f,g)
-#if $i==31
- and $t2,$t2,#0xff
- cmp $t2,#0xf2 @ done?
-#endif
-#if $i<15
-# if __ARM_ARCH__>=7
- ldr $t1,[$inp],#4 @ prefetch
-# else
- ldrb $t1,[$inp,#3]
-# endif
- eor $t2,$a,$b @ a^b, b^c in next round
-#else
- ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
- eor $t2,$a,$b @ a^b, b^c in next round
- ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
-#endif
- eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
- and $t3,$t3,$t2 @ (b^c)&=(a^b)
- add $d,$d,$h @ d+=h
- eor $t3,$t3,$b @ Maj(a,b,c)
- add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
- @ add $h,$h,$t3 @ h+=Maj(a,b,c)
-___
- ($t2,$t3)=($t3,$t2);
-}
-
-sub BODY_16_XX {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___;
- @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
- @ ldr $t4,[sp,#`($i+14)%16`*4]
- mov $t0,$t1,ror#$sigma0[0]
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- mov $t2,$t4,ror#$sigma1[0]
- eor $t0,$t0,$t1,ror#$sigma0[1]
- eor $t2,$t2,$t4,ror#$sigma1[1]
- eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
- ldr $t1,[sp,#`($i+0)%16`*4]
- eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
- ldr $t4,[sp,#`($i+9)%16`*4]
-
- add $t2,$t2,$t0
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
- add $t1,$t1,$t2
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
- add $t1,$t1,$t4 @ X[i]
-___
- &BODY_00_15(@_);
-}
-
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code 32
-# endif
-#endif
-
-.type K256,%object
-.align 5
-K256:
-.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.size K256,.-K256
-.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha256_blocks_arch
-#endif
-.align 5
-
-.global sha256_blocks_arch
-.type sha256_blocks_arch,%function
-sha256_blocks_arch:
-.Lsha256_blocks_arch:
-#if __ARM_ARCH__<7
- sub r3,pc,#8 @ sha256_blocks_arch
-#else
- adr r3,.Lsha256_blocks_arch
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#ARMV8_SHA256
- bne .LARMv8
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
- stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
- ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
- sub $Ktbl,r3,#256+32 @ K256
- sub sp,sp,#16*4 @ alloca(X[16])
-.Loop:
-# if __ARM_ARCH__>=7
- ldr $t1,[$inp],#4
-# else
- ldrb $t1,[$inp,#3]
-# endif
- eor $t3,$B,$C @ magic
- eor $t2,$t2,$t2
-___
-for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=".Lrounds_16_xx:\n";
-for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-#if __ARM_ARCH__>=7
- ite eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq $t3,[sp,#16*4] @ pull ctx
- bne .Lrounds_16_xx
-
- add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
- ldr $t0,[$t3,#0]
- ldr $t1,[$t3,#4]
- ldr $t2,[$t3,#8]
- add $A,$A,$t0
- ldr $t0,[$t3,#12]
- add $B,$B,$t1
- ldr $t1,[$t3,#16]
- add $C,$C,$t2
- ldr $t2,[$t3,#20]
- add $D,$D,$t0
- ldr $t0,[$t3,#24]
- add $E,$E,$t1
- ldr $t1,[$t3,#28]
- add $F,$F,$t2
- ldr $inp,[sp,#17*4] @ pull inp
- ldr $t2,[sp,#18*4] @ pull inp+len
- add $G,$G,$t0
- add $H,$H,$t1
- stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
- cmp $inp,$t2
- sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
- bne .Loop
-
- add sp,sp,#`16+3`*4 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size sha256_blocks_arch,.-sha256_blocks_arch
-___
-######################################################################
-# NEON stuff
-#
-{{{
-my @X=map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
-my $Xfer=$t4;
-my $j=0;
-
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Xupdate()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T2,$T0,$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T1,$T0,$sigma0[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T2,$T0,32-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T3,$T0,$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T1,$T1,$T2);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T3,$T0,32-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T1,$T1,$T3); # sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vld1_32 ("{$T0}","[$Ktbl,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4); # sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 ($T0,$T0,@X[0]);
- while($#insns>=2) { eval(shift(@insns)); }
- &vst1_32 ("{$T0}","[$Xfer,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vld1_32 ("{$T0}","[$Ktbl,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vrev32_8 (@X[0],@X[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 ($T0,$T0,@X[0]);
- foreach (@insns) { eval; } # remaining instructions
- &vst1_32 ("{$T0}","[$Xfer,:128]!");
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub body_00_15 () {
- (
- '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
- '&add ($h,$h,$t1)', # h+=X[i]+K[i]
- '&eor ($t1,$f,$g)',
- '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
- '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
- '&and ($t1,$t1,$e)',
- '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
- '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
- '&eor ($t1,$t1,$g)', # Ch(e,f,g)
- '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
- '&eor ($t2,$a,$b)', # a^b, b^c in next round
- '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
- '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
- '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
- '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
- '&ldr ($t1,"[sp,#64]") if ($j==31)',
- '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
- '&add ($d,$d,$h)', # d+=h
- '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
- '&eor ($t3,$t3,$b)', # Maj(a,b,c)
- '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
- )
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch armv7-a
-.fpu neon
-
-.global sha256_block_data_order_neon
-.type sha256_block_data_order_neon,%function
-.align 4
-sha256_block_data_order_neon:
-.LNEON:
- stmdb sp!,{r4-r12,lr}
-
- sub $H,sp,#16*4+16
- adr $Ktbl,.Lsha256_blocks_arch
- sub $Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256
- bic $H,$H,#15 @ align for 128-bit stores
- mov $t2,sp
- mov sp,$H @ alloca
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
-
- vld1.8 {@X[0]},[$inp]!
- vld1.8 {@X[1]},[$inp]!
- vld1.8 {@X[2]},[$inp]!
- vld1.8 {@X[3]},[$inp]!
- vld1.32 {$T0},[$Ktbl,:128]!
- vld1.32 {$T1},[$Ktbl,:128]!
- vld1.32 {$T2},[$Ktbl,:128]!
- vld1.32 {$T3},[$Ktbl,:128]!
- vrev32.8 @X[0],@X[0] @ yes, even on
- str $ctx,[sp,#64]
- vrev32.8 @X[1],@X[1] @ big-endian
- str $inp,[sp,#68]
- mov $Xfer,sp
- vrev32.8 @X[2],@X[2]
- str $len,[sp,#72]
- vrev32.8 @X[3],@X[3]
- str $t2,[sp,#76] @ save original sp
- vadd.i32 $T0,$T0,@X[0]
- vadd.i32 $T1,$T1,@X[1]
- vst1.32 {$T0},[$Xfer,:128]!
- vadd.i32 $T2,$T2,@X[2]
- vst1.32 {$T1},[$Xfer,:128]!
- vadd.i32 $T3,$T3,@X[3]
- vst1.32 {$T2},[$Xfer,:128]!
- vst1.32 {$T3},[$Xfer,:128]!
-
- ldmia $ctx,{$A-$H}
- sub $Xfer,$Xfer,#64
- ldr $t1,[sp,#0]
- eor $t2,$t2,$t2
- eor $t3,$B,$C
- b .L_00_48
-
-.align 4
-.L_00_48:
-___
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
-$code.=<<___;
- teq $t1,#0 @ check for K256 terminator
- ldr $t1,[sp,#0]
- sub $Xfer,$Xfer,#64
- bne .L_00_48
-
- ldr $inp,[sp,#68]
- ldr $t0,[sp,#72]
- sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
- teq $inp,$t0
- it eq
- subeq $inp,$inp,#64 @ avoid SEGV
- vld1.8 {@X[0]},[$inp]! @ load next input block
- vld1.8 {@X[1]},[$inp]!
- vld1.8 {@X[2]},[$inp]!
- vld1.8 {@X[3]},[$inp]!
- it ne
- strne $inp,[sp,#68]
- mov $Xfer,sp
-___
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
-$code.=<<___;
- ldr $t0,[$t1,#0]
- add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
- ldr $t2,[$t1,#4]
- ldr $t3,[$t1,#8]
- ldr $t4,[$t1,#12]
- add $A,$A,$t0 @ accumulate
- ldr $t0,[$t1,#16]
- add $B,$B,$t2
- ldr $t2,[$t1,#20]
- add $C,$C,$t3
- ldr $t3,[$t1,#24]
- add $D,$D,$t4
- ldr $t4,[$t1,#28]
- add $E,$E,$t0
- str $A,[$t1],#4
- add $F,$F,$t2
- str $B,[$t1],#4
- add $G,$G,$t3
- str $C,[$t1],#4
- add $H,$H,$t4
- str $D,[$t1],#4
- stmia $t1,{$E-$H}
-
- ittte ne
- movne $Xfer,sp
- ldrne $t1,[sp,#0]
- eorne $t2,$t2,$t2
- ldreq sp,[sp,#76] @ restore original sp
- itt ne
- eorne $t3,$B,$C
- bne .L_00_48
-
- ldmia sp!,{r4-r12,pc}
-.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
-#endif
-___
-}}}
-######################################################################
-# ARMv8 stuff
-#
-{{{
-my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
-my @MSG=map("q$_",(8..11));
-my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
-my $Ktbl="r3";
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# ifdef __thumb2__
-# define INST(a,b,c,d) .byte c,d|0xc,a,b
-# else
-# define INST(a,b,c,d) .byte a,b,c,d
-# endif
-
-.type sha256_block_data_order_armv8,%function
-.align 5
-sha256_block_data_order_armv8:
-.LARMv8:
- vld1.32 {$ABCD,$EFGH},[$ctx]
-# ifdef __thumb2__
- adr $Ktbl,.LARMv8
- sub $Ktbl,$Ktbl,#.LARMv8-K256
-# else
- adrl $Ktbl,K256
-# endif
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
-
-.Loop_v8:
- vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
- vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
- vld1.32 {$W0},[$Ktbl]!
- vrev32.8 @MSG[0],@MSG[0]
- vrev32.8 @MSG[1],@MSG[1]
- vrev32.8 @MSG[2],@MSG[2]
- vrev32.8 @MSG[3],@MSG[3]
- vmov $ABCD_SAVE,$ABCD @ offload
- vmov $EFGH_SAVE,$EFGH
- teq $inp,$len
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
- vld1.32 {$W1},[$Ktbl]!
- vadd.i32 $W0,$W0,@MSG[0]
- sha256su0 @MSG[0],@MSG[1]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
- sha256su1 @MSG[0],@MSG[2],@MSG[3]
-___
- ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
-}
-$code.=<<___;
- vld1.32 {$W1},[$Ktbl]!
- vadd.i32 $W0,$W0,@MSG[0]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- vld1.32 {$W0},[$Ktbl]!
- vadd.i32 $W1,$W1,@MSG[1]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- vld1.32 {$W1},[$Ktbl]
- vadd.i32 $W0,$W0,@MSG[2]
- sub $Ktbl,$Ktbl,#256-16 @ rewind
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- vadd.i32 $W1,$W1,@MSG[3]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
- vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
- it ne
- bne .Loop_v8
-
- vst1.32 {$ABCD,$EFGH},[$ctx]
-
- ret @ bx lr
-.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
-#endif
-___
-}}}
-$code.=<<___;
-.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-{ my %opcode = (
- "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
- "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
-
- sub unsha256 {
- my ($mnemonic,$arg)=@_;
-
- if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
- my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
- |(($2&7)<<17)|(($2&8)<<4)
- |(($3&7)<<1) |(($3&8)<<2);
- # since ARMv7 instructions are always encoded little-endian.
- # correct solution is to use .inst directive, but older
- # assemblers don't implement it:-(
- sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
- $word&0xff,($word>>8)&0xff,
- ($word>>16)&0xff,($word>>24)&0xff,
- $mnemonic,$arg;
- }
- }
-}
-
-foreach (split($/,$code)) {
-
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
-
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-
-close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/sha256-ce.S b/arch/arm/lib/crypto/sha256-ce.S
deleted file mode 100644
index ac2c9b01b22d..000000000000
--- a/arch/arm/lib/crypto/sha256-ce.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
-
- k0 .req q7
- k1 .req q8
- rk .req r3
-
- ta0 .req q9
- ta1 .req q10
- tb0 .req q10
- tb1 .req q9
-
- dga .req q11
- dgb .req q12
-
- dg0 .req q13
- dg1 .req q14
- dg2 .req q15
-
- .macro add_only, ev, s0
- vmov dg2, dg0
- .ifnb \s0
- vld1.32 {k\ev}, [rk, :128]!
- .endif
- sha256h.32 dg0, dg1, tb\ev
- sha256h2.32 dg1, dg2, tb\ev
- .ifnb \s0
- vadd.u32 ta\ev, q\s0, k\ev
- .endif
- .endm
-
- .macro add_update, ev, s0, s1, s2, s3
- sha256su0.32 q\s0, q\s1
- add_only \ev, \s1
- sha256su1.32 q\s0, q\s2, q\s3
- .endm
-
- .align 6
-.Lsha256_rcon:
- .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
- .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
- .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
- .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
- .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
- .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
- .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
- .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
- .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
- .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
- .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
- .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
- .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
- .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
- .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
- .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
- /*
- * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
- * const u8 *data, size_t nblocks);
- */
-ENTRY(sha256_ce_transform)
- /* load state */
- vld1.32 {dga-dgb}, [r0]
-
- /* load input */
-0: vld1.32 {q0-q1}, [r1]!
- vld1.32 {q2-q3}, [r1]!
- subs r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
- vrev32.8 q0, q0
- vrev32.8 q1, q1
- vrev32.8 q2, q2
- vrev32.8 q3, q3
-#endif
-
- /* load first round constant */
- adr rk, .Lsha256_rcon
- vld1.32 {k0}, [rk, :128]!
-
- vadd.u32 ta0, q0, k0
- vmov dg0, dga
- vmov dg1, dgb
-
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
-
- add_only 1, 1
- add_only 0, 2
- add_only 1, 3
- add_only 0
-
- /* update state */
- vadd.u32 dga, dga, dg0
- vadd.u32 dgb, dgb, dg1
- bne 0b
-
- /* store new state */
- vst1.32 {dga-dgb}, [r0]
- bx lr
-ENDPROC(sha256_ce_transform)
diff --git a/arch/arm/lib/crypto/sha256.c b/arch/arm/lib/crypto/sha256.c
deleted file mode 100644
index 109192e54b0f..000000000000
--- a/arch/arm/lib/crypto/sha256.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
- static_branch_likely(&have_neon)) {
- kernel_neon_begin();
- if (static_branch_likely(&have_ce))
- sha256_ce_transform(state, data, nblocks);
- else
- sha256_block_data_order_neon(state, data, nblocks);
- kernel_neon_end();
- } else {
- sha256_blocks_arch(state, data, nblocks);
- }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-bool sha256_is_arch_optimized(void)
-{
- /* We always can use at least the ARM scalar implementation. */
- return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_arm_mod_init(void)
-{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
- static_branch_enable(&have_neon);
- if (elf_hwcap2 & HWCAP2_SHA2)
- static_branch_enable(&have_ce);
- }
- return 0;
-}
-subsys_initcall(sha256_arm_mod_init);
-
-static void __exit sha256_arm_mod_exit(void)
-{
-}
-module_exit(sha256_arm_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM");
diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index 0daf6c5b5c1c..16392720296c 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -19,6 +19,7 @@
#include <linux/mtd/nand-gpio.h>
#include <linux/mtd/partitions.h>
#include <linux/platform_device.h>
+#include <linux/property.h>
#include <linux/regulator/consumer.h>
#include <linux/regulator/fixed.h>
#include <linux/regulator/machine.h>
@@ -175,20 +176,18 @@ static struct resource latch1_resources[] = {
#define LATCH1_LABEL "latch1"
-static struct bgpio_pdata latch1_pdata = {
- .label = LATCH1_LABEL,
- .base = -1,
- .ngpio = LATCH1_NGPIO,
+static const struct property_entry latch1_gpio_props[] = {
+ PROPERTY_ENTRY_STRING("label", LATCH1_LABEL),
+ PROPERTY_ENTRY_U32("ngpios", LATCH1_NGPIO),
+ { }
};
-static struct platform_device latch1_gpio_device = {
+static const struct platform_device_info latch1_gpio_devinfo = {
.name = "basic-mmio-gpio",
.id = 0,
- .resource = latch1_resources,
- .num_resources = ARRAY_SIZE(latch1_resources),
- .dev = {
- .platform_data = &latch1_pdata,
- },
+ .res = latch1_resources,
+ .num_res = ARRAY_SIZE(latch1_resources),
+ .properties = latch1_gpio_props,
};
#define LATCH1_PIN_LED_CAMERA 0
@@ -213,20 +212,18 @@ static struct resource latch2_resources[] = {
#define LATCH2_LABEL "latch2"
-static struct bgpio_pdata latch2_pdata = {
- .label = LATCH2_LABEL,
- .base = -1,
- .ngpio = LATCH2_NGPIO,
+static const struct property_entry latch2_gpio_props[] = {
+ PROPERTY_ENTRY_STRING("label", LATCH2_LABEL),
+ PROPERTY_ENTRY_U32("ngpios", LATCH2_NGPIO),
+ { }
};
-static struct platform_device latch2_gpio_device = {
+static struct platform_device_info latch2_gpio_devinfo = {
.name = "basic-mmio-gpio",
.id = 1,
- .resource = latch2_resources,
- .num_resources = ARRAY_SIZE(latch2_resources),
- .dev = {
- .platform_data = &latch2_pdata,
- },
+ .res = latch2_resources,
+ .num_res = ARRAY_SIZE(latch2_resources),
+ .properties = latch2_gpio_props,
};
#define LATCH2_PIN_LCD_VBLEN 0
@@ -542,8 +539,6 @@ static struct gpiod_lookup_table keybrd_pwr_gpio_table = {
};
static struct platform_device *ams_delta_devices[] __initdata = {
- &latch1_gpio_device,
- &latch2_gpio_device,
&ams_delta_kp_device,
&ams_delta_audio_device,
&ams_delta_serio_device,
@@ -697,6 +692,9 @@ static void __init ams_delta_init(void)
omap1_usb_init(&ams_delta_usb_config);
platform_add_devices(ams_delta_devices, ARRAY_SIZE(ams_delta_devices));
+ platform_device_register_full(&latch1_gpio_devinfo);
+ platform_device_register_full(&latch2_gpio_devinfo);
+
/*
* As soon as regulator consumers have been registered, assign their
* dev_names to consumer supply entries of respective regulators.
diff --git a/arch/arm/mach-s3c/mach-crag6410.c b/arch/arm/mach-s3c/mach-crag6410.c
index e5df2cb51ab2..028169c7debf 100644
--- a/arch/arm/mach-s3c/mach-crag6410.c
+++ b/arch/arm/mach-s3c/mach-crag6410.c
@@ -252,14 +252,17 @@ static struct resource crag6410_mmgpio_resource[] = {
[0] = DEFINE_RES_MEM_NAMED(S3C64XX_PA_XM0CSN4, 1, "dat"),
};
-static struct platform_device crag6410_mmgpio = {
+static const struct property_entry crag6410_mmgpio_props[] = {
+ PROPERTY_ENTRY_U32("gpio-mmio,base", MMGPIO_GPIO_BASE),
+ { }
+};
+
+static struct platform_device_info crag6410_mmgpio_devinfo = {
.name = "basic-mmio-gpio",
.id = -1,
- .resource = crag6410_mmgpio_resource,
- .num_resources = ARRAY_SIZE(crag6410_mmgpio_resource),
- .dev.platform_data = &(struct bgpio_pdata) {
- .base = MMGPIO_GPIO_BASE,
- },
+ .res = crag6410_mmgpio_resource,
+ .num_res = ARRAY_SIZE(crag6410_mmgpio_resource),
+ .properties = crag6410_mmgpio_props,
};
static struct platform_device speyside_device = {
@@ -373,7 +376,6 @@ static struct platform_device *crag6410_devices[] __initdata = {
&crag6410_gpio_keydev,
&crag6410_dm9k_device,
&s3c64xx_device_spi0,
- &crag6410_mmgpio,
&crag6410_lcd_powerdev,
&crag6410_backlight_device,
&speyside_device,
@@ -871,6 +873,7 @@ static void __init crag6410_machine_init(void)
pwm_add_table(crag6410_pwm_lookup, ARRAY_SIZE(crag6410_pwm_lookup));
platform_add_devices(crag6410_devices, ARRAY_SIZE(crag6410_devices));
+ platform_device_register_full(&crag6410_mmgpio_devinfo);
gpio_led_register_device(-1, &gpio_leds_pdata);
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index 2b833aa0212b..bad8aa661e9d 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -80,7 +80,7 @@ void ASSABET_BCR_frob(unsigned int mask, unsigned int val)
{
unsigned long m = mask, v = val;
- assabet_bcr_gc->set_multiple(assabet_bcr_gc, &m, &v);
+ assabet_bcr_gc->set_multiple_rv(assabet_bcr_gc, &m, &v);
}
EXPORT_SYMBOL(ASSABET_BCR_frob);
diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c
index 88fe79f0a4ed..6516598c8a71 100644
--- a/arch/arm/mach-sa1100/neponset.c
+++ b/arch/arm/mach-sa1100/neponset.c
@@ -126,7 +126,7 @@ void neponset_ncr_frob(unsigned int mask, unsigned int val)
unsigned long m = mask, v = val;
if (nep)
- n->gpio[0]->set_multiple(n->gpio[0], &m, &v);
+ n->gpio[0]->set_multiple_rv(n->gpio[0], &m, &v);
else
WARN(1, "nep unset\n");
}
diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
index 25dbd84a1aaf..2bfefb252ffd 100644
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -295,7 +295,7 @@ static inline u32 read_extra_features(void)
return u;
}
-static inline void write_extra_features(u32 u)
+static inline void __init write_extra_features(u32 u)
{
__asm__("mcr p15, 1, %0, c15, c1, 0" : : "r" (u));
}
diff --git a/arch/arm/mm/cache-tauros2.c b/arch/arm/mm/cache-tauros2.c
index b1e1aba602f7..bfe166ccace0 100644
--- a/arch/arm/mm/cache-tauros2.c
+++ b/arch/arm/mm/cache-tauros2.c
@@ -177,7 +177,7 @@ static inline void __init write_actlr(u32 actlr)
__asm__("mcr p15, 0, %0, c1, c0, 1\n" : : "r" (actlr));
}
-static void enable_extra_feature(unsigned int features)
+static void __init enable_extra_feature(unsigned int features)
{
u32 u;
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 27c1d5ebcd91..b07e699aaa3c 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -482,3 +482,5 @@
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
467 common open_tree_attr sys_open_tree_attr
+468 common file_getattr sys_file_getattr
+469 common file_setattr sys_file_setattr
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index cb044bfd145d..cf8cd39ab804 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -26,7 +26,7 @@ CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
CFLAGS_REMOVE_vdso.o = -pg
# Force -O2 to avoid libgcc dependencies
-CFLAGS_REMOVE_vgettimeofday.o = -pg -Os $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS)
+CFLAGS_REMOVE_vgettimeofday.o = -pg -Os $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS)
ifeq ($(c-gettimeofday-y),)
CFLAGS_vgettimeofday.o = -O2
else