50 files changed, 54 insertions, 7268 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3072731fe09c..a6e80653abd1 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,8 +8,6 @@ config ARM
 	select ARCH_HAS_CACHE_LINE_SIZE if OF
 	select ARCH_HAS_CPU_CACHE_ALIASING
 	select ARCH_HAS_CPU_FINALIZE_INIT if MMU
-	select ARCH_HAS_CRC32 if KERNEL_MODE_NEON
-	select ARCH_HAS_CRC_T10DIF if KERNEL_MODE_NEON
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DMA_ALLOC if MMU
@@ -87,11 +85,11 @@ config ARM
 	select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
 	select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
 	select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
+	select HAVE_ARCH_KSTACK_ERASE
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
 	select HAVE_ARCH_PFN_VALID
 	select HAVE_ARCH_SECCOMP
 	select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT
-	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE
@@ -121,7 +119,7 @@ config ARM
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M
 	select HAVE_KRETPROBES if HAVE_KPROBES
-	select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_CAN_USE_KEEP_IN_OVERLAY)
+	select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD) && LD_CAN_USE_KEEP_IN_OVERLAY
 	select HAVE_MOD_ARCH_SPECIFIC
 	select HAVE_NMI
 	select HAVE_OPTPROBES if !THUMB2_KERNEL
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 4808d3ed98e4..e31e95ffd33f 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -149,7 +149,7 @@ endif
 # Need -Uarm for gcc < 3.x
 KBUILD_CPPFLAGS	+=$(cpp-y)
 KBUILD_CFLAGS	+=$(CFLAGS_ABI) $(CFLAGS_ISA) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm
-KBUILD_AFLAGS	+=$(CFLAGS_ABI) $(AFLAGS_ISA) -Wa,$(arch-y) $(tune-y) -include asm/unified.h -msoft-float
+KBUILD_AFLAGS	+=$(CFLAGS_ABI) $(AFLAGS_ISA) -Wa,$(arch-y) $(tune-y) -include $(srctree)/arch/arm/include/asm/unified.h -msoft-float
 KBUILD_RUSTFLAGS += --target=arm-unknown-linux-gnueabi
 
 CHECKFLAGS	+= -D__arm__
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index d61369b1eabe..a159120d1e42 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -9,7 +9,6 @@ OBJS		=
 
 HEAD	= head.o
 OBJS	+= misc.o decompress.o
-CFLAGS_decompress.o += $(DISABLE_STACKLEAK_PLUGIN)
 ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y)
 OBJS	+= debug.o
 AFLAGS_head.o += -DDEBUG
@@ -96,6 +95,7 @@ KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 
 ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin \
 	     -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
+	     $(DISABLE_KSTACK_ERASE) \
 	     -I$(obj)
 ccflags-remove-$(CONFIG_FUNCTION_TRACER) += -pg
 asflags-y := -DZIMAGE
diff --git a/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi b/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
index f909b1d4dbca..e82cf312da25 100644
--- a/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
+++ b/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
@@ -652,7 +652,7 @@
 			reg = <0x01cb4000 0x3000>;
 			interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_CSI>,
-				 <&ccu CLK_CSI1_SCLK>,
+				 <&ccu CLK_CSI_SCLK>,
 				 <&ccu CLK_DRAM_CSI>;
 			clock-names = "bus", "mod", "ram";
 			resets = <&ccu RST_BUS_CSI>;
diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index f71af368674c..6915c766923a 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -363,8 +363,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_CHACHA20_NEON=m
 CONFIG_CRYPTO_DEV_EXYNOS_RNG=y
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index 242e7d5a3f68..a3be0b2ede09 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -98,9 +98,6 @@ CONFIG_CRYPTO_SELFTESTS=y
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_SEQIV=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_AES_ARM_CE=m
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index 50c170b4619f..c75808b94be6 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -791,8 +791,11 @@ CONFIG_SND=m
 CONFIG_SND_HDA_TEGRA=m
 CONFIG_SND_HDA_INPUT_BEEP=y
 CONFIG_SND_HDA_PATCH_LOADER=y
-CONFIG_SND_HDA_CODEC_REALTEK=m
+CONFIG_SND_HDA_CODEC_REALTEK=y
+CONFIG_SND_HDA_CODEC_REALTEK_LIB=m
+CONFIG_SND_HDA_CODEC_ALC269=m
 CONFIG_SND_HDA_CODEC_HDMI=m
+CONFIG_SND_HDA_CODEC_HDMI_TEGRA=m
 CONFIG_SND_USB_AUDIO=m
 CONFIG_SND_SOC=m
 CONFIG_SND_ATMEL_SOC=m
@@ -1280,9 +1283,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_AES_ARM_CE=m
@@ -1298,7 +1298,6 @@ CONFIG_CRYPTO_DEV_MARVELL_CESA=m
 CONFIG_CRYPTO_DEV_QCE=m
 CONFIG_CRYPTO_DEV_QCOM_RNG=m
 CONFIG_CRYPTO_DEV_ROCKCHIP=m
-CONFIG_CRYPTO_DEV_STM32_CRC=m
 CONFIG_CRYPTO_DEV_STM32_HASH=m
 CONFIG_CRYPTO_DEV_STM32_CRYP=m
 CONFIG_CMA_SIZE_MBYTES=64
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 9f9780c8e62a..046467637901 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -704,8 +704,6 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_SECURITY=y
 CONFIG_CRYPTO_MICHAEL_MIC=y
 CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_CHACHA20_NEON=m
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index ff29c5b0e9c9..1a80602c1284 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -658,8 +658,6 @@ CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
-CONFIG_CRYPTO_SHA1_ARM=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_FONTS=y
 CONFIG_FONT_8x8=y
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 7efb9a8596e4..1e5f3cdf691c 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -62,47 +62,6 @@ config CRYPTO_BLAKE2B_NEON
 	  much faster than the SHA-2 family and slightly faster than
 	  SHA-1.
 
-config CRYPTO_SHA1_ARM
-	tristate "Hash functions: SHA-1"
-	select CRYPTO_SHA1
-	select CRYPTO_HASH
-	help
-	  SHA-1 secure hash algorithm (FIPS 180)
-
-	  Architecture: arm
-
-config CRYPTO_SHA1_ARM_NEON
-	tristate "Hash functions: SHA-1 (NEON)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_SHA1_ARM
-	select CRYPTO_SHA1
-	select CRYPTO_HASH
-	help
-	  SHA-1 secure hash algorithm (FIPS 180)
-
-	  Architecture: arm using
-	  - NEON (Advanced SIMD) extensions
-
-config CRYPTO_SHA1_ARM_CE
-	tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_SHA1_ARM
-	select CRYPTO_HASH
-	help
-	  SHA-1 secure hash algorithm (FIPS 180)
-
-	  Architecture: arm using ARMv8 Crypto Extensions
-
-config CRYPTO_SHA512_ARM
-	tristate "Hash functions: SHA-384 and SHA-512 (NEON)"
-	select CRYPTO_HASH
-	depends on !CPU_V7M
-	help
-	  SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm using
-	  - NEON (Advanced SIMD) extensions
-
 config CRYPTO_AES_ARM
 	tristate "Ciphers: AES"
 	select CRYPTO_ALGAPI
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 8479137c6e80..4f23999ae17d 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -5,38 +5,17 @@
 
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
-obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 
 aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
-sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
-sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
-sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
-sha512-arm-y	:= sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
 blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
-sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 curve25519-neon-y := curve25519-core.o curve25519-glue.o
-
-quiet_cmd_perl = PERL    $@
-      cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
-	$(call cmd,perl)
-
-clean-files += sha512-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
-
-AFLAGS_sha512-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S
deleted file mode 100644
index 1c8b685149f2..000000000000
--- a/arch/arm/crypto/sha1-armv4-large.S
+++ /dev/null
@@ -1,507 +0,0 @@
-#define __ARM_ARCH__ __LINUX_ARM_ARCH__
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see https://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ sha1_block procedure for ARMv4.
-@
-@ January 2007.
-
-@ Size/performance trade-off
-@ ====================================================================
-@ impl		size in bytes	comp cycles[*]	measured performance
-@ ====================================================================
-@ thumb		304		3212		4420
-@ armv4-small	392/+29%	1958/+64%	2250/+96%
-@ armv4-compact	740/+89%	1552/+26%	1840/+22%
-@ armv4-large	1420/+92%	1307/+19%	1370/+34%[***]
-@ full unroll	~5100/+260%	~1260/+4%	~1300/+5%
-@ ====================================================================
-@ thumb		= same as 'small' but in Thumb instructions[**] and
-@		  with recurring code in two private functions;
-@ small		= detached Xload/update, loops are folded;
-@ compact	= detached Xload/update, 5x unroll;
-@ large		= interleaved Xload/update, 5x unroll;
-@ full unroll	= interleaved Xload/update, full unroll, estimated[!];
-@
-@ [*]	Manually counted instructions in "grand" loop body. Measured
-@	performance is affected by prologue and epilogue overhead,
-@	i-cache availability, branch penalties, etc.
-@ [**]	While each Thumb instruction is twice smaller, they are not as
-@	diverse as ARM ones: e.g., there are only two arithmetic
-@	instructions with 3 arguments, no [fixed] rotate, addressing
-@	modes are limited. As result it takes more instructions to do
-@	the same job in Thumb, therefore the code is never twice as
-@	small and always slower.
-@ [***]	which is also ~35% better than compiler generated code. Dual-
-@	issue Cortex A8 core was measured to process input block in
-@	~990 cycles.
-
-@ August 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
-@ Cortex A8 core and in absolute terms ~870 cycles per input block
-@ [or 13.6 cycles per byte].
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 10%
-@ improvement on Cortex A8 core and 12.2 cycles per byte.
-
-#include <linux/linkage.h>
-
-.text
-
-.align	2
-ENTRY(sha1_block_data_order)
-	stmdb	sp!,{r4-r12,lr}
-	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
-	ldmia	r0,{r3,r4,r5,r6,r7}
-.Lloop:
-	ldr	r8,.LK_00_19
-	mov	r14,sp
-	sub	sp,sp,#15*4
-	mov	r5,r5,ror#30
-	mov	r6,r6,ror#30
-	mov	r7,r7,ror#30		@ [6]
-.L_00_15:
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	eor	r10,r5,r6			@ F_xx_xx
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r4,r10,ror#2
-	add	r7,r7,r9			@ E+=X[i]
-	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r4,r5			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r6,r8,r6,ror#2			@ E+=K_00_19
-	eor	r10,r4,r5			@ F_xx_xx
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r3,r10,ror#2
-	add	r6,r6,r9			@ E+=X[i]
-	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r3,r4			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r5,r8,r5,ror#2			@ E+=K_00_19
-	eor	r10,r3,r4			@ F_xx_xx
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r7,r10,ror#2
-	add	r5,r5,r9			@ E+=X[i]
-	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r7,r3			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r4,r8,r4,ror#2			@ E+=K_00_19
-	eor	r10,r7,r3			@ F_xx_xx
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r6,r10,ror#2
-	add	r4,r4,r9			@ E+=X[i]
-	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r6,r7			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r3,r8,r3,ror#2			@ E+=K_00_19
-	eor	r10,r6,r7			@ F_xx_xx
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r5,r10,ror#2
-	add	r3,r3,r9			@ E+=X[i]
-	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
-	cmp	r14,sp
-	bne	.L_00_15		@ [((11+4)*5+2)*3]
-	sub	sp,sp,#25*4
-#if __ARM_ARCH__<7
-	ldrb	r10,[r1,#2]
-	ldrb	r9,[r1,#3]
-	ldrb	r11,[r1,#1]
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	ldrb	r12,[r1],#4
-	orr	r9,r9,r10,lsl#8
-	eor	r10,r5,r6			@ F_xx_xx
-	orr	r9,r9,r11,lsl#16
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	orr	r9,r9,r12,lsl#24
-#else
-	ldr	r9,[r1],#4			@ handles unaligned
-	add	r7,r8,r7,ror#2			@ E+=K_00_19
-	eor	r10,r5,r6			@ F_xx_xx
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-#ifdef __ARMEL__
-	rev	r9,r9				@ byte swap
-#endif
-#endif
-	and	r10,r4,r10,ror#2
-	add	r7,r7,r9			@ E+=X[i]
-	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
-	str	r9,[r14,#-4]!
-	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r3,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
-	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r7,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
-	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r6,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
-	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r5,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
-	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
-
-	ldr	r8,.LK_20_39		@ [+15+16*4]
-	cmn	sp,#0			@ [+3], clear carry to denote 20_39
-.L_20_39_or_60_79:
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r5,r6			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor r10,r4,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r7,r7,r9			@ E+=X[i]
-	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor r10,r3,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor r10,r7,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor r10,r6,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	eor r10,r5,r10,ror#2					@ F_xx_xx
-						@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
- ARM(	teq	r14,sp		)	@ preserve carry
- THUMB(	mov	r11,sp		)
- THUMB(	teq	r14,r11		)	@ preserve carry
-	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
-	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
-
-	ldr	r8,.LK_40_59
-	sub	sp,sp,#20*4		@ [+2]
-.L_40_59:
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r5,r6			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r4,r10,ror#2					@ F_xx_xx
-	and r11,r5,r6					@ F_xx_xx
-	add	r7,r7,r9			@ E+=X[i]
-	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
-	add	r7,r7,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r4,r5			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r3,r10,ror#2					@ F_xx_xx
-	and r11,r4,r5					@ F_xx_xx
-	add	r6,r6,r9			@ E+=X[i]
-	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
-	add	r6,r6,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r3,r4			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r7,r10,ror#2					@ F_xx_xx
-	and r11,r3,r4					@ F_xx_xx
-	add	r5,r5,r9			@ E+=X[i]
-	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
-	add	r5,r5,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r7,r3			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r6,r10,ror#2					@ F_xx_xx
-	and r11,r7,r3					@ F_xx_xx
-	add	r4,r4,r9			@ E+=X[i]
-	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
-	add	r4,r4,r11,ror#2
-	ldr	r9,[r14,#15*4]
-	ldr	r10,[r14,#13*4]
-	ldr	r11,[r14,#7*4]
-	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
-	ldr	r12,[r14,#2*4]
-	eor	r9,r9,r10
-	eor	r11,r11,r12			@ 1 cycle stall
-	eor	r10,r6,r7			@ F_xx_xx
-	mov	r9,r9,ror#31
-	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
-	eor	r9,r9,r11,ror#31
-	str	r9,[r14,#-4]!
-	and r10,r5,r10,ror#2					@ F_xx_xx
-	and r11,r6,r7					@ F_xx_xx
-	add	r3,r3,r9			@ E+=X[i]
-	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
-	add	r3,r3,r11,ror#2
-	cmp	r14,sp
-	bne	.L_40_59		@ [+((12+5)*5+2)*4]
-
-	ldr	r8,.LK_60_79
-	sub	sp,sp,#20*4
-	cmp	sp,#0			@ set carry to denote 60_79
-	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
-.L_done:
-	add	sp,sp,#80*4		@ "deallocate" stack frame
-	ldmia	r0,{r8,r9,r10,r11,r12}
-	add	r3,r8,r3
-	add	r4,r9,r4
-	add	r5,r10,r5,ror#2
-	add	r6,r11,r6,ror#2
-	add	r7,r12,r7,ror#2
-	stmia	r0,{r3,r4,r5,r6,r7}
-	teq	r1,r2
-	bne	.Lloop			@ [+18], total 1307
-
-	ldmia	sp!,{r4-r12,pc}
-.align	2
-.LK_00_19:	.word	0x5a827999
-.LK_20_39:	.word	0x6ed9eba1
-.LK_40_59:	.word	0x8f1bbcdc
-.LK_60_79:	.word	0xca62c1d6
-ENDPROC(sha1_block_data_order)
-.asciz	"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
-.align	2
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
deleted file mode 100644
index 28d816a6a530..000000000000
--- a/arch/arm/crypto/sha1-armv7-neon.S
+++ /dev/null
@@ -1,634 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
- *
- * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-.syntax unified
-.fpu neon
-
-.text
-
-
-/* Context structure */
-
-#define state_h0 0
-#define state_h1 4
-#define state_h2 8
-#define state_h3 12
-#define state_h4 16
-
-
-/* Constants */
-
-#define K1  0x5A827999
-#define K2  0x6ED9EBA1
-#define K3  0x8F1BBCDC
-#define K4  0xCA62C1D6
-.align 4
-.LK_VEC:
-.LK1:	.long K1, K1, K1, K1
-.LK2:	.long K2, K2, K2, K2
-.LK3:	.long K3, K3, K3, K3
-.LK4:	.long K4, K4, K4, K4
-
-
-/* Register macros */
-
-#define RSTATE r0
-#define RDATA r1
-#define RNBLKS r2
-#define ROLDSTACK r3
-#define RWK lr
-
-#define _a r4
-#define _b r5
-#define _c r6
-#define _d r7
-#define _e r8
-
-#define RT0 r9
-#define RT1 r10
-#define RT2 r11
-#define RT3 r12
-
-#define W0 q0
-#define W1 q7
-#define W2 q2
-#define W3 q3
-#define W4 q4
-#define W5 q6
-#define W6 q5
-#define W7 q1
-
-#define tmp0 q8
-#define tmp1 q9
-#define tmp2 q10
-#define tmp3 q11
-
-#define qK1 q12
-#define qK2 q13
-#define qK3 q14
-#define qK4 q15
-
-#ifdef CONFIG_CPU_BIG_ENDIAN
-#define ARM_LE(code...)
-#else
-#define ARM_LE(code...)		code
-#endif
-
-/* Round function macros. */
-
-#define WK_offs(i) (((i) & 15) * 4)
-
-#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	ldr RT3, [sp, WK_offs(i)]; \
-		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	bic RT0, d, b; \
-	add e, e, a, ror #(32 - 5); \
-	and RT1, c, b; \
-		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	add RT0, RT0, RT3; \
-	add e, e, RT1; \
-	ror b, #(32 - 30); \
-		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	add e, e, RT0;
-
-#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	ldr RT3, [sp, WK_offs(i)]; \
-		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	eor RT0, d, b; \
-	add e, e, a, ror #(32 - 5); \
-	eor RT0, RT0, c; \
-		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	add e, e, RT3; \
-	ror b, #(32 - 30); \
-		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	add e, e, RT0; \
-
-#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	ldr RT3, [sp, WK_offs(i)]; \
-		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	eor RT0, b, c; \
-	and RT1, b, c; \
-	add e, e, a, ror #(32 - 5); \
-		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	and RT0, RT0, d; \
-	add RT1, RT1, RT3; \
-	add e, e, RT0; \
-	ror b, #(32 - 30); \
-		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-	add e, e, RT1;
-
-#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
-           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define R(a,b,c,d,e,f,i) \
-	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
-	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define dummy(...)
-
-
-/* Input expansion macros. */
-
-/********* Precalc macros for rounds 0-15 *************************************/
-
-#define W_PRECALC_00_15() \
-	add       RWK, sp, #(WK_offs(0));			\
-	\
-	vld1.32   {W0, W7}, [RDATA]!;				\
- ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
-	vld1.32   {W6, W5}, [RDATA]!;				\
-	vadd.u32  tmp0, W0, curK;				\
- ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
- ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
-	vadd.u32  tmp1, W7, curK;				\
- ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
-	vadd.u32  tmp2, W6, curK;				\
-	vst1.32   {tmp0, tmp1}, [RWK]!;				\
-	vadd.u32  tmp3, W5, curK;				\
-	vst1.32   {tmp2, tmp3}, [RWK];				\
-
-#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vld1.32   {W0, W7}, [RDATA]!;				\
-
-#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	add       RWK, sp, #(WK_offs(0));			\
-
-#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
-
-#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vld1.32   {W6, W5}, [RDATA]!;				\
-
-#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vadd.u32  tmp0, W0, curK;				\
-
-#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
-
-#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
-
-#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vadd.u32  tmp1, W7, curK;				\
-
-#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
-
-#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vadd.u32  tmp2, W6, curK;				\
-
-#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vst1.32   {tmp0, tmp1}, [RWK]!;				\
-
-#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vadd.u32  tmp3, W5, curK;				\
-
-#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vst1.32   {tmp2, tmp3}, [RWK];				\
-
-
-/********* Precalc macros for rounds 16-31 ************************************/
-
-#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	veor      tmp0, tmp0;			\
-	vext.8    W, W_m16, W_m12, #8;		\
-
-#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	add       RWK, sp, #(WK_offs(i));	\
-	vext.8    tmp0, W_m04, tmp0, #4;	\
-
-#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	veor      tmp0, tmp0, W_m16;		\
-	veor.32   W, W, W_m08;			\
-
-#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	veor      tmp1, tmp1;			\
-	veor      W, W, tmp0;			\
-
-#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vshl.u32  tmp0, W, #1;			\
-
-#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vext.8    tmp1, tmp1, W, #(16-12);	\
-	vshr.u32  W, W, #31;			\
-
-#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vorr      tmp0, tmp0, W;		\
-	vshr.u32  W, tmp1, #30;			\
-
-#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vshl.u32  tmp1, tmp1, #2;		\
-
-#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	veor      tmp0, tmp0, W;		\
-
-#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	veor      W, tmp0, tmp1;		\
-
-#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vadd.u32  tmp0, W, curK;		\
-
-#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vst1.32   {tmp0}, [RWK];
-
-
-/********* Precalc macros for rounds 32-79 ************************************/
-
-#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	veor W, W_m28; \
-
-#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vext.8 tmp0, W_m08, W_m04, #8; \
-
-#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	veor W, W_m16; \
-
-#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	veor W, tmp0; \
-
-#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	add RWK, sp, #(WK_offs(i&~3)); \
-
-#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vshl.u32 tmp1, W, #2; \
-
-#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vshr.u32 tmp0, W, #30; \
-
-#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vorr W, tmp0, tmp1; \
-
-#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vadd.u32 tmp0, W, curK; \
-
-#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-	vst1.32 {tmp0}, [RWK];
-
-
-/*
- * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
- *
- * unsigned int
- * sha1_transform_neon (void *ctx, const unsigned char *data,
- *                      unsigned int nblks)
- */
-.align 3
-ENTRY(sha1_transform_neon)
-  /* input:
-   *	r0: ctx, CTX
-   *	r1: data (64*nblks bytes)
-   *	r2: nblks
-   */
-
-  cmp RNBLKS, #0;
-  beq .Ldo_nothing;
-
-  push {r4-r12, lr};
-  /*vpush {q4-q7};*/
-
-  adr RT3, .LK_VEC;
-
-  mov ROLDSTACK, sp;
-
-  /* Align stack. */
-  sub RT0, sp, #(16*4);
-  and RT0, #(~(16-1));
-  mov sp, RT0;
-
-  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
-
-  /* Get the values of the chaining variables. */
-  ldm RSTATE, {_a-_e};
-
-  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
-
-#undef curK
-#define curK qK1
-  /* Precalc 0-15. */
-  W_PRECALC_00_15();
-
-.Loop:
-  /* Transform 0-15 + Precalc 16-31. */
-  _R( _a, _b, _c, _d, _e, F1,  0,
-      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
-      W4, W5, W6, W7, W0, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F1,  1,
-      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
-      W4, W5, W6, W7, W0, _, _, _ );
-  _R( _d, _e, _a, _b, _c, F1,  2,
-      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
-      W4, W5, W6, W7, W0, _, _, _ );
-  _R( _c, _d, _e, _a, _b, F1,  3,
-      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
-      W4, W5, W6, W7, W0, _, _, _ );
-
-#undef curK
-#define curK qK2
-  _R( _b, _c, _d, _e, _a, F1,  4,
-      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
-      W3, W4, W5, W6, W7, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F1,  5,
-      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
-      W3, W4, W5, W6, W7, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F1,  6,
-      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
-      W3, W4, W5, W6, W7, _, _, _ );
-  _R( _d, _e, _a, _b, _c, F1,  7,
-      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
-      W3, W4, W5, W6, W7, _, _, _ );
-
-  _R( _c, _d, _e, _a, _b, F1,  8,
-      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
-      W2, W3, W4, W5, W6, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F1,  9,
-      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
-      W2, W3, W4, W5, W6, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F1, 10,
-      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
-      W2, W3, W4, W5, W6, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F1, 11,
-      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
-      W2, W3, W4, W5, W6, _, _, _ );
-
-  _R( _d, _e, _a, _b, _c, F1, 12,
-      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
-      W1, W2, W3, W4, W5, _, _, _ );
-  _R( _c, _d, _e, _a, _b, F1, 13,
-      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
-      W1, W2, W3, W4, W5, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F1, 14,
-      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
-      W1, W2, W3, W4, W5, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F1, 15,
-      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
-      W1, W2, W3, W4, W5, _, _, _ );
-
-  /* Transform 16-63 + Precalc 32-79. */
-  _R( _e, _a, _b, _c, _d, F1, 16,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _d, _e, _a, _b, _c, F1, 17,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _c, _d, _e, _a, _b, F1, 18,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _b, _c, _d, _e, _a, F1, 19,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-
-  _R( _a, _b, _c, _d, _e, F2, 20,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _e, _a, _b, _c, _d, F2, 21,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _d, _e, _a, _b, _c, F2, 22,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _c, _d, _e, _a, _b, F2, 23,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-
-#undef curK
-#define curK qK3
-  _R( _b, _c, _d, _e, _a, F2, 24,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _a, _b, _c, _d, _e, F2, 25,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _e, _a, _b, _c, _d, F2, 26,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _d, _e, _a, _b, _c, F2, 27,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-
-  _R( _c, _d, _e, _a, _b, F2, 28,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _b, _c, _d, _e, _a, F2, 29,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _a, _b, _c, _d, _e, F2, 30,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _e, _a, _b, _c, _d, F2, 31,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-
-  _R( _d, _e, _a, _b, _c, F2, 32,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
-      W4, W5, W6, W7, W0, W1, W2, W3);
-  _R( _c, _d, _e, _a, _b, F2, 33,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
-      W4, W5, W6, W7, W0, W1, W2, W3);
-  _R( _b, _c, _d, _e, _a, F2, 34,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
-      W4, W5, W6, W7, W0, W1, W2, W3);
-  _R( _a, _b, _c, _d, _e, F2, 35,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
-      W4, W5, W6, W7, W0, W1, W2, W3);
-
-  _R( _e, _a, _b, _c, _d, F2, 36,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
-      W3, W4, W5, W6, W7, W0, W1, W2);
-  _R( _d, _e, _a, _b, _c, F2, 37,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
-      W3, W4, W5, W6, W7, W0, W1, W2);
-  _R( _c, _d, _e, _a, _b, F2, 38,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
-      W3, W4, W5, W6, W7, W0, W1, W2);
-  _R( _b, _c, _d, _e, _a, F2, 39,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
-      W3, W4, W5, W6, W7, W0, W1, W2);
-
-  _R( _a, _b, _c, _d, _e, F3, 40,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
-      W2, W3, W4, W5, W6, W7, W0, W1);
-  _R( _e, _a, _b, _c, _d, F3, 41,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
-      W2, W3, W4, W5, W6, W7, W0, W1);
-  _R( _d, _e, _a, _b, _c, F3, 42,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
-      W2, W3, W4, W5, W6, W7, W0, W1);
-  _R( _c, _d, _e, _a, _b, F3, 43,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
-      W2, W3, W4, W5, W6, W7, W0, W1);
-
-#undef curK
-#define curK qK4
-  _R( _b, _c, _d, _e, _a, F3, 44,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
-      W1, W2, W3, W4, W5, W6, W7, W0);
-  _R( _a, _b, _c, _d, _e, F3, 45,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
-      W1, W2, W3, W4, W5, W6, W7, W0);
-  _R( _e, _a, _b, _c, _d, F3, 46,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
-      W1, W2, W3, W4, W5, W6, W7, W0);
-  _R( _d, _e, _a, _b, _c, F3, 47,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
-      W1, W2, W3, W4, W5, W6, W7, W0);
-
-  _R( _c, _d, _e, _a, _b, F3, 48,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _b, _c, _d, _e, _a, F3, 49,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _a, _b, _c, _d, _e, F3, 50,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _e, _a, _b, _c, _d, F3, 51,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-
-  _R( _d, _e, _a, _b, _c, F3, 52,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _c, _d, _e, _a, _b, F3, 53,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _b, _c, _d, _e, _a, F3, 54,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _a, _b, _c, _d, _e, F3, 55,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-
-  _R( _e, _a, _b, _c, _d, F3, 56,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _d, _e, _a, _b, _c, F3, 57,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _c, _d, _e, _a, _b, F3, 58,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _b, _c, _d, _e, _a, F3, 59,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-
-  subs RNBLKS, #1;
-
-  _R( _a, _b, _c, _d, _e, F4, 60,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _e, _a, _b, _c, _d, F4, 61,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _d, _e, _a, _b, _c, F4, 62,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _c, _d, _e, _a, _b, F4, 63,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-
-  beq .Lend;
-
-  /* Transform 64-79 + Precalc 0-15 of next block. */
-#undef curK
-#define curK qK1
-  _R( _b, _c, _d, _e, _a, F4, 64,
-      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F4, 65,
-      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F4, 66,
-      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _d, _e, _a, _b, _c, F4, 67,
-      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
-  _R( _c, _d, _e, _a, _b, F4, 68,
-      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F4, 69,
-      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F4, 70,
-      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F4, 71,
-      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
-  _R( _d, _e, _a, _b, _c, F4, 72,
-      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _c, _d, _e, _a, _b, F4, 73,
-      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F4, 74,
-      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F4, 75,
-      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
-  _R( _e, _a, _b, _c, _d, F4, 76,
-      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _d, _e, _a, _b, _c, F4, 77,
-      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _c, _d, _e, _a, _b, F4, 78,
-      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F4, 79,
-      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
-
-  /* Update the chaining variables. */
-  ldm RSTATE, {RT0-RT3};
-  add _a, RT0;
-  ldr RT0, [RSTATE, #state_h4];
-  add _b, RT1;
-  add _c, RT2;
-  add _d, RT3;
-  add _e, RT0;
-  stm RSTATE, {_a-_e};
-
-  b .Loop;
-
-.Lend:
-  /* Transform 64-79 */
-  R( _b, _c, _d, _e, _a, F4, 64 );
-  R( _a, _b, _c, _d, _e, F4, 65 );
-  R( _e, _a, _b, _c, _d, F4, 66 );
-  R( _d, _e, _a, _b, _c, F4, 67 );
-  R( _c, _d, _e, _a, _b, F4, 68 );
-  R( _b, _c, _d, _e, _a, F4, 69 );
-  R( _a, _b, _c, _d, _e, F4, 70 );
-  R( _e, _a, _b, _c, _d, F4, 71 );
-  R( _d, _e, _a, _b, _c, F4, 72 );
-  R( _c, _d, _e, _a, _b, F4, 73 );
-  R( _b, _c, _d, _e, _a, F4, 74 );
-  R( _a, _b, _c, _d, _e, F4, 75 );
-  R( _e, _a, _b, _c, _d, F4, 76 );
-  R( _d, _e, _a, _b, _c, F4, 77 );
-  R( _c, _d, _e, _a, _b, F4, 78 );
-  R( _b, _c, _d, _e, _a, F4, 79 );
-
-  mov sp, ROLDSTACK;
-
-  /* Update the chaining variables. */
-  ldm RSTATE, {RT0-RT3};
-  add _a, RT0;
-  ldr RT0, [RSTATE, #state_h4];
-  add _b, RT1;
-  add _c, RT2;
-  add _d, RT3;
-  /*vpop {q4-q7};*/
-  add _e, RT0;
-  stm RSTATE, {_a-_e};
-
-  pop {r4-r12, pc};
-
-.Ldo_nothing:
-  bx lr
-ENDPROC(sha1_transform_neon)
diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S
deleted file mode 100644
index 8a702e051738..000000000000
--- a/arch/arm/crypto/sha1-ce-core.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.arch		armv8-a
-	.fpu		crypto-neon-fp-armv8
-
-	k0		.req	q0
-	k1		.req	q1
-	k2		.req	q2
-	k3		.req	q3
-
-	ta0		.req	q4
-	ta1		.req	q5
-	tb0		.req	q5
-	tb1		.req	q4
-
-	dga		.req	q6
-	dgb		.req	q7
-	dgbs		.req	s28
-
-	dg0		.req	q12
-	dg1a0		.req	q13
-	dg1a1		.req	q14
-	dg1b0		.req	q14
-	dg1b1		.req	q13
-
-	.macro		add_only, op, ev, rc, s0, dg1
-	.ifnb		\s0
-	vadd.u32	tb\ev, q\s0, \rc
-	.endif
-	sha1h.32	dg1b\ev, dg0
-	.ifb		\dg1
-	sha1\op\().32	dg0, dg1a\ev, ta\ev
-	.else
-	sha1\op\().32	dg0, \dg1, ta\ev
-	.endif
-	.endm
-
-	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1
-	sha1su0.32	q\s0, q\s1, q\s2
-	add_only	\op, \ev, \rc, \s1, \dg1
-	sha1su1.32	q\s0, q\s3
-	.endm
-
-	.align		6
-.Lsha1_rcon:
-	.word		0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
-	.word		0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
-	.word		0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
-	.word		0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
-
-	/*
-	 * void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
-	 *			  int blocks);
-	 */
-ENTRY(sha1_ce_transform)
-	/* load round constants */
-	adr		ip, .Lsha1_rcon
-	vld1.32		{k0-k1}, [ip, :128]!
-	vld1.32		{k2-k3}, [ip, :128]
-
-	/* load state */
-	vld1.32		{dga}, [r0]
-	vldr		dgbs, [r0, #16]
-
-	/* load input */
-0:	vld1.32		{q8-q9}, [r1]!
-	vld1.32		{q10-q11}, [r1]!
-	subs		r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
-	vrev32.8	q8, q8
-	vrev32.8	q9, q9
-	vrev32.8	q10, q10
-	vrev32.8	q11, q11
-#endif
-
-	vadd.u32	ta0, q8, k0
-	vmov		dg0, dga
-
-	add_update	c, 0, k0,  8,  9, 10, 11, dgb
-	add_update	c, 1, k0,  9, 10, 11,  8
-	add_update	c, 0, k0, 10, 11,  8,  9
-	add_update	c, 1, k0, 11,  8,  9, 10
-	add_update	c, 0, k1,  8,  9, 10, 11
-
-	add_update	p, 1, k1,  9, 10, 11,  8
-	add_update	p, 0, k1, 10, 11,  8,  9
-	add_update	p, 1, k1, 11,  8,  9, 10
-	add_update	p, 0, k1,  8,  9, 10, 11
-	add_update	p, 1, k2,  9, 10, 11,  8
-
-	add_update	m, 0, k2, 10, 11,  8,  9
-	add_update	m, 1, k2, 11,  8,  9, 10
-	add_update	m, 0, k2,  8,  9, 10, 11
-	add_update	m, 1, k2,  9, 10, 11,  8
-	add_update	m, 0, k3, 10, 11,  8,  9
-
-	add_update	p, 1, k3, 11,  8,  9, 10
-	add_only	p, 0, k3,  9
-	add_only	p, 1, k3, 10
-	add_only	p, 0, k3, 11
-	add_only	p, 1
-
-	/* update state */
-	vadd.u32	dga, dga, dg0
-	vadd.u32	dgb, dgb, dg1a0
-	bne		0b
-
-	/* store new state */
-	vst1.32		{dga}, [r0]
-	vstr		dgbs, [r0, #16]
-	bx		lr
-ENDPROC(sha1_ce_transform)
diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
deleted file mode 100644
index fac07a4799de..000000000000
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
-				  int blocks);
-
-static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
-			  unsigned int len)
-{
-	int remain;
-
-	kernel_neon_begin();
-	remain = sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
-	kernel_neon_end();
-
-	return remain;
-}
-
-static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
-			 unsigned int len, u8 *out)
-{
-	kernel_neon_begin();
-	sha1_base_do_finup(desc, data, len, sha1_ce_transform);
-	kernel_neon_end();
-
-	return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-	.init			= sha1_base_init,
-	.update			= sha1_ce_update,
-	.finup			= sha1_ce_finup,
-	.descsize		= SHA1_STATE_SIZE,
-	.digestsize		= SHA1_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha1",
-		.cra_driver_name	= "sha1-ce",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= SHA1_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-};
-
-static int __init sha1_ce_mod_init(void)
-{
-	return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_ce_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(SHA1, sha1_ce_mod_init);
-module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
deleted file mode 100644
index 255da00c7d98..000000000000
--- a/arch/arm/crypto/sha1_glue.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation
- *
- * This file is based on sha1_generic.c and sha1_ssse3_glue.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_block_data_order(struct sha1_state *digest,
-		const u8 *data, int rounds);
-
-static int sha1_update_arm(struct shash_desc *desc, const u8 *data,
-			   unsigned int len)
-{
-	/* make sure signature matches sha1_block_fn() */
-	BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
-
-	return sha1_base_do_update_blocks(desc, data, len,
-					  sha1_block_data_order);
-}
-
-static int sha1_finup_arm(struct shash_desc *desc, const u8 *data,
-			  unsigned int len, u8 *out)
-{
-	sha1_base_do_finup(desc, data, len, sha1_block_data_order);
-	return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_base_init,
-	.update		=	sha1_update_arm,
-	.finup		=	sha1_finup_arm,
-	.descsize	=	SHA1_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"sha1",
-		.cra_driver_name=	"sha1-asm",
-		.cra_priority	=	150,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	SHA1_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-
-static int __init sha1_mod_init(void)
-{
-	return crypto_register_shash(&alg);
-}
-
-
-static void __exit sha1_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-
-module_init(sha1_mod_init);
-module_exit(sha1_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (ARM)");
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
deleted file mode 100644
index d321850f22a6..000000000000
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
- * ARM NEON instructions.
- *
- * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is based on sha1_generic.c and sha1_ssse3_glue.c:
- *  Copyright (c) Alan Smithee.
- *  Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- *  Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- *  Copyright (c) Mathias Krause <minipli@googlemail.com>
- *  Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_transform_neon(struct sha1_state *state_h,
-				    const u8 *data, int rounds);
-
-static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
-			  unsigned int len)
-{
-	int remain;
-
-	kernel_neon_begin();
-	remain = sha1_base_do_update_blocks(desc, data, len,
-					    sha1_transform_neon);
-	kernel_neon_end();
-
-	return remain;
-}
-
-static int sha1_neon_finup(struct shash_desc *desc, const u8 *data,
-			   unsigned int len, u8 *out)
-{
-	kernel_neon_begin();
-	sha1_base_do_finup(desc, data, len, sha1_transform_neon);
-	kernel_neon_end();
-
-	return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_base_init,
-	.update		=	sha1_neon_update,
-	.finup		=	sha1_neon_finup,
-	.descsize		= SHA1_STATE_SIZE,
-	.base		=	{
-		.cra_name		= "sha1",
-		.cra_driver_name	= "sha1-neon",
-		.cra_priority		= 250,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= SHA1_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-};
-
-static int __init sha1_neon_mod_init(void)
-{
-	if (!cpu_has_neon())
-		return -ENODEV;
-
-	return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_neon_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_init(sha1_neon_mod_init);
-module_exit(sha1_neon_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated");
-MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
deleted file mode 100644
index 2fc3516912fa..000000000000
--- a/arch/arm/crypto/sha512-armv4.pl
+++ /dev/null
@@ -1,657 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA512 block procedure for ARMv4. September 2007.
-
-# This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-# Xscale PXA250 core].
-#
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 6% improvement on
-# Cortex A8 core and ~40 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 7%
-# improvement on Coxtex A8 core and ~38 cycles per byte.
-
-# March 2011.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process
-# one byte in 23.3 cycles or ~60% faster than integer-only code.
-
-# August 2012.
-#
-# Improve NEON performance by 12% on Snapdragon S4. In absolute
-# terms it's 22.6 cycles per byte, which is disappointing result.
-# Technical writers asserted that 3-way S4 pipeline can sustain
-# multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
-# for further details. On side note Cortex-A15 processes one byte in
-# 16 cycles.
-
-# Byte order [in]dependence. =========================================
-#
-# Originally caller was expected to maintain specific *dword* order in
-# h[0-7], namely with most significant dword at *lower* address, which
-# was reflected in below two parameters as 0 and 4. Now caller is
-# expected to maintain native byte order for whole 64-bit values.
-$hi="HI";
-$lo="LO";
-# ====================================================================
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0";	# parameter block
-$inp="r1";
-$len="r2";
-
-$Tlo="r3";
-$Thi="r4";
-$Alo="r5";
-$Ahi="r6";
-$Elo="r7";
-$Ehi="r8";
-$t0="r9";
-$t1="r10";
-$t2="r11";
-$t3="r12";
-############	r13 is stack pointer
-$Ktbl="r14";
-############	r15 is program counter
-
-$Aoff=8*0;
-$Boff=8*1;
-$Coff=8*2;
-$Doff=8*3;
-$Eoff=8*4;
-$Foff=8*5;
-$Goff=8*6;
-$Hoff=8*7;
-$Xoff=8*8;
-
-sub BODY_00_15() {
-my $magic = shift;
-$code.=<<___;
-	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-	mov	$t0,$Elo,lsr#14
-	str	$Tlo,[sp,#$Xoff+0]
-	mov	$t1,$Ehi,lsr#14
-	str	$Thi,[sp,#$Xoff+4]
-	eor	$t0,$t0,$Ehi,lsl#18
-	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
-	eor	$t1,$t1,$Elo,lsl#18
-	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
-	eor	$t0,$t0,$Elo,lsr#18
-	eor	$t1,$t1,$Ehi,lsr#18
-	eor	$t0,$t0,$Ehi,lsl#14
-	eor	$t1,$t1,$Elo,lsl#14
-	eor	$t0,$t0,$Ehi,lsr#9
-	eor	$t1,$t1,$Elo,lsr#9
-	eor	$t0,$t0,$Elo,lsl#23
-	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
-	adds	$Tlo,$Tlo,$t0
-	ldr	$t0,[sp,#$Foff+0]	@ f.lo
-	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
-	ldr	$t1,[sp,#$Foff+4]	@ f.hi
-	adds	$Tlo,$Tlo,$t2
-	ldr	$t2,[sp,#$Goff+0]	@ g.lo
-	adc	$Thi,$Thi,$t3		@ T += h
-	ldr	$t3,[sp,#$Goff+4]	@ g.hi
-
-	eor	$t0,$t0,$t2
-	str	$Elo,[sp,#$Eoff+0]
-	eor	$t1,$t1,$t3
-	str	$Ehi,[sp,#$Eoff+4]
-	and	$t0,$t0,$Elo
-	str	$Alo,[sp,#$Aoff+0]
-	and	$t1,$t1,$Ehi
-	str	$Ahi,[sp,#$Aoff+4]
-	eor	$t0,$t0,$t2
-	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
-	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
-
-	adds	$Tlo,$Tlo,$t0
-	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
-	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
-	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
-	adds	$Tlo,$Tlo,$t2
-	and	$t0,$t2,#0xff
-	adc	$Thi,$Thi,$t3		@ T += K[i]
-	adds	$Elo,$Elo,$Tlo
-	ldr	$t2,[sp,#$Boff+0]	@ b.lo
-	adc	$Ehi,$Ehi,$Thi		@ d += T
-	teq	$t0,#$magic
-
-	ldr	$t3,[sp,#$Coff+0]	@ c.lo
-#if __ARM_ARCH__>=7
-	it	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	orreq	$Ktbl,$Ktbl,#1
-	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-	mov	$t0,$Alo,lsr#28
-	mov	$t1,$Ahi,lsr#28
-	eor	$t0,$t0,$Ahi,lsl#4
-	eor	$t1,$t1,$Alo,lsl#4
-	eor	$t0,$t0,$Ahi,lsr#2
-	eor	$t1,$t1,$Alo,lsr#2
-	eor	$t0,$t0,$Alo,lsl#30
-	eor	$t1,$t1,$Ahi,lsl#30
-	eor	$t0,$t0,$Ahi,lsr#7
-	eor	$t1,$t1,$Alo,lsr#7
-	eor	$t0,$t0,$Alo,lsl#25
-	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
-	adds	$Tlo,$Tlo,$t0
-	and	$t0,$Alo,$t2
-	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
-
-	ldr	$t1,[sp,#$Boff+4]	@ b.hi
-	orr	$Alo,$Alo,$t2
-	ldr	$t2,[sp,#$Coff+4]	@ c.hi
-	and	$Alo,$Alo,$t3
-	and	$t3,$Ahi,$t1
-	orr	$Ahi,$Ahi,$t1
-	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
-	and	$Ahi,$Ahi,$t2
-	adds	$Alo,$Alo,$Tlo
-	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
-	sub	sp,sp,#8
-	adc	$Ahi,$Ahi,$Thi		@ h += T
-	tst	$Ktbl,#1
-	add	$Ktbl,$Ktbl,#8
-___
-}
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-#endif
-
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code	32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code   32
-# endif
-#endif
-
-.type	K512,%object
-.align	5
-K512:
-WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
-WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
-WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
-WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
-WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
-WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
-WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
-WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
-WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
-WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
-WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
-WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
-WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
-WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
-WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
-WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
-WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
-WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
-WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
-WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
-WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
-WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
-WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
-WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
-WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
-WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
-WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
-WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
-WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
-WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
-WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
-WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
-WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
-WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
-WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
-WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
-WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
-WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
-WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
-WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
-.size	K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha512_block_data_order
-.skip	32-4
-#else
-.skip	32
-#endif
-
-.global	sha512_block_data_order
-.type	sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
-#if __ARM_ARCH__<7
-	sub	r3,pc,#8		@ sha512_block_data_order
-#else
-	adr	r3,.Lsha512_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-	tst	r12,#1
-	bne	.LNEON
-#endif
-	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
-	stmdb	sp!,{r4-r12,lr}
-	sub	$Ktbl,r3,#672		@ K512
-	sub	sp,sp,#9*8
-
-	ldr	$Elo,[$ctx,#$Eoff+$lo]
-	ldr	$Ehi,[$ctx,#$Eoff+$hi]
-	ldr	$t0, [$ctx,#$Goff+$lo]
-	ldr	$t1, [$ctx,#$Goff+$hi]
-	ldr	$t2, [$ctx,#$Hoff+$lo]
-	ldr	$t3, [$ctx,#$Hoff+$hi]
-.Loop:
-	str	$t0, [sp,#$Goff+0]
-	str	$t1, [sp,#$Goff+4]
-	str	$t2, [sp,#$Hoff+0]
-	str	$t3, [sp,#$Hoff+4]
-	ldr	$Alo,[$ctx,#$Aoff+$lo]
-	ldr	$Ahi,[$ctx,#$Aoff+$hi]
-	ldr	$Tlo,[$ctx,#$Boff+$lo]
-	ldr	$Thi,[$ctx,#$Boff+$hi]
-	ldr	$t0, [$ctx,#$Coff+$lo]
-	ldr	$t1, [$ctx,#$Coff+$hi]
-	ldr	$t2, [$ctx,#$Doff+$lo]
-	ldr	$t3, [$ctx,#$Doff+$hi]
-	str	$Tlo,[sp,#$Boff+0]
-	str	$Thi,[sp,#$Boff+4]
-	str	$t0, [sp,#$Coff+0]
-	str	$t1, [sp,#$Coff+4]
-	str	$t2, [sp,#$Doff+0]
-	str	$t3, [sp,#$Doff+4]
-	ldr	$Tlo,[$ctx,#$Foff+$lo]
-	ldr	$Thi,[$ctx,#$Foff+$hi]
-	str	$Tlo,[sp,#$Foff+0]
-	str	$Thi,[sp,#$Foff+4]
-
-.L00_15:
-#if __ARM_ARCH__<7
-	ldrb	$Tlo,[$inp,#7]
-	ldrb	$t0, [$inp,#6]
-	ldrb	$t1, [$inp,#5]
-	ldrb	$t2, [$inp,#4]
-	ldrb	$Thi,[$inp,#3]
-	ldrb	$t3, [$inp,#2]
-	orr	$Tlo,$Tlo,$t0,lsl#8
-	ldrb	$t0, [$inp,#1]
-	orr	$Tlo,$Tlo,$t1,lsl#16
-	ldrb	$t1, [$inp],#8
-	orr	$Tlo,$Tlo,$t2,lsl#24
-	orr	$Thi,$Thi,$t3,lsl#8
-	orr	$Thi,$Thi,$t0,lsl#16
-	orr	$Thi,$Thi,$t1,lsl#24
-#else
-	ldr	$Tlo,[$inp,#4]
-	ldr	$Thi,[$inp],#8
-#ifdef __ARMEL__
-	rev	$Tlo,$Tlo
-	rev	$Thi,$Thi
-#endif
-#endif
-___
-	&BODY_00_15(0x94);
-$code.=<<___;
-	tst	$Ktbl,#1
-	beq	.L00_15
-	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
-	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
-	bic	$Ktbl,$Ktbl,#1
-.L16_79:
-	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
-	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
-	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
-	mov	$Tlo,$t0,lsr#1
-	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
-	mov	$Thi,$t1,lsr#1
-	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
-	eor	$Tlo,$Tlo,$t1,lsl#31
-	eor	$Thi,$Thi,$t0,lsl#31
-	eor	$Tlo,$Tlo,$t0,lsr#8
-	eor	$Thi,$Thi,$t1,lsr#8
-	eor	$Tlo,$Tlo,$t1,lsl#24
-	eor	$Thi,$Thi,$t0,lsl#24
-	eor	$Tlo,$Tlo,$t0,lsr#7
-	eor	$Thi,$Thi,$t1,lsr#7
-	eor	$Tlo,$Tlo,$t1,lsl#25
-
-	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
-	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
-	mov	$t0,$t2,lsr#19
-	mov	$t1,$t3,lsr#19
-	eor	$t0,$t0,$t3,lsl#13
-	eor	$t1,$t1,$t2,lsl#13
-	eor	$t0,$t0,$t3,lsr#29
-	eor	$t1,$t1,$t2,lsr#29
-	eor	$t0,$t0,$t2,lsl#3
-	eor	$t1,$t1,$t3,lsl#3
-	eor	$t0,$t0,$t2,lsr#6
-	eor	$t1,$t1,$t3,lsr#6
-	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
-	eor	$t0,$t0,$t3,lsl#26
-
-	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
-	adds	$Tlo,$Tlo,$t0
-	ldr	$t0,[sp,#`$Xoff+8*16`+0]
-	adc	$Thi,$Thi,$t1
-
-	ldr	$t1,[sp,#`$Xoff+8*16`+4]
-	adds	$Tlo,$Tlo,$t2
-	adc	$Thi,$Thi,$t3
-	adds	$Tlo,$Tlo,$t0
-	adc	$Thi,$Thi,$t1
-___
-	&BODY_00_15(0x17);
-$code.=<<___;
-#if __ARM_ARCH__>=7
-	ittt	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
-	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
-	beq	.L16_79
-	bic	$Ktbl,$Ktbl,#1
-
-	ldr	$Tlo,[sp,#$Boff+0]
-	ldr	$Thi,[sp,#$Boff+4]
-	ldr	$t0, [$ctx,#$Aoff+$lo]
-	ldr	$t1, [$ctx,#$Aoff+$hi]
-	ldr	$t2, [$ctx,#$Boff+$lo]
-	ldr	$t3, [$ctx,#$Boff+$hi]
-	adds	$t0,$Alo,$t0
-	str	$t0, [$ctx,#$Aoff+$lo]
-	adc	$t1,$Ahi,$t1
-	str	$t1, [$ctx,#$Aoff+$hi]
-	adds	$t2,$Tlo,$t2
-	str	$t2, [$ctx,#$Boff+$lo]
-	adc	$t3,$Thi,$t3
-	str	$t3, [$ctx,#$Boff+$hi]
-
-	ldr	$Alo,[sp,#$Coff+0]
-	ldr	$Ahi,[sp,#$Coff+4]
-	ldr	$Tlo,[sp,#$Doff+0]
-	ldr	$Thi,[sp,#$Doff+4]
-	ldr	$t0, [$ctx,#$Coff+$lo]
-	ldr	$t1, [$ctx,#$Coff+$hi]
-	ldr	$t2, [$ctx,#$Doff+$lo]
-	ldr	$t3, [$ctx,#$Doff+$hi]
-	adds	$t0,$Alo,$t0
-	str	$t0, [$ctx,#$Coff+$lo]
-	adc	$t1,$Ahi,$t1
-	str	$t1, [$ctx,#$Coff+$hi]
-	adds	$t2,$Tlo,$t2
-	str	$t2, [$ctx,#$Doff+$lo]
-	adc	$t3,$Thi,$t3
-	str	$t3, [$ctx,#$Doff+$hi]
-
-	ldr	$Tlo,[sp,#$Foff+0]
-	ldr	$Thi,[sp,#$Foff+4]
-	ldr	$t0, [$ctx,#$Eoff+$lo]
-	ldr	$t1, [$ctx,#$Eoff+$hi]
-	ldr	$t2, [$ctx,#$Foff+$lo]
-	ldr	$t3, [$ctx,#$Foff+$hi]
-	adds	$Elo,$Elo,$t0
-	str	$Elo,[$ctx,#$Eoff+$lo]
-	adc	$Ehi,$Ehi,$t1
-	str	$Ehi,[$ctx,#$Eoff+$hi]
-	adds	$t2,$Tlo,$t2
-	str	$t2, [$ctx,#$Foff+$lo]
-	adc	$t3,$Thi,$t3
-	str	$t3, [$ctx,#$Foff+$hi]
-
-	ldr	$Alo,[sp,#$Goff+0]
-	ldr	$Ahi,[sp,#$Goff+4]
-	ldr	$Tlo,[sp,#$Hoff+0]
-	ldr	$Thi,[sp,#$Hoff+4]
-	ldr	$t0, [$ctx,#$Goff+$lo]
-	ldr	$t1, [$ctx,#$Goff+$hi]
-	ldr	$t2, [$ctx,#$Hoff+$lo]
-	ldr	$t3, [$ctx,#$Hoff+$hi]
-	adds	$t0,$Alo,$t0
-	str	$t0, [$ctx,#$Goff+$lo]
-	adc	$t1,$Ahi,$t1
-	str	$t1, [$ctx,#$Goff+$hi]
-	adds	$t2,$Tlo,$t2
-	str	$t2, [$ctx,#$Hoff+$lo]
-	adc	$t3,$Thi,$t3
-	str	$t3, [$ctx,#$Hoff+$hi]
-
-	add	sp,sp,#640
-	sub	$Ktbl,$Ktbl,#640
-
-	teq	$inp,$len
-	bne	.Loop
-
-	add	sp,sp,#8*9		@ destroy frame
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4-r12,pc}
-#else
-	ldmia	sp!,{r4-r12,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	sha512_block_data_order,.-sha512_block_data_order
-___
-
-{
-my @Sigma0=(28,34,39);
-my @Sigma1=(14,18,41);
-my @sigma0=(1, 8, 7);
-my @sigma1=(19,61,6);
-
-my $Ktbl="r3";
-my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
-
-my @X=map("d$_",(0..15));
-my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
-
-sub NEON_00_15() {
-my $i=shift;
-my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
-
-$code.=<<___ if ($i<16 || $i&1);
-	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
-#if $i<16
-	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
-#endif
-	vshr.u64	$t1,$e,#@Sigma1[1]
-#if $i>0
-	 vadd.i64	$a,$Maj			@ h+=Maj from the past
-#endif
-	vshr.u64	$t2,$e,#@Sigma1[2]
-___
-$code.=<<___;
-	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
-	vsli.64		$t0,$e,#`64-@Sigma1[0]`
-	vsli.64		$t1,$e,#`64-@Sigma1[1]`
-	vmov		$Ch,$e
-	vsli.64		$t2,$e,#`64-@Sigma1[2]`
-#if $i<16 && defined(__ARMEL__)
-	vrev64.8	@X[$i],@X[$i]
-#endif
-	veor		$t1,$t0
-	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
-	vshr.u64	$t0,$a,#@Sigma0[0]
-	veor		$t2,$t1			@ Sigma1(e)
-	vadd.i64	$T1,$Ch,$h
-	vshr.u64	$t1,$a,#@Sigma0[1]
-	vsli.64		$t0,$a,#`64-@Sigma0[0]`
-	vadd.i64	$T1,$t2
-	vshr.u64	$t2,$a,#@Sigma0[2]
-	vadd.i64	$K,@X[$i%16]
-	vsli.64		$t1,$a,#`64-@Sigma0[1]`
-	veor		$Maj,$a,$b
-	vsli.64		$t2,$a,#`64-@Sigma0[2]`
-	veor		$h,$t0,$t1
-	vadd.i64	$T1,$K
-	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
-	veor		$h,$t2			@ Sigma0(a)
-	vadd.i64	$d,$T1
-	vadd.i64	$Maj,$T1
-	@ vadd.i64	$h,$Maj
-___
-}
-
-sub NEON_16_79() {
-my $i=shift;
-
-if ($i&1)	{ &NEON_00_15($i,@_); return; }
-
-# 2x-vectorized, therefore runs every 2nd round
-my @X=map("q$_",(0..7));			# view @X as 128-bit vector
-my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
-my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
-my $e=@_[4];					# $e from NEON_00_15
-$i /= 2;
-$code.=<<___;
-	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
-	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
-	 vadd.i64	@_[0],d30			@ h+=Maj from the past
-	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
-	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
-	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
-	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
-	veor		$s1,$t0
-	vshr.u64	$t0,$s0,#@sigma0[0]
-	veor		$s1,$t1				@ sigma1(X[i+14])
-	vshr.u64	$t1,$s0,#@sigma0[1]
-	vadd.i64	@X[$i%8],$s1
-	vshr.u64	$s1,$s0,#@sigma0[2]
-	vsli.64		$t0,$s0,#`64-@sigma0[0]`
-	vsli.64		$t1,$s0,#`64-@sigma0[1]`
-	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
-	veor		$s1,$t0
-	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
-	vadd.i64	@X[$i%8],$s0
-	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
-	veor		$s1,$t1				@ sigma0(X[i+1])
-	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
-	vadd.i64	@X[$i%8],$s1
-___
-	&NEON_00_15(2*$i,@_);
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch	armv7-a
-.fpu	neon
-
-.global	sha512_block_data_order_neon
-.type	sha512_block_data_order_neon,%function
-.align	4
-sha512_block_data_order_neon:
-.LNEON:
-	dmb				@ errata #451034 on early Cortex A8
-	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
-	VFP_ABI_PUSH
-	adr	$Ktbl,.Lsha512_block_data_order
-	sub	$Ktbl,$Ktbl,.Lsha512_block_data_order-K512
-	vldmia	$ctx,{$A-$H}		@ load context
-.Loop_neon:
-___
-for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-	mov		$cnt,#4
-.L16_79_neon:
-	subs		$cnt,#1
-___
-for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-	bne		.L16_79_neon
-
-	 vadd.i64	$A,d30		@ h+=Maj from the past
-	vldmia		$ctx,{d24-d31}	@ load context to temp
-	vadd.i64	q8,q12		@ vectorized accumulate
-	vadd.i64	q9,q13
-	vadd.i64	q10,q14
-	vadd.i64	q11,q15
-	vstmia		$ctx,{$A-$H}	@ save context
-	teq		$inp,$len
-	sub		$Ktbl,#640	@ rewind K512
-	bne		.Loop_neon
-
-	VFP_ABI_POP
-	ret				@ bx lr
-.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
-#endif
-___
-}
-$code.=<<___;
-.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm	OPENSSL_armcap_P,4,4
-#endif
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx	lr/gm;
-
-open SELF,$0;
-while(<SELF>) {
-	next if (/^#!/);
-	last if (!s/^#/@/ and !/^$/);
-	print;
-}
-close SELF;
-
-print $code;
-close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
deleted file mode 100644
index f8a6480889b1..000000000000
--- a/arch/arm/crypto/sha512-glue.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha512-glue.c - accelerated SHA-384/512 for ARM
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha512.h"
-
-MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha384-arm");
-MODULE_ALIAS_CRYPTO("sha512-arm");
-
-asmlinkage void sha512_block_data_order(struct sha512_state *state,
-					u8 const *src, int blocks);
-
-static int sha512_arm_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	return sha512_base_do_update_blocks(desc, data, len,
-					    sha512_block_data_order);
-}
-
-static int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
-			    unsigned int len, u8 *out)
-{
-	sha512_base_do_finup(desc, data, len, sha512_block_data_order);
-	return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg sha512_arm_algs[] = { {
-	.init			= sha384_base_init,
-	.update			= sha512_arm_update,
-	.finup			= sha512_arm_finup,
-	.descsize		= SHA512_STATE_SIZE,
-	.digestsize		= SHA384_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha384",
-		.cra_driver_name	= "sha384-arm",
-		.cra_priority		= 250,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					  CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize		= SHA512_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-},  {
-	.init			= sha512_base_init,
-	.update			= sha512_arm_update,
-	.finup			= sha512_arm_finup,
-	.descsize		= SHA512_STATE_SIZE,
-	.digestsize		= SHA512_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha512",
-		.cra_driver_name	= "sha512-arm",
-		.cra_priority		= 250,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					  CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize		= SHA512_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-} };
-
-static int __init sha512_arm_mod_init(void)
-{
-	int err;
-
-	err = crypto_register_shashes(sha512_arm_algs,
-				      ARRAY_SIZE(sha512_arm_algs));
-	if (err)
-		return err;
-
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
-		err = crypto_register_shashes(sha512_neon_algs,
-					      ARRAY_SIZE(sha512_neon_algs));
-		if (err)
-			goto err_unregister;
-	}
-	return 0;
-
-err_unregister:
-	crypto_unregister_shashes(sha512_arm_algs,
-				  ARRAY_SIZE(sha512_arm_algs));
-
-	return err;
-}
-
-static void __exit sha512_arm_mod_fini(void)
-{
-	crypto_unregister_shashes(sha512_arm_algs,
-				  ARRAY_SIZE(sha512_arm_algs));
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
-		crypto_unregister_shashes(sha512_neon_algs,
-					  ARRAY_SIZE(sha512_neon_algs));
-}
-
-module_init(sha512_arm_mod_init);
-module_exit(sha512_arm_mod_fini);
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
deleted file mode 100644
index bd528077fefb..000000000000
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha512.h"
-
-MODULE_ALIAS_CRYPTO("sha384-neon");
-MODULE_ALIAS_CRYPTO("sha512-neon");
-
-asmlinkage void sha512_block_data_order_neon(struct sha512_state *state,
-					     const u8 *src, int blocks);
-
-static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len)
-{
-	int remain;
-
-	kernel_neon_begin();
-	remain = sha512_base_do_update_blocks(desc, data, len,
-					      sha512_block_data_order_neon);
-	kernel_neon_end();
-	return remain;
-}
-
-static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, u8 *out)
-{
-	kernel_neon_begin();
-	sha512_base_do_finup(desc, data, len, sha512_block_data_order_neon);
-	kernel_neon_end();
-	return sha512_base_finish(desc, out);
-}
-
-struct shash_alg sha512_neon_algs[] = { {
-	.init			= sha384_base_init,
-	.update			= sha512_neon_update,
-	.finup			= sha512_neon_finup,
-	.descsize		= SHA512_STATE_SIZE,
-	.digestsize		= SHA384_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha384",
-		.cra_driver_name	= "sha384-neon",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					  CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize		= SHA384_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-
-	}
-},  {
-	.init			= sha512_base_init,
-	.update			= sha512_neon_update,
-	.finup			= sha512_neon_finup,
-	.descsize		= SHA512_STATE_SIZE,
-	.digestsize		= SHA512_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha512",
-		.cra_driver_name	= "sha512-neon",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					  CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize		= SHA512_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-} };
diff --git a/arch/arm/crypto/sha512.h b/arch/arm/crypto/sha512.h
deleted file mode 100644
index eeaee52cda69..000000000000
--- a/arch/arm/crypto/sha512.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-extern struct shash_alg sha512_neon_algs[2];
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index f379c852dcb7..88336a1292bb 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -119,7 +119,7 @@ no_work_pending:
 
 	ct_user_enter save = 0
 
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	bl	stackleak_erase_on_task_stack
 #endif
 	restore_user_regs fast = 0, offset = 0
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index c421a899fc84..7951b2c06fec 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -677,7 +677,7 @@ enum arm_regset {
 
 static const struct user_regset arm_regsets[] = {
 	[REGSET_GPR] = {
-		.core_note_type = NT_PRSTATUS,
+		USER_REGSET_NOTE_TYPE(PRSTATUS),
 		.n = ELF_NGREG,
 		.size = sizeof(u32),
 		.align = sizeof(u32),
@@ -689,7 +689,7 @@ static const struct user_regset arm_regsets[] = {
 		 * For the FPA regs in fpstate, the real fields are a mixture
 		 * of sizes, so pretend that the registers are word-sized:
 		 */
-		.core_note_type = NT_PRFPREG,
+		USER_REGSET_NOTE_TYPE(PRFPREG),
 		.n = sizeof(struct user_fp) / sizeof(u32),
 		.size = sizeof(u32),
 		.align = sizeof(u32),
@@ -702,7 +702,7 @@ static const struct user_regset arm_regsets[] = {
 		 * Pretend that the VFP regs are word-sized, since the FPSCR is
 		 * a single word dangling at the end of struct user_vfp:
 		 */
-		.core_note_type = NT_ARM_VFP,
+		USER_REGSET_NOTE_TYPE(ARM_VFP),
 		.n = ARM_VFPREGS_SIZE / sizeof(u32),
 		.size = sizeof(u32),
 		.align = sizeof(u32),
diff --git a/arch/arm/lib/.gitignore b/arch/arm/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/arm/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 91ea0e29107a..0ca5aae1bcc3 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -5,8 +5,6 @@
 # Copyright (C) 1995-2000 Russell King
 #
 
-obj-y += crypto/
-
 lib-y		:= changebit.o csumipv6.o csumpartial.o               \
 		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
 		   delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
@@ -47,9 +45,3 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
 endif
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
-
-obj-$(CONFIG_CRC32_ARCH) += crc32-arm.o
-crc32-arm-y := crc32.o crc32-core.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm.o
-crc-t10dif-arm-y := crc-t10dif.o crc-t10dif-core.o
diff --git a/arch/arm/lib/crc-t10dif-core.S b/arch/arm/lib/crc-t10dif-core.S
deleted file mode 100644
index 2bbf2df9c1e2..000000000000
--- a/arch/arm/lib/crc-t10dif-core.S
+++ /dev/null
@@ -1,468 +0,0 @@
-//
-// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-//     Erdinc Ozturk <erdinc.ozturk@intel.com>
-//     Vinodh Gopal <vinodh.gopal@intel.com>
-//     James Guilford <james.guilford@intel.com>
-//     Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses.  You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//       Reference paper titled "Fast CRC Computation for Generic
-//	Polynomials Using PCLMULQDQ Instruction"
-//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
-//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-#ifdef CONFIG_CPU_ENDIAN_BE8
-#define CPU_LE(code...)
-#else
-#define CPU_LE(code...)		code
-#endif
-
-	.text
-	.arch		armv8-a
-	.fpu		crypto-neon-fp-armv8
-
-	init_crc	.req	r0
-	buf		.req	r1
-	len		.req	r2
-
-	fold_consts_ptr	.req	ip
-
-	q0l		.req	d0
-	q0h		.req	d1
-	q1l		.req	d2
-	q1h		.req	d3
-	q2l		.req	d4
-	q2h		.req	d5
-	q3l		.req	d6
-	q3h		.req	d7
-	q4l		.req	d8
-	q4h		.req	d9
-	q5l		.req	d10
-	q5h		.req	d11
-	q6l		.req	d12
-	q6h		.req	d13
-	q7l		.req	d14
-	q7h		.req	d15
-	q8l		.req	d16
-	q8h		.req	d17
-	q9l		.req	d18
-	q9h		.req	d19
-	q10l		.req	d20
-	q10h		.req	d21
-	q11l		.req	d22
-	q11h		.req	d23
-	q12l		.req	d24
-	q12h		.req	d25
-
-	FOLD_CONSTS	.req	q10
-	FOLD_CONST_L	.req	q10l
-	FOLD_CONST_H	.req	q10h
-
-	/*
-	 * Pairwise long polynomial multiplication of two 16-bit values
-	 *
-	 *   { w0, w1 }, { y0, y1 }
-	 *
-	 * by two 64-bit values
-	 *
-	 *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
-	 *
-	 * where each vector element is a byte, ordered from least to most
-	 * significant. The resulting 80-bit vectors are XOR'ed together.
-	 *
-	 * This can be implemented using 8x8 long polynomial multiplication, by
-	 * reorganizing the input so that each pairwise 8x8 multiplication
-	 * produces one of the terms from the decomposition below, and
-	 * combining the results of each rank and shifting them into place.
-	 *
-	 * Rank
-	 *  0            w0*x0 ^              |        y0*z0 ^
-	 *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
-	 *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
-	 *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
-	 *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
-	 *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
-	 *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
-	 *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
-	 *  8            w1*x7      << 64     |        y1*z7      << 64
-	 *
-	 * The inputs can be reorganized into
-	 *
-	 *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
-	 *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
-	 *
-	 * and after performing 8x8->16 bit long polynomial multiplication of
-	 * each of the halves of the first vector with those of the second one,
-	 * we obtain the following four vectors of 16-bit elements:
-	 *
-	 *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
-	 *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
-	 *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
-	 *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
-	 *
-	 * Results b and c can be XORed together, as the vector elements have
-	 * matching ranks. Then, the final XOR can be pulled forward, and
-	 * applied between the halves of each of the remaining three vectors,
-	 * which are then shifted into place, and XORed together to produce the
-	 * final 80-bit result.
-	 */
-        .macro		pmull16x64_p8, v16, v64
-	vext.8		q11, \v64, \v64, #1
-	vld1.64		{q12}, [r4, :128]
-	vuzp.8		q11, \v64
-	vtbl.8		d24, {\v16\()_L-\v16\()_H}, d24
-	vtbl.8		d25, {\v16\()_L-\v16\()_H}, d25
-	bl		__pmull16x64_p8
-	veor		\v64, q12, q14
-        .endm
-
-__pmull16x64_p8:
-	vmull.p8	q13, d23, d24
-	vmull.p8	q14, d23, d25
-	vmull.p8	q15, d22, d24
-	vmull.p8	q12, d22, d25
-
-	veor		q14, q14, q15
-	veor		d24, d24, d25
-	veor		d26, d26, d27
-	veor		d28, d28, d29
-	vmov.i32	d25, #0
-	vmov.i32	d29, #0
-	vext.8		q12, q12, q12, #14
-	vext.8		q14, q14, q14, #15
-	veor		d24, d24, d26
-	bx		lr
-ENDPROC(__pmull16x64_p8)
-
-        .macro		pmull16x64_p64, v16, v64
-	vmull.p64	q11, \v64\()l, \v16\()_L
-	vmull.p64	\v64, \v64\()h, \v16\()_H
-	veor		\v64, \v64, q11
-	.endm
-
-	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
-	// into reg1, reg2.
-	.macro		fold_32_bytes, reg1, reg2, p
-	vld1.64		{q8-q9}, [buf]!
-
-	pmull16x64_\p	FOLD_CONST, \reg1
-	pmull16x64_\p	FOLD_CONST, \reg2
-
-CPU_LE(	vrev64.8	q8, q8	)
-CPU_LE(	vrev64.8	q9, q9	)
-	vswp		q8l, q8h
-	vswp		q9l, q9h
-
-	veor.8		\reg1, \reg1, q8
-	veor.8		\reg2, \reg2, q9
-	.endm
-
-	// Fold src_reg into dst_reg, optionally loading the next fold constants
-	.macro		fold_16_bytes, src_reg, dst_reg, p, load_next_consts
-	pmull16x64_\p	FOLD_CONST, \src_reg
-	.ifnb		\load_next_consts
-	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
-	.endif
-	veor.8		\dst_reg, \dst_reg, \src_reg
-	.endm
-
-	.macro		crct10dif, p
-	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-	cmp		len, #256
-	blt		.Lless_than_256_bytes\@
-
-	mov_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
-
-	// Load the first 128 data bytes.  Byte swapping is necessary to make
-	// the bit order match the polynomial coefficient order.
-	vld1.64		{q0-q1}, [buf]!
-	vld1.64		{q2-q3}, [buf]!
-	vld1.64		{q4-q5}, [buf]!
-	vld1.64		{q6-q7}, [buf]!
-CPU_LE(	vrev64.8	q0, q0	)
-CPU_LE(	vrev64.8	q1, q1	)
-CPU_LE(	vrev64.8	q2, q2	)
-CPU_LE(	vrev64.8	q3, q3	)
-CPU_LE(	vrev64.8	q4, q4	)
-CPU_LE(	vrev64.8	q5, q5	)
-CPU_LE(	vrev64.8	q6, q6	)
-CPU_LE(	vrev64.8	q7, q7	)
-	vswp		q0l, q0h
-	vswp		q1l, q1h
-	vswp		q2l, q2h
-	vswp		q3l, q3h
-	vswp		q4l, q4h
-	vswp		q5l, q5h
-	vswp		q6l, q6h
-	vswp		q7l, q7h
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	vmov.i8		q8h, #0
-	vmov.u16	q8h[3], init_crc
-	veor		q0h, q0h, q8h
-
-	// Load the constants for folding across 128 bytes.
-	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
-	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
-	// 128 to simplify the termination condition of the following loop.
-	sub		len, len, #256
-
-	// While >= 128 data bytes remain (not counting q0-q7), fold the 128
-	// bytes q0-q7 into them, storing the result back into q0-q7.
-.Lfold_128_bytes_loop\@:
-	fold_32_bytes	q0, q1, \p
-	fold_32_bytes	q2, q3, \p
-	fold_32_bytes	q4, q5, \p
-	fold_32_bytes	q6, q7, \p
-	subs		len, len, #128
-	bge		.Lfold_128_bytes_loop\@
-
-	// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
-
-	// Fold across 64 bytes.
-	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
-	fold_16_bytes	q0, q4, \p
-	fold_16_bytes	q1, q5, \p
-	fold_16_bytes	q2, q6, \p
-	fold_16_bytes	q3, q7, \p, 1
-	// Fold across 32 bytes.
-	fold_16_bytes	q4, q6, \p
-	fold_16_bytes	q5, q7, \p, 1
-	// Fold across 16 bytes.
-	fold_16_bytes	q6, q7, \p
-
-	// Add 128 to get the correct number of data bytes remaining in 0...127
-	// (not counting q7), following the previous extra subtraction by 128.
-	// Then subtract 16 to simplify the termination condition of the
-	// following loop.
-	adds		len, len, #(128-16)
-
-	// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
-	// into them, storing the result back into q7.
-	blt		.Lfold_16_bytes_loop_done\@
-.Lfold_16_bytes_loop\@:
-	pmull16x64_\p	FOLD_CONST, q7
-	vld1.64		{q0}, [buf]!
-CPU_LE(	vrev64.8	q0, q0	)
-	vswp		q0l, q0h
-	veor.8		q7, q7, q0
-	subs		len, len, #16
-	bge		.Lfold_16_bytes_loop\@
-
-.Lfold_16_bytes_loop_done\@:
-	// Add 16 to get the correct number of data bytes remaining in 0...15
-	// (not counting q7), following the previous extra subtraction by 16.
-	adds		len, len, #16
-	beq		.Lreduce_final_16_bytes\@
-
-.Lhandle_partial_segment\@:
-	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
-	// 16 bytes are in q7 and the rest are the remaining data in 'buf'.  To
-	// do this without needing a fold constant for each possible 'len',
-	// redivide the bytes into a first chunk of 'len' bytes and a second
-	// chunk of 16 bytes, then fold the first chunk into the second.
-
-	// q0 = last 16 original data bytes
-	add		buf, buf, len
-	sub		buf, buf, #16
-	vld1.64		{q0}, [buf]
-CPU_LE(	vrev64.8	q0, q0	)
-	vswp		q0l, q0h
-
-	// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
-	mov_l		r1, .Lbyteshift_table + 16
-	sub		r1, r1, len
-	vld1.8		{q2}, [r1]
-	vtbl.8		q1l, {q7l-q7h}, q2l
-	vtbl.8		q1h, {q7l-q7h}, q2h
-
-	// q3 = first chunk: q7 right-shifted by '16-len' bytes.
-	vmov.i8		q3, #0x80
-	veor.8		q2, q2, q3
-	vtbl.8		q3l, {q7l-q7h}, q2l
-	vtbl.8		q3h, {q7l-q7h}, q2h
-
-	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
-	vshr.s8		q2, q2, #7
-
-	// q2 = second chunk: 'len' bytes from q0 (low-order bytes),
-	// then '16-len' bytes from q1 (high-order bytes).
-	vbsl.8		q2, q1, q0
-
-	// Fold the first chunk into the second chunk, storing the result in q7.
-	pmull16x64_\p	FOLD_CONST, q3
-	veor.8		q7, q3, q2
-	b		.Lreduce_final_16_bytes\@
-
-.Lless_than_256_bytes\@:
-	// Checksumming a buffer of length 16...255 bytes
-
-	mov_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
-
-	// Load the first 16 data bytes.
-	vld1.64		{q7}, [buf]!
-CPU_LE(	vrev64.8	q7, q7	)
-	vswp		q7l, q7h
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	vmov.i8		q0h, #0
-	vmov.u16	q0h[3], init_crc
-	veor.8		q7h, q7h, q0h
-
-	// Load the fold-across-16-bytes constants.
-	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
-	cmp		len, #16
-	beq		.Lreduce_final_16_bytes\@	// len == 16
-	subs		len, len, #32
-	addlt		len, len, #16
-	blt		.Lhandle_partial_segment\@	// 17 <= len <= 31
-	b		.Lfold_16_bytes_loop\@		// 32 <= len <= 255
-
-.Lreduce_final_16_bytes\@:
-	.endm
-
-//
-// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-ENTRY(crc_t10dif_pmull64)
-	crct10dif	p64
-
-	// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
-
-	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
-	// Fold the high 64 bits into the low 64 bits, while also multiplying by
-	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-	// whose low 48 bits are 0.
-	vmull.p64	q0, q7h, FOLD_CONST_H	// high bits * x^48 * (x^80 mod G(x))
-	veor.8		q0h, q0h, q7l		// + low bits * x^64
-
-	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
-	vmov.i8		q1, #0
-	vmov		s4, s3			// extract high 32 bits
-	vmov		s3, s5			// zero high 32 bits
-	vmull.p64	q1, q1l, FOLD_CONST_L	// high 32 bits * x^48 * (x^48 mod G(x))
-	veor.8		q0, q0, q1		// + low bits
-
-	// Load G(x) and floor(x^48 / G(x)).
-	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]
-
-	// Use Barrett reduction to compute the final CRC value.
-	vmull.p64	q1, q0h, FOLD_CONST_H	// high 32 bits * floor(x^48 / G(x))
-	vshr.u64	q1l, q1l, #32		// /= x^32
-	vmull.p64	q1, q1l, FOLD_CONST_L	// *= G(x)
-	vshr.u64	q0l, q0l, #48
-	veor.8		q0l, q0l, q1l		// + low 16 nonzero bits
-	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
-
-	vmov.u16	r0, q0l[0]
-	bx		lr
-ENDPROC(crc_t10dif_pmull64)
-
-ENTRY(crc_t10dif_pmull8)
-	push		{r4, lr}
-	mov_l		r4, .L16x64perm
-
-	crct10dif	p8
-
-CPU_LE(	vrev64.8	q7, q7	)
-	vswp		q7l, q7h
-	vst1.64		{q7}, [r3, :128]
-	pop		{r4, pc}
-ENDPROC(crc_t10dif_pmull8)
-
-	.section	".rodata", "a"
-	.align		4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
-	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
-	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
-// .Lfold_across_64_bytes_consts:
-	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
-	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
-// .Lfold_across_32_bytes_consts:
-	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
-	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
-.Lfold_across_16_bytes_consts:
-	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
-	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
-// .Lfinal_fold_consts:
-	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
-	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
-	.quad		0x0000000000018bb7	// G(x)
-	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
-	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
-	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
-
-.L16x64perm:
-	.quad		0x808080800000000, 0x909090901010101
diff --git a/arch/arm/lib/crc-t10dif.c b/arch/arm/lib/crc-t10dif.c
deleted file mode 100644
index 1093f8ec13b0..000000000000
--- a/arch/arm/lib/crc-t10dif.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
-
-asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
-asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
-				  u8 out[16]);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
-{
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
-		if (static_branch_likely(&have_pmull)) {
-			if (crypto_simd_usable()) {
-				kernel_neon_begin();
-				crc = crc_t10dif_pmull64(crc, data, length);
-				kernel_neon_end();
-				return crc;
-			}
-		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
-			   static_branch_likely(&have_neon) &&
-			   crypto_simd_usable()) {
-			u8 buf[16] __aligned(16);
-
-			kernel_neon_begin();
-			crc_t10dif_pmull8(crc, data, length, buf);
-			kernel_neon_end();
-
-			return crc_t10dif_generic(0, buf, sizeof(buf));
-		}
-	}
-	return crc_t10dif_generic(crc, data, length);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_arm_init(void)
-{
-	if (elf_hwcap & HWCAP_NEON) {
-		static_branch_enable(&have_neon);
-		if (elf_hwcap2 & HWCAP2_PMULL)
-			static_branch_enable(&have_pmull);
-	}
-	return 0;
-}
-subsys_initcall(crc_t10dif_arm_init);
-
-static void __exit crc_t10dif_arm_exit(void)
-{
-}
-module_exit(crc_t10dif_arm_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crc32-core.S b/arch/arm/lib/crc32-core.S
deleted file mode 100644
index 6f674f30c70b..000000000000
--- a/arch/arm/lib/crc32-core.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see http://www.gnu.org/licenses
- *
- * Please  visit http://www.xyratex.com/contact if you need additional
- * information or have any questions.
- *
- * GPL HEADER END
- */
-
-/*
- * Copyright 2012 Xyratex Technology Limited
- *
- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
- * calculation.
- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
- * at:
- * https://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2B: Instruction Set Reference, N-Z
- *
- * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
- *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.align		6
-	.arch		armv8-a
-	.arch_extension	crc
-	.fpu		crypto-neon-fp-armv8
-
-.Lcrc32_constants:
-	/*
-	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
-	 * #define CONSTANT_R1  0x154442bd4LL
-	 *
-	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
-	 * #define CONSTANT_R2  0x1c6e41596LL
-	 */
-	.quad		0x0000000154442bd4
-	.quad		0x00000001c6e41596
-
-	/*
-	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
-	 * #define CONSTANT_R3  0x1751997d0LL
-	 *
-	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
-	 * #define CONSTANT_R4  0x0ccaa009eLL
-	 */
-	.quad		0x00000001751997d0
-	.quad		0x00000000ccaa009e
-
-	/*
-	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
-	 * #define CONSTANT_R5  0x163cd6124LL
-	 */
-	.quad		0x0000000163cd6124
-	.quad		0x00000000FFFFFFFF
-
-	/*
-	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
-	 *
-	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
-	 *                                                      = 0x1F7011641LL
-	 * #define CONSTANT_RU  0x1F7011641LL
-	 */
-	.quad		0x00000001DB710641
-	.quad		0x00000001F7011641
-
-.Lcrc32c_constants:
-	.quad		0x00000000740eef02
-	.quad		0x000000009e4addf8
-	.quad		0x00000000f20c0dfe
-	.quad		0x000000014cd00bd6
-	.quad		0x00000000dd45aab8
-	.quad		0x00000000FFFFFFFF
-	.quad		0x0000000105ec76f0
-	.quad		0x00000000dea713f1
-
-	dCONSTANTl	.req	d0
-	dCONSTANTh	.req	d1
-	qCONSTANT	.req	q0
-
-	BUF		.req	r0
-	LEN		.req	r1
-	CRC		.req	r2
-
-	qzr		.req	q9
-
-	/**
-	 * Calculate crc32
-	 * BUF - buffer
-	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
-	 * CRC - initial crc32
-	 * return %eax crc32
-	 * uint crc32_pmull_le(unsigned char const *buffer,
-	 *                     size_t len, uint crc32)
-	 */
-SYM_FUNC_START(crc32_pmull_le)
-	adr		r3, .Lcrc32_constants
-	b		0f
-SYM_FUNC_END(crc32_pmull_le)
-
-SYM_FUNC_START(crc32c_pmull_le)
-	adr		r3, .Lcrc32c_constants
-
-0:	bic		LEN, LEN, #15
-	vld1.8		{q1-q2}, [BUF, :128]!
-	vld1.8		{q3-q4}, [BUF, :128]!
-	vmov.i8		qzr, #0
-	vmov.i8		qCONSTANT, #0
-	vmov.32		dCONSTANTl[0], CRC
-	veor.8		d2, d2, dCONSTANTl
-	sub		LEN, LEN, #0x40
-	cmp		LEN, #0x40
-	blt		less_64
-
-	vld1.64		{qCONSTANT}, [r3]
-
-loop_64:		/* 64 bytes Full cache line folding */
-	sub		LEN, LEN, #0x40
-
-	vmull.p64	q5, d3, dCONSTANTh
-	vmull.p64	q6, d5, dCONSTANTh
-	vmull.p64	q7, d7, dCONSTANTh
-	vmull.p64	q8, d9, dCONSTANTh
-
-	vmull.p64	q1, d2, dCONSTANTl
-	vmull.p64	q2, d4, dCONSTANTl
-	vmull.p64	q3, d6, dCONSTANTl
-	vmull.p64	q4, d8, dCONSTANTl
-
-	veor.8		q1, q1, q5
-	vld1.8		{q5}, [BUF, :128]!
-	veor.8		q2, q2, q6
-	vld1.8		{q6}, [BUF, :128]!
-	veor.8		q3, q3, q7
-	vld1.8		{q7}, [BUF, :128]!
-	veor.8		q4, q4, q8
-	vld1.8		{q8}, [BUF, :128]!
-
-	veor.8		q1, q1, q5
-	veor.8		q2, q2, q6
-	veor.8		q3, q3, q7
-	veor.8		q4, q4, q8
-
-	cmp		LEN, #0x40
-	bge		loop_64
-
-less_64:		/* Folding cache line into 128bit */
-	vldr		dCONSTANTl, [r3, #16]
-	vldr		dCONSTANTh, [r3, #24]
-
-	vmull.p64	q5, d3, dCONSTANTh
-	vmull.p64	q1, d2, dCONSTANTl
-	veor.8		q1, q1, q5
-	veor.8		q1, q1, q2
-
-	vmull.p64	q5, d3, dCONSTANTh
-	vmull.p64	q1, d2, dCONSTANTl
-	veor.8		q1, q1, q5
-	veor.8		q1, q1, q3
-
-	vmull.p64	q5, d3, dCONSTANTh
-	vmull.p64	q1, d2, dCONSTANTl
-	veor.8		q1, q1, q5
-	veor.8		q1, q1, q4
-
-	teq		LEN, #0
-	beq		fold_64
-
-loop_16:		/* Folding rest buffer into 128bit */
-	subs		LEN, LEN, #0x10
-
-	vld1.8		{q2}, [BUF, :128]!
-	vmull.p64	q5, d3, dCONSTANTh
-	vmull.p64	q1, d2, dCONSTANTl
-	veor.8		q1, q1, q5
-	veor.8		q1, q1, q2
-
-	bne		loop_16
-
-fold_64:
-	/* perform the last 64 bit fold, also adds 32 zeroes
-	 * to the input stream */
-	vmull.p64	q2, d2, dCONSTANTh
-	vext.8		q1, q1, qzr, #8
-	veor.8		q1, q1, q2
-
-	/* final 32-bit fold */
-	vldr		dCONSTANTl, [r3, #32]
-	vldr		d6, [r3, #40]
-	vmov.i8		d7, #0
-
-	vext.8		q2, q1, qzr, #4
-	vand.8		d2, d2, d6
-	vmull.p64	q1, d2, dCONSTANTl
-	veor.8		q1, q1, q2
-
-	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
-	vldr		dCONSTANTl, [r3, #48]
-	vldr		dCONSTANTh, [r3, #56]
-
-	vand.8		q2, q1, q3
-	vext.8		q2, qzr, q2, #8
-	vmull.p64	q2, d5, dCONSTANTh
-	vand.8		q2, q2, q3
-	vmull.p64	q2, d4, dCONSTANTl
-	veor.8		q1, q1, q2
-	vmov		r0, s5
-
-	bx		lr
-SYM_FUNC_END(crc32c_pmull_le)
-
-	.macro		__crc32, c
-	subs		ip, r2, #8
-	bmi		.Ltail\c
-
-	tst		r1, #3
-	bne		.Lunaligned\c
-
-	teq		ip, #0
-.Laligned8\c:
-	ldrd		r2, r3, [r1], #8
-ARM_BE8(rev		r2, r2		)
-ARM_BE8(rev		r3, r3		)
-	crc32\c\()w	r0, r0, r2
-	crc32\c\()w	r0, r0, r3
-	bxeq		lr
-	subs		ip, ip, #8
-	bpl		.Laligned8\c
-
-.Ltail\c:
-	tst		ip, #4
-	beq		2f
-	ldr		r3, [r1], #4
-ARM_BE8(rev		r3, r3		)
-	crc32\c\()w	r0, r0, r3
-
-2:	tst		ip, #2
-	beq		1f
-	ldrh		r3, [r1], #2
-ARM_BE8(rev16		r3, r3		)
-	crc32\c\()h	r0, r0, r3
-
-1:	tst		ip, #1
-	bxeq		lr
-	ldrb		r3, [r1]
-	crc32\c\()b	r0, r0, r3
-	bx		lr
-
-.Lunaligned\c:
-	tst		r1, #1
-	beq		2f
-	ldrb		r3, [r1], #1
-	subs		r2, r2, #1
-	crc32\c\()b	r0, r0, r3
-
-	tst		r1, #2
-	beq		0f
-2:	ldrh		r3, [r1], #2
-	subs		r2, r2, #2
-ARM_BE8(rev16		r3, r3		)
-	crc32\c\()h	r0, r0, r3
-
-0:	subs		ip, r2, #8
-	bpl		.Laligned8\c
-	b		.Ltail\c
-	.endm
-
-	.align		5
-SYM_FUNC_START(crc32_armv8_le)
-	__crc32
-SYM_FUNC_END(crc32_armv8_le)
-
-	.align		5
-SYM_FUNC_START(crc32c_armv8_le)
-	__crc32		c
-SYM_FUNC_END(crc32c_armv8_le)
diff --git a/arch/arm/lib/crc32.c b/arch/arm/lib/crc32.c
deleted file mode 100644
index f2bef8849c7c..000000000000
--- a/arch/arm/lib/crc32.c
+++ /dev/null
@@ -1,123 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc32.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define PMULL_MIN_LEN	64	/* min size of buffer for pmull functions */
-
-asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc);
-asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len);
-
-asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc);
-asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len);
-
-static u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len)
-{
-	if (static_branch_likely(&have_crc32))
-		return crc32_armv8_le(crc, p, len);
-	return crc32_le_base(crc, p, len);
-}
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
-	if (len >= PMULL_MIN_LEN + 15 &&
-	    static_branch_likely(&have_pmull) && crypto_simd_usable()) {
-		size_t n = -(uintptr_t)p & 15;
-
-		/* align p to 16-byte boundary */
-		if (n) {
-			crc = crc32_le_scalar(crc, p, n);
-			p += n;
-			len -= n;
-		}
-		n = round_down(len, 16);
-		kernel_neon_begin();
-		crc = crc32_pmull_le(p, n, crc);
-		kernel_neon_end();
-		p += n;
-		len -= n;
-	}
-	return crc32_le_scalar(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-static u32 crc32c_scalar(u32 crc, const u8 *p, size_t len)
-{
-	if (static_branch_likely(&have_crc32))
-		return crc32c_armv8_le(crc, p, len);
-	return crc32c_base(crc, p, len);
-}
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
-	if (len >= PMULL_MIN_LEN + 15 &&
-	    static_branch_likely(&have_pmull) && crypto_simd_usable()) {
-		size_t n = -(uintptr_t)p & 15;
-
-		/* align p to 16-byte boundary */
-		if (n) {
-			crc = crc32c_scalar(crc, p, n);
-			p += n;
-			len -= n;
-		}
-		n = round_down(len, 16);
-		kernel_neon_begin();
-		crc = crc32c_pmull_le(p, n, crc);
-		kernel_neon_end();
-		p += n;
-		len -= n;
-	}
-	return crc32c_scalar(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
-	return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_arm_init(void)
-{
-	if (elf_hwcap2 & HWCAP2_CRC32)
-		static_branch_enable(&have_crc32);
-	if (elf_hwcap2 & HWCAP2_PMULL)
-		static_branch_enable(&have_pmull);
-	return 0;
-}
-subsys_initcall(crc32_arm_init);
-
-static void __exit crc32_arm_exit(void)
-{
-}
-module_exit(crc32_arm_exit);
-
-u32 crc32_optimizations(void)
-{
-	if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL))
-		return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
-	return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/.gitignore b/arch/arm/lib/crypto/.gitignore
deleted file mode 100644
index 12d74d8b03d0..000000000000
--- a/arch/arm/lib/crypto/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
-sha256-core.S
diff --git a/arch/arm/lib/crypto/Kconfig b/arch/arm/lib/crypto/Kconfig
deleted file mode 100644
index d1ad664f0c67..000000000000
--- a/arch/arm/lib/crypto/Kconfig
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_ARM
-	bool "Hash functions: BLAKE2s"
-	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	help
-	  BLAKE2s cryptographic hash function (RFC 7693)
-
-	  Architecture: arm
-
-	  This is faster than the generic implementations of BLAKE2s and
-	  BLAKE2b, but slower than the NEON implementation of BLAKE2b.
-	  There is no NEON implementation of BLAKE2s, since NEON doesn't
-	  really help with it.
-
-config CRYPTO_CHACHA20_NEON
-	tristate
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_ARM
-	tristate
-	default CRYPTO_LIB_POLY1305
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM
-	tristate
-	depends on !CPU_V7M
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm/lib/crypto/Makefile b/arch/arm/lib/crypto/Makefile
deleted file mode 100644
index 431f77c3ff6f..000000000000
--- a/arch/arm/lib/crypto/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
-libblake2s-arm-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-scalar-core.o chacha-glue.o
-chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
-poly1305-arm-y := poly1305-core.o poly1305-glue.o
-
-obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
-sha256-arm-y := sha256.o sha256-core.o
-sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
-quiet_cmd_perl = PERL    $@
-      cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
-	$(call cmd,perl)
-
-clean-files += poly1305-core.S sha256-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
-
-# massage the perlasm code a bit so we only get the NEON routine if we need it
-poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
-poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
-AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
-
-AFLAGS_sha256-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/lib/crypto/blake2s-core.S b/arch/arm/lib/crypto/blake2s-core.S
deleted file mode 100644
index df40e46601f1..000000000000
--- a/arch/arm/lib/crypto/blake2s-core.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * BLAKE2s digest algorithm, ARM scalar implementation
- *
- * Copyright 2020 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	// Registers used to hold message words temporarily.  There aren't
-	// enough ARM registers to hold the whole message block, so we have to
-	// load the words on-demand.
-	M_0		.req	r12
-	M_1		.req	r14
-
-// The BLAKE2s initialization vector
-.Lblake2s_IV:
-	.word	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
-	.word	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-
-.macro __ldrd		a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
-	ldrd		\a, \b, [\src, #\offset]
-#else
-	ldr		\a, [\src, #\offset]
-	ldr		\b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd		a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
-	strd		\a, \b, [\dst, #\offset]
-#else
-	str		\a, [\dst, #\offset]
-	str		\b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _le32_bswap	a, tmp
-#ifdef __ARMEB__
-	rev_l		\a, \tmp
-#endif
-.endm
-
-.macro _le32_bswap_8x	a, b, c, d, e, f, g, h,  tmp
-	_le32_bswap	\a, \tmp
-	_le32_bswap	\b, \tmp
-	_le32_bswap	\c, \tmp
-	_le32_bswap	\d, \tmp
-	_le32_bswap	\e, \tmp
-	_le32_bswap	\f, \tmp
-	_le32_bswap	\g, \tmp
-	_le32_bswap	\h, \tmp
-.endm
-
-// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
-// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
-// columns/diagonals.  s0-s1 are the word offsets to the message words the first
-// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
-// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
-//
-// Note that to save instructions, the rotations don't happen when the
-// pseudocode says they should, but rather they are delayed until the values are
-// used.  See the comment above _blake2s_round().
-.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
-
-	ldr		M_0, [sp, #32 + 4 * \s0]
-	ldr		M_1, [sp, #32 + 4 * \s2]
-
-	// a += b + m[blake2s_sigma[r][2*i + 0]];
-	add		\a0, \a0, \b0, ror #brot
-	add		\a1, \a1, \b1, ror #brot
-	add		\a0, \a0, M_0
-	add		\a1, \a1, M_1
-
-	// d = ror32(d ^ a, 16);
-	eor		\d0, \a0, \d0, ror #drot
-	eor		\d1, \a1, \d1, ror #drot
-
-	// c += d;
-	add		\c0, \c0, \d0, ror #16
-	add		\c1, \c1, \d1, ror #16
-
-	// b = ror32(b ^ c, 12);
-	eor		\b0, \c0, \b0, ror #brot
-	eor		\b1, \c1, \b1, ror #brot
-
-	ldr		M_0, [sp, #32 + 4 * \s1]
-	ldr		M_1, [sp, #32 + 4 * \s3]
-
-	// a += b + m[blake2s_sigma[r][2*i + 1]];
-	add		\a0, \a0, \b0, ror #12
-	add		\a1, \a1, \b1, ror #12
-	add		\a0, \a0, M_0
-	add		\a1, \a1, M_1
-
-	// d = ror32(d ^ a, 8);
-	eor		\d0, \a0, \d0, ror#16
-	eor		\d1, \a1, \d1, ror#16
-
-	// c += d;
-	add		\c0, \c0, \d0, ror#8
-	add		\c1, \c1, \d1, ror#8
-
-	// b = ror32(b ^ c, 7);
-	eor		\b0, \c0, \b0, ror#12
-	eor		\b1, \c1, \b1, ror#12
-.endm
-
-// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
-// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
-// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
-// r14 are free to use.  The macro arguments s0-s15 give the order in which the
-// message words are used in this round.
-//
-// All rotates are performed using the implicit rotate operand accepted by the
-// 'add' and 'eor' instructions.  This is faster than using explicit rotate
-// instructions.  To make this work, we allow the values in the second and last
-// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
-// wrong rotation amount.  The rotation amount is then fixed up just in time
-// when the values are used.  'brot' is the number of bits the values in row 'b'
-// need to be rotated right to arrive at the correct values, and 'drot'
-// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
-// that they end up as (7, 8) after every round.
-.macro	_blake2s_round	s0, s1, s2, s3, s4, s5, s6, s7, \
-			s8, s9, s10, s11, s12, s13, s14, s15
-
-	// Mix first two columns:
-	// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
-	__ldrd		r10, r11, sp, 16	// load v[12] and v[13]
-	_blake2s_quarterround	r0, r4, r8, r10,  r1, r5, r9, r11, \
-				\s0, \s1, \s2, \s3
-	__strd		r8, r9, sp, 0
-	__strd		r10, r11, sp, 16
-
-	// Mix second two columns:
-	// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
-	__ldrd		r8, r9, sp, 8		// load v[10] and v[11]
-	__ldrd		r10, r11, sp, 24	// load v[14] and v[15]
-	_blake2s_quarterround	r2, r6, r8, r10,  r3, r7, r9, r11, \
-				\s4, \s5, \s6, \s7
-	str		r10, [sp, #24]		// store v[14]
-	// v[10], v[11], and v[15] are used below, so no need to store them yet.
-
-	.set brot, 7
-	.set drot, 8
-
-	// Mix first two diagonals:
-	// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
-	ldr		r10, [sp, #16]		// load v[12]
-	_blake2s_quarterround	r0, r5, r8, r11,  r1, r6, r9, r10, \
-				\s8, \s9, \s10, \s11
-	__strd		r8, r9, sp, 8
-	str		r11, [sp, #28]
-	str		r10, [sp, #16]
-
-	// Mix second two diagonals:
-	// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
-	__ldrd		r8, r9, sp, 0		// load v[8] and v[9]
-	__ldrd		r10, r11, sp, 20	// load v[13] and v[14]
-	_blake2s_quarterround	r2, r7, r8, r10,  r3, r4, r9, r11, \
-				\s12, \s13, \s14, \s15
-	__strd		r10, r11, sp, 20
-.endm
-
-//
-// void blake2s_compress(struct blake2s_state *state,
-//			 const u8 *block, size_t nblocks, u32 inc);
-//
-// Only the first three fields of struct blake2s_state are used:
-//	u32 h[8];	(inout)
-//	u32 t[2];	(inout)
-//	u32 f[2];	(in)
-//
-	.align		5
-ENTRY(blake2s_compress)
-	push		{r0-r2,r4-r11,lr}	// keep this an even number
-
-.Lnext_block:
-	// r0 is 'state'
-	// r1 is 'block'
-	// r3 is 'inc'
-
-	// Load and increment the counter t[0..1].
-	__ldrd		r10, r11, r0, 32
-	adds		r10, r10, r3
-	adc		r11, r11, #0
-	__strd		r10, r11, r0, 32
-
-	// _blake2s_round is very short on registers, so copy the message block
-	// to the stack to save a register during the rounds.  This also has the
-	// advantage that misalignment only needs to be dealt with in one place.
-	sub		sp, sp, #64
-	mov		r12, sp
-	tst		r1, #3
-	bne		.Lcopy_block_misaligned
-	ldmia		r1!, {r2-r9}
-	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
-	stmia		r12!, {r2-r9}
-	ldmia		r1!, {r2-r9}
-	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
-	stmia		r12, {r2-r9}
-.Lcopy_block_done:
-	str		r1, [sp, #68]		// Update message pointer
-
-	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
-	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
-	mov		r14, r0			// r14 = state
-	adr		r12, .Lblake2s_IV
-	ldmia		r12!, {r8-r9}		// load IV[0..1]
-	__ldrd		r0, r1, r14, 40		// load f[0..1]
-	ldm		r12, {r2-r7}		// load IV[3..7]
-	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
-	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
-	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
-	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
-	push		{r2-r7}			// push v[9..15]
-	sub		sp, sp, #8		// leave space for v[8..9]
-
-	// Load h[0..7] == v[0..7].
-	ldm		r14, {r0-r7}
-
-	// Execute the rounds.  Each round is provided the order in which it
-	// needs to use the message words.
-	.set brot, 0
-	.set drot, 0
-	_blake2s_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-	_blake2s_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
-	_blake2s_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
-	_blake2s_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
-	_blake2s_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
-	_blake2s_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
-	_blake2s_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
-	_blake2s_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
-	_blake2s_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
-	_blake2s_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
-
-	// Fold the final state matrix into the hash chaining value:
-	//
-	//	for (i = 0; i < 8; i++)
-	//		h[i] ^= v[i] ^ v[i + 8];
-	//
-	ldr		r14, [sp, #96]		// r14 = &h[0]
-	add		sp, sp, #8		// v[8..9] are already loaded.
-	pop		{r10-r11}		// load v[10..11]
-	eor		r0, r0, r8
-	eor		r1, r1, r9
-	eor		r2, r2, r10
-	eor		r3, r3, r11
-	ldm		r14, {r8-r11}		// load h[0..3]
-	eor		r0, r0, r8
-	eor		r1, r1, r9
-	eor		r2, r2, r10
-	eor		r3, r3, r11
-	stmia		r14!, {r0-r3}		// store new h[0..3]
-	ldm		r14, {r0-r3}		// load old h[4..7]
-	pop		{r8-r11}		// load v[12..15]
-	eor		r0, r0, r4, ror #brot
-	eor		r1, r1, r5, ror #brot
-	eor		r2, r2, r6, ror #brot
-	eor		r3, r3, r7, ror #brot
-	eor		r0, r0, r8, ror #drot
-	eor		r1, r1, r9, ror #drot
-	eor		r2, r2, r10, ror #drot
-	eor		r3, r3, r11, ror #drot
-	  add		sp, sp, #64		// skip copy of message block
-	stm		r14, {r0-r3}		// store new h[4..7]
-
-	// Advance to the next block, if there is one.  Note that if there are
-	// multiple blocks, then 'inc' (the counter increment amount) must be
-	// 64.  So we can simply set it to 64 without re-loading it.
-	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
-	mov		r3, #64			// set 'inc'
-	subs		r2, r2, #1		// nblocks--
-	str		r2, [sp, #8]
-	bne		.Lnext_block		// nblocks != 0?
-
-	pop		{r0-r2,r4-r11,pc}
-
-	// The next message block (pointed to by r1) isn't 4-byte aligned, so it
-	// can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
-	// by r12) using an alternative method.  r2-r9 are free to use.
-.Lcopy_block_misaligned:
-	mov		r2, #64
-1:
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	ldr		r3, [r1], #4
-	_le32_bswap	r3, r4
-#else
-	ldrb		r3, [r1, #0]
-	ldrb		r4, [r1, #1]
-	ldrb		r5, [r1, #2]
-	ldrb		r6, [r1, #3]
-	add		r1, r1, #4
-	orr		r3, r3, r4, lsl #8
-	orr		r3, r3, r5, lsl #16
-	orr		r3, r3, r6, lsl #24
-#endif
-	subs		r2, r2, #4
-	str		r3, [r12], #4
-	bne		1b
-	b		.Lcopy_block_done
-ENDPROC(blake2s_compress)
diff --git a/arch/arm/lib/crypto/blake2s-glue.c b/arch/arm/lib/crypto/blake2s-glue.c
deleted file mode 100644
index 0238a70d9581..000000000000
--- a/arch/arm/lib/crypto/blake2s-glue.c
+++ /dev/null
@@ -1,7 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <crypto/internal/blake2s.h>
-#include <linux/module.h>
-
-/* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress);
diff --git a/arch/arm/lib/crypto/chacha-glue.c b/arch/arm/lib/crypto/chacha-glue.c
deleted file mode 100644
index 88ec96415283..000000000000
--- a/arch/arm/lib/crypto/chacha-glue.c
+++ /dev/null
@@ -1,138 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha and HChaCha functions (ARM optimized)
- *
- * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/cputype.h>
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
-				      u8 *dst, const u8 *src, int nrounds);
-asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
-				       u8 *dst, const u8 *src,
-				       int nrounds, unsigned int nbytes);
-asmlinkage void hchacha_block_arm(const struct chacha_state *state,
-				  u32 out[HCHACHA_OUT_WORDS], int nrounds);
-asmlinkage void hchacha_block_neon(const struct chacha_state *state,
-				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
-			     const struct chacha_state *state, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
-
-static inline bool neon_usable(void)
-{
-	return static_branch_likely(&use_neon) && crypto_simd_usable();
-}
-
-static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
-{
-	u8 buf[CHACHA_BLOCK_SIZE];
-
-	while (bytes > CHACHA_BLOCK_SIZE) {
-		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
-
-		chacha_4block_xor_neon(state, dst, src, nrounds, l);
-		bytes -= l;
-		src += l;
-		dst += l;
-		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
-	}
-	if (bytes) {
-		const u8 *s = src;
-		u8 *d = dst;
-
-		if (bytes != CHACHA_BLOCK_SIZE)
-			s = d = memcpy(buf, src, bytes);
-		chacha_block_xor_neon(state, d, s, nrounds);
-		if (d != dst)
-			memcpy(dst, buf, bytes);
-		state->x[12]++;
-	}
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
-		hchacha_block_arm(state, out, nrounds);
-	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, out, nrounds);
-		kernel_neon_end();
-	}
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
-{
-	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
-	    bytes <= CHACHA_BLOCK_SIZE) {
-		chacha_doarm(dst, src, bytes, state, nrounds);
-		state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
-		return;
-	}
-
-	do {
-		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
-
-		bytes -= todo;
-		src += todo;
-		dst += todo;
-	} while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_arm_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
-		switch (read_cpuid_part()) {
-		case ARM_CPU_PART_CORTEX_A7:
-		case ARM_CPU_PART_CORTEX_A5:
-			/*
-			 * The Cortex-A7 and Cortex-A5 do not perform well with
-			 * the NEON implementation but do incredibly with the
-			 * scalar one and use less power.
-			 */
-			break;
-		default:
-			static_branch_enable(&use_neon);
-		}
-	}
-	return 0;
-}
-subsys_initcall(chacha_arm_mod_init);
-
-static void __exit chacha_arm_mod_exit(void)
-{
-}
-module_exit(chacha_arm_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/chacha-neon-core.S b/arch/arm/lib/crypto/chacha-neon-core.S
deleted file mode 100644
index ddd62b6294a5..000000000000
--- a/arch/arm/lib/crypto/chacha-neon-core.S
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
- /*
-  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
-  *
-  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
-  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
-  * (c)  vrev32.16			(16-bit rotations only)
-  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
-  *					 needs index vector)
-  *
-  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
-  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
-  * cycles of (b) on both Cortex-A7 and Cortex-A53.
-  *
-  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
-  * and doesn't need a temporary register.
-  *
-  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
-  * is twice as fast as (a), even when doing (a) on multiple registers
-  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
-  * parallelizes better when temporary registers are scarce.
-  *
-  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
-  * (a), so the need to load the rotation table actually makes the vtbl method
-  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
-  * seems to be a good compromise to get a more significant speed boost on some
-  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
-  */
-
-#include <linux/linkage.h>
-#include <asm/cache.h>
-
-	.text
-	.fpu		neon
-	.align		5
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers q0-q3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in r3.
- *
- * Clobbers: r3, ip, q4-q5
- */
-chacha_permute:
-
-	adr		ip, .Lrol8_table
-	vld1.8		{d10}, [ip, :64]
-
-.Ldoubleround:
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vrev32.16	q3, q3
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #12
-	vsri.u32	q1, q4, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vtbl.8		d6, {d6}, d10
-	vtbl.8		d7, {d7}, d10
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #7
-	vsri.u32	q1, q4, #25
-
-	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	vext.8		q1, q1, q1, #4
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vext.8		q2, q2, q2, #8
-	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	vext.8		q3, q3, q3, #12
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vrev32.16	q3, q3
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #12
-	vsri.u32	q1, q4, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vtbl.8		d6, {d6}, d10
-	vtbl.8		d7, {d7}, d10
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #7
-	vsri.u32	q1, q4, #25
-
-	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	vext.8		q1, q1, q1, #12
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vext.8		q2, q2, q2, #8
-	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	vext.8		q3, q3, q3, #4
-
-	subs		r3, r3, #2
-	bne		.Ldoubleround
-
-	bx		lr
-ENDPROC(chacha_permute)
-
-ENTRY(chacha_block_xor_neon)
-	// r0: Input state matrix, s
-	// r1: 1 data block output, o
-	// r2: 1 data block input, i
-	// r3: nrounds
-	push		{lr}
-
-	// x0..3 = s0..3
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	vmov		q8, q0
-	vmov		q9, q1
-	vmov		q10, q2
-	vmov		q11, q3
-
-	bl		chacha_permute
-
-	add		ip, r2, #0x20
-	vld1.8		{q4-q5}, [r2]
-	vld1.8		{q6-q7}, [ip]
-
-	// o0 = i0 ^ (x0 + s0)
-	vadd.i32	q0, q0, q8
-	veor		q0, q0, q4
-
-	// o1 = i1 ^ (x1 + s1)
-	vadd.i32	q1, q1, q9
-	veor		q1, q1, q5
-
-	// o2 = i2 ^ (x2 + s2)
-	vadd.i32	q2, q2, q10
-	veor		q2, q2, q6
-
-	// o3 = i3 ^ (x3 + s3)
-	vadd.i32	q3, q3, q11
-	veor		q3, q3, q7
-
-	add		ip, r1, #0x20
-	vst1.8		{q0-q1}, [r1]
-	vst1.8		{q2-q3}, [ip]
-
-	pop		{pc}
-ENDPROC(chacha_block_xor_neon)
-
-ENTRY(hchacha_block_neon)
-	// r0: Input state matrix, s
-	// r1: output (8 32-bit words)
-	// r2: nrounds
-	push		{lr}
-
-	vld1.32		{q0-q1}, [r0]!
-	vld1.32		{q2-q3}, [r0]
-
-	mov		r3, r2
-	bl		chacha_permute
-
-	vst1.32		{q0}, [r1]!
-	vst1.32		{q3}, [r1]
-
-	pop		{pc}
-ENDPROC(hchacha_block_neon)
-
-	.align		4
-.Lctrinc:	.word	0, 1, 2, 3
-.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
-
-	.align		5
-ENTRY(chacha_4block_xor_neon)
-	push		{r4, lr}
-	mov		r4, sp			// preserve the stack pointer
-	sub		ip, sp, #0x20		// allocate a 32 byte buffer
-	bic		ip, ip, #0x1f		// aligned to 32 bytes
-	mov		sp, ip
-
-	// r0: Input state matrix, s
-	// r1: 4 data blocks output, o
-	// r2: 4 data blocks input, i
-	// r3: nrounds
-
-	//
-	// This function encrypts four consecutive ChaCha blocks by loading
-	// the state matrix in NEON registers four times. The algorithm performs
-	// each operation on the corresponding word of each state matrix, hence
-	// requires no word shuffling. The words are re-interleaved before the
-	// final addition of the original state and the XORing step.
-	//
-
-	// x0..15[0-3] = s0..15[0-3]
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	adr		lr, .Lctrinc
-	vdup.32		q15, d7[1]
-	vdup.32		q14, d7[0]
-	vld1.32		{q4}, [lr, :128]
-	vdup.32		q13, d6[1]
-	vdup.32		q12, d6[0]
-	vdup.32		q11, d5[1]
-	vdup.32		q10, d5[0]
-	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
-	vdup.32		q9, d4[1]
-	vdup.32		q8, d4[0]
-	vdup.32		q7, d3[1]
-	vdup.32		q6, d3[0]
-	vdup.32		q5, d2[1]
-	vdup.32		q4, d2[0]
-	vdup.32		q3, d1[1]
-	vdup.32		q2, d1[0]
-	vdup.32		q1, d0[1]
-	vdup.32		q0, d0[0]
-
-	adr		ip, .Lrol8_table
-	b		1f
-
-.Ldoubleround4:
-	vld1.32		{q8-q9}, [sp, :256]
-1:
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	vadd.i32	q0, q0, q4
-	vadd.i32	q1, q1, q5
-	vadd.i32	q2, q2, q6
-	vadd.i32	q3, q3, q7
-
-	veor		q12, q12, q0
-	veor		q13, q13, q1
-	veor		q14, q14, q2
-	veor		q15, q15, q3
-
-	vrev32.16	q12, q12
-	vrev32.16	q13, q13
-	vrev32.16	q14, q14
-	vrev32.16	q15, q15
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	vadd.i32	q8, q8, q12
-	vadd.i32	q9, q9, q13
-	vadd.i32	q10, q10, q14
-	vadd.i32	q11, q11, q15
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q4, q8
-	veor		q9, q5, q9
-	vshl.u32	q4, q8, #12
-	vshl.u32	q5, q9, #12
-	vsri.u32	q4, q8, #20
-	vsri.u32	q5, q9, #20
-
-	veor		q8, q6, q10
-	veor		q9, q7, q11
-	vshl.u32	q6, q8, #12
-	vshl.u32	q7, q9, #12
-	vsri.u32	q6, q8, #20
-	vsri.u32	q7, q9, #20
-
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	vld1.8		{d16}, [ip, :64]
-	vadd.i32	q0, q0, q4
-	vadd.i32	q1, q1, q5
-	vadd.i32	q2, q2, q6
-	vadd.i32	q3, q3, q7
-
-	veor		q12, q12, q0
-	veor		q13, q13, q1
-	veor		q14, q14, q2
-	veor		q15, q15, q3
-
-	vtbl.8		d24, {d24}, d16
-	vtbl.8		d25, {d25}, d16
-	vtbl.8		d26, {d26}, d16
-	vtbl.8		d27, {d27}, d16
-	vtbl.8		d28, {d28}, d16
-	vtbl.8		d29, {d29}, d16
-	vtbl.8		d30, {d30}, d16
-	vtbl.8		d31, {d31}, d16
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	vadd.i32	q8, q8, q12
-	vadd.i32	q9, q9, q13
-	vadd.i32	q10, q10, q14
-	vadd.i32	q11, q11, q15
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q4, q8
-	veor		q9, q5, q9
-	vshl.u32	q4, q8, #7
-	vshl.u32	q5, q9, #7
-	vsri.u32	q4, q8, #25
-	vsri.u32	q5, q9, #25
-
-	veor		q8, q6, q10
-	veor		q9, q7, q11
-	vshl.u32	q6, q8, #7
-	vshl.u32	q7, q9, #7
-	vsri.u32	q6, q8, #25
-	vsri.u32	q7, q9, #25
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	vadd.i32	q0, q0, q5
-	vadd.i32	q1, q1, q6
-	vadd.i32	q2, q2, q7
-	vadd.i32	q3, q3, q4
-
-	veor		q15, q15, q0
-	veor		q12, q12, q1
-	veor		q13, q13, q2
-	veor		q14, q14, q3
-
-	vrev32.16	q15, q15
-	vrev32.16	q12, q12
-	vrev32.16	q13, q13
-	vrev32.16	q14, q14
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	vadd.i32	q10, q10, q15
-	vadd.i32	q11, q11, q12
-	vadd.i32	q8, q8, q13
-	vadd.i32	q9, q9, q14
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q7, q8
-	veor		q9, q4, q9
-	vshl.u32	q7, q8, #12
-	vshl.u32	q4, q9, #12
-	vsri.u32	q7, q8, #20
-	vsri.u32	q4, q9, #20
-
-	veor		q8, q5, q10
-	veor		q9, q6, q11
-	vshl.u32	q5, q8, #12
-	vshl.u32	q6, q9, #12
-	vsri.u32	q5, q8, #20
-	vsri.u32	q6, q9, #20
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	vld1.8		{d16}, [ip, :64]
-	vadd.i32	q0, q0, q5
-	vadd.i32	q1, q1, q6
-	vadd.i32	q2, q2, q7
-	vadd.i32	q3, q3, q4
-
-	veor		q15, q15, q0
-	veor		q12, q12, q1
-	veor		q13, q13, q2
-	veor		q14, q14, q3
-
-	vtbl.8		d30, {d30}, d16
-	vtbl.8		d31, {d31}, d16
-	vtbl.8		d24, {d24}, d16
-	vtbl.8		d25, {d25}, d16
-	vtbl.8		d26, {d26}, d16
-	vtbl.8		d27, {d27}, d16
-	vtbl.8		d28, {d28}, d16
-	vtbl.8		d29, {d29}, d16
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	vadd.i32	q10, q10, q15
-	vadd.i32	q11, q11, q12
-	vadd.i32	q8, q8, q13
-	vadd.i32	q9, q9, q14
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q7, q8
-	veor		q9, q4, q9
-	vshl.u32	q7, q8, #7
-	vshl.u32	q4, q9, #7
-	vsri.u32	q7, q8, #25
-	vsri.u32	q4, q9, #25
-
-	veor		q8, q5, q10
-	veor		q9, q6, q11
-	vshl.u32	q5, q8, #7
-	vshl.u32	q6, q9, #7
-	vsri.u32	q5, q8, #25
-	vsri.u32	q6, q9, #25
-
-	subs		r3, r3, #2
-	bne		.Ldoubleround4
-
-	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
-	// x8..9[0-3] are on the stack.
-
-	// Re-interleave the words in the first two rows of each block (x0..7).
-	// Also add the counter values 0-3 to x12[0-3].
-	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
-	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
-	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
-	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
-	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
-	  vadd.u32	q12, q8			// x12 += counter values 0-3
-	vswp		d1, d4
-	vswp		d3, d6
-	  vld1.32	{q8-q9}, [r0]!		// load s0..7
-	vswp		d9, d12
-	vswp		d11, d14
-
-	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
-	// after XORing the first 32 bytes.
-	vswp		q1, q4
-
-	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
-
-	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
-	vadd.u32	q0, q0, q8
-	vadd.u32	q2, q2, q8
-	vadd.u32	q4, q4, q8
-	vadd.u32	q3, q3, q8
-
-	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
-	vadd.u32	q1, q1, q9
-	vadd.u32	q6, q6, q9
-	vadd.u32	q5, q5, q9
-	vadd.u32	q7, q7, q9
-
-	// XOR first 32 bytes using keystream from first two rows of first block
-	vld1.8		{q8-q9}, [r2]!
-	veor		q8, q8, q0
-	veor		q9, q9, q1
-	vst1.8		{q8-q9}, [r1]!
-
-	// Re-interleave the words in the last two rows of each block (x8..15).
-	vld1.32		{q8-q9}, [sp, :256]
-	  mov		sp, r4		// restore original stack pointer
-	  ldr		r4, [r4, #8]	// load number of bytes
-	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
-	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
-	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
-	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
-	  vld1.32	{q0-q1}, [r0]	// load s8..15
-	vswp		d25, d28
-	vswp		d27, d30
-	vswp		d17, d20
-	vswp		d19, d22
-
-	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
-
-	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
-	vadd.u32	q8,  q8,  q0
-	vadd.u32	q10, q10, q0
-	vadd.u32	q9,  q9,  q0
-	vadd.u32	q11, q11, q0
-
-	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
-	vadd.u32	q12, q12, q1
-	vadd.u32	q14, q14, q1
-	vadd.u32	q13, q13, q1
-	vadd.u32	q15, q15, q1
-
-	// XOR the rest of the data with the keystream
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #96
-	veor		q0, q0, q8
-	veor		q1, q1, q12
-	ble		.Lle96
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q2
-	veor		q1, q1, q6
-	ble		.Lle128
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q10
-	veor		q1, q1, q14
-	ble		.Lle160
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q4
-	veor		q1, q1, q5
-	ble		.Lle192
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q9
-	veor		q1, q1, q13
-	ble		.Lle224
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q3
-	veor		q1, q1, q7
-	blt		.Llt256
-.Lout:
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]
-	veor		q0, q0, q11
-	veor		q1, q1, q15
-	vst1.8		{q0-q1}, [r1]
-
-	pop		{r4, pc}
-
-.Lle192:
-	vmov		q4, q9
-	vmov		q5, q13
-
-.Lle160:
-	// nothing to do
-
-.Lfinalblock:
-	// Process the final block if processing less than 4 full blocks.
-	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
-	// previous 32 byte output block that still needs to be written at
-	// [r1] in q0-q1.
-	beq		.Lfullblock
-
-.Lpartialblock:
-	adr		lr, .Lpermute + 32
-	add		r2, r2, r4
-	add		lr, lr, r4
-	add		r4, r4, r1
-
-	vld1.8		{q2-q3}, [lr]
-	vld1.8		{q6-q7}, [r2]
-
-	add		r4, r4, #32
-
-	vtbl.8		d4, {q4-q5}, d4
-	vtbl.8		d5, {q4-q5}, d5
-	vtbl.8		d6, {q4-q5}, d6
-	vtbl.8		d7, {q4-q5}, d7
-
-	veor		q6, q6, q2
-	veor		q7, q7, q3
-
-	vst1.8		{q6-q7}, [r4]	// overlapping stores
-	vst1.8		{q0-q1}, [r1]
-	pop		{r4, pc}
-
-.Lfullblock:
-	vmov		q11, q4
-	vmov		q15, q5
-	b		.Lout
-.Lle96:
-	vmov		q4, q2
-	vmov		q5, q6
-	b		.Lfinalblock
-.Lle128:
-	vmov		q4, q10
-	vmov		q5, q14
-	b		.Lfinalblock
-.Lle224:
-	vmov		q4, q3
-	vmov		q5, q7
-	b		.Lfinalblock
-.Llt256:
-	vmov		q4, q11
-	vmov		q5, q15
-	b		.Lpartialblock
-ENDPROC(chacha_4block_xor_neon)
-
-	.align		L1_CACHE_SHIFT
-.Lpermute:
-	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/arch/arm/lib/crypto/chacha-scalar-core.S b/arch/arm/lib/crypto/chacha-scalar-core.S
deleted file mode 100644
index 4951df05c158..000000000000
--- a/arch/arm/lib/crypto/chacha-scalar-core.S
+++ /dev/null
@@ -1,444 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2018 Google, Inc.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Design notes:
- *
- * 16 registers would be needed to hold the state matrix, but only 14 are
- * available because 'sp' and 'pc' cannot be used.  So we spill the elements
- * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
- * 'ldrd' and one 'strd' instruction per round.
- *
- * All rotates are performed using the implicit rotate operand accepted by the
- * 'add' and 'eor' instructions.  This is faster than using explicit rotate
- * instructions.  To make this work, we allow the values in the second and last
- * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
- * wrong rotation amount.  The rotation amount is then fixed up just in time
- * when the values are used.  'brot' is the number of bits the values in row 'b'
- * need to be rotated right to arrive at the correct values, and 'drot'
- * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
- * that they end up as (25, 24) after every round.
- */
-
-	// ChaCha state registers
-	X0	.req	r0
-	X1	.req	r1
-	X2	.req	r2
-	X3	.req	r3
-	X4	.req	r4
-	X5	.req	r5
-	X6	.req	r6
-	X7	.req	r7
-	X8_X10	.req	r8	// shared by x8 and x10
-	X9_X11	.req	r9	// shared by x9 and x11
-	X12	.req	r10
-	X13	.req	r11
-	X14	.req	r12
-	X15	.req	r14
-
-.macro _le32_bswap_4x	a, b, c, d,  tmp
-#ifdef __ARMEB__
-	rev_l		\a,  \tmp
-	rev_l		\b,  \tmp
-	rev_l		\c,  \tmp
-	rev_l		\d,  \tmp
-#endif
-.endm
-
-.macro __ldrd		a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
-	ldrd		\a, \b, [\src, #\offset]
-#else
-	ldr		\a, [\src, #\offset]
-	ldr		\b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd		a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
-	strd		\a, \b, [\dst, #\offset]
-#else
-	str		\a, [\dst, #\offset]
-	str		\b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
-
-	// a += b; d ^= a; d = rol(d, 16);
-	add		\a1, \a1, \b1, ror #brot
-	add		\a2, \a2, \b2, ror #brot
-	eor		\d1, \a1, \d1, ror #drot
-	eor		\d2, \a2, \d2, ror #drot
-	// drot == 32 - 16 == 16
-
-	// c += d; b ^= c; b = rol(b, 12);
-	add		\c1, \c1, \d1, ror #16
-	add		\c2, \c2, \d2, ror #16
-	eor		\b1, \c1, \b1, ror #brot
-	eor		\b2, \c2, \b2, ror #brot
-	// brot == 32 - 12 == 20
-
-	// a += b; d ^= a; d = rol(d, 8);
-	add		\a1, \a1, \b1, ror #20
-	add		\a2, \a2, \b2, ror #20
-	eor		\d1, \a1, \d1, ror #16
-	eor		\d2, \a2, \d2, ror #16
-	// drot == 32 - 8 == 24
-
-	// c += d; b ^= c; b = rol(b, 7);
-	add		\c1, \c1, \d1, ror #24
-	add		\c2, \c2, \d2, ror #24
-	eor		\b1, \c1, \b1, ror #20
-	eor		\b2, \c2, \b2, ror #20
-	// brot == 32 - 7 == 25
-.endm
-
-.macro _doubleround
-
-	// column round
-
-	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
-	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
-
-	// save (x8, x9); restore (x10, x11)
-	__strd		X8_X10, X9_X11, sp, 0
-	__ldrd		X8_X10, X9_X11, sp, 8
-
-	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
-	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
-
-	.set brot, 25
-	.set drot, 24
-
-	// diagonal round
-
-	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
-	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
-
-	// save (x10, x11); restore (x8, x9)
-	__strd		X8_X10, X9_X11, sp, 8
-	__ldrd		X8_X10, X9_X11, sp, 0
-
-	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
-	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
-.endm
-
-.macro _chacha_permute	nrounds
-	.set brot, 0
-	.set drot, 0
-	.rept \nrounds / 2
-	 _doubleround
-	.endr
-.endm
-
-.macro _chacha		nrounds
-
-.Lnext_block\@:
-	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
-	// Registers contain x0-x9,x12-x15.
-
-	// Do the core ChaCha permutation to update x0-x15.
-	_chacha_permute	\nrounds
-
-	add		sp, #8
-	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
-	// Registers contain x0-x9,x12-x15.
-	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
-	push		{X8_X10, X9_X11, X12, X13, X14, X15}
-
-	// Load (OUT, IN, LEN).
-	ldr		r14, [sp, #96]
-	ldr		r12, [sp, #100]
-	ldr		r11, [sp, #104]
-
-	orr		r10, r14, r12
-
-	// Use slow path if fewer than 64 bytes remain.
-	cmp		r11, #64
-	blt		.Lxor_slowpath\@
-
-	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
-	// ARMv6+, since ldmia and stmia (used below) still require alignment.
-	tst		r10, #3
-	bne		.Lxor_slowpath\@
-
-	// Fast path: XOR 64 bytes of aligned data.
-
-	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
-	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
-	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-	// x0-x3
-	__ldrd		r8, r9, sp, 32
-	__ldrd		r10, r11, sp, 40
-	add		X0, X0, r8
-	add		X1, X1, r9
-	add		X2, X2, r10
-	add		X3, X3, r11
-	_le32_bswap_4x	X0, X1, X2, X3,  r8
-	ldmia		r12!, {r8-r11}
-	eor		X0, X0, r8
-	eor		X1, X1, r9
-	eor		X2, X2, r10
-	eor		X3, X3, r11
-	stmia		r14!, {X0-X3}
-
-	// x4-x7
-	__ldrd		r8, r9, sp, 48
-	__ldrd		r10, r11, sp, 56
-	add		X4, r8, X4, ror #brot
-	add		X5, r9, X5, ror #brot
-	ldmia		r12!, {X0-X3}
-	add		X6, r10, X6, ror #brot
-	add		X7, r11, X7, ror #brot
-	_le32_bswap_4x	X4, X5, X6, X7,  r8
-	eor		X4, X4, X0
-	eor		X5, X5, X1
-	eor		X6, X6, X2
-	eor		X7, X7, X3
-	stmia		r14!, {X4-X7}
-
-	// x8-x15
-	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
-	__ldrd		r8, r9, sp, 32
-	__ldrd		r10, r11, sp, 40
-	add		r0, r0, r8		// x8
-	add		r1, r1, r9		// x9
-	add		r6, r6, r10		// x10
-	add		r7, r7, r11		// x11
-	_le32_bswap_4x	r0, r1, r6, r7,  r8
-	ldmia		r12!, {r8-r11}
-	eor		r0, r0, r8		// x8
-	eor		r1, r1, r9		// x9
-	eor		r6, r6, r10		// x10
-	eor		r7, r7, r11		// x11
-	stmia		r14!, {r0,r1,r6,r7}
-	ldmia		r12!, {r0,r1,r6,r7}
-	__ldrd		r8, r9, sp, 48
-	__ldrd		r10, r11, sp, 56
-	add		r2, r8, r2, ror #drot	// x12
-	add		r3, r9, r3, ror #drot	// x13
-	add		r4, r10, r4, ror #drot	// x14
-	add		r5, r11, r5, ror #drot	// x15
-	_le32_bswap_4x	r2, r3, r4, r5,  r9
-	  ldr		r9, [sp, #72]		// load LEN
-	eor		r2, r2, r0		// x12
-	eor		r3, r3, r1		// x13
-	eor		r4, r4, r6		// x14
-	eor		r5, r5, r7		// x15
-	  subs		r9, #64			// decrement and check LEN
-	stmia		r14!, {r2-r5}
-
-	beq		.Ldone\@
-
-.Lprepare_for_next_block\@:
-
-	// Stack: x0-x15 OUT IN LEN
-
-	// Increment block counter (x12)
-	add		r8, #1
-
-	// Store updated (OUT, IN, LEN)
-	str		r14, [sp, #64]
-	str		r12, [sp, #68]
-	str		r9, [sp, #72]
-
-	  mov		r14, sp
-
-	// Store updated block counter (x12)
-	str		r8, [sp, #48]
-
-	  sub		sp, #16
-
-	// Reload state and do next block
-	ldmia		r14!, {r0-r11}		// load x0-x11
-	__strd		r10, r11, sp, 8		// store x10-x11 before state
-	ldmia		r14, {r10-r12,r14}	// load x12-x15
-	b		.Lnext_block\@
-
-.Lxor_slowpath\@:
-	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
-	// We handle it by storing the 64 bytes of keystream to the stack, then
-	// XOR-ing the needed portion with the data.
-
-	// Allocate keystream buffer
-	sub		sp, #64
-	mov		r14, sp
-
-	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
-	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
-	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-	// Save keystream for x0-x3
-	__ldrd		r8, r9, sp, 96
-	__ldrd		r10, r11, sp, 104
-	add		X0, X0, r8
-	add		X1, X1, r9
-	add		X2, X2, r10
-	add		X3, X3, r11
-	_le32_bswap_4x	X0, X1, X2, X3,  r8
-	stmia		r14!, {X0-X3}
-
-	// Save keystream for x4-x7
-	__ldrd		r8, r9, sp, 112
-	__ldrd		r10, r11, sp, 120
-	add		X4, r8, X4, ror #brot
-	add		X5, r9, X5, ror #brot
-	add		X6, r10, X6, ror #brot
-	add		X7, r11, X7, ror #brot
-	_le32_bswap_4x	X4, X5, X6, X7,  r8
-	  add		r8, sp, #64
-	stmia		r14!, {X4-X7}
-
-	// Save keystream for x8-x15
-	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
-	__ldrd		r8, r9, sp, 128
-	__ldrd		r10, r11, sp, 136
-	add		r0, r0, r8		// x8
-	add		r1, r1, r9		// x9
-	add		r6, r6, r10		// x10
-	add		r7, r7, r11		// x11
-	_le32_bswap_4x	r0, r1, r6, r7,  r8
-	stmia		r14!, {r0,r1,r6,r7}
-	__ldrd		r8, r9, sp, 144
-	__ldrd		r10, r11, sp, 152
-	add		r2, r8, r2, ror #drot	// x12
-	add		r3, r9, r3, ror #drot	// x13
-	add		r4, r10, r4, ror #drot	// x14
-	add		r5, r11, r5, ror #drot	// x15
-	_le32_bswap_4x	r2, r3, r4, r5,  r9
-	stmia		r14, {r2-r5}
-
-	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
-	// Registers: r8 is block counter, r12 is IN.
-
-	ldr		r9, [sp, #168]		// LEN
-	ldr		r14, [sp, #160]		// OUT
-	cmp		r9, #64
-	  mov		r0, sp
-	movle		r1, r9
-	movgt		r1, #64
-	// r1 is number of bytes to XOR, in range [1, 64]
-
-.if __LINUX_ARM_ARCH__ < 6
-	orr		r2, r12, r14
-	tst		r2, #3			// IN or OUT misaligned?
-	bne		.Lxor_next_byte\@
-.endif
-
-	// XOR a word at a time
-.rept 16
-	subs		r1, #4
-	blt		.Lxor_words_done\@
-	ldr		r2, [r12], #4
-	ldr		r3, [r0], #4
-	eor		r2, r2, r3
-	str		r2, [r14], #4
-.endr
-	b		.Lxor_slowpath_done\@
-.Lxor_words_done\@:
-	ands		r1, r1, #3
-	beq		.Lxor_slowpath_done\@
-
-	// XOR a byte at a time
-.Lxor_next_byte\@:
-	ldrb		r2, [r12], #1
-	ldrb		r3, [r0], #1
-	eor		r2, r2, r3
-	strb		r2, [r14], #1
-	subs		r1, #1
-	bne		.Lxor_next_byte\@
-
-.Lxor_slowpath_done\@:
-	subs		r9, #64
-	add		sp, #96
-	bgt		.Lprepare_for_next_block\@
-
-.Ldone\@:
-.endm	// _chacha
-
-/*
- * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
- *		     const struct chacha_state *state, int nrounds);
- */
-ENTRY(chacha_doarm)
-	cmp		r2, #0			// len == 0?
-	reteq		lr
-
-	ldr		ip, [sp]
-	cmp		ip, #12
-
-	push		{r0-r2,r4-r11,lr}
-
-	// Push state x0-x15 onto stack.
-	// Also store an extra copy of x10-x11 just before the state.
-
-	add		X12, r3, #48
-	ldm		X12, {X12,X13,X14,X15}
-	push		{X12,X13,X14,X15}
-	sub		sp, sp, #64
-
-	__ldrd		X8_X10, X9_X11, r3, 40
-	__strd		X8_X10, X9_X11, sp, 8
-	__strd		X8_X10, X9_X11, sp, 56
-	ldm		r3, {X0-X9_X11}
-	__strd		X0, X1, sp, 16
-	__strd		X2, X3, sp, 24
-	__strd		X4, X5, sp, 32
-	__strd		X6, X7, sp, 40
-	__strd		X8_X10, X9_X11, sp, 48
-
-	beq		1f
-	_chacha		20
-
-0:	add		sp, #76
-	pop		{r4-r11, pc}
-
-1:	_chacha		12
-	b		0b
-ENDPROC(chacha_doarm)
-
-/*
- * void hchacha_block_arm(const struct chacha_state *state,
- *			  u32 out[HCHACHA_OUT_WORDS], int nrounds);
- */
-ENTRY(hchacha_block_arm)
-	push		{r1,r4-r11,lr}
-
-	cmp		r2, #12			// ChaCha12 ?
-
-	mov		r14, r0
-	ldmia		r14!, {r0-r11}		// load x0-x11
-	push		{r10-r11}		// store x10-x11 to stack
-	ldm		r14, {r10-r12,r14}	// load x12-x15
-	sub		sp, #8
-
-	beq		1f
-	_chacha_permute	20
-
-	// Skip over (unused0-unused1, x10-x11)
-0:	add		sp, #16
-
-	// Fix up rotations of x12-x15
-	ror		X12, X12, #drot
-	ror		X13, X13, #drot
-	  pop		{r4}			// load 'out'
-	ror		X14, X14, #drot
-	ror		X15, X15, #drot
-
-	// Store (x0-x3,x12-x15) to 'out'
-	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
-
-	pop		{r4-r11,pc}
-
-1:	_chacha_permute	12
-	b		0b
-ENDPROC(hchacha_block_arm)
diff --git a/arch/arm/lib/crypto/poly1305-armv4.pl b/arch/arm/lib/crypto/poly1305-armv4.pl
deleted file mode 100644
index d57c6e2fc84a..000000000000
--- a/arch/arm/lib/crypto/poly1305-armv4.pl
+++ /dev/null
@@ -1,1236 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-#			IALU(*)/gcc-4.4		NEON
-#
-# ARM11xx(ARMv6)	7.78/+100%		-
-# Cortex-A5		6.35/+130%		3.00
-# Cortex-A8		6.25/+115%		2.36
-# Cortex-A9		5.10/+95%		2.55
-# Cortex-A15		3.85/+85%		1.25(**)
-# Snapdragon S4		5.70/+100%		1.48(**)
-#
-# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**)	these are trade-off results, they can be improved by ~8% but at
-#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#ifndef	__KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init   poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit   poly1305_emit_arch
-.globl	poly1305_blocks_neon
-#endif
-
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-.text
-
-.globl	poly1305_emit
-.globl	poly1305_blocks
-.globl	poly1305_init
-.type	poly1305_init,%function
-.align	5
-poly1305_init:
-.Lpoly1305_init:
-	stmdb	sp!,{r4-r11}
-
-	eor	r3,r3,r3
-	cmp	$inp,#0
-	str	r3,[$ctx,#0]		@ zero hash value
-	str	r3,[$ctx,#4]
-	str	r3,[$ctx,#8]
-	str	r3,[$ctx,#12]
-	str	r3,[$ctx,#16]
-	str	r3,[$ctx,#36]		@ clear is_base2_26
-	add	$ctx,$ctx,#20
-
-#ifdef	__thumb2__
-	it	eq
-#endif
-	moveq	r0,#0
-	beq	.Lno_key
-
-#if	__ARM_MAX_ARCH__>=7
-	mov	r3,#-1
-	str	r3,[$ctx,#28]		@ impossible key power value
-# ifndef __KERNEL__
-	adr	r11,.Lpoly1305_init
-	ldr	r12,.LOPENSSL_armcap
-# endif
-#endif
-	ldrb	r4,[$inp,#0]
-	mov	r10,#0x0fffffff
-	ldrb	r5,[$inp,#1]
-	and	r3,r10,#-4		@ 0x0ffffffc
-	ldrb	r6,[$inp,#2]
-	ldrb	r7,[$inp,#3]
-	orr	r4,r4,r5,lsl#8
-	ldrb	r5,[$inp,#4]
-	orr	r4,r4,r6,lsl#16
-	ldrb	r6,[$inp,#5]
-	orr	r4,r4,r7,lsl#24
-	ldrb	r7,[$inp,#6]
-	and	r4,r4,r10
-
-#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-# if !defined(_WIN32)
-	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
-# endif
-# if defined(__APPLE__) || defined(_WIN32)
-	ldr	r12,[r12]
-# endif
-#endif
-	ldrb	r8,[$inp,#7]
-	orr	r5,r5,r6,lsl#8
-	ldrb	r6,[$inp,#8]
-	orr	r5,r5,r7,lsl#16
-	ldrb	r7,[$inp,#9]
-	orr	r5,r5,r8,lsl#24
-	ldrb	r8,[$inp,#10]
-	and	r5,r5,r3
-
-#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	tst	r12,#ARMV7_NEON		@ check for NEON
-# ifdef	__thumb2__
-	adr	r9,.Lpoly1305_blocks_neon
-	adr	r11,.Lpoly1305_blocks
-	it	ne
-	movne	r11,r9
-	adr	r12,.Lpoly1305_emit
-	orr	r11,r11,#1		@ thumb-ify addresses
-	orr	r12,r12,#1
-# else
-	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
-	ite	eq
-	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
-	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
-# endif
-#endif
-	ldrb	r9,[$inp,#11]
-	orr	r6,r6,r7,lsl#8
-	ldrb	r7,[$inp,#12]
-	orr	r6,r6,r8,lsl#16
-	ldrb	r8,[$inp,#13]
-	orr	r6,r6,r9,lsl#24
-	ldrb	r9,[$inp,#14]
-	and	r6,r6,r3
-
-	ldrb	r10,[$inp,#15]
-	orr	r7,r7,r8,lsl#8
-	str	r4,[$ctx,#0]
-	orr	r7,r7,r9,lsl#16
-	str	r5,[$ctx,#4]
-	orr	r7,r7,r10,lsl#24
-	str	r6,[$ctx,#8]
-	and	r7,r7,r3
-	str	r7,[$ctx,#12]
-#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	stmia	r2,{r11,r12}		@ fill functions table
-	mov	r0,#1
-#else
-	mov	r0,#0
-#endif
-.Lno_key:
-	ldmia	sp!,{r4-r11}
-#if	__ARM_ARCH__>=5
-	ret				@ bx	lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type	poly1305_blocks,%function
-.align	5
-poly1305_blocks:
-.Lpoly1305_blocks:
-	stmdb	sp!,{r3-r11,lr}
-
-	ands	$len,$len,#-16
-	beq	.Lno_data
-
-	add	$len,$len,$inp		@ end pointer
-	sub	sp,sp,#32
-
-#if __ARM_ARCH__<7
-	ldmia	$ctx,{$h0-$r3}		@ load context
-	add	$ctx,$ctx,#20
-	str	$len,[sp,#16]		@ offload stuff
-	str	$ctx,[sp,#12]
-#else
-	ldr	lr,[$ctx,#36]		@ is_base2_26
-	ldmia	$ctx!,{$h0-$h4}		@ load hash value
-	str	$len,[sp,#16]		@ offload stuff
-	str	$ctx,[sp,#12]
-
-	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
-	mov	$r1,$h1,lsr#6
-	adcs	$r1,$r1,$h2,lsl#20
-	mov	$r2,$h2,lsr#12
-	adcs	$r2,$r2,$h3,lsl#14
-	mov	$r3,$h3,lsr#18
-	adcs	$r3,$r3,$h4,lsl#8
-	mov	$len,#0
-	teq	lr,#0
-	str	$len,[$ctx,#16]		@ clear is_base2_26
-	adc	$len,$len,$h4,lsr#24
-
-	itttt	ne
-	movne	$h0,$r0			@ choose between radixes
-	movne	$h1,$r1
-	movne	$h2,$r2
-	movne	$h3,$r3
-	ldmia	$ctx,{$r0-$r3}		@ load key
-	it	ne
-	movne	$h4,$len
-#endif
-
-	mov	lr,$inp
-	cmp	$padbit,#0
-	str	$r1,[sp,#20]
-	str	$r2,[sp,#24]
-	str	$r3,[sp,#28]
-	b	.Loop
-
-.align	4
-.Loop:
-#if __ARM_ARCH__<7
-	ldrb	r0,[lr],#16		@ load input
-# ifdef	__thumb2__
-	it	hi
-# endif
-	addhi	$h4,$h4,#1		@ 1<<128
-	ldrb	r1,[lr,#-15]
-	ldrb	r2,[lr,#-14]
-	ldrb	r3,[lr,#-13]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-12]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-11]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-10]
-	adds	$h0,$h0,r3		@ accumulate input
-
-	ldrb	r3,[lr,#-9]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-8]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-7]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-6]
-	adcs	$h1,$h1,r3
-
-	ldrb	r3,[lr,#-5]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-4]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-3]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-2]
-	adcs	$h2,$h2,r3
-
-	ldrb	r3,[lr,#-1]
-	orr	r1,r0,r1,lsl#8
-	str	lr,[sp,#8]		@ offload input pointer
-	orr	r2,r1,r2,lsl#16
-	add	$s1,$r1,$r1,lsr#2
-	orr	r3,r2,r3,lsl#24
-#else
-	ldr	r0,[lr],#16		@ load input
-	it	hi
-	addhi	$h4,$h4,#1		@ padbit
-	ldr	r1,[lr,#-12]
-	ldr	r2,[lr,#-8]
-	ldr	r3,[lr,#-4]
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-	adds	$h0,$h0,r0		@ accumulate input
-	str	lr,[sp,#8]		@ offload input pointer
-	adcs	$h1,$h1,r1
-	add	$s1,$r1,$r1,lsr#2
-	adcs	$h2,$h2,r2
-#endif
-	add	$s2,$r2,$r2,lsr#2
-	adcs	$h3,$h3,r3
-	add	$s3,$r3,$r3,lsr#2
-
-	umull	r2,r3,$h1,$r0
-	 adc	$h4,$h4,#0
-	umull	r0,r1,$h0,$r0
-	umlal	r2,r3,$h4,$s1
-	umlal	r0,r1,$h3,$s1
-	ldr	$r1,[sp,#20]		@ reload $r1
-	umlal	r2,r3,$h2,$s3
-	umlal	r0,r1,$h1,$s3
-	umlal	r2,r3,$h3,$s2
-	umlal	r0,r1,$h2,$s2
-	umlal	r2,r3,$h0,$r1
-	str	r0,[sp,#0]		@ future $h0
-	 mul	r0,$s2,$h4
-	ldr	$r2,[sp,#24]		@ reload $r2
-	adds	r2,r2,r1		@ d1+=d0>>32
-	 eor	r1,r1,r1
-	adc	lr,r3,#0		@ future $h2
-	str	r2,[sp,#4]		@ future $h1
-
-	mul	r2,$s3,$h4
-	eor	r3,r3,r3
-	umlal	r0,r1,$h3,$s3
-	ldr	$r3,[sp,#28]		@ reload $r3
-	umlal	r2,r3,$h3,$r0
-	umlal	r0,r1,$h2,$r0
-	umlal	r2,r3,$h2,$r1
-	umlal	r0,r1,$h1,$r1
-	umlal	r2,r3,$h1,$r2
-	umlal	r0,r1,$h0,$r2
-	umlal	r2,r3,$h0,$r3
-	ldr	$h0,[sp,#0]
-	mul	$h4,$r0,$h4
-	ldr	$h1,[sp,#4]
-
-	adds	$h2,lr,r0		@ d2+=d1>>32
-	ldr	lr,[sp,#8]		@ reload input pointer
-	adc	r1,r1,#0
-	adds	$h3,r2,r1		@ d3+=d2>>32
-	ldr	r0,[sp,#16]		@ reload end pointer
-	adc	r3,r3,#0
-	add	$h4,$h4,r3		@ h4+=d3>>32
-
-	and	r1,$h4,#-4
-	and	$h4,$h4,#3
-	add	r1,r1,r1,lsr#2		@ *=5
-	adds	$h0,$h0,r1
-	adcs	$h1,$h1,#0
-	adcs	$h2,$h2,#0
-	adcs	$h3,$h3,#0
-	adc	$h4,$h4,#0
-
-	cmp	r0,lr			@ done yet?
-	bhi	.Loop
-
-	ldr	$ctx,[sp,#12]
-	add	sp,sp,#32
-	stmdb	$ctx,{$h0-$h4}		@ store the result
-
-.Lno_data:
-#if	__ARM_ARCH__>=5
-	ldmia	sp!,{r3-r11,pc}
-#else
-	ldmia	sp!,{r3-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$ctx;
-
-$code.=<<___;
-.type	poly1305_emit,%function
-.align	5
-poly1305_emit:
-.Lpoly1305_emit:
-	stmdb	sp!,{r4-r11}
-
-	ldmia	$ctx,{$h0-$h4}
-
-#if __ARM_ARCH__>=7
-	ldr	ip,[$ctx,#36]		@ is_base2_26
-
-	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
-	mov	$g1,$h1,lsr#6
-	adcs	$g1,$g1,$h2,lsl#20
-	mov	$g2,$h2,lsr#12
-	adcs	$g2,$g2,$h3,lsl#14
-	mov	$g3,$h3,lsr#18
-	adcs	$g3,$g3,$h4,lsl#8
-	mov	$g4,#0
-	adc	$g4,$g4,$h4,lsr#24
-
-	tst	ip,ip
-	itttt	ne
-	movne	$h0,$g0
-	movne	$h1,$g1
-	movne	$h2,$g2
-	movne	$h3,$g3
-	it	ne
-	movne	$h4,$g4
-#endif
-
-	adds	$g0,$h0,#5		@ compare to modulus
-	adcs	$g1,$h1,#0
-	adcs	$g2,$h2,#0
-	adcs	$g3,$h3,#0
-	adc	$g4,$h4,#0
-	tst	$g4,#4			@ did it carry/borrow?
-
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h0,$g0
-	ldr	$g0,[$nonce,#0]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h1,$g1
-	ldr	$g1,[$nonce,#4]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h2,$g2
-	ldr	$g2,[$nonce,#8]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h3,$g3
-	ldr	$g3,[$nonce,#12]
-
-	adds	$h0,$h0,$g0
-	adcs	$h1,$h1,$g1
-	adcs	$h2,$h2,$g2
-	adc	$h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
-	rev	$h0,$h0
-	rev	$h1,$h1
-	rev	$h2,$h2
-	rev	$h3,$h3
-# endif
-	str	$h0,[$mac,#0]
-	str	$h1,[$mac,#4]
-	str	$h2,[$mac,#8]
-	str	$h3,[$mac,#12]
-#else
-	strb	$h0,[$mac,#0]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#4]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#8]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#12]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#1]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#5]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#9]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#13]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#2]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#6]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#10]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#14]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#3]
-	strb	$h1,[$mac,#7]
-	strb	$h2,[$mac,#11]
-	strb	$h3,[$mac,#15]
-#endif
-	ldmia	sp!,{r4-r11}
-#if	__ARM_ARCH__>=5
-	ret				@ bx	lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if	__ARM_MAX_ARCH__>=7
-.fpu	neon
-
-.type	poly1305_init_neon,%function
-.align	5
-poly1305_init_neon:
-.Lpoly1305_init_neon:
-	ldr	r3,[$ctx,#48]		@ first table element
-	cmp	r3,#-1			@ is value impossible?
-	bne	.Lno_init_neon
-
-	ldr	r4,[$ctx,#20]		@ load key base 2^32
-	ldr	r5,[$ctx,#24]
-	ldr	r6,[$ctx,#28]
-	ldr	r7,[$ctx,#32]
-
-	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
-	mov	r3,r4,lsr#26
-	mov	r4,r5,lsr#20
-	orr	r3,r3,r5,lsl#6
-	mov	r5,r6,lsr#14
-	orr	r4,r4,r6,lsl#12
-	mov	r6,r7,lsr#8
-	orr	r5,r5,r7,lsl#18
-	and	r3,r3,#0x03ffffff
-	and	r4,r4,#0x03ffffff
-	and	r5,r5,#0x03ffffff
-
-	vdup.32	$R0,r2			@ r^1 in both lanes
-	add	r2,r3,r3,lsl#2		@ *5
-	vdup.32	$R1,r3
-	add	r3,r4,r4,lsl#2
-	vdup.32	$S1,r2
-	vdup.32	$R2,r4
-	add	r4,r5,r5,lsl#2
-	vdup.32	$S2,r3
-	vdup.32	$R3,r5
-	add	r5,r6,r6,lsl#2
-	vdup.32	$S3,r4
-	vdup.32	$R4,r6
-	vdup.32	$S4,r5
-
-	mov	$zeros,#2		@ counter
-
-.Lsquare_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-
-	vmull.u32	$D0,$R0,${R0}[1]
-	vmull.u32	$D1,$R1,${R0}[1]
-	vmull.u32	$D2,$R2,${R0}[1]
-	vmull.u32	$D3,$R3,${R0}[1]
-	vmull.u32	$D4,$R4,${R0}[1]
-
-	vmlal.u32	$D0,$R4,${S1}[1]
-	vmlal.u32	$D1,$R0,${R1}[1]
-	vmlal.u32	$D2,$R1,${R1}[1]
-	vmlal.u32	$D3,$R2,${R1}[1]
-	vmlal.u32	$D4,$R3,${R1}[1]
-
-	vmlal.u32	$D0,$R3,${S2}[1]
-	vmlal.u32	$D1,$R4,${S2}[1]
-	vmlal.u32	$D3,$R1,${R2}[1]
-	vmlal.u32	$D2,$R0,${R2}[1]
-	vmlal.u32	$D4,$R2,${R2}[1]
-
-	vmlal.u32	$D0,$R2,${S3}[1]
-	vmlal.u32	$D3,$R0,${R3}[1]
-	vmlal.u32	$D1,$R3,${S3}[1]
-	vmlal.u32	$D2,$R4,${S3}[1]
-	vmlal.u32	$D4,$R1,${R3}[1]
-
-	vmlal.u32	$D3,$R4,${S4}[1]
-	vmlal.u32	$D0,$R1,${S4}[1]
-	vmlal.u32	$D1,$R2,${S4}[1]
-	vmlal.u32	$D2,$R3,${S4}[1]
-	vmlal.u32	$D4,$R0,${R4}[1]
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	@ and P. Schwabe
-	@
-	@ H0>>+H1>>+H2>>+H3>>+H4
-	@ H3>>+H4>>*5+H0>>+H1
-	@
-	@ Trivia.
-	@
-	@ Result of multiplication of n-bit number by m-bit number is
-	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
-	@ m-bit number multiplied by 2^n is still n+m bits wide.
-	@
-	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
-	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
-	@ one is n+1 bits wide.
-	@
-	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
-	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
-	@ can be 27. However! In cases when their width exceeds 26 bits
-	@ they are limited by 2^26+2^6. This in turn means that *sum*
-	@ of the products with these values can still be viewed as sum
-	@ of 52-bit numbers as long as the amount of addends is not a
-	@ power of 2. For example,
-	@
-	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
-	@
-	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
-	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
-	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
-	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
-	@ which is less than 32 * (2^52) or 2^57. And when processing
-	@ data we are looking at triple as many addends...
-	@
-	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
-	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
-	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
-	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
-	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
-	@ This means that result of reduction have to be compressed upon
-	@ loop wrap-around. This can be done in the process of reduction
-	@ to minimize amount of instructions [as well as amount of
-	@ 128-bit instructions, which benefits low-end processors], but
-	@ one has to watch for H2 (which is narrower than H0) and 5*H4
-	@ not being wider than 58 bits, so that result of right shift
-	@ by 26 bits fits in 32 bits. This is also useful on x86,
-	@ because it allows to use paddd in place for paddq, which
-	@ benefits Atom, where paddq is ridiculously slow.
-
-	vshr.u64	$T0,$D3,#26
-	vmovn.i64	$D3#lo,$D3
-	 vshr.u64	$T1,$D0,#26
-	 vmovn.i64	$D0#lo,$D0
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-	 vbic.i32	$D0#lo,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D4,#26
-	vmovn.i64	$D4#lo,$D4
-	 vshr.u64	$T1,$D1,#26
-	 vmovn.i64	$D1#lo,$D1
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-	vbic.i32	$D4#lo,#0xfc000000
-	 vbic.i32	$D1#lo,#0xfc000000
-
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo
-	vshl.u32	$T0#lo,$T0#lo,#2
-	 vshrn.u64	$T1#lo,$D2,#26
-	 vmovn.i64	$D2#lo,$D2
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
-	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
-	 vbic.i32	$D2#lo,#0xfc000000
-
-	vshr.u32	$T0#lo,$D0#lo,#26
-	vbic.i32	$D0#lo,#0xfc000000
-	 vshr.u32	$T1#lo,$D3#lo,#26
-	 vbic.i32	$D3#lo,#0xfc000000
-	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
-	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
-
-	subs		$zeros,$zeros,#1
-	beq		.Lsquare_break_neon
-
-	add		$tbl0,$ctx,#(48+0*9*4)
-	add		$tbl1,$ctx,#(48+1*9*4)
-
-	vtrn.32		$R0,$D0#lo		@ r^2:r^1
-	vtrn.32		$R2,$D2#lo
-	vtrn.32		$R3,$D3#lo
-	vtrn.32		$R1,$D1#lo
-	vtrn.32		$R4,$D4#lo
-
-	vshl.u32	$S2,$R2,#2		@ *5
-	vshl.u32	$S3,$R3,#2
-	vshl.u32	$S1,$R1,#2
-	vshl.u32	$S4,$R4,#2
-	vadd.i32	$S2,$S2,$R2
-	vadd.i32	$S1,$S1,$R1
-	vadd.i32	$S3,$S3,$R3
-	vadd.i32	$S4,$S4,$R4
-
-	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vst1.32		{${S4}[0]},[$tbl0,:32]
-	vst1.32		{${S4}[1]},[$tbl1,:32]
-
-	b		.Lsquare_neon
-
-.align	4
-.Lsquare_break_neon:
-	add		$tbl0,$ctx,#(48+2*4*9)
-	add		$tbl1,$ctx,#(48+3*4*9)
-
-	vmov		$R0,$D0#lo		@ r^4:r^3
-	vshl.u32	$S1,$D1#lo,#2		@ *5
-	vmov		$R1,$D1#lo
-	vshl.u32	$S2,$D2#lo,#2
-	vmov		$R2,$D2#lo
-	vshl.u32	$S3,$D3#lo,#2
-	vmov		$R3,$D3#lo
-	vshl.u32	$S4,$D4#lo,#2
-	vmov		$R4,$D4#lo
-	vadd.i32	$S1,$S1,$D1#lo
-	vadd.i32	$S2,$S2,$D2#lo
-	vadd.i32	$S3,$S3,$D3#lo
-	vadd.i32	$S4,$S4,$D4#lo
-
-	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vst1.32		{${S4}[0]},[$tbl0]
-	vst1.32		{${S4}[1]},[$tbl1]
-
-.Lno_init_neon:
-	ret				@ bx	lr
-.size	poly1305_init_neon,.-poly1305_init_neon
-
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
-	ldr	ip,[$ctx,#36]		@ is_base2_26
-
-	cmp	$len,#64
-	blo	.Lpoly1305_blocks
-
-	stmdb	sp!,{r4-r7}
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-
-	tst	ip,ip			@ is_base2_26?
-	bne	.Lbase2_26_neon
-
-	stmdb	sp!,{r1-r3,lr}
-	bl	.Lpoly1305_init_neon
-
-	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
-	ldr	r5,[$ctx,#4]
-	ldr	r6,[$ctx,#8]
-	ldr	r7,[$ctx,#12]
-	ldr	ip,[$ctx,#16]
-
-	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
-	mov	r3,r4,lsr#26
-	 veor	$D0#lo,$D0#lo,$D0#lo
-	mov	r4,r5,lsr#20
-	orr	r3,r3,r5,lsl#6
-	 veor	$D1#lo,$D1#lo,$D1#lo
-	mov	r5,r6,lsr#14
-	orr	r4,r4,r6,lsl#12
-	 veor	$D2#lo,$D2#lo,$D2#lo
-	mov	r6,r7,lsr#8
-	orr	r5,r5,r7,lsl#18
-	 veor	$D3#lo,$D3#lo,$D3#lo
-	and	r3,r3,#0x03ffffff
-	orr	r6,r6,ip,lsl#24
-	 veor	$D4#lo,$D4#lo,$D4#lo
-	and	r4,r4,#0x03ffffff
-	mov	r1,#1
-	and	r5,r5,#0x03ffffff
-	str	r1,[$ctx,#36]		@ set is_base2_26
-
-	vmov.32	$D0#lo[0],r2
-	vmov.32	$D1#lo[0],r3
-	vmov.32	$D2#lo[0],r4
-	vmov.32	$D3#lo[0],r5
-	vmov.32	$D4#lo[0],r6
-	adr	$zeros,.Lzeros
-
-	ldmia	sp!,{r1-r3,lr}
-	b	.Lhash_loaded
-
-.align	4
-.Lbase2_26_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ load hash value
-
-	veor		$D0#lo,$D0#lo,$D0#lo
-	veor		$D1#lo,$D1#lo,$D1#lo
-	veor		$D2#lo,$D2#lo,$D2#lo
-	veor		$D3#lo,$D3#lo,$D3#lo
-	veor		$D4#lo,$D4#lo,$D4#lo
-	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-	adr		$zeros,.Lzeros
-	vld1.32		{$D4#lo[0]},[$ctx]
-	sub		$ctx,$ctx,#16		@ rewind
-
-.Lhash_loaded:
-	add		$in2,$inp,#32
-	mov		$padbit,$padbit,lsl#24
-	tst		$len,#31
-	beq		.Leven
-
-	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
-	vmov.32		$H4#lo[0],$padbit
-	sub		$len,$len,#16
-	add		$in2,$inp,#32
-
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H3,$H3
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-# endif
-	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
-	vshl.u32	$H3#lo,$H3#lo,#18
-
-	vsri.u32	$H3#lo,$H2#lo,#14
-	vshl.u32	$H2#lo,$H2#lo,#12
-	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
-
-	vbic.i32	$H3#lo,#0xfc000000
-	vsri.u32	$H2#lo,$H1#lo,#20
-	vshl.u32	$H1#lo,$H1#lo,#6
-
-	vbic.i32	$H2#lo,#0xfc000000
-	vsri.u32	$H1#lo,$H0#lo,#26
-	vadd.i32	$H3#hi,$H3#lo,$D3#lo
-
-	vbic.i32	$H0#lo,#0xfc000000
-	vbic.i32	$H1#lo,#0xfc000000
-	vadd.i32	$H2#hi,$H2#lo,$D2#lo
-
-	vadd.i32	$H0#hi,$H0#lo,$D0#lo
-	vadd.i32	$H1#hi,$H1#lo,$D1#lo
-
-	mov		$tbl1,$zeros
-	add		$tbl0,$ctx,#48
-
-	cmp		$len,$len
-	b		.Long_tail
-
-.align	4
-.Leven:
-	subs		$len,$len,#64
-	it		lo
-	movlo		$in2,$zeros
-
-	vmov.i32	$H4,#1<<24		@ padbit, yes, always
-	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
-	add		$inp,$inp,#64
-	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
-	add		$in2,$in2,#64
-	itt		hi
-	addhi		$tbl1,$ctx,#(48+1*9*4)
-	addhi		$tbl0,$ctx,#(48+3*9*4)
-
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H3,$H3
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-# endif
-	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
-	vshl.u32	$H3,$H3,#18
-
-	vsri.u32	$H3,$H2,#14
-	vshl.u32	$H2,$H2,#12
-
-	vbic.i32	$H3,#0xfc000000
-	vsri.u32	$H2,$H1,#20
-	vshl.u32	$H1,$H1,#6
-
-	vbic.i32	$H2,#0xfc000000
-	vsri.u32	$H1,$H0,#26
-
-	vbic.i32	$H0,#0xfc000000
-	vbic.i32	$H1,#0xfc000000
-
-	bls		.Lskip_loop
-
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	b		.Loop_neon
-
-.align	5
-.Loop_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	@   \___________________/
-	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	@   \___________________/ \____________________/
-	@
-	@ Note that we start with inp[2:3]*r^2. This is because it
-	@ doesn't depend on reduction in previous iteration.
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ inp[2:3]*r^2
-
-	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
-	vmull.u32	$D2,$H2#hi,${R0}[1]
-	vadd.i32	$H0#lo,$H0#lo,$D0#lo
-	vmull.u32	$D0,$H0#hi,${R0}[1]
-	vadd.i32	$H3#lo,$H3#lo,$D3#lo
-	vmull.u32	$D3,$H3#hi,${R0}[1]
-	vmlal.u32	$D2,$H1#hi,${R1}[1]
-	vadd.i32	$H1#lo,$H1#lo,$D1#lo
-	vmull.u32	$D1,$H1#hi,${R0}[1]
-
-	vadd.i32	$H4#lo,$H4#lo,$D4#lo
-	vmull.u32	$D4,$H4#hi,${R0}[1]
-	subs		$len,$len,#64
-	vmlal.u32	$D0,$H4#hi,${S1}[1]
-	it		lo
-	movlo		$in2,$zeros
-	vmlal.u32	$D3,$H2#hi,${R1}[1]
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D1,$H0#hi,${R1}[1]
-	vmlal.u32	$D4,$H3#hi,${R1}[1]
-
-	vmlal.u32	$D0,$H3#hi,${S2}[1]
-	vmlal.u32	$D3,$H1#hi,${R2}[1]
-	vmlal.u32	$D4,$H2#hi,${R2}[1]
-	vmlal.u32	$D1,$H4#hi,${S2}[1]
-	vmlal.u32	$D2,$H0#hi,${R2}[1]
-
-	vmlal.u32	$D3,$H0#hi,${R3}[1]
-	vmlal.u32	$D0,$H2#hi,${S3}[1]
-	vmlal.u32	$D4,$H1#hi,${R3}[1]
-	vmlal.u32	$D1,$H3#hi,${S3}[1]
-	vmlal.u32	$D2,$H4#hi,${S3}[1]
-
-	vmlal.u32	$D3,$H4#hi,${S4}[1]
-	vmlal.u32	$D0,$H1#hi,${S4}[1]
-	vmlal.u32	$D4,$H0#hi,${R4}[1]
-	vmlal.u32	$D1,$H2#hi,${S4}[1]
-	vmlal.u32	$D2,$H3#hi,${S4}[1]
-
-	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
-	add		$in2,$in2,#64
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ (hash+inp[0:1])*r^4 and accumulate
-
-	vmlal.u32	$D3,$H3#lo,${R0}[0]
-	vmlal.u32	$D0,$H0#lo,${R0}[0]
-	vmlal.u32	$D4,$H4#lo,${R0}[0]
-	vmlal.u32	$D1,$H1#lo,${R0}[0]
-	vmlal.u32	$D2,$H2#lo,${R0}[0]
-	vld1.32		${S4}[0],[$tbl0,:32]
-
-	vmlal.u32	$D3,$H2#lo,${R1}[0]
-	vmlal.u32	$D0,$H4#lo,${S1}[0]
-	vmlal.u32	$D4,$H3#lo,${R1}[0]
-	vmlal.u32	$D1,$H0#lo,${R1}[0]
-	vmlal.u32	$D2,$H1#lo,${R1}[0]
-
-	vmlal.u32	$D3,$H1#lo,${R2}[0]
-	vmlal.u32	$D0,$H3#lo,${S2}[0]
-	vmlal.u32	$D4,$H2#lo,${R2}[0]
-	vmlal.u32	$D1,$H4#lo,${S2}[0]
-	vmlal.u32	$D2,$H0#lo,${R2}[0]
-
-	vmlal.u32	$D3,$H0#lo,${R3}[0]
-	vmlal.u32	$D0,$H2#lo,${S3}[0]
-	vmlal.u32	$D4,$H1#lo,${R3}[0]
-	vmlal.u32	$D1,$H3#lo,${S3}[0]
-	vmlal.u32	$D3,$H4#lo,${S4}[0]
-
-	vmlal.u32	$D2,$H4#lo,${S3}[0]
-	vmlal.u32	$D0,$H1#lo,${S4}[0]
-	vmlal.u32	$D4,$H0#lo,${R4}[0]
-	vmov.i32	$H4,#1<<24		@ padbit, yes, always
-	vmlal.u32	$D1,$H2#lo,${S4}[0]
-	vmlal.u32	$D2,$H3#lo,${S4}[0]
-
-	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
-	add		$inp,$inp,#64
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-	vrev32.8	$H3,$H3
-# endif
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
-	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-
-	vshr.u64	$T0,$D3,#26
-	vmovn.i64	$D3#lo,$D3
-	 vshr.u64	$T1,$D0,#26
-	 vmovn.i64	$D0#lo,$D0
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	vbic.i32	$D3#lo,#0xfc000000
-	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-	  vshl.u32	$H3,$H3,#18
-	 vbic.i32	$D0#lo,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D4,#26
-	vmovn.i64	$D4#lo,$D4
-	 vshr.u64	$T1,$D1,#26
-	 vmovn.i64	$D1#lo,$D1
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-	  vsri.u32	$H3,$H2,#14
-	vbic.i32	$D4#lo,#0xfc000000
-	  vshl.u32	$H2,$H2,#12
-	 vbic.i32	$D1#lo,#0xfc000000
-
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo
-	vshl.u32	$T0#lo,$T0#lo,#2
-	  vbic.i32	$H3,#0xfc000000
-	 vshrn.u64	$T1#lo,$D2,#26
-	 vmovn.i64	$D2#lo,$D2
-	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
-	  vsri.u32	$H2,$H1,#20
-	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
-	  vshl.u32	$H1,$H1,#6
-	 vbic.i32	$D2#lo,#0xfc000000
-	  vbic.i32	$H2,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
-	vmovn.i64	$D0#lo,$D0
-	  vsri.u32	$H1,$H0,#26
-	  vbic.i32	$H0,#0xfc000000
-	 vshr.u32	$T1#lo,$D3#lo,#26
-	 vbic.i32	$D3#lo,#0xfc000000
-	vbic.i32	$D0#lo,#0xfc000000
-	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
-	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
-	  vbic.i32	$H1,#0xfc000000
-
-	bhi		.Loop_neon
-
-.Lskip_loop:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	add		$tbl1,$ctx,#(48+0*9*4)
-	add		$tbl0,$ctx,#(48+1*9*4)
-	adds		$len,$len,#32
-	it		ne
-	movne		$len,#0
-	bne		.Long_tail
-
-	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
-	vadd.i32	$H0#hi,$H0#lo,$D0#lo
-	vadd.i32	$H3#hi,$H3#lo,$D3#lo
-	vadd.i32	$H1#hi,$H1#lo,$D1#lo
-	vadd.i32	$H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
-
-	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
-	vmull.u32	$D2,$H2#hi,$R0
-	vadd.i32	$H0#lo,$H0#lo,$D0#lo
-	vmull.u32	$D0,$H0#hi,$R0
-	vadd.i32	$H3#lo,$H3#lo,$D3#lo
-	vmull.u32	$D3,$H3#hi,$R0
-	vadd.i32	$H1#lo,$H1#lo,$D1#lo
-	vmull.u32	$D1,$H1#hi,$R0
-	vadd.i32	$H4#lo,$H4#lo,$D4#lo
-	vmull.u32	$D4,$H4#hi,$R0
-
-	vmlal.u32	$D0,$H4#hi,$S1
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vmlal.u32	$D3,$H2#hi,$R1
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vmlal.u32	$D1,$H0#hi,$R1
-	vmlal.u32	$D4,$H3#hi,$R1
-	vmlal.u32	$D2,$H1#hi,$R1
-
-	vmlal.u32	$D3,$H1#hi,$R2
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D0,$H3#hi,$S2
-	vld1.32		${S4}[0],[$tbl0,:32]
-	vmlal.u32	$D4,$H2#hi,$R2
-	vmlal.u32	$D1,$H4#hi,$S2
-	vmlal.u32	$D2,$H0#hi,$R2
-
-	vmlal.u32	$D3,$H0#hi,$R3
-	 it		ne
-	 addne		$tbl1,$ctx,#(48+2*9*4)
-	vmlal.u32	$D0,$H2#hi,$S3
-	 it		ne
-	 addne		$tbl0,$ctx,#(48+3*9*4)
-	vmlal.u32	$D4,$H1#hi,$R3
-	vmlal.u32	$D1,$H3#hi,$S3
-	vmlal.u32	$D2,$H4#hi,$S3
-
-	vmlal.u32	$D3,$H4#hi,$S4
-	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
-	vmlal.u32	$D0,$H1#hi,$S4
-	 vshr.u64	$MASK,$MASK,#38
-	vmlal.u32	$D4,$H0#hi,$R4
-	vmlal.u32	$D1,$H2#hi,$S4
-	vmlal.u32	$D2,$H3#hi,$S4
-
-	beq		.Lshort_tail
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
-
-	vmlal.u32	$D2,$H2#lo,$R0
-	vmlal.u32	$D0,$H0#lo,$R0
-	vmlal.u32	$D3,$H3#lo,$R0
-	vmlal.u32	$D1,$H1#lo,$R0
-	vmlal.u32	$D4,$H4#lo,$R0
-
-	vmlal.u32	$D0,$H4#lo,$S1
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vmlal.u32	$D3,$H2#lo,$R1
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vmlal.u32	$D1,$H0#lo,$R1
-	vmlal.u32	$D4,$H3#lo,$R1
-	vmlal.u32	$D2,$H1#lo,$R1
-
-	vmlal.u32	$D3,$H1#lo,$R2
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D0,$H3#lo,$S2
-	vld1.32		${S4}[0],[$tbl0,:32]
-	vmlal.u32	$D4,$H2#lo,$R2
-	vmlal.u32	$D1,$H4#lo,$S2
-	vmlal.u32	$D2,$H0#lo,$R2
-
-	vmlal.u32	$D3,$H0#lo,$R3
-	vmlal.u32	$D0,$H2#lo,$S3
-	vmlal.u32	$D4,$H1#lo,$R3
-	vmlal.u32	$D1,$H3#lo,$S3
-	vmlal.u32	$D2,$H4#lo,$S3
-
-	vmlal.u32	$D3,$H4#lo,$S4
-	 vorn		$MASK,$MASK,$MASK	@ all-ones
-	vmlal.u32	$D0,$H1#lo,$S4
-	 vshr.u64	$MASK,$MASK,#38
-	vmlal.u32	$D4,$H0#lo,$R4
-	vmlal.u32	$D1,$H2#lo,$S4
-	vmlal.u32	$D2,$H3#lo,$S4
-
-.Lshort_tail:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ horizontal addition
-
-	vadd.i64	$D3#lo,$D3#lo,$D3#hi
-	vadd.i64	$D0#lo,$D0#lo,$D0#hi
-	vadd.i64	$D4#lo,$D4#lo,$D4#hi
-	vadd.i64	$D1#lo,$D1#lo,$D1#hi
-	vadd.i64	$D2#lo,$D2#lo,$D2#hi
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction, but without narrowing
-
-	vshr.u64	$T0,$D3,#26
-	vand.i64	$D3,$D3,$MASK
-	 vshr.u64	$T1,$D0,#26
-	 vand.i64	$D0,$D0,$MASK
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-
-	vshr.u64	$T0,$D4,#26
-	vand.i64	$D4,$D4,$MASK
-	 vshr.u64	$T1,$D1,#26
-	 vand.i64	$D1,$D1,$MASK
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-
-	vadd.i64	$D0,$D0,$T0
-	vshl.u64	$T0,$T0,#2
-	 vshr.u64	$T1,$D2,#26
-	 vand.i64	$D2,$D2,$MASK
-	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
-	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
-
-	vshr.u64	$T0,$D0,#26
-	vand.i64	$D0,$D0,$MASK
-	 vshr.u64	$T1,$D3,#26
-	 vand.i64	$D3,$D3,$MASK
-	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
-	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
-
-	cmp		$len,#0
-	bne		.Leven
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ store hash value
-
-	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-	vst1.32		{$D4#lo[0]},[$ctx]
-
-	vldmia	sp!,{d8-d15}			@ epilogue
-	ldmia	sp!,{r4-r7}
-	ret					@ bx	lr
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-#ifndef	__KERNEL__
-.LOPENSSL_armcap:
-# ifdef	_WIN32
-.word	OPENSSL_armcap_P
-# else
-.word	OPENSSL_armcap_P-.Lpoly1305_init
-# endif
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
-#endif
-___
-}	}
-$code.=<<___;
-.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
-.align	2
-___
-
-foreach (split("\n",$code)) {
-	s/\`([^\`]*)\`/eval $1/geo;
-
-	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
-	s/\bret\b/bx	lr/go						or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
-
-	print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/poly1305-glue.c b/arch/arm/lib/crypto/poly1305-glue.c
deleted file mode 100644
index 2603b0771f2c..000000000000
--- a/arch/arm/lib/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
-	struct poly1305_block_state *state,
-	const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
-				    const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
-				     const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-				   u8 digest[POLY1305_DIGEST_SIZE],
-				   const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-void __weak poly1305_blocks_neon(struct poly1305_block_state *state,
-				 const u8 *src, u32 len, u32 hibit)
-{
-}
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
-			  unsigned int len, u32 padbit)
-{
-	len = round_down(len, POLY1305_BLOCK_SIZE);
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon)) {
-		do {
-			unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
-
-			len -= todo;
-			src += todo;
-		} while (len);
-	} else
-		poly1305_blocks_arm(state, src, len, padbit);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init arm_poly1305_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    (elf_hwcap & HWCAP_NEON))
-		static_branch_enable(&have_neon);
-	return 0;
-}
-subsys_initcall(arm_poly1305_mod_init);
-
-static void __exit arm_poly1305_mod_exit(void)
-{
-}
-module_exit(arm_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/sha256-armv4.pl b/arch/arm/lib/crypto/sha256-armv4.pl
deleted file mode 100644
index 8122db7fd599..000000000000
--- a/arch/arm/lib/crypto/sha256-armv4.pl
+++ /dev/null
@@ -1,724 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA256 block procedure for ARMv4. May 2007.
-
-# Performance is ~2x better than gcc 3.4 generated code and in "abso-
-# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte [on single-issue Xscale PXA250 core].
-
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 22% improvement on
-# Cortex A8 core and ~20 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-# September 2013.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process one
-# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-# code (meaning that latter performs sub-optimally, nothing was done
-# about it).
-
-# May 2014.
-#
-# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0";	$t0="r0";
-$inp="r1";	$t4="r1";
-$len="r2";	$t1="r2";
-$T1="r3";	$t3="r3";
-$A="r4";
-$B="r5";
-$C="r6";
-$D="r7";
-$E="r8";
-$F="r9";
-$G="r10";
-$H="r11";
-@V=($A,$B,$C,$D,$E,$F,$G,$H);
-$t2="r12";
-$Ktbl="r14";
-
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-
-sub BODY_00_15 {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___ if ($i<16);
-#if __ARM_ARCH__>=7
-	@ ldr	$t1,[$inp],#4			@ $i
-# if $i==15
-	str	$inp,[sp,#17*4]			@ make room for $t4
-# endif
-	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
-	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
-	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
-# ifndef __ARMEB__
-	rev	$t1,$t1
-# endif
-#else
-	@ ldrb	$t1,[$inp,#3]			@ $i
-	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
-	ldrb	$t2,[$inp,#2]
-	ldrb	$t0,[$inp,#1]
-	orr	$t1,$t1,$t2,lsl#8
-	ldrb	$t2,[$inp],#4
-	orr	$t1,$t1,$t0,lsl#16
-# if $i==15
-	str	$inp,[sp,#17*4]			@ make room for $t4
-# endif
-	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
-	orr	$t1,$t1,$t2,lsl#24
-	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
-#endif
-___
-$code.=<<___;
-	ldr	$t2,[$Ktbl],#4			@ *K256++
-	add	$h,$h,$t1			@ h+=X[i]
-	str	$t1,[sp,#`$i%16`*4]
-	eor	$t1,$f,$g
-	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
-	and	$t1,$t1,$e
-	add	$h,$h,$t2			@ h+=K256[i]
-	eor	$t1,$t1,$g			@ Ch(e,f,g)
-	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
-	add	$h,$h,$t1			@ h+=Ch(e,f,g)
-#if $i==31
-	and	$t2,$t2,#0xff
-	cmp	$t2,#0xf2			@ done?
-#endif
-#if $i<15
-# if __ARM_ARCH__>=7
-	ldr	$t1,[$inp],#4			@ prefetch
-# else
-	ldrb	$t1,[$inp,#3]
-# endif
-	eor	$t2,$a,$b			@ a^b, b^c in next round
-#else
-	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
-	eor	$t2,$a,$b			@ a^b, b^c in next round
-	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
-#endif
-	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
-	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
-	add	$d,$d,$h			@ d+=h
-	eor	$t3,$t3,$b			@ Maj(a,b,c)
-	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
-	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
-___
-	($t2,$t3)=($t3,$t2);
-}
-
-sub BODY_16_XX {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___;
-	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
-	@ ldr	$t4,[sp,#`($i+14)%16`*4]
-	mov	$t0,$t1,ror#$sigma0[0]
-	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
-	mov	$t2,$t4,ror#$sigma1[0]
-	eor	$t0,$t0,$t1,ror#$sigma0[1]
-	eor	$t2,$t2,$t4,ror#$sigma1[1]
-	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
-	ldr	$t1,[sp,#`($i+0)%16`*4]
-	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
-	ldr	$t4,[sp,#`($i+9)%16`*4]
-
-	add	$t2,$t2,$t0
-	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
-	add	$t1,$t1,$t2
-	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
-	add	$t1,$t1,$t4			@ X[i]
-___
-	&BODY_00_15(@_);
-}
-
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code	32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code   32
-# endif
-#endif
-
-.type	K256,%object
-.align	5
-K256:
-.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.size	K256,.-K256
-.word	0				@ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha256_blocks_arch
-#endif
-.align	5
-
-.global	sha256_blocks_arch
-.type	sha256_blocks_arch,%function
-sha256_blocks_arch:
-.Lsha256_blocks_arch:
-#if __ARM_ARCH__<7
-	sub	r3,pc,#8		@ sha256_blocks_arch
-#else
-	adr	r3,.Lsha256_blocks_arch
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-	tst	r12,#ARMV8_SHA256
-	bne	.LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
-	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
-	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
-	sub	$Ktbl,r3,#256+32	@ K256
-	sub	sp,sp,#16*4		@ alloca(X[16])
-.Loop:
-# if __ARM_ARCH__>=7
-	ldr	$t1,[$inp],#4
-# else
-	ldrb	$t1,[$inp,#3]
-# endif
-	eor	$t3,$B,$C		@ magic
-	eor	$t2,$t2,$t2
-___
-for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=".Lrounds_16_xx:\n";
-for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-#if __ARM_ARCH__>=7
-	ite	eq			@ Thumb2 thing, sanity check in ARM
-#endif
-	ldreq	$t3,[sp,#16*4]		@ pull ctx
-	bne	.Lrounds_16_xx
-
-	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
-	ldr	$t0,[$t3,#0]
-	ldr	$t1,[$t3,#4]
-	ldr	$t2,[$t3,#8]
-	add	$A,$A,$t0
-	ldr	$t0,[$t3,#12]
-	add	$B,$B,$t1
-	ldr	$t1,[$t3,#16]
-	add	$C,$C,$t2
-	ldr	$t2,[$t3,#20]
-	add	$D,$D,$t0
-	ldr	$t0,[$t3,#24]
-	add	$E,$E,$t1
-	ldr	$t1,[$t3,#28]
-	add	$F,$F,$t2
-	ldr	$inp,[sp,#17*4]		@ pull inp
-	ldr	$t2,[sp,#18*4]		@ pull inp+len
-	add	$G,$G,$t0
-	add	$H,$H,$t1
-	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
-	cmp	$inp,$t2
-	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
-	bne	.Loop
-
-	add	sp,sp,#`16+3`*4	@ destroy frame
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4-r11,pc}
-#else
-	ldmia	sp!,{r4-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	sha256_blocks_arch,.-sha256_blocks_arch
-___
-######################################################################
-# NEON stuff
-#
-{{{
-my @X=map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
-my $Xfer=$t4;
-my $j=0;
-
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-
-sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
-  my $arg = pop;
-    $arg = "#$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Xupdate()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vshr_u32	($T2,$T0,$sigma0[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vshr_u32	($T1,$T0,$sigma0[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vsli_32	($T2,$T0,32-$sigma0[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vshr_u32	($T3,$T0,$sigma0[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&veor		($T1,$T1,$T2);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vsli_32	($T3,$T0,32-$sigma0[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &veor		($T5,$T5,$T4);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &veor		($T5,$T5,$T4);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vld1_32	("{$T0}","[$Ktbl,:128]!");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vadd_i32	($T0,$T0,@X[0]);
-	 while($#insns>=2) { eval(shift(@insns)); }
-	&vst1_32	("{$T0}","[$Xfer,:128]!");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-
-	push(@X,shift(@X));		# "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vld1_32	("{$T0}","[$Ktbl,:128]!");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vrev32_8	(@X[0],@X[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&vadd_i32	($T0,$T0,@X[0]);
-	 foreach (@insns) { eval; }	# remaining instructions
-	&vst1_32	("{$T0}","[$Xfer,:128]!");
-
-	push(@X,shift(@X));		# "rotate" X[]
-}
-
-sub body_00_15 () {
-	(
-	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
-	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
-	'&eor	($t1,$f,$g)',
-	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
-	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
-	'&and	($t1,$t1,$e)',
-	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
-	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
-	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
-	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
-	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
-	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
-	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
-	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
-	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
-	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
-	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
-	'&add	($d,$d,$h)',			# d+=h
-	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
-	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
-	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
-	)
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch	armv7-a
-.fpu	neon
-
-.global	sha256_block_data_order_neon
-.type	sha256_block_data_order_neon,%function
-.align	4
-sha256_block_data_order_neon:
-.LNEON:
-	stmdb	sp!,{r4-r12,lr}
-
-	sub	$H,sp,#16*4+16
-	adr	$Ktbl,.Lsha256_blocks_arch
-	sub	$Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256
-	bic	$H,$H,#15		@ align for 128-bit stores
-	mov	$t2,sp
-	mov	sp,$H			@ alloca
-	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-
-	vld1.8		{@X[0]},[$inp]!
-	vld1.8		{@X[1]},[$inp]!
-	vld1.8		{@X[2]},[$inp]!
-	vld1.8		{@X[3]},[$inp]!
-	vld1.32		{$T0},[$Ktbl,:128]!
-	vld1.32		{$T1},[$Ktbl,:128]!
-	vld1.32		{$T2},[$Ktbl,:128]!
-	vld1.32		{$T3},[$Ktbl,:128]!
-	vrev32.8	@X[0],@X[0]		@ yes, even on
-	str		$ctx,[sp,#64]
-	vrev32.8	@X[1],@X[1]		@ big-endian
-	str		$inp,[sp,#68]
-	mov		$Xfer,sp
-	vrev32.8	@X[2],@X[2]
-	str		$len,[sp,#72]
-	vrev32.8	@X[3],@X[3]
-	str		$t2,[sp,#76]		@ save original sp
-	vadd.i32	$T0,$T0,@X[0]
-	vadd.i32	$T1,$T1,@X[1]
-	vst1.32		{$T0},[$Xfer,:128]!
-	vadd.i32	$T2,$T2,@X[2]
-	vst1.32		{$T1},[$Xfer,:128]!
-	vadd.i32	$T3,$T3,@X[3]
-	vst1.32		{$T2},[$Xfer,:128]!
-	vst1.32		{$T3},[$Xfer,:128]!
-
-	ldmia		$ctx,{$A-$H}
-	sub		$Xfer,$Xfer,#64
-	ldr		$t1,[sp,#0]
-	eor		$t2,$t2,$t2
-	eor		$t3,$B,$C
-	b		.L_00_48
-
-.align	4
-.L_00_48:
-___
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-$code.=<<___;
-	teq	$t1,#0				@ check for K256 terminator
-	ldr	$t1,[sp,#0]
-	sub	$Xfer,$Xfer,#64
-	bne	.L_00_48
-
-	ldr		$inp,[sp,#68]
-	ldr		$t0,[sp,#72]
-	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
-	teq		$inp,$t0
-	it		eq
-	subeq		$inp,$inp,#64		@ avoid SEGV
-	vld1.8		{@X[0]},[$inp]!		@ load next input block
-	vld1.8		{@X[1]},[$inp]!
-	vld1.8		{@X[2]},[$inp]!
-	vld1.8		{@X[3]},[$inp]!
-	it		ne
-	strne		$inp,[sp,#68]
-	mov		$Xfer,sp
-___
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-$code.=<<___;
-	ldr	$t0,[$t1,#0]
-	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
-	ldr	$t2,[$t1,#4]
-	ldr	$t3,[$t1,#8]
-	ldr	$t4,[$t1,#12]
-	add	$A,$A,$t0			@ accumulate
-	ldr	$t0,[$t1,#16]
-	add	$B,$B,$t2
-	ldr	$t2,[$t1,#20]
-	add	$C,$C,$t3
-	ldr	$t3,[$t1,#24]
-	add	$D,$D,$t4
-	ldr	$t4,[$t1,#28]
-	add	$E,$E,$t0
-	str	$A,[$t1],#4
-	add	$F,$F,$t2
-	str	$B,[$t1],#4
-	add	$G,$G,$t3
-	str	$C,[$t1],#4
-	add	$H,$H,$t4
-	str	$D,[$t1],#4
-	stmia	$t1,{$E-$H}
-
-	ittte	ne
-	movne	$Xfer,sp
-	ldrne	$t1,[sp,#0]
-	eorne	$t2,$t2,$t2
-	ldreq	sp,[sp,#76]			@ restore original sp
-	itt	ne
-	eorne	$t3,$B,$C
-	bne	.L_00_48
-
-	ldmia	sp!,{r4-r12,pc}
-.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
-#endif
-___
-}}}
-######################################################################
-# ARMv8 stuff
-#
-{{{
-my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
-my @MSG=map("q$_",(8..11));
-my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
-my $Ktbl="r3";
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# ifdef __thumb2__
-#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
-# else
-#  define INST(a,b,c,d)	.byte	a,b,c,d
-# endif
-
-.type	sha256_block_data_order_armv8,%function
-.align	5
-sha256_block_data_order_armv8:
-.LARMv8:
-	vld1.32	{$ABCD,$EFGH},[$ctx]
-# ifdef __thumb2__
-	adr	$Ktbl,.LARMv8
-	sub	$Ktbl,$Ktbl,#.LARMv8-K256
-# else
-	adrl	$Ktbl,K256
-# endif
-	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-
-.Loop_v8:
-	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
-	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
-	vld1.32		{$W0},[$Ktbl]!
-	vrev32.8	@MSG[0],@MSG[0]
-	vrev32.8	@MSG[1],@MSG[1]
-	vrev32.8	@MSG[2],@MSG[2]
-	vrev32.8	@MSG[3],@MSG[3]
-	vmov		$ABCD_SAVE,$ABCD	@ offload
-	vmov		$EFGH_SAVE,$EFGH
-	teq		$inp,$len
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
-	vld1.32		{$W1},[$Ktbl]!
-	vadd.i32	$W0,$W0,@MSG[0]
-	sha256su0	@MSG[0],@MSG[1]
-	vmov		$abcd,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-	sha256su1	@MSG[0],@MSG[2],@MSG[3]
-___
-	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
-}
-$code.=<<___;
-	vld1.32		{$W1},[$Ktbl]!
-	vadd.i32	$W0,$W0,@MSG[0]
-	vmov		$abcd,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-
-	vld1.32		{$W0},[$Ktbl]!
-	vadd.i32	$W1,$W1,@MSG[1]
-	vmov		$abcd,$ABCD
-	sha256h		$ABCD,$EFGH,$W1
-	sha256h2	$EFGH,$abcd,$W1
-
-	vld1.32		{$W1},[$Ktbl]
-	vadd.i32	$W0,$W0,@MSG[2]
-	sub		$Ktbl,$Ktbl,#256-16	@ rewind
-	vmov		$abcd,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-
-	vadd.i32	$W1,$W1,@MSG[3]
-	vmov		$abcd,$ABCD
-	sha256h		$ABCD,$EFGH,$W1
-	sha256h2	$EFGH,$abcd,$W1
-
-	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
-	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
-	it		ne
-	bne		.Loop_v8
-
-	vst1.32		{$ABCD,$EFGH},[$ctx]
-
-	ret		@ bx lr
-.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
-#endif
-___
-}}}
-$code.=<<___;
-.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm   OPENSSL_armcap_P,4,4
-#endif
-___
-
-open SELF,$0;
-while(<SELF>) {
-	next if (/^#!/);
-	last if (!s/^#/@/ and !/^$/);
-	print;
-}
-close SELF;
-
-{   my  %opcode = (
-	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
-	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
-
-    sub unsha256 {
-	my ($mnemonic,$arg)=@_;
-
-	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
-	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-					 |(($2&7)<<17)|(($2&8)<<4)
-					 |(($3&7)<<1) |(($3&8)<<2);
-	    # since ARMv7 instructions are always encoded little-endian.
-	    # correct solution is to use .inst directive, but older
-	    # assemblers don't implement it:-(
-	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
-			$word&0xff,($word>>8)&0xff,
-			($word>>16)&0xff,($word>>24)&0xff,
-			$mnemonic,$arg;
-	}
-    }
-}
-
-foreach (split($/,$code)) {
-
-	s/\`([^\`]*)\`/eval $1/geo;
-
-	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
-
-	s/\bret\b/bx	lr/go		or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
-
-	print $_,"\n";
-}
-
-close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/sha256-ce.S b/arch/arm/lib/crypto/sha256-ce.S
deleted file mode 100644
index ac2c9b01b22d..000000000000
--- a/arch/arm/lib/crypto/sha256-ce.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.arch		armv8-a
-	.fpu		crypto-neon-fp-armv8
-
-	k0		.req	q7
-	k1		.req	q8
-	rk		.req	r3
-
-	ta0		.req	q9
-	ta1		.req	q10
-	tb0		.req	q10
-	tb1		.req	q9
-
-	dga		.req	q11
-	dgb		.req	q12
-
-	dg0		.req	q13
-	dg1		.req	q14
-	dg2		.req	q15
-
-	.macro		add_only, ev, s0
-	vmov		dg2, dg0
-	.ifnb		\s0
-	vld1.32		{k\ev}, [rk, :128]!
-	.endif
-	sha256h.32	dg0, dg1, tb\ev
-	sha256h2.32	dg1, dg2, tb\ev
-	.ifnb		\s0
-	vadd.u32	ta\ev, q\s0, k\ev
-	.endif
-	.endm
-
-	.macro		add_update, ev, s0, s1, s2, s3
-	sha256su0.32	q\s0, q\s1
-	add_only	\ev, \s1
-	sha256su1.32	q\s0, q\s2, q\s3
-	.endm
-
-	.align		6
-.Lsha256_rcon:
-	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
-	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
-	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
-	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
-	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
-	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
-	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
-	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
-	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
-	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
-	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
-	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
-	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
-	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
-	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
-	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
-	/*
-	 * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
-	 *			    const u8 *data, size_t nblocks);
-	 */
-ENTRY(sha256_ce_transform)
-	/* load state */
-	vld1.32		{dga-dgb}, [r0]
-
-	/* load input */
-0:	vld1.32		{q0-q1}, [r1]!
-	vld1.32		{q2-q3}, [r1]!
-	subs		r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
-	vrev32.8	q0, q0
-	vrev32.8	q1, q1
-	vrev32.8	q2, q2
-	vrev32.8	q3, q3
-#endif
-
-	/* load first round constant */
-	adr		rk, .Lsha256_rcon
-	vld1.32		{k0}, [rk, :128]!
-
-	vadd.u32	ta0, q0, k0
-	vmov		dg0, dga
-	vmov		dg1, dgb
-
-	add_update	1, 0, 1, 2, 3
-	add_update	0, 1, 2, 3, 0
-	add_update	1, 2, 3, 0, 1
-	add_update	0, 3, 0, 1, 2
-	add_update	1, 0, 1, 2, 3
-	add_update	0, 1, 2, 3, 0
-	add_update	1, 2, 3, 0, 1
-	add_update	0, 3, 0, 1, 2
-	add_update	1, 0, 1, 2, 3
-	add_update	0, 1, 2, 3, 0
-	add_update	1, 2, 3, 0, 1
-	add_update	0, 3, 0, 1, 2
-
-	add_only	1, 1
-	add_only	0, 2
-	add_only	1, 3
-	add_only	0
-
-	/* update state */
-	vadd.u32	dga, dga, dg0
-	vadd.u32	dgb, dgb, dg1
-	bne		0b
-
-	/* store new state */
-	vst1.32		{dga-dgb}, [r0]
-	bx		lr
-ENDPROC(sha256_ce_transform)
diff --git a/arch/arm/lib/crypto/sha256.c b/arch/arm/lib/crypto/sha256.c
deleted file mode 100644
index 109192e54b0f..000000000000
--- a/arch/arm/lib/crypto/sha256.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-				   const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
-					     const u8 *data, size_t nblocks);
-asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
-				    const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
-			const u8 *data, size_t nblocks)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon)) {
-		kernel_neon_begin();
-		if (static_branch_likely(&have_ce))
-			sha256_ce_transform(state, data, nblocks);
-		else
-			sha256_block_data_order_neon(state, data, nblocks);
-		kernel_neon_end();
-	} else {
-		sha256_blocks_arch(state, data, nblocks);
-	}
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-bool sha256_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_arm_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
-		static_branch_enable(&have_neon);
-		if (elf_hwcap2 & HWCAP2_SHA2)
-			static_branch_enable(&have_ce);
-	}
-	return 0;
-}
-subsys_initcall(sha256_arm_mod_init);
-
-static void __exit sha256_arm_mod_exit(void)
-{
-}
-module_exit(sha256_arm_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM");
diff --git a/arch/arm/mach-omap1/board-ams-delta.c b/arch/arm/mach-omap1/board-ams-delta.c
index 0daf6c5b5c1c..16392720296c 100644
--- a/arch/arm/mach-omap1/board-ams-delta.c
+++ b/arch/arm/mach-omap1/board-ams-delta.c
@@ -19,6 +19,7 @@
 #include <linux/mtd/nand-gpio.h>
 #include <linux/mtd/partitions.h>
 #include <linux/platform_device.h>
+#include <linux/property.h>
 #include <linux/regulator/consumer.h>
 #include <linux/regulator/fixed.h>
 #include <linux/regulator/machine.h>
@@ -175,20 +176,18 @@ static struct resource latch1_resources[] = {
 
 #define LATCH1_LABEL	"latch1"
 
-static struct bgpio_pdata latch1_pdata = {
-	.label	= LATCH1_LABEL,
-	.base	= -1,
-	.ngpio	= LATCH1_NGPIO,
+static const struct property_entry latch1_gpio_props[] = {
+	PROPERTY_ENTRY_STRING("label", LATCH1_LABEL),
+	PROPERTY_ENTRY_U32("ngpios", LATCH1_NGPIO),
+	{ }
 };
 
-static struct platform_device latch1_gpio_device = {
+static const struct platform_device_info latch1_gpio_devinfo = {
 	.name		= "basic-mmio-gpio",
 	.id		= 0,
-	.resource	= latch1_resources,
-	.num_resources	= ARRAY_SIZE(latch1_resources),
-	.dev		= {
-		.platform_data	= &latch1_pdata,
-	},
+	.res		= latch1_resources,
+	.num_res	= ARRAY_SIZE(latch1_resources),
+	.properties	= latch1_gpio_props,
 };
 
 #define LATCH1_PIN_LED_CAMERA		0
@@ -213,20 +212,18 @@ static struct resource latch2_resources[] = {
 
 #define LATCH2_LABEL	"latch2"
 
-static struct bgpio_pdata latch2_pdata = {
-	.label	= LATCH2_LABEL,
-	.base	= -1,
-	.ngpio	= LATCH2_NGPIO,
+static const struct property_entry latch2_gpio_props[] = {
+	PROPERTY_ENTRY_STRING("label", LATCH2_LABEL),
+	PROPERTY_ENTRY_U32("ngpios", LATCH2_NGPIO),
+	{ }
 };
 
-static struct platform_device latch2_gpio_device = {
+static struct platform_device_info latch2_gpio_devinfo = {
 	.name		= "basic-mmio-gpio",
 	.id		= 1,
-	.resource	= latch2_resources,
-	.num_resources	= ARRAY_SIZE(latch2_resources),
-	.dev		= {
-		.platform_data	= &latch2_pdata,
-	},
+	.res		= latch2_resources,
+	.num_res	= ARRAY_SIZE(latch2_resources),
+	.properties	= latch2_gpio_props,
 };
 
 #define LATCH2_PIN_LCD_VBLEN		0
@@ -542,8 +539,6 @@ static struct gpiod_lookup_table keybrd_pwr_gpio_table = {
 };
 
 static struct platform_device *ams_delta_devices[] __initdata = {
-	&latch1_gpio_device,
-	&latch2_gpio_device,
 	&ams_delta_kp_device,
 	&ams_delta_audio_device,
 	&ams_delta_serio_device,
@@ -697,6 +692,9 @@ static void __init ams_delta_init(void)
 	omap1_usb_init(&ams_delta_usb_config);
 	platform_add_devices(ams_delta_devices, ARRAY_SIZE(ams_delta_devices));
 
+	platform_device_register_full(&latch1_gpio_devinfo);
+	platform_device_register_full(&latch2_gpio_devinfo);
+
 	/*
 	 * As soon as regulator consumers have been registered, assign their
 	 * dev_names to consumer supply entries of respective regulators.
diff --git a/arch/arm/mach-s3c/mach-crag6410.c b/arch/arm/mach-s3c/mach-crag6410.c
index e5df2cb51ab2..028169c7debf 100644
--- a/arch/arm/mach-s3c/mach-crag6410.c
+++ b/arch/arm/mach-s3c/mach-crag6410.c
@@ -252,14 +252,17 @@ static struct resource crag6410_mmgpio_resource[] = {
 	[0] = DEFINE_RES_MEM_NAMED(S3C64XX_PA_XM0CSN4, 1, "dat"),
 };
 
-static struct platform_device crag6410_mmgpio = {
+static const struct property_entry crag6410_mmgpio_props[] = {
+	PROPERTY_ENTRY_U32("gpio-mmio,base", MMGPIO_GPIO_BASE),
+	{ }
+};
+
+static struct platform_device_info crag6410_mmgpio_devinfo = {
 	.name		= "basic-mmio-gpio",
 	.id		= -1,
-	.resource	= crag6410_mmgpio_resource,
-	.num_resources	= ARRAY_SIZE(crag6410_mmgpio_resource),
-	.dev.platform_data = &(struct bgpio_pdata) {
-		.base	= MMGPIO_GPIO_BASE,
-	},
+	.res		= crag6410_mmgpio_resource,
+	.num_res	= ARRAY_SIZE(crag6410_mmgpio_resource),
+	.properties	= crag6410_mmgpio_props,
 };
 
 static struct platform_device speyside_device = {
@@ -373,7 +376,6 @@ static struct platform_device *crag6410_devices[] __initdata = {
 	&crag6410_gpio_keydev,
 	&crag6410_dm9k_device,
 	&s3c64xx_device_spi0,
-	&crag6410_mmgpio,
 	&crag6410_lcd_powerdev,
 	&crag6410_backlight_device,
 	&speyside_device,
@@ -871,6 +873,7 @@ static void __init crag6410_machine_init(void)
 
 	pwm_add_table(crag6410_pwm_lookup, ARRAY_SIZE(crag6410_pwm_lookup));
 	platform_add_devices(crag6410_devices, ARRAY_SIZE(crag6410_devices));
+	platform_device_register_full(&crag6410_mmgpio_devinfo);
 
 	gpio_led_register_device(-1, &gpio_leds_pdata);
 
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index 2b833aa0212b..bad8aa661e9d 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -80,7 +80,7 @@ void ASSABET_BCR_frob(unsigned int mask, unsigned int val)
 {
 	unsigned long m = mask, v = val;
 
-	assabet_bcr_gc->set_multiple(assabet_bcr_gc, &m, &v);
+	assabet_bcr_gc->set_multiple_rv(assabet_bcr_gc, &m, &v);
 }
 EXPORT_SYMBOL(ASSABET_BCR_frob);
 
diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c
index 88fe79f0a4ed..6516598c8a71 100644
--- a/arch/arm/mach-sa1100/neponset.c
+++ b/arch/arm/mach-sa1100/neponset.c
@@ -126,7 +126,7 @@ void neponset_ncr_frob(unsigned int mask, unsigned int val)
 	unsigned long m = mask, v = val;
 
 	if (nep)
-		n->gpio[0]->set_multiple(n->gpio[0], &m, &v);
+		n->gpio[0]->set_multiple_rv(n->gpio[0], &m, &v);
 	else
 		WARN(1, "nep unset\n");
 }
diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
index 25dbd84a1aaf..2bfefb252ffd 100644
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -295,7 +295,7 @@ static inline u32 read_extra_features(void)
 	return u;
 }
 
-static inline void write_extra_features(u32 u)
+static inline void __init write_extra_features(u32 u)
 {
 	__asm__("mcr p15, 1, %0, c15, c1, 0" : : "r" (u));
 }
diff --git a/arch/arm/mm/cache-tauros2.c b/arch/arm/mm/cache-tauros2.c
index b1e1aba602f7..bfe166ccace0 100644
--- a/arch/arm/mm/cache-tauros2.c
+++ b/arch/arm/mm/cache-tauros2.c
@@ -177,7 +177,7 @@ static inline void __init write_actlr(u32 actlr)
 	__asm__("mcr p15, 0, %0, c1, c0, 1\n" : : "r" (actlr));
 }
 
-static void enable_extra_feature(unsigned int features)
+static void __init enable_extra_feature(unsigned int features)
 {
 	u32 u;
 
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 27c1d5ebcd91..b07e699aaa3c 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -482,3 +482,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index cb044bfd145d..cf8cd39ab804 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -26,7 +26,7 @@ CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
 CFLAGS_REMOVE_vdso.o = -pg
 
 # Force -O2 to avoid libgcc dependencies
-CFLAGS_REMOVE_vgettimeofday.o = -pg -Os $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS)
+CFLAGS_REMOVE_vgettimeofday.o = -pg -Os $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS)
 ifeq ($(c-gettimeofday-y),)
 CFLAGS_vgettimeofday.o = -O2
 else