149 files changed, 33316 insertions, 257 deletions
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 1ec1466108cc..c2b65b6a9bb6 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -2,6 +2,9 @@
 
 menu "Crypto library routines"
 
+config CRYPTO_HASH_INFO
+	bool
+
 config CRYPTO_LIB_UTILS
 	tristate
 
@@ -136,6 +139,20 @@ config CRYPTO_LIB_CHACHA20POLY1305
 
 config CRYPTO_LIB_SHA1
 	tristate
+	help
+	  The SHA-1 library functions.  Select this if your module uses any of
+	  the functions from <crypto/sha1.h>.
+
+config CRYPTO_LIB_SHA1_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA1 && !UML
+	default y if ARM
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if PPC
+	default y if S390
+	default y if SPARC64
+	default y if X86_64
 
 config CRYPTO_LIB_SHA256
 	tristate
@@ -144,56 +161,62 @@ config CRYPTO_LIB_SHA256
 	  by either the generic implementation or an arch-specific one, if one
 	  is available and enabled.
 
-config CRYPTO_ARCH_HAVE_LIB_SHA256
+config CRYPTO_LIB_SHA256_ARCH
 	bool
+	depends on CRYPTO_LIB_SHA256 && !UML
+	default y if ARM && !CPU_V7M
+	default y if ARM64
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if PPC && SPE
+	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if S390
+	default y if SPARC64
+	default y if X86_64
+
+config CRYPTO_LIB_SHA512
+	tristate
 	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the SHA-256 library interface.
+	  The SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions.
+	  Select this if your module uses any of these functions from
+	  <crypto/sha2.h>.
 
-config CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
+config CRYPTO_LIB_SHA512_ARCH
 	bool
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the SHA-256 library interface
-	  that is SIMD-based and therefore not usable in hardirq
-	  context.
-
-config CRYPTO_LIB_SHA256_GENERIC
-	tristate
-	default CRYPTO_LIB_SHA256 if !CRYPTO_ARCH_HAVE_LIB_SHA256
-	help
-	  This symbol can be selected by arch implementations of the SHA-256
-	  library interface that require the generic code as a fallback, e.g.,
-	  for SIMD implementations. If no arch specific implementation is
-	  enabled, this implementation serves the users of CRYPTO_LIB_SHA256.
+	depends on CRYPTO_LIB_SHA512 && !UML
+	default y if ARM && !CPU_V7M
+	default y if ARM64
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if S390
+	default y if SPARC64
+	default y if X86_64
 
 config CRYPTO_LIB_SM3
 	tristate
 
+source "lib/crypto/tests/Kconfig"
+
 if !KMSAN # avoid false positives from assembly
 if ARM
-source "arch/arm/lib/crypto/Kconfig"
+source "lib/crypto/arm/Kconfig"
 endif
 if ARM64
-source "arch/arm64/lib/crypto/Kconfig"
+source "lib/crypto/arm64/Kconfig"
 endif
 if MIPS
-source "arch/mips/lib/crypto/Kconfig"
+source "lib/crypto/mips/Kconfig"
 endif
 if PPC
-source "arch/powerpc/lib/crypto/Kconfig"
+source "lib/crypto/powerpc/Kconfig"
 endif
 if RISCV
-source "arch/riscv/lib/crypto/Kconfig"
+source "lib/crypto/riscv/Kconfig"
 endif
 if S390
-source "arch/s390/lib/crypto/Kconfig"
-endif
-if SPARC
-source "arch/sparc/lib/crypto/Kconfig"
+source "lib/crypto/s390/Kconfig"
 endif
 if X86
-source "arch/x86/lib/crypto/Kconfig"
+source "lib/crypto/x86/Kconfig"
 endif
 endif
 
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index b0c0f8aea269..e4151be2ebd4 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -1,5 +1,17 @@
 # SPDX-License-Identifier: GPL-2.0
 
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) > $(@)
+
+quiet_cmd_perlasm_with_args = PERLASM $@
+      cmd_perlasm_with_args = $(PERL) $(<) void $(@)
+
+obj-$(CONFIG_KUNIT)				+= tests/
+
+obj-$(CONFIG_CRYPTO_HASH_INFO)			+= hash_info.o
+
 obj-$(CONFIG_CRYPTO_LIB_UTILS)			+= libcryptoutils.o
 libcryptoutils-y				:= memneq.o utils.o
 
@@ -55,14 +67,91 @@ libpoly1305-generic-y				:= poly1305-donna32.o
 libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o
 libpoly1305-generic-y				+= poly1305-generic.o
 
-obj-$(CONFIG_CRYPTO_LIB_SHA1)			+= libsha1.o
-libsha1-y					:= sha1.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
+libsha1-y := sha1.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
+CFLAGS_sha1.o += -I$(src)/$(SRCARCH)
+ifeq ($(CONFIG_ARM),y)
+libsha1-y += arm/sha1-armv4-large.o
+libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \
+				      arm/sha1-ce-core.o
+endif
+libsha1-$(CONFIG_ARM64) += arm64/sha1-ce-core.o
+ifeq ($(CONFIG_PPC),y)
+libsha1-y += powerpc/sha1-powerpc-asm.o
+libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o
+endif
+libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o
+libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \
+			 x86/sha1-avx2-asm.o \
+			 x86/sha1-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
+libsha256-y := sha256.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA256_ARCH),y)
+CFLAGS_sha256.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libsha256-y += arm/sha256-ce.o arm/sha256-core.o
+$(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl
+	$(call cmd,perlasm)
+clean-files += arm/sha256-core.S
+AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libsha256-y += arm64/sha256-core.o
+$(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl
+	$(call cmd,perlasm_with_args)
+clean-files += arm64/sha256-core.S
+libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o
+endif
+
+libsha256-$(CONFIG_PPC) += powerpc/sha256-spe-asm.o
+libsha256-$(CONFIG_RISCV) += riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.o
+libsha256-$(CONFIG_SPARC) += sparc/sha256_asm.o
+libsha256-$(CONFIG_X86) += x86/sha256-ssse3-asm.o \
+			   x86/sha256-avx-asm.o \
+			   x86/sha256-avx2-asm.o \
+			   x86/sha256-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA256_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o
+libsha512-y := sha512.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y)
+CFLAGS_sha512.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libsha512-y += arm/sha512-core.o
+$(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl
+	$(call cmd,perlasm)
+clean-files += arm/sha512-core.S
+AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libsha512-y += arm64/sha512-core.o
+$(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl
+	$(call cmd,perlasm_with_args)
+clean-files += arm64/sha512-core.S
+libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o
+endif
 
-obj-$(CONFIG_CRYPTO_LIB_SHA256)			+= libsha256.o
-libsha256-y					:= sha256.o
+libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o
+libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o
+libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \
+			   x86/sha512-avx-asm.o \
+			   x86/sha512-avx2-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 
-obj-$(CONFIG_CRYPTO_LIB_SHA256_GENERIC)		+= libsha256-generic.o
-libsha256-generic-y				:= sha256-generic.o
+################################################################################
 
 obj-$(CONFIG_MPILIB) += mpi/
 
@@ -70,3 +159,11 @@ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
 
 obj-$(CONFIG_CRYPTO_LIB_SM3)			+= libsm3.o
 libsm3-y					:= sm3.o
+
+obj-$(CONFIG_ARM) += arm/
+obj-$(CONFIG_ARM64) += arm64/
+obj-$(CONFIG_MIPS) += mips/
+obj-$(CONFIG_PPC) += powerpc/
+obj-$(CONFIG_RISCV) += riscv/
+obj-$(CONFIG_S390) += s390/
+obj-$(CONFIG_X86) += x86/
diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c
index eafe14d021f5..b57fda3460f1 100644
--- a/lib/crypto/aes.c
+++ b/lib/crypto/aes.c
@@ -5,6 +5,7 @@
 
 #include <crypto/aes.h>
 #include <linux/crypto.h>
+#include <linux/export.h>
 #include <linux/module.h>
 #include <linux/unaligned.h>
 
diff --git a/lib/crypto/aescfb.c b/lib/crypto/aescfb.c
index 2f09ae92ffa0..0f294c8cbf3c 100644
--- a/lib/crypto/aescfb.c
+++ b/lib/crypto/aescfb.c
@@ -5,11 +5,10 @@
  * Copyright 2023 Google LLC
  */
 
-#include <linux/module.h>
-
-#include <crypto/algapi.h>
 #include <crypto/aes.h>
-
+#include <crypto/algapi.h>
+#include <linux/export.h>
+#include <linux/module.h>
 #include <asm/irqflags.h>
 
 static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
diff --git a/lib/crypto/aesgcm.c b/lib/crypto/aesgcm.c
index faa4dee9bb1b..ac0b2fcfd606 100644
--- a/lib/crypto/aesgcm.c
+++ b/lib/crypto/aesgcm.c
@@ -5,12 +5,11 @@
  * Copyright 2022 Google LLC
  */
 
-#include <linux/module.h>
-
 #include <crypto/algapi.h>
 #include <crypto/gcm.h>
 #include <crypto/ghash.h>
-
+#include <linux/export.h>
+#include <linux/module.h>
 #include <asm/irqflags.h>
 
 static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c
index 838812d18216..4e950e1e66d0 100644
--- a/lib/crypto/arc4.c
+++ b/lib/crypto/arc4.c
@@ -8,6 +8,7 @@
  */
 
 #include <crypto/arc4.h>
+#include <linux/export.h>
 #include <linux/module.h>
 
 int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore
new file mode 100644
index 000000000000..f6c4e8ef80da
--- /dev/null
+++ b/lib/crypto/arm/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
+sha512-core.S
diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig
new file mode 100644
index 000000000000..e8444fd0aae3
--- /dev/null
+++ b/lib/crypto/arm/Kconfig
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_BLAKE2S_ARM
+	bool "Hash functions: BLAKE2s"
+	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+	help
+	  BLAKE2s cryptographic hash function (RFC 7693)
+
+	  Architecture: arm
+
+	  This is faster than the generic implementations of BLAKE2s and
+	  BLAKE2b, but slower than the NEON implementation of BLAKE2b.
+	  There is no NEON implementation of BLAKE2s, since NEON doesn't
+	  really help with it.
+
+config CRYPTO_CHACHA20_NEON
+	tristate
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_ARM
+	tristate
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile
new file mode 100644
index 000000000000..4c042a4c77ed
--- /dev/null
+++ b/lib/crypto/arm/Makefile
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
+libblake2s-arm-y := blake2s-core.o blake2s-glue.o
+
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-scalar-core.o chacha-glue.o
+chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+poly1305-arm-y := poly1305-core.o poly1305-glue.o
+
+quiet_cmd_perl = PERL    $@
+      cmd_perl = $(PERL) $(<) > $(@)
+
+$(obj)/%-core.S: $(src)/%-armv4.pl
+	$(call cmd,perl)
+
+clean-files += poly1305-core.S
+
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
+
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
new file mode 100644
index 000000000000..df40e46601f1
--- /dev/null
+++ b/lib/crypto/arm/blake2s-core.S
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	// Registers used to hold message words temporarily.  There aren't
+	// enough ARM registers to hold the whole message block, so we have to
+	// load the words on-demand.
+	M_0		.req	r12
+	M_1		.req	r14
+
+// The BLAKE2s initialization vector
+.Lblake2s_IV:
+	.word	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+	.word	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+
+.macro __ldrd		a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	ldrd		\a, \b, [\src, #\offset]
+#else
+	ldr		\a, [\src, #\offset]
+	ldr		\b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd		a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	strd		\a, \b, [\dst, #\offset]
+#else
+	str		\a, [\dst, #\offset]
+	str		\b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _le32_bswap	a, tmp
+#ifdef __ARMEB__
+	rev_l		\a, \tmp
+#endif
+.endm
+
+.macro _le32_bswap_8x	a, b, c, d, e, f, g, h,  tmp
+	_le32_bswap	\a, \tmp
+	_le32_bswap	\b, \tmp
+	_le32_bswap	\c, \tmp
+	_le32_bswap	\d, \tmp
+	_le32_bswap	\e, \tmp
+	_le32_bswap	\f, \tmp
+	_le32_bswap	\g, \tmp
+	_le32_bswap	\h, \tmp
+.endm
+
+// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
+// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
+// columns/diagonals.  s0-s1 are the word offsets to the message words the first
+// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
+// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
+//
+// Note that to save instructions, the rotations don't happen when the
+// pseudocode says they should, but rather they are delayed until the values are
+// used.  See the comment above _blake2s_round().
+.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
+
+	ldr		M_0, [sp, #32 + 4 * \s0]
+	ldr		M_1, [sp, #32 + 4 * \s2]
+
+	// a += b + m[blake2s_sigma[r][2*i + 0]];
+	add		\a0, \a0, \b0, ror #brot
+	add		\a1, \a1, \b1, ror #brot
+	add		\a0, \a0, M_0
+	add		\a1, \a1, M_1
+
+	// d = ror32(d ^ a, 16);
+	eor		\d0, \a0, \d0, ror #drot
+	eor		\d1, \a1, \d1, ror #drot
+
+	// c += d;
+	add		\c0, \c0, \d0, ror #16
+	add		\c1, \c1, \d1, ror #16
+
+	// b = ror32(b ^ c, 12);
+	eor		\b0, \c0, \b0, ror #brot
+	eor		\b1, \c1, \b1, ror #brot
+
+	ldr		M_0, [sp, #32 + 4 * \s1]
+	ldr		M_1, [sp, #32 + 4 * \s3]
+
+	// a += b + m[blake2s_sigma[r][2*i + 1]];
+	add		\a0, \a0, \b0, ror #12
+	add		\a1, \a1, \b1, ror #12
+	add		\a0, \a0, M_0
+	add		\a1, \a1, M_1
+
+	// d = ror32(d ^ a, 8);
+	eor		\d0, \a0, \d0, ror#16
+	eor		\d1, \a1, \d1, ror#16
+
+	// c += d;
+	add		\c0, \c0, \d0, ror#8
+	add		\c1, \c1, \d1, ror#8
+
+	// b = ror32(b ^ c, 7);
+	eor		\b0, \c0, \b0, ror#12
+	eor		\b1, \c1, \b1, ror#12
+.endm
+
+// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
+// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
+// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
+// r14 are free to use.  The macro arguments s0-s15 give the order in which the
+// message words are used in this round.
+//
+// All rotates are performed using the implicit rotate operand accepted by the
+// 'add' and 'eor' instructions.  This is faster than using explicit rotate
+// instructions.  To make this work, we allow the values in the second and last
+// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
+// wrong rotation amount.  The rotation amount is then fixed up just in time
+// when the values are used.  'brot' is the number of bits the values in row 'b'
+// need to be rotated right to arrive at the correct values, and 'drot'
+// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+// that they end up as (7, 8) after every round.
+.macro	_blake2s_round	s0, s1, s2, s3, s4, s5, s6, s7, \
+			s8, s9, s10, s11, s12, s13, s14, s15
+
+	// Mix first two columns:
+	// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
+	__ldrd		r10, r11, sp, 16	// load v[12] and v[13]
+	_blake2s_quarterround	r0, r4, r8, r10,  r1, r5, r9, r11, \
+				\s0, \s1, \s2, \s3
+	__strd		r8, r9, sp, 0
+	__strd		r10, r11, sp, 16
+
+	// Mix second two columns:
+	// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
+	__ldrd		r8, r9, sp, 8		// load v[10] and v[11]
+	__ldrd		r10, r11, sp, 24	// load v[14] and v[15]
+	_blake2s_quarterround	r2, r6, r8, r10,  r3, r7, r9, r11, \
+				\s4, \s5, \s6, \s7
+	str		r10, [sp, #24]		// store v[14]
+	// v[10], v[11], and v[15] are used below, so no need to store them yet.
+
+	.set brot, 7
+	.set drot, 8
+
+	// Mix first two diagonals:
+	// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
+	ldr		r10, [sp, #16]		// load v[12]
+	_blake2s_quarterround	r0, r5, r8, r11,  r1, r6, r9, r10, \
+				\s8, \s9, \s10, \s11
+	__strd		r8, r9, sp, 8
+	str		r11, [sp, #28]
+	str		r10, [sp, #16]
+
+	// Mix second two diagonals:
+	// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
+	__ldrd		r8, r9, sp, 0		// load v[8] and v[9]
+	__ldrd		r10, r11, sp, 20	// load v[13] and v[14]
+	_blake2s_quarterround	r2, r7, r8, r10,  r3, r4, r9, r11, \
+				\s12, \s13, \s14, \s15
+	__strd		r10, r11, sp, 20
+.endm
+
+//
+// void blake2s_compress(struct blake2s_state *state,
+//			 const u8 *block, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_state are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
+	.align		5
+ENTRY(blake2s_compress)
+	push		{r0-r2,r4-r11,lr}	// keep this an even number
+
+.Lnext_block:
+	// r0 is 'state'
+	// r1 is 'block'
+	// r3 is 'inc'
+
+	// Load and increment the counter t[0..1].
+	__ldrd		r10, r11, r0, 32
+	adds		r10, r10, r3
+	adc		r11, r11, #0
+	__strd		r10, r11, r0, 32
+
+	// _blake2s_round is very short on registers, so copy the message block
+	// to the stack to save a register during the rounds.  This also has the
+	// advantage that misalignment only needs to be dealt with in one place.
+	sub		sp, sp, #64
+	mov		r12, sp
+	tst		r1, #3
+	bne		.Lcopy_block_misaligned
+	ldmia		r1!, {r2-r9}
+	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
+	stmia		r12!, {r2-r9}
+	ldmia		r1!, {r2-r9}
+	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
+	stmia		r12, {r2-r9}
+.Lcopy_block_done:
+	str		r1, [sp, #68]		// Update message pointer
+
+	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
+	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
+	mov		r14, r0			// r14 = state
+	adr		r12, .Lblake2s_IV
+	ldmia		r12!, {r8-r9}		// load IV[0..1]
+	__ldrd		r0, r1, r14, 40		// load f[0..1]
+	ldm		r12, {r2-r7}		// load IV[3..7]
+	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
+	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
+	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
+	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
+	push		{r2-r7}			// push v[9..15]
+	sub		sp, sp, #8		// leave space for v[8..9]
+
+	// Load h[0..7] == v[0..7].
+	ldm		r14, {r0-r7}
+
+	// Execute the rounds.  Each round is provided the order in which it
+	// needs to use the message words.
+	.set brot, 0
+	.set drot, 0
+	_blake2s_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	_blake2s_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+	_blake2s_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+	_blake2s_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+	_blake2s_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+	_blake2s_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+	_blake2s_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+	_blake2s_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+	_blake2s_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+	_blake2s_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+
+	// Fold the final state matrix into the hash chaining value:
+	//
+	//	for (i = 0; i < 8; i++)
+	//		h[i] ^= v[i] ^ v[i + 8];
+	//
+	ldr		r14, [sp, #96]		// r14 = &h[0]
+	add		sp, sp, #8		// v[8..9] are already loaded.
+	pop		{r10-r11}		// load v[10..11]
+	eor		r0, r0, r8
+	eor		r1, r1, r9
+	eor		r2, r2, r10
+	eor		r3, r3, r11
+	ldm		r14, {r8-r11}		// load h[0..3]
+	eor		r0, r0, r8
+	eor		r1, r1, r9
+	eor		r2, r2, r10
+	eor		r3, r3, r11
+	stmia		r14!, {r0-r3}		// store new h[0..3]
+	ldm		r14, {r0-r3}		// load old h[4..7]
+	pop		{r8-r11}		// load v[12..15]
+	eor		r0, r0, r4, ror #brot
+	eor		r1, r1, r5, ror #brot
+	eor		r2, r2, r6, ror #brot
+	eor		r3, r3, r7, ror #brot
+	eor		r0, r0, r8, ror #drot
+	eor		r1, r1, r9, ror #drot
+	eor		r2, r2, r10, ror #drot
+	eor		r3, r3, r11, ror #drot
+	  add		sp, sp, #64		// skip copy of message block
+	stm		r14, {r0-r3}		// store new h[4..7]
+
+	// Advance to the next block, if there is one.  Note that if there are
+	// multiple blocks, then 'inc' (the counter increment amount) must be
+	// 64.  So we can simply set it to 64 without re-loading it.
+	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
+	mov		r3, #64			// set 'inc'
+	subs		r2, r2, #1		// nblocks--
+	str		r2, [sp, #8]
+	bne		.Lnext_block		// nblocks != 0?
+
+	pop		{r0-r2,r4-r11,pc}
+
+	// The next message block (pointed to by r1) isn't 4-byte aligned, so it
+	// can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
+	// by r12) using an alternative method.  r2-r9 are free to use.
+.Lcopy_block_misaligned:
+	mov		r2, #64
+1:
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	ldr		r3, [r1], #4
+	_le32_bswap	r3, r4
+#else
+	ldrb		r3, [r1, #0]
+	ldrb		r4, [r1, #1]
+	ldrb		r5, [r1, #2]
+	ldrb		r6, [r1, #3]
+	add		r1, r1, #4
+	orr		r3, r3, r4, lsl #8
+	orr		r3, r3, r5, lsl #16
+	orr		r3, r3, r6, lsl #24
+#endif
+	subs		r2, r2, #4
+	str		r3, [r12], #4
+	bne		1b
+	b		.Lcopy_block_done
+ENDPROC(blake2s_compress)
diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c
new file mode 100644
index 000000000000..0238a70d9581
--- /dev/null
+++ b/lib/crypto/arm/blake2s-glue.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/internal/blake2s.h>
+#include <linux/module.h>
+
+/* defined in blake2s-core.S */
+EXPORT_SYMBOL(blake2s_compress);
diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha-glue.c
new file mode 100644
index 000000000000..88ec96415283
--- /dev/null
+++ b/lib/crypto/arm/chacha-glue.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha and HChaCha functions (ARM optimized)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/cputype.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+				      u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       int nrounds, unsigned int nbytes);
+asmlinkage void hchacha_block_arm(const struct chacha_state *state,
+				  u32 out[HCHACHA_OUT_WORDS], int nrounds);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+			     const struct chacha_state *state, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+static inline bool neon_usable(void)
+{
+	return static_branch_likely(&use_neon) && crypto_simd_usable();
+}
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes > CHACHA_BLOCK_SIZE) {
+		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+	}
+	if (bytes) {
+		const u8 *s = src;
+		u8 *d = dst;
+
+		if (bytes != CHACHA_BLOCK_SIZE)
+			s = d = memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, d, s, nrounds);
+		if (d != dst)
+			memcpy(dst, buf, bytes);
+		state->x[12]++;
+	}
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+		hchacha_block_arm(state, out, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, out, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+	    bytes <= CHACHA_BLOCK_SIZE) {
+		chacha_doarm(dst, src, bytes, state, nrounds);
+		state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+		return;
+	}
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_neon_begin();
+		chacha_doneon(state, dst, src, todo, nrounds);
+		kernel_neon_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_arm_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+		switch (read_cpuid_part()) {
+		case ARM_CPU_PART_CORTEX_A7:
+		case ARM_CPU_PART_CORTEX_A5:
+			/*
+			 * The Cortex-A7 and Cortex-A5 do not perform well with
+			 * the NEON implementation but do incredibly with the
+			 * scalar one and use less power.
+			 */
+			break;
+		default:
+			static_branch_enable(&use_neon);
+		}
+	}
+	return 0;
+}
+subsys_initcall(chacha_arm_mod_init);
+
+static void __exit chacha_arm_mod_exit(void)
+{
+}
+module_exit(chacha_arm_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm/chacha-neon-core.S b/lib/crypto/arm/chacha-neon-core.S
new file mode 100644
index 000000000000..ddd62b6294a5
--- /dev/null
+++ b/lib/crypto/arm/chacha-neon-core.S
@@ -0,0 +1,643 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
+  *
+  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
+  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
+  * (c)  vrev32.16			(16-bit rotations only)
+  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
+  *					 needs index vector)
+  *
+  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
+  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
+  * cycles of (b) on both Cortex-A7 and Cortex-A53.
+  *
+  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+  * and doesn't need a temporary register.
+  *
+  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
+  * is twice as fast as (a), even when doing (a) on multiple registers
+  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
+  * parallelizes better when temporary registers are scarce.
+  *
+  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+  * (a), so the need to load the rotation table actually makes the vtbl method
+  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
+  * seems to be a good compromise to get a more significant speed boost on some
+  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+  */
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+
+	.text
+	.fpu		neon
+	.align		5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+	adr		ip, .Lrol8_table
+	vld1.8		{d10}, [ip, :64]
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vext.8		q1, q1, q1, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vext.8		q3, q3, q3, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vext.8		q1, q1, q1, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vext.8		q3, q3, q3, #4
+
+	subs		r3, r3, #2
+	bne		.Ldoubleround
+
+	bx		lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+	// r3: nrounds
+	push		{lr}
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	bl		chacha_permute
+
+	add		ip, r2, #0x20
+	vld1.8		{q4-q5}, [r2]
+	vld1.8		{q6-q7}, [ip]
+
+	// o0 = i0 ^ (x0 + s0)
+	vadd.i32	q0, q0, q8
+	veor		q0, q0, q4
+
+	// o1 = i1 ^ (x1 + s1)
+	vadd.i32	q1, q1, q9
+	veor		q1, q1, q5
+
+	// o2 = i2 ^ (x2 + s2)
+	vadd.i32	q2, q2, q10
+	veor		q2, q2, q6
+
+	// o3 = i3 ^ (x3 + s3)
+	vadd.i32	q3, q3, q11
+	veor		q3, q3, q7
+
+	add		ip, r1, #0x20
+	vst1.8		{q0-q1}, [r1]
+	vst1.8		{q2-q3}, [ip]
+
+	pop		{pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+	// r0: Input state matrix, s
+	// r1: output (8 32-bit words)
+	// r2: nrounds
+	push		{lr}
+
+	vld1.32		{q0-q1}, [r0]!
+	vld1.32		{q2-q3}, [r0]
+
+	mov		r3, r2
+	bl		chacha_permute
+
+	vst1.32		{q0}, [r1]!
+	vst1.32		{q3}, [r1]
+
+	pop		{pc}
+ENDPROC(hchacha_block_neon)
+
+	.align		4
+.Lctrinc:	.word	0, 1, 2, 3
+.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
+
+	.align		5
+ENTRY(chacha_4block_xor_neon)
+	push		{r4, lr}
+	mov		r4, sp			// preserve the stack pointer
+	sub		ip, sp, #0x20		// allocate a 32 byte buffer
+	bic		ip, ip, #0x1f		// aligned to 32 bytes
+	mov		sp, ip
+
+	// r0: Input state matrix, s
+	// r1: 4 data blocks output, o
+	// r2: 4 data blocks input, i
+	// r3: nrounds
+
+	//
+	// This function encrypts four consecutive ChaCha blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. The words are re-interleaved before the
+	// final addition of the original state and the XORing step.
+	//
+
+	// x0..15[0-3] = s0..15[0-3]
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	adr		lr, .Lctrinc
+	vdup.32		q15, d7[1]
+	vdup.32		q14, d7[0]
+	vld1.32		{q4}, [lr, :128]
+	vdup.32		q13, d6[1]
+	vdup.32		q12, d6[0]
+	vdup.32		q11, d5[1]
+	vdup.32		q10, d5[0]
+	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
+	vdup.32		q9, d4[1]
+	vdup.32		q8, d4[0]
+	vdup.32		q7, d3[1]
+	vdup.32		q6, d3[0]
+	vdup.32		q5, d2[1]
+	vdup.32		q4, d2[0]
+	vdup.32		q3, d1[1]
+	vdup.32		q2, d1[0]
+	vdup.32		q1, d0[1]
+	vdup.32		q0, d0[0]
+
+	adr		ip, .Lrol8_table
+	b		1f
+
+.Ldoubleround4:
+	vld1.32		{q8-q9}, [sp, :256]
+1:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+	vrev32.16	q15, q15
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #12
+	vshl.u32	q5, q9, #12
+	vsri.u32	q4, q8, #20
+	vsri.u32	q5, q9, #20
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #12
+	vshl.u32	q7, q9, #12
+	vsri.u32	q6, q8, #20
+	vsri.u32	q7, q9, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #7
+	vshl.u32	q5, q9, #7
+	vsri.u32	q4, q8, #25
+	vsri.u32	q5, q9, #25
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #7
+	vshl.u32	q7, q9, #7
+	vsri.u32	q6, q8, #25
+	vsri.u32	q7, q9, #25
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vrev32.16	q15, q15
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #12
+	vshl.u32	q4, q9, #12
+	vsri.u32	q7, q8, #20
+	vsri.u32	q4, q9, #20
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #12
+	vshl.u32	q6, q9, #12
+	vsri.u32	q5, q8, #20
+	vsri.u32	q6, q9, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #7
+	vshl.u32	q4, q9, #7
+	vsri.u32	q7, q8, #25
+	vsri.u32	q4, q9, #25
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #7
+	vshl.u32	q6, q9, #7
+	vsri.u32	q5, q8, #25
+	vsri.u32	q6, q9, #25
+
+	subs		r3, r3, #2
+	bne		.Ldoubleround4
+
+	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+	// x8..9[0-3] are on the stack.
+
+	// Re-interleave the words in the first two rows of each block (x0..7).
+	// Also add the counter values 0-3 to x12[0-3].
+	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
+	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
+	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
+	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
+	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
+	  vadd.u32	q12, q8			// x12 += counter values 0-3
+	vswp		d1, d4
+	vswp		d3, d6
+	  vld1.32	{q8-q9}, [r0]!		// load s0..7
+	vswp		d9, d12
+	vswp		d11, d14
+
+	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+	// after XORing the first 32 bytes.
+	vswp		q1, q4
+
+	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
+	vadd.u32	q0, q0, q8
+	vadd.u32	q2, q2, q8
+	vadd.u32	q4, q4, q8
+	vadd.u32	q3, q3, q8
+
+	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
+	vadd.u32	q1, q1, q9
+	vadd.u32	q6, q6, q9
+	vadd.u32	q5, q5, q9
+	vadd.u32	q7, q7, q9
+
+	// XOR first 32 bytes using keystream from first two rows of first block
+	vld1.8		{q8-q9}, [r2]!
+	veor		q8, q8, q0
+	veor		q9, q9, q1
+	vst1.8		{q8-q9}, [r1]!
+
+	// Re-interleave the words in the last two rows of each block (x8..15).
+	vld1.32		{q8-q9}, [sp, :256]
+	  mov		sp, r4		// restore original stack pointer
+	  ldr		r4, [r4, #8]	// load number of bytes
+	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
+	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
+	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
+	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
+	  vld1.32	{q0-q1}, [r0]	// load s8..15
+	vswp		d25, d28
+	vswp		d27, d30
+	vswp		d17, d20
+	vswp		d19, d22
+
+	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
+	vadd.u32	q8,  q8,  q0
+	vadd.u32	q10, q10, q0
+	vadd.u32	q9,  q9,  q0
+	vadd.u32	q11, q11, q0
+
+	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+	vadd.u32	q12, q12, q1
+	vadd.u32	q14, q14, q1
+	vadd.u32	q13, q13, q1
+	vadd.u32	q15, q15, q1
+
+	// XOR the rest of the data with the keystream
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #96
+	veor		q0, q0, q8
+	veor		q1, q1, q12
+	ble		.Lle96
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q2
+	veor		q1, q1, q6
+	ble		.Lle128
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q10
+	veor		q1, q1, q14
+	ble		.Lle160
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q4
+	veor		q1, q1, q5
+	ble		.Lle192
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q9
+	veor		q1, q1, q13
+	ble		.Lle224
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q3
+	veor		q1, q1, q7
+	blt		.Llt256
+.Lout:
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]
+	veor		q0, q0, q11
+	veor		q1, q1, q15
+	vst1.8		{q0-q1}, [r1]
+
+	pop		{r4, pc}
+
+.Lle192:
+	vmov		q4, q9
+	vmov		q5, q13
+
+.Lle160:
+	// nothing to do
+
+.Lfinalblock:
+	// Process the final block if processing less than 4 full blocks.
+	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+	// previous 32 byte output block that still needs to be written at
+	// [r1] in q0-q1.
+	beq		.Lfullblock
+
+.Lpartialblock:
+	adr		lr, .Lpermute + 32
+	add		r2, r2, r4
+	add		lr, lr, r4
+	add		r4, r4, r1
+
+	vld1.8		{q2-q3}, [lr]
+	vld1.8		{q6-q7}, [r2]
+
+	add		r4, r4, #32
+
+	vtbl.8		d4, {q4-q5}, d4
+	vtbl.8		d5, {q4-q5}, d5
+	vtbl.8		d6, {q4-q5}, d6
+	vtbl.8		d7, {q4-q5}, d7
+
+	veor		q6, q6, q2
+	veor		q7, q7, q3
+
+	vst1.8		{q6-q7}, [r4]	// overlapping stores
+	vst1.8		{q0-q1}, [r1]
+	pop		{r4, pc}
+
+.Lfullblock:
+	vmov		q11, q4
+	vmov		q15, q5
+	b		.Lout
+.Lle96:
+	vmov		q4, q2
+	vmov		q5, q6
+	b		.Lfinalblock
+.Lle128:
+	vmov		q4, q10
+	vmov		q5, q14
+	b		.Lfinalblock
+.Lle224:
+	vmov		q4, q3
+	vmov		q5, q7
+	b		.Lfinalblock
+.Llt256:
+	vmov		q4, q11
+	vmov		q5, q15
+	b		.Lpartialblock
+ENDPROC(chacha_4block_xor_neon)
+
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/lib/crypto/arm/chacha-scalar-core.S b/lib/crypto/arm/chacha-scalar-core.S
new file mode 100644
index 000000000000..4951df05c158
--- /dev/null
+++ b/lib/crypto/arm/chacha-scalar-core.S
@@ -0,0 +1,444 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used.  So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions.  This is faster than using explicit rotate
+ * instructions.  To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount.  The rotation amount is then fixed up just in time
+ * when the values are used.  'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+	// ChaCha state registers
+	X0	.req	r0
+	X1	.req	r1
+	X2	.req	r2
+	X3	.req	r3
+	X4	.req	r4
+	X5	.req	r5
+	X6	.req	r6
+	X7	.req	r7
+	X8_X10	.req	r8	// shared by x8 and x10
+	X9_X11	.req	r9	// shared by x9 and x11
+	X12	.req	r10
+	X13	.req	r11
+	X14	.req	r12
+	X15	.req	r14
+
+.macro _le32_bswap_4x	a, b, c, d,  tmp
+#ifdef __ARMEB__
+	rev_l		\a,  \tmp
+	rev_l		\b,  \tmp
+	rev_l		\c,  \tmp
+	rev_l		\d,  \tmp
+#endif
+.endm
+
+.macro __ldrd		a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	ldrd		\a, \b, [\src, #\offset]
+#else
+	ldr		\a, [\src, #\offset]
+	ldr		\b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd		a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	strd		\a, \b, [\dst, #\offset]
+#else
+	str		\a, [\dst, #\offset]
+	str		\b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
+
+	// a += b; d ^= a; d = rol(d, 16);
+	add		\a1, \a1, \b1, ror #brot
+	add		\a2, \a2, \b2, ror #brot
+	eor		\d1, \a1, \d1, ror #drot
+	eor		\d2, \a2, \d2, ror #drot
+	// drot == 32 - 16 == 16
+
+	// c += d; b ^= c; b = rol(b, 12);
+	add		\c1, \c1, \d1, ror #16
+	add		\c2, \c2, \d2, ror #16
+	eor		\b1, \c1, \b1, ror #brot
+	eor		\b2, \c2, \b2, ror #brot
+	// brot == 32 - 12 == 20
+
+	// a += b; d ^= a; d = rol(d, 8);
+	add		\a1, \a1, \b1, ror #20
+	add		\a2, \a2, \b2, ror #20
+	eor		\d1, \a1, \d1, ror #16
+	eor		\d2, \a2, \d2, ror #16
+	// drot == 32 - 8 == 24
+
+	// c += d; b ^= c; b = rol(b, 7);
+	add		\c1, \c1, \d1, ror #24
+	add		\c2, \c2, \d2, ror #24
+	eor		\b1, \c1, \b1, ror #20
+	eor		\b2, \c2, \b2, ror #20
+	// brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+	// column round
+
+	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
+
+	// save (x8, x9); restore (x10, x11)
+	__strd		X8_X10, X9_X11, sp, 0
+	__ldrd		X8_X10, X9_X11, sp, 8
+
+	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
+
+	.set brot, 25
+	.set drot, 24
+
+	// diagonal round
+
+	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
+
+	// save (x10, x11); restore (x8, x9)
+	__strd		X8_X10, X9_X11, sp, 8
+	__ldrd		X8_X10, X9_X11, sp, 0
+
+	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute	nrounds
+	.set brot, 0
+	.set drot, 0
+	.rept \nrounds / 2
+	 _doubleround
+	.endr
+.endm
+
+.macro _chacha		nrounds
+
+.Lnext_block\@:
+	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+	// Registers contain x0-x9,x12-x15.
+
+	// Do the core ChaCha permutation to update x0-x15.
+	_chacha_permute	\nrounds
+
+	add		sp, #8
+	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers contain x0-x9,x12-x15.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+	push		{X8_X10, X9_X11, X12, X13, X14, X15}
+
+	// Load (OUT, IN, LEN).
+	ldr		r14, [sp, #96]
+	ldr		r12, [sp, #100]
+	ldr		r11, [sp, #104]
+
+	orr		r10, r14, r12
+
+	// Use slow path if fewer than 64 bytes remain.
+	cmp		r11, #64
+	blt		.Lxor_slowpath\@
+
+	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
+	// ARMv6+, since ldmia and stmia (used below) still require alignment.
+	tst		r10, #3
+	bne		.Lxor_slowpath\@
+
+	// Fast path: XOR 64 bytes of aligned data.
+
+	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// x0-x3
+	__ldrd		r8, r9, sp, 32
+	__ldrd		r10, r11, sp, 40
+	add		X0, X0, r8
+	add		X1, X1, r9
+	add		X2, X2, r10
+	add		X3, X3, r11
+	_le32_bswap_4x	X0, X1, X2, X3,  r8
+	ldmia		r12!, {r8-r11}
+	eor		X0, X0, r8
+	eor		X1, X1, r9
+	eor		X2, X2, r10
+	eor		X3, X3, r11
+	stmia		r14!, {X0-X3}
+
+	// x4-x7
+	__ldrd		r8, r9, sp, 48
+	__ldrd		r10, r11, sp, 56
+	add		X4, r8, X4, ror #brot
+	add		X5, r9, X5, ror #brot
+	ldmia		r12!, {X0-X3}
+	add		X6, r10, X6, ror #brot
+	add		X7, r11, X7, ror #brot
+	_le32_bswap_4x	X4, X5, X6, X7,  r8
+	eor		X4, X4, X0
+	eor		X5, X5, X1
+	eor		X6, X6, X2
+	eor		X7, X7, X3
+	stmia		r14!, {X4-X7}
+
+	// x8-x15
+	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
+	__ldrd		r8, r9, sp, 32
+	__ldrd		r10, r11, sp, 40
+	add		r0, r0, r8		// x8
+	add		r1, r1, r9		// x9
+	add		r6, r6, r10		// x10
+	add		r7, r7, r11		// x11
+	_le32_bswap_4x	r0, r1, r6, r7,  r8
+	ldmia		r12!, {r8-r11}
+	eor		r0, r0, r8		// x8
+	eor		r1, r1, r9		// x9
+	eor		r6, r6, r10		// x10
+	eor		r7, r7, r11		// x11
+	stmia		r14!, {r0,r1,r6,r7}
+	ldmia		r12!, {r0,r1,r6,r7}
+	__ldrd		r8, r9, sp, 48
+	__ldrd		r10, r11, sp, 56
+	add		r2, r8, r2, ror #drot	// x12
+	add		r3, r9, r3, ror #drot	// x13
+	add		r4, r10, r4, ror #drot	// x14
+	add		r5, r11, r5, ror #drot	// x15
+	_le32_bswap_4x	r2, r3, r4, r5,  r9
+	  ldr		r9, [sp, #72]		// load LEN
+	eor		r2, r2, r0		// x12
+	eor		r3, r3, r1		// x13
+	eor		r4, r4, r6		// x14
+	eor		r5, r5, r7		// x15
+	  subs		r9, #64			// decrement and check LEN
+	stmia		r14!, {r2-r5}
+
+	beq		.Ldone\@
+
+.Lprepare_for_next_block\@:
+
+	// Stack: x0-x15 OUT IN LEN
+
+	// Increment block counter (x12)
+	add		r8, #1
+
+	// Store updated (OUT, IN, LEN)
+	str		r14, [sp, #64]
+	str		r12, [sp, #68]
+	str		r9, [sp, #72]
+
+	  mov		r14, sp
+
+	// Store updated block counter (x12)
+	str		r8, [sp, #48]
+
+	  sub		sp, #16
+
+	// Reload state and do next block
+	ldmia		r14!, {r0-r11}		// load x0-x11
+	__strd		r10, r11, sp, 8		// store x10-x11 before state
+	ldmia		r14, {r10-r12,r14}	// load x12-x15
+	b		.Lnext_block\@
+
+.Lxor_slowpath\@:
+	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+	// We handle it by storing the 64 bytes of keystream to the stack, then
+	// XOR-ing the needed portion with the data.
+
+	// Allocate keystream buffer
+	sub		sp, #64
+	mov		r14, sp
+
+	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// Save keystream for x0-x3
+	__ldrd		r8, r9, sp, 96
+	__ldrd		r10, r11, sp, 104
+	add		X0, X0, r8
+	add		X1, X1, r9
+	add		X2, X2, r10
+	add		X3, X3, r11
+	_le32_bswap_4x	X0, X1, X2, X3,  r8
+	stmia		r14!, {X0-X3}
+
+	// Save keystream for x4-x7
+	__ldrd		r8, r9, sp, 112
+	__ldrd		r10, r11, sp, 120
+	add		X4, r8, X4, ror #brot
+	add		X5, r9, X5, ror #brot
+	add		X6, r10, X6, ror #brot
+	add		X7, r11, X7, ror #brot
+	_le32_bswap_4x	X4, X5, X6, X7,  r8
+	  add		r8, sp, #64
+	stmia		r14!, {X4-X7}
+
+	// Save keystream for x8-x15
+	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
+	__ldrd		r8, r9, sp, 128
+	__ldrd		r10, r11, sp, 136
+	add		r0, r0, r8		// x8
+	add		r1, r1, r9		// x9
+	add		r6, r6, r10		// x10
+	add		r7, r7, r11		// x11
+	_le32_bswap_4x	r0, r1, r6, r7,  r8
+	stmia		r14!, {r0,r1,r6,r7}
+	__ldrd		r8, r9, sp, 144
+	__ldrd		r10, r11, sp, 152
+	add		r2, r8, r2, ror #drot	// x12
+	add		r3, r9, r3, ror #drot	// x13
+	add		r4, r10, r4, ror #drot	// x14
+	add		r5, r11, r5, ror #drot	// x15
+	_le32_bswap_4x	r2, r3, r4, r5,  r9
+	stmia		r14, {r2-r5}
+
+	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+	// Registers: r8 is block counter, r12 is IN.
+
+	ldr		r9, [sp, #168]		// LEN
+	ldr		r14, [sp, #160]		// OUT
+	cmp		r9, #64
+	  mov		r0, sp
+	movle		r1, r9
+	movgt		r1, #64
+	// r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+	orr		r2, r12, r14
+	tst		r2, #3			// IN or OUT misaligned?
+	bne		.Lxor_next_byte\@
+.endif
+
+	// XOR a word at a time
+.rept 16
+	subs		r1, #4
+	blt		.Lxor_words_done\@
+	ldr		r2, [r12], #4
+	ldr		r3, [r0], #4
+	eor		r2, r2, r3
+	str		r2, [r14], #4
+.endr
+	b		.Lxor_slowpath_done\@
+.Lxor_words_done\@:
+	ands		r1, r1, #3
+	beq		.Lxor_slowpath_done\@
+
+	// XOR a byte at a time
+.Lxor_next_byte\@:
+	ldrb		r2, [r12], #1
+	ldrb		r3, [r0], #1
+	eor		r2, r2, r3
+	strb		r2, [r14], #1
+	subs		r1, #1
+	bne		.Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+	subs		r9, #64
+	add		sp, #96
+	bgt		.Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm	// _chacha
+
+/*
+ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ *		     const struct chacha_state *state, int nrounds);
+ */
+ENTRY(chacha_doarm)
+	cmp		r2, #0			// len == 0?
+	reteq		lr
+
+	ldr		ip, [sp]
+	cmp		ip, #12
+
+	push		{r0-r2,r4-r11,lr}
+
+	// Push state x0-x15 onto stack.
+	// Also store an extra copy of x10-x11 just before the state.
+
+	add		X12, r3, #48
+	ldm		X12, {X12,X13,X14,X15}
+	push		{X12,X13,X14,X15}
+	sub		sp, sp, #64
+
+	__ldrd		X8_X10, X9_X11, r3, 40
+	__strd		X8_X10, X9_X11, sp, 8
+	__strd		X8_X10, X9_X11, sp, 56
+	ldm		r3, {X0-X9_X11}
+	__strd		X0, X1, sp, 16
+	__strd		X2, X3, sp, 24
+	__strd		X4, X5, sp, 32
+	__strd		X6, X7, sp, 40
+	__strd		X8_X10, X9_X11, sp, 48
+
+	beq		1f
+	_chacha		20
+
+0:	add		sp, #76
+	pop		{r4-r11, pc}
+
+1:	_chacha		12
+	b		0b
+ENDPROC(chacha_doarm)
+
+/*
+ * void hchacha_block_arm(const struct chacha_state *state,
+ *			  u32 out[HCHACHA_OUT_WORDS], int nrounds);
+ */
+ENTRY(hchacha_block_arm)
+	push		{r1,r4-r11,lr}
+
+	cmp		r2, #12			// ChaCha12 ?
+
+	mov		r14, r0
+	ldmia		r14!, {r0-r11}		// load x0-x11
+	push		{r10-r11}		// store x10-x11 to stack
+	ldm		r14, {r10-r12,r14}	// load x12-x15
+	sub		sp, #8
+
+	beq		1f
+	_chacha_permute	20
+
+	// Skip over (unused0-unused1, x10-x11)
+0:	add		sp, #16
+
+	// Fix up rotations of x12-x15
+	ror		X12, X12, #drot
+	ror		X13, X13, #drot
+	  pop		{r4}			// load 'out'
+	ror		X14, X14, #drot
+	ror		X15, X15, #drot
+
+	// Store (x0-x3,x12-x15) to 'out'
+	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+	pop		{r4-r11,pc}
+
+1:	_chacha_permute	12
+	b		0b
+ENDPROC(hchacha_block_arm)
diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl
new file mode 100644
index 000000000000..dd7a996361a7
--- /dev/null
+++ b/lib/crypto/arm/poly1305-armv4.pl
@@ -0,0 +1,1236 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+#			IALU(*)/gcc-4.4		NEON
+#
+# ARM11xx(ARMv6)	7.78/+100%		-
+# Cortex-A5		6.35/+130%		3.00
+# Cortex-A8		6.25/+115%		2.36
+# Cortex-A9		5.10/+95%		2.55
+# Cortex-A15		3.85/+85%		1.25(**)
+# Snapdragon S4		5.70/+100%		1.48(**)
+#
+# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**)	these are trade-off results, they can be improved by ~8% but at
+#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#ifndef	__KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init   poly1305_block_init_arch
+# define poly1305_blocks poly1305_blocks_arm
+# define poly1305_emit   poly1305_emit_arch
+#endif
+
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.globl	poly1305_emit
+.globl	poly1305_blocks
+.globl	poly1305_init
+.type	poly1305_init,%function
+.align	5
+poly1305_init:
+.Lpoly1305_init:
+	stmdb	sp!,{r4-r11}
+
+	eor	r3,r3,r3
+	cmp	$inp,#0
+	str	r3,[$ctx,#0]		@ zero hash value
+	str	r3,[$ctx,#4]
+	str	r3,[$ctx,#8]
+	str	r3,[$ctx,#12]
+	str	r3,[$ctx,#16]
+	str	r3,[$ctx,#36]		@ clear is_base2_26
+	add	$ctx,$ctx,#20
+
+#ifdef	__thumb2__
+	it	eq
+#endif
+	moveq	r0,#0
+	beq	.Lno_key
+
+#if	__ARM_MAX_ARCH__>=7
+	mov	r3,#-1
+	str	r3,[$ctx,#28]		@ impossible key power value
+# ifndef __KERNEL__
+	adr	r11,.Lpoly1305_init
+	ldr	r12,.LOPENSSL_armcap
+# endif
+#endif
+	ldrb	r4,[$inp,#0]
+	mov	r10,#0x0fffffff
+	ldrb	r5,[$inp,#1]
+	and	r3,r10,#-4		@ 0x0ffffffc
+	ldrb	r6,[$inp,#2]
+	ldrb	r7,[$inp,#3]
+	orr	r4,r4,r5,lsl#8
+	ldrb	r5,[$inp,#4]
+	orr	r4,r4,r6,lsl#16
+	ldrb	r6,[$inp,#5]
+	orr	r4,r4,r7,lsl#24
+	ldrb	r7,[$inp,#6]
+	and	r4,r4,r10
+
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+# if !defined(_WIN32)
+	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
+	ldr	r12,[r12]
+# endif
+#endif
+	ldrb	r8,[$inp,#7]
+	orr	r5,r5,r6,lsl#8
+	ldrb	r6,[$inp,#8]
+	orr	r5,r5,r7,lsl#16
+	ldrb	r7,[$inp,#9]
+	orr	r5,r5,r8,lsl#24
+	ldrb	r8,[$inp,#10]
+	and	r5,r5,r3
+
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	tst	r12,#ARMV7_NEON		@ check for NEON
+# ifdef	__thumb2__
+	adr	r9,.Lpoly1305_blocks_neon
+	adr	r11,.Lpoly1305_blocks
+	it	ne
+	movne	r11,r9
+	adr	r12,.Lpoly1305_emit
+	orr	r11,r11,#1		@ thumb-ify addresses
+	orr	r12,r12,#1
+# else
+	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+	ite	eq
+	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
+# endif
+#endif
+	ldrb	r9,[$inp,#11]
+	orr	r6,r6,r7,lsl#8
+	ldrb	r7,[$inp,#12]
+	orr	r6,r6,r8,lsl#16
+	ldrb	r8,[$inp,#13]
+	orr	r6,r6,r9,lsl#24
+	ldrb	r9,[$inp,#14]
+	and	r6,r6,r3
+
+	ldrb	r10,[$inp,#15]
+	orr	r7,r7,r8,lsl#8
+	str	r4,[$ctx,#0]
+	orr	r7,r7,r9,lsl#16
+	str	r5,[$ctx,#4]
+	orr	r7,r7,r10,lsl#24
+	str	r6,[$ctx,#8]
+	and	r7,r7,r3
+	str	r7,[$ctx,#12]
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	stmia	r2,{r11,r12}		@ fill functions table
+	mov	r0,#1
+#else
+	mov	r0,#0
+#endif
+.Lno_key:
+	ldmia	sp!,{r4-r11}
+#if	__ARM_ARCH__>=5
+	ret				@ bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type	poly1305_blocks,%function
+.align	5
+poly1305_blocks:
+.Lpoly1305_blocks:
+	stmdb	sp!,{r3-r11,lr}
+
+	ands	$len,$len,#-16
+	beq	.Lno_data
+
+	add	$len,$len,$inp		@ end pointer
+	sub	sp,sp,#32
+
+#if __ARM_ARCH__<7
+	ldmia	$ctx,{$h0-$r3}		@ load context
+	add	$ctx,$ctx,#20
+	str	$len,[sp,#16]		@ offload stuff
+	str	$ctx,[sp,#12]
+#else
+	ldr	lr,[$ctx,#36]		@ is_base2_26
+	ldmia	$ctx!,{$h0-$h4}		@ load hash value
+	str	$len,[sp,#16]		@ offload stuff
+	str	$ctx,[sp,#12]
+
+	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
+	mov	$r1,$h1,lsr#6
+	adcs	$r1,$r1,$h2,lsl#20
+	mov	$r2,$h2,lsr#12
+	adcs	$r2,$r2,$h3,lsl#14
+	mov	$r3,$h3,lsr#18
+	adcs	$r3,$r3,$h4,lsl#8
+	mov	$len,#0
+	teq	lr,#0
+	str	$len,[$ctx,#16]		@ clear is_base2_26
+	adc	$len,$len,$h4,lsr#24
+
+	itttt	ne
+	movne	$h0,$r0			@ choose between radixes
+	movne	$h1,$r1
+	movne	$h2,$r2
+	movne	$h3,$r3
+	ldmia	$ctx,{$r0-$r3}		@ load key
+	it	ne
+	movne	$h4,$len
+#endif
+
+	mov	lr,$inp
+	cmp	$padbit,#0
+	str	$r1,[sp,#20]
+	str	$r2,[sp,#24]
+	str	$r3,[sp,#28]
+	b	.Loop
+
+.align	4
+.Loop:
+#if __ARM_ARCH__<7
+	ldrb	r0,[lr],#16		@ load input
+# ifdef	__thumb2__
+	it	hi
+# endif
+	addhi	$h4,$h4,#1		@ 1<<128
+	ldrb	r1,[lr,#-15]
+	ldrb	r2,[lr,#-14]
+	ldrb	r3,[lr,#-13]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-12]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-11]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-10]
+	adds	$h0,$h0,r3		@ accumulate input
+
+	ldrb	r3,[lr,#-9]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-8]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-7]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-6]
+	adcs	$h1,$h1,r3
+
+	ldrb	r3,[lr,#-5]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-4]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-3]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-2]
+	adcs	$h2,$h2,r3
+
+	ldrb	r3,[lr,#-1]
+	orr	r1,r0,r1,lsl#8
+	str	lr,[sp,#8]		@ offload input pointer
+	orr	r2,r1,r2,lsl#16
+	add	$s1,$r1,$r1,lsr#2
+	orr	r3,r2,r3,lsl#24
+#else
+	ldr	r0,[lr],#16		@ load input
+	it	hi
+	addhi	$h4,$h4,#1		@ padbit
+	ldr	r1,[lr,#-12]
+	ldr	r2,[lr,#-8]
+	ldr	r3,[lr,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	adds	$h0,$h0,r0		@ accumulate input
+	str	lr,[sp,#8]		@ offload input pointer
+	adcs	$h1,$h1,r1
+	add	$s1,$r1,$r1,lsr#2
+	adcs	$h2,$h2,r2
+#endif
+	add	$s2,$r2,$r2,lsr#2
+	adcs	$h3,$h3,r3
+	add	$s3,$r3,$r3,lsr#2
+
+	umull	r2,r3,$h1,$r0
+	 adc	$h4,$h4,#0
+	umull	r0,r1,$h0,$r0
+	umlal	r2,r3,$h4,$s1
+	umlal	r0,r1,$h3,$s1
+	ldr	$r1,[sp,#20]		@ reload $r1
+	umlal	r2,r3,$h2,$s3
+	umlal	r0,r1,$h1,$s3
+	umlal	r2,r3,$h3,$s2
+	umlal	r0,r1,$h2,$s2
+	umlal	r2,r3,$h0,$r1
+	str	r0,[sp,#0]		@ future $h0
+	 mul	r0,$s2,$h4
+	ldr	$r2,[sp,#24]		@ reload $r2
+	adds	r2,r2,r1		@ d1+=d0>>32
+	 eor	r1,r1,r1
+	adc	lr,r3,#0		@ future $h2
+	str	r2,[sp,#4]		@ future $h1
+
+	mul	r2,$s3,$h4
+	eor	r3,r3,r3
+	umlal	r0,r1,$h3,$s3
+	ldr	$r3,[sp,#28]		@ reload $r3
+	umlal	r2,r3,$h3,$r0
+	umlal	r0,r1,$h2,$r0
+	umlal	r2,r3,$h2,$r1
+	umlal	r0,r1,$h1,$r1
+	umlal	r2,r3,$h1,$r2
+	umlal	r0,r1,$h0,$r2
+	umlal	r2,r3,$h0,$r3
+	ldr	$h0,[sp,#0]
+	mul	$h4,$r0,$h4
+	ldr	$h1,[sp,#4]
+
+	adds	$h2,lr,r0		@ d2+=d1>>32
+	ldr	lr,[sp,#8]		@ reload input pointer
+	adc	r1,r1,#0
+	adds	$h3,r2,r1		@ d3+=d2>>32
+	ldr	r0,[sp,#16]		@ reload end pointer
+	adc	r3,r3,#0
+	add	$h4,$h4,r3		@ h4+=d3>>32
+
+	and	r1,$h4,#-4
+	and	$h4,$h4,#3
+	add	r1,r1,r1,lsr#2		@ *=5
+	adds	$h0,$h0,r1
+	adcs	$h1,$h1,#0
+	adcs	$h2,$h2,#0
+	adcs	$h3,$h3,#0
+	adc	$h4,$h4,#0
+
+	cmp	r0,lr			@ done yet?
+	bhi	.Loop
+
+	ldr	$ctx,[sp,#12]
+	add	sp,sp,#32
+	stmdb	$ctx,{$h0-$h4}		@ store the result
+
+.Lno_data:
+#if	__ARM_ARCH__>=5
+	ldmia	sp!,{r3-r11,pc}
+#else
+	ldmia	sp!,{r3-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$ctx;
+
+$code.=<<___;
+.type	poly1305_emit,%function
+.align	5
+poly1305_emit:
+.Lpoly1305_emit:
+	stmdb	sp!,{r4-r11}
+
+	ldmia	$ctx,{$h0-$h4}
+
+#if __ARM_ARCH__>=7
+	ldr	ip,[$ctx,#36]		@ is_base2_26
+
+	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
+	mov	$g1,$h1,lsr#6
+	adcs	$g1,$g1,$h2,lsl#20
+	mov	$g2,$h2,lsr#12
+	adcs	$g2,$g2,$h3,lsl#14
+	mov	$g3,$h3,lsr#18
+	adcs	$g3,$g3,$h4,lsl#8
+	mov	$g4,#0
+	adc	$g4,$g4,$h4,lsr#24
+
+	tst	ip,ip
+	itttt	ne
+	movne	$h0,$g0
+	movne	$h1,$g1
+	movne	$h2,$g2
+	movne	$h3,$g3
+	it	ne
+	movne	$h4,$g4
+#endif
+
+	adds	$g0,$h0,#5		@ compare to modulus
+	adcs	$g1,$h1,#0
+	adcs	$g2,$h2,#0
+	adcs	$g3,$h3,#0
+	adc	$g4,$h4,#0
+	tst	$g4,#4			@ did it carry/borrow?
+
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h0,$g0
+	ldr	$g0,[$nonce,#0]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h1,$g1
+	ldr	$g1,[$nonce,#4]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h2,$g2
+	ldr	$g2,[$nonce,#8]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h3,$g3
+	ldr	$g3,[$nonce,#12]
+
+	adds	$h0,$h0,$g0
+	adcs	$h1,$h1,$g1
+	adcs	$h2,$h2,$g2
+	adc	$h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+	rev	$h0,$h0
+	rev	$h1,$h1
+	rev	$h2,$h2
+	rev	$h3,$h3
+# endif
+	str	$h0,[$mac,#0]
+	str	$h1,[$mac,#4]
+	str	$h2,[$mac,#8]
+	str	$h3,[$mac,#12]
+#else
+	strb	$h0,[$mac,#0]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#4]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#8]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#12]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#1]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#5]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#9]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#13]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#2]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#6]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#10]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#14]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#3]
+	strb	$h1,[$mac,#7]
+	strb	$h2,[$mac,#11]
+	strb	$h3,[$mac,#15]
+#endif
+	ldmia	sp!,{r4-r11}
+#if	__ARM_ARCH__>=5
+	ret				@ bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if	__ARM_MAX_ARCH__>=7
+.fpu	neon
+
+.type	poly1305_init_neon,%function
+.align	5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+	ldr	r3,[$ctx,#48]		@ first table element
+	cmp	r3,#-1			@ is value impossible?
+	bne	.Lno_init_neon
+
+	ldr	r4,[$ctx,#20]		@ load key base 2^32
+	ldr	r5,[$ctx,#24]
+	ldr	r6,[$ctx,#28]
+	ldr	r7,[$ctx,#32]
+
+	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
+	mov	r3,r4,lsr#26
+	mov	r4,r5,lsr#20
+	orr	r3,r3,r5,lsl#6
+	mov	r5,r6,lsr#14
+	orr	r4,r4,r6,lsl#12
+	mov	r6,r7,lsr#8
+	orr	r5,r5,r7,lsl#18
+	and	r3,r3,#0x03ffffff
+	and	r4,r4,#0x03ffffff
+	and	r5,r5,#0x03ffffff
+
+	vdup.32	$R0,r2			@ r^1 in both lanes
+	add	r2,r3,r3,lsl#2		@ *5
+	vdup.32	$R1,r3
+	add	r3,r4,r4,lsl#2
+	vdup.32	$S1,r2
+	vdup.32	$R2,r4
+	add	r4,r5,r5,lsl#2
+	vdup.32	$S2,r3
+	vdup.32	$R3,r5
+	add	r5,r6,r6,lsl#2
+	vdup.32	$S3,r4
+	vdup.32	$R4,r6
+	vdup.32	$S4,r5
+
+	mov	$zeros,#2		@ counter
+
+.Lsquare_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+
+	vmull.u32	$D0,$R0,${R0}[1]
+	vmull.u32	$D1,$R1,${R0}[1]
+	vmull.u32	$D2,$R2,${R0}[1]
+	vmull.u32	$D3,$R3,${R0}[1]
+	vmull.u32	$D4,$R4,${R0}[1]
+
+	vmlal.u32	$D0,$R4,${S1}[1]
+	vmlal.u32	$D1,$R0,${R1}[1]
+	vmlal.u32	$D2,$R1,${R1}[1]
+	vmlal.u32	$D3,$R2,${R1}[1]
+	vmlal.u32	$D4,$R3,${R1}[1]
+
+	vmlal.u32	$D0,$R3,${S2}[1]
+	vmlal.u32	$D1,$R4,${S2}[1]
+	vmlal.u32	$D3,$R1,${R2}[1]
+	vmlal.u32	$D2,$R0,${R2}[1]
+	vmlal.u32	$D4,$R2,${R2}[1]
+
+	vmlal.u32	$D0,$R2,${S3}[1]
+	vmlal.u32	$D3,$R0,${R3}[1]
+	vmlal.u32	$D1,$R3,${S3}[1]
+	vmlal.u32	$D2,$R4,${S3}[1]
+	vmlal.u32	$D4,$R1,${R3}[1]
+
+	vmlal.u32	$D3,$R4,${S4}[1]
+	vmlal.u32	$D0,$R1,${S4}[1]
+	vmlal.u32	$D1,$R2,${S4}[1]
+	vmlal.u32	$D2,$R3,${S4}[1]
+	vmlal.u32	$D4,$R0,${R4}[1]
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	@ and P. Schwabe
+	@
+	@ H0>>+H1>>+H2>>+H3>>+H4
+	@ H3>>+H4>>*5+H0>>+H1
+	@
+	@ Trivia.
+	@
+	@ Result of multiplication of n-bit number by m-bit number is
+	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+	@ m-bit number multiplied by 2^n is still n+m bits wide.
+	@
+	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+	@ one is n+1 bits wide.
+	@
+	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+	@ can be 27. However! In cases when their width exceeds 26 bits
+	@ they are limited by 2^26+2^6. This in turn means that *sum*
+	@ of the products with these values can still be viewed as sum
+	@ of 52-bit numbers as long as the amount of addends is not a
+	@ power of 2. For example,
+	@
+	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+	@
+	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
+	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+	@ which is less than 32 * (2^52) or 2^57. And when processing
+	@ data we are looking at triple as many addends...
+	@
+	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
+	@ This means that result of reduction have to be compressed upon
+	@ loop wrap-around. This can be done in the process of reduction
+	@ to minimize amount of instructions [as well as amount of
+	@ 128-bit instructions, which benefits low-end processors], but
+	@ one has to watch for H2 (which is narrower than H0) and 5*H4
+	@ not being wider than 58 bits, so that result of right shift
+	@ by 26 bits fits in 32 bits. This is also useful on x86,
+	@ because it allows to use paddd in place for paddq, which
+	@ benefits Atom, where paddq is ridiculously slow.
+
+	vshr.u64	$T0,$D3,#26
+	vmovn.i64	$D3#lo,$D3
+	 vshr.u64	$T1,$D0,#26
+	 vmovn.i64	$D0#lo,$D0
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+	 vbic.i32	$D0#lo,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D4,#26
+	vmovn.i64	$D4#lo,$D4
+	 vshr.u64	$T1,$D1,#26
+	 vmovn.i64	$D1#lo,$D1
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+	vbic.i32	$D4#lo,#0xfc000000
+	 vbic.i32	$D1#lo,#0xfc000000
+
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo
+	vshl.u32	$T0#lo,$T0#lo,#2
+	 vshrn.u64	$T1#lo,$D2,#26
+	 vmovn.i64	$D2#lo,$D2
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
+	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
+	 vbic.i32	$D2#lo,#0xfc000000
+
+	vshr.u32	$T0#lo,$D0#lo,#26
+	vbic.i32	$D0#lo,#0xfc000000
+	 vshr.u32	$T1#lo,$D3#lo,#26
+	 vbic.i32	$D3#lo,#0xfc000000
+	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
+	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
+
+	subs		$zeros,$zeros,#1
+	beq		.Lsquare_break_neon
+
+	add		$tbl0,$ctx,#(48+0*9*4)
+	add		$tbl1,$ctx,#(48+1*9*4)
+
+	vtrn.32		$R0,$D0#lo		@ r^2:r^1
+	vtrn.32		$R2,$D2#lo
+	vtrn.32		$R3,$D3#lo
+	vtrn.32		$R1,$D1#lo
+	vtrn.32		$R4,$D4#lo
+
+	vshl.u32	$S2,$R2,#2		@ *5
+	vshl.u32	$S3,$R3,#2
+	vshl.u32	$S1,$R1,#2
+	vshl.u32	$S4,$R4,#2
+	vadd.i32	$S2,$S2,$R2
+	vadd.i32	$S1,$S1,$R1
+	vadd.i32	$S3,$S3,$R3
+	vadd.i32	$S4,$S4,$R4
+
+	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vst1.32		{${S4}[0]},[$tbl0,:32]
+	vst1.32		{${S4}[1]},[$tbl1,:32]
+
+	b		.Lsquare_neon
+
+.align	4
+.Lsquare_break_neon:
+	add		$tbl0,$ctx,#(48+2*4*9)
+	add		$tbl1,$ctx,#(48+3*4*9)
+
+	vmov		$R0,$D0#lo		@ r^4:r^3
+	vshl.u32	$S1,$D1#lo,#2		@ *5
+	vmov		$R1,$D1#lo
+	vshl.u32	$S2,$D2#lo,#2
+	vmov		$R2,$D2#lo
+	vshl.u32	$S3,$D3#lo,#2
+	vmov		$R3,$D3#lo
+	vshl.u32	$S4,$D4#lo,#2
+	vmov		$R4,$D4#lo
+	vadd.i32	$S1,$S1,$D1#lo
+	vadd.i32	$S2,$S2,$D2#lo
+	vadd.i32	$S3,$S3,$D3#lo
+	vadd.i32	$S4,$S4,$D4#lo
+
+	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vst1.32		{${S4}[0]},[$tbl0]
+	vst1.32		{${S4}[1]},[$tbl1]
+
+.Lno_init_neon:
+	ret				@ bx	lr
+.size	poly1305_init_neon,.-poly1305_init_neon
+
+.globl	poly1305_blocks_neon
+.type	poly1305_blocks_neon,%function
+.align	5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+	ldr	ip,[$ctx,#36]		@ is_base2_26
+
+	cmp	$len,#64
+	blo	.Lpoly1305_blocks
+
+	stmdb	sp!,{r4-r7}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+
+	tst	ip,ip			@ is_base2_26?
+	bne	.Lbase2_26_neon
+
+	stmdb	sp!,{r1-r3,lr}
+	bl	.Lpoly1305_init_neon
+
+	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
+	ldr	r5,[$ctx,#4]
+	ldr	r6,[$ctx,#8]
+	ldr	r7,[$ctx,#12]
+	ldr	ip,[$ctx,#16]
+
+	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
+	mov	r3,r4,lsr#26
+	 veor	$D0#lo,$D0#lo,$D0#lo
+	mov	r4,r5,lsr#20
+	orr	r3,r3,r5,lsl#6
+	 veor	$D1#lo,$D1#lo,$D1#lo
+	mov	r5,r6,lsr#14
+	orr	r4,r4,r6,lsl#12
+	 veor	$D2#lo,$D2#lo,$D2#lo
+	mov	r6,r7,lsr#8
+	orr	r5,r5,r7,lsl#18
+	 veor	$D3#lo,$D3#lo,$D3#lo
+	and	r3,r3,#0x03ffffff
+	orr	r6,r6,ip,lsl#24
+	 veor	$D4#lo,$D4#lo,$D4#lo
+	and	r4,r4,#0x03ffffff
+	mov	r1,#1
+	and	r5,r5,#0x03ffffff
+	str	r1,[$ctx,#36]		@ set is_base2_26
+
+	vmov.32	$D0#lo[0],r2
+	vmov.32	$D1#lo[0],r3
+	vmov.32	$D2#lo[0],r4
+	vmov.32	$D3#lo[0],r5
+	vmov.32	$D4#lo[0],r6
+	adr	$zeros,.Lzeros
+
+	ldmia	sp!,{r1-r3,lr}
+	b	.Lhash_loaded
+
+.align	4
+.Lbase2_26_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ load hash value
+
+	veor		$D0#lo,$D0#lo,$D0#lo
+	veor		$D1#lo,$D1#lo,$D1#lo
+	veor		$D2#lo,$D2#lo,$D2#lo
+	veor		$D3#lo,$D3#lo,$D3#lo
+	veor		$D4#lo,$D4#lo,$D4#lo
+	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+	adr		$zeros,.Lzeros
+	vld1.32		{$D4#lo[0]},[$ctx]
+	sub		$ctx,$ctx,#16		@ rewind
+
+.Lhash_loaded:
+	add		$in2,$inp,#32
+	mov		$padbit,$padbit,lsl#24
+	tst		$len,#31
+	beq		.Leven
+
+	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+	vmov.32		$H4#lo[0],$padbit
+	sub		$len,$len,#16
+	add		$in2,$inp,#32
+
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H3,$H3
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+# endif
+	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
+	vshl.u32	$H3#lo,$H3#lo,#18
+
+	vsri.u32	$H3#lo,$H2#lo,#14
+	vshl.u32	$H2#lo,$H2#lo,#12
+	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
+
+	vbic.i32	$H3#lo,#0xfc000000
+	vsri.u32	$H2#lo,$H1#lo,#20
+	vshl.u32	$H1#lo,$H1#lo,#6
+
+	vbic.i32	$H2#lo,#0xfc000000
+	vsri.u32	$H1#lo,$H0#lo,#26
+	vadd.i32	$H3#hi,$H3#lo,$D3#lo
+
+	vbic.i32	$H0#lo,#0xfc000000
+	vbic.i32	$H1#lo,#0xfc000000
+	vadd.i32	$H2#hi,$H2#lo,$D2#lo
+
+	vadd.i32	$H0#hi,$H0#lo,$D0#lo
+	vadd.i32	$H1#hi,$H1#lo,$D1#lo
+
+	mov		$tbl1,$zeros
+	add		$tbl0,$ctx,#48
+
+	cmp		$len,$len
+	b		.Long_tail
+
+.align	4
+.Leven:
+	subs		$len,$len,#64
+	it		lo
+	movlo		$in2,$zeros
+
+	vmov.i32	$H4,#1<<24		@ padbit, yes, always
+	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
+	add		$inp,$inp,#64
+	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
+	add		$in2,$in2,#64
+	itt		hi
+	addhi		$tbl1,$ctx,#(48+1*9*4)
+	addhi		$tbl0,$ctx,#(48+3*9*4)
+
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H3,$H3
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+# endif
+	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
+	vshl.u32	$H3,$H3,#18
+
+	vsri.u32	$H3,$H2,#14
+	vshl.u32	$H2,$H2,#12
+
+	vbic.i32	$H3,#0xfc000000
+	vsri.u32	$H2,$H1,#20
+	vshl.u32	$H1,$H1,#6
+
+	vbic.i32	$H2,#0xfc000000
+	vsri.u32	$H1,$H0,#26
+
+	vbic.i32	$H0,#0xfc000000
+	vbic.i32	$H1,#0xfc000000
+
+	bls		.Lskip_loop
+
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	b		.Loop_neon
+
+.align	5
+.Loop_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	@   \___________________/
+	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	@   \___________________/ \____________________/
+	@
+	@ Note that we start with inp[2:3]*r^2. This is because it
+	@ doesn't depend on reduction in previous iteration.
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ inp[2:3]*r^2
+
+	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
+	vmull.u32	$D2,$H2#hi,${R0}[1]
+	vadd.i32	$H0#lo,$H0#lo,$D0#lo
+	vmull.u32	$D0,$H0#hi,${R0}[1]
+	vadd.i32	$H3#lo,$H3#lo,$D3#lo
+	vmull.u32	$D3,$H3#hi,${R0}[1]
+	vmlal.u32	$D2,$H1#hi,${R1}[1]
+	vadd.i32	$H1#lo,$H1#lo,$D1#lo
+	vmull.u32	$D1,$H1#hi,${R0}[1]
+
+	vadd.i32	$H4#lo,$H4#lo,$D4#lo
+	vmull.u32	$D4,$H4#hi,${R0}[1]
+	subs		$len,$len,#64
+	vmlal.u32	$D0,$H4#hi,${S1}[1]
+	it		lo
+	movlo		$in2,$zeros
+	vmlal.u32	$D3,$H2#hi,${R1}[1]
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D1,$H0#hi,${R1}[1]
+	vmlal.u32	$D4,$H3#hi,${R1}[1]
+
+	vmlal.u32	$D0,$H3#hi,${S2}[1]
+	vmlal.u32	$D3,$H1#hi,${R2}[1]
+	vmlal.u32	$D4,$H2#hi,${R2}[1]
+	vmlal.u32	$D1,$H4#hi,${S2}[1]
+	vmlal.u32	$D2,$H0#hi,${R2}[1]
+
+	vmlal.u32	$D3,$H0#hi,${R3}[1]
+	vmlal.u32	$D0,$H2#hi,${S3}[1]
+	vmlal.u32	$D4,$H1#hi,${R3}[1]
+	vmlal.u32	$D1,$H3#hi,${S3}[1]
+	vmlal.u32	$D2,$H4#hi,${S3}[1]
+
+	vmlal.u32	$D3,$H4#hi,${S4}[1]
+	vmlal.u32	$D0,$H1#hi,${S4}[1]
+	vmlal.u32	$D4,$H0#hi,${R4}[1]
+	vmlal.u32	$D1,$H2#hi,${S4}[1]
+	vmlal.u32	$D2,$H3#hi,${S4}[1]
+
+	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
+	add		$in2,$in2,#64
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ (hash+inp[0:1])*r^4 and accumulate
+
+	vmlal.u32	$D3,$H3#lo,${R0}[0]
+	vmlal.u32	$D0,$H0#lo,${R0}[0]
+	vmlal.u32	$D4,$H4#lo,${R0}[0]
+	vmlal.u32	$D1,$H1#lo,${R0}[0]
+	vmlal.u32	$D2,$H2#lo,${R0}[0]
+	vld1.32		${S4}[0],[$tbl0,:32]
+
+	vmlal.u32	$D3,$H2#lo,${R1}[0]
+	vmlal.u32	$D0,$H4#lo,${S1}[0]
+	vmlal.u32	$D4,$H3#lo,${R1}[0]
+	vmlal.u32	$D1,$H0#lo,${R1}[0]
+	vmlal.u32	$D2,$H1#lo,${R1}[0]
+
+	vmlal.u32	$D3,$H1#lo,${R2}[0]
+	vmlal.u32	$D0,$H3#lo,${S2}[0]
+	vmlal.u32	$D4,$H2#lo,${R2}[0]
+	vmlal.u32	$D1,$H4#lo,${S2}[0]
+	vmlal.u32	$D2,$H0#lo,${R2}[0]
+
+	vmlal.u32	$D3,$H0#lo,${R3}[0]
+	vmlal.u32	$D0,$H2#lo,${S3}[0]
+	vmlal.u32	$D4,$H1#lo,${R3}[0]
+	vmlal.u32	$D1,$H3#lo,${S3}[0]
+	vmlal.u32	$D3,$H4#lo,${S4}[0]
+
+	vmlal.u32	$D2,$H4#lo,${S3}[0]
+	vmlal.u32	$D0,$H1#lo,${S4}[0]
+	vmlal.u32	$D4,$H0#lo,${R4}[0]
+	vmov.i32	$H4,#1<<24		@ padbit, yes, always
+	vmlal.u32	$D1,$H2#lo,${S4}[0]
+	vmlal.u32	$D2,$H3#lo,${S4}[0]
+
+	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
+	add		$inp,$inp,#64
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+	vrev32.8	$H3,$H3
+# endif
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
+	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+	vshr.u64	$T0,$D3,#26
+	vmovn.i64	$D3#lo,$D3
+	 vshr.u64	$T1,$D0,#26
+	 vmovn.i64	$D0#lo,$D0
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	vbic.i32	$D3#lo,#0xfc000000
+	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+	  vshl.u32	$H3,$H3,#18
+	 vbic.i32	$D0#lo,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D4,#26
+	vmovn.i64	$D4#lo,$D4
+	 vshr.u64	$T1,$D1,#26
+	 vmovn.i64	$D1#lo,$D1
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+	  vsri.u32	$H3,$H2,#14
+	vbic.i32	$D4#lo,#0xfc000000
+	  vshl.u32	$H2,$H2,#12
+	 vbic.i32	$D1#lo,#0xfc000000
+
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo
+	vshl.u32	$T0#lo,$T0#lo,#2
+	  vbic.i32	$H3,#0xfc000000
+	 vshrn.u64	$T1#lo,$D2,#26
+	 vmovn.i64	$D2#lo,$D2
+	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
+	  vsri.u32	$H2,$H1,#20
+	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
+	  vshl.u32	$H1,$H1,#6
+	 vbic.i32	$D2#lo,#0xfc000000
+	  vbic.i32	$H2,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
+	vmovn.i64	$D0#lo,$D0
+	  vsri.u32	$H1,$H0,#26
+	  vbic.i32	$H0,#0xfc000000
+	 vshr.u32	$T1#lo,$D3#lo,#26
+	 vbic.i32	$D3#lo,#0xfc000000
+	vbic.i32	$D0#lo,#0xfc000000
+	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
+	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
+	  vbic.i32	$H1,#0xfc000000
+
+	bhi		.Loop_neon
+
+.Lskip_loop:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	add		$tbl1,$ctx,#(48+0*9*4)
+	add		$tbl0,$ctx,#(48+1*9*4)
+	adds		$len,$len,#32
+	it		ne
+	movne		$len,#0
+	bne		.Long_tail
+
+	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
+	vadd.i32	$H0#hi,$H0#lo,$D0#lo
+	vadd.i32	$H3#hi,$H3#lo,$D3#lo
+	vadd.i32	$H1#hi,$H1#lo,$D1#lo
+	vadd.i32	$H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
+
+	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
+	vmull.u32	$D2,$H2#hi,$R0
+	vadd.i32	$H0#lo,$H0#lo,$D0#lo
+	vmull.u32	$D0,$H0#hi,$R0
+	vadd.i32	$H3#lo,$H3#lo,$D3#lo
+	vmull.u32	$D3,$H3#hi,$R0
+	vadd.i32	$H1#lo,$H1#lo,$D1#lo
+	vmull.u32	$D1,$H1#hi,$R0
+	vadd.i32	$H4#lo,$H4#lo,$D4#lo
+	vmull.u32	$D4,$H4#hi,$R0
+
+	vmlal.u32	$D0,$H4#hi,$S1
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vmlal.u32	$D3,$H2#hi,$R1
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vmlal.u32	$D1,$H0#hi,$R1
+	vmlal.u32	$D4,$H3#hi,$R1
+	vmlal.u32	$D2,$H1#hi,$R1
+
+	vmlal.u32	$D3,$H1#hi,$R2
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D0,$H3#hi,$S2
+	vld1.32		${S4}[0],[$tbl0,:32]
+	vmlal.u32	$D4,$H2#hi,$R2
+	vmlal.u32	$D1,$H4#hi,$S2
+	vmlal.u32	$D2,$H0#hi,$R2
+
+	vmlal.u32	$D3,$H0#hi,$R3
+	 it		ne
+	 addne		$tbl1,$ctx,#(48+2*9*4)
+	vmlal.u32	$D0,$H2#hi,$S3
+	 it		ne
+	 addne		$tbl0,$ctx,#(48+3*9*4)
+	vmlal.u32	$D4,$H1#hi,$R3
+	vmlal.u32	$D1,$H3#hi,$S3
+	vmlal.u32	$D2,$H4#hi,$S3
+
+	vmlal.u32	$D3,$H4#hi,$S4
+	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
+	vmlal.u32	$D0,$H1#hi,$S4
+	 vshr.u64	$MASK,$MASK,#38
+	vmlal.u32	$D4,$H0#hi,$R4
+	vmlal.u32	$D1,$H2#hi,$S4
+	vmlal.u32	$D2,$H3#hi,$S4
+
+	beq		.Lshort_tail
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
+
+	vmlal.u32	$D2,$H2#lo,$R0
+	vmlal.u32	$D0,$H0#lo,$R0
+	vmlal.u32	$D3,$H3#lo,$R0
+	vmlal.u32	$D1,$H1#lo,$R0
+	vmlal.u32	$D4,$H4#lo,$R0
+
+	vmlal.u32	$D0,$H4#lo,$S1
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vmlal.u32	$D3,$H2#lo,$R1
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vmlal.u32	$D1,$H0#lo,$R1
+	vmlal.u32	$D4,$H3#lo,$R1
+	vmlal.u32	$D2,$H1#lo,$R1
+
+	vmlal.u32	$D3,$H1#lo,$R2
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D0,$H3#lo,$S2
+	vld1.32		${S4}[0],[$tbl0,:32]
+	vmlal.u32	$D4,$H2#lo,$R2
+	vmlal.u32	$D1,$H4#lo,$S2
+	vmlal.u32	$D2,$H0#lo,$R2
+
+	vmlal.u32	$D3,$H0#lo,$R3
+	vmlal.u32	$D0,$H2#lo,$S3
+	vmlal.u32	$D4,$H1#lo,$R3
+	vmlal.u32	$D1,$H3#lo,$S3
+	vmlal.u32	$D2,$H4#lo,$S3
+
+	vmlal.u32	$D3,$H4#lo,$S4
+	 vorn		$MASK,$MASK,$MASK	@ all-ones
+	vmlal.u32	$D0,$H1#lo,$S4
+	 vshr.u64	$MASK,$MASK,#38
+	vmlal.u32	$D4,$H0#lo,$R4
+	vmlal.u32	$D1,$H2#lo,$S4
+	vmlal.u32	$D2,$H3#lo,$S4
+
+.Lshort_tail:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ horizontal addition
+
+	vadd.i64	$D3#lo,$D3#lo,$D3#hi
+	vadd.i64	$D0#lo,$D0#lo,$D0#hi
+	vadd.i64	$D4#lo,$D4#lo,$D4#hi
+	vadd.i64	$D1#lo,$D1#lo,$D1#hi
+	vadd.i64	$D2#lo,$D2#lo,$D2#hi
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction, but without narrowing
+
+	vshr.u64	$T0,$D3,#26
+	vand.i64	$D3,$D3,$MASK
+	 vshr.u64	$T1,$D0,#26
+	 vand.i64	$D0,$D0,$MASK
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+
+	vshr.u64	$T0,$D4,#26
+	vand.i64	$D4,$D4,$MASK
+	 vshr.u64	$T1,$D1,#26
+	 vand.i64	$D1,$D1,$MASK
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+
+	vadd.i64	$D0,$D0,$T0
+	vshl.u64	$T0,$T0,#2
+	 vshr.u64	$T1,$D2,#26
+	 vand.i64	$D2,$D2,$MASK
+	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
+	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
+
+	vshr.u64	$T0,$D0,#26
+	vand.i64	$D0,$D0,$MASK
+	 vshr.u64	$T1,$D3,#26
+	 vand.i64	$D3,$D3,$MASK
+	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
+	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
+
+	cmp		$len,#0
+	bne		.Leven
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ store hash value
+
+	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+	vst1.32		{$D4#lo[0]},[$ctx]
+
+	vldmia	sp!,{d8-d15}			@ epilogue
+	ldmia	sp!,{r4-r7}
+	ret					@ bx	lr
+.size	poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align	5
+.Lzeros:
+.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#ifndef	__KERNEL__
+.LOPENSSL_armcap:
+# ifdef	_WIN32
+.word	OPENSSL_armcap_P
+# else
+.word	OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+#endif
+___
+}	}
+$code.=<<___;
+.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go						or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c
new file mode 100644
index 000000000000..2d86c78af883
--- /dev/null
+++ b/lib/crypto/arm/poly1305-glue.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
+				    const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+				   u8 digest[POLY1305_DIGEST_SIZE],
+				   const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+			  unsigned int len, u32 padbit)
+{
+	len = round_down(len, POLY1305_BLOCK_SIZE);
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		do {
+			unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+			kernel_neon_begin();
+			poly1305_blocks_neon(state, src, todo, padbit);
+			kernel_neon_end();
+
+			len -= todo;
+			src += todo;
+		} while (len);
+	} else
+		poly1305_blocks_arm(state, src, len, padbit);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init arm_poly1305_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    (elf_hwcap & HWCAP_NEON))
+		static_branch_enable(&have_neon);
+	return 0;
+}
+subsys_initcall(arm_poly1305_mod_init);
+
+static void __exit arm_poly1305_mod_exit(void)
+{
+}
+module_exit(arm_poly1305_mod_exit);
+
+MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm/sha1-armv4-large.S b/lib/crypto/arm/sha1-armv4-large.S
new file mode 100644
index 000000000000..1c8b685149f2
--- /dev/null
+++ b/lib/crypto/arm/sha1-armv4-large.S
@@ -0,0 +1,507 @@
+#define __ARM_ARCH__ __LINUX_ARM_ARCH__
+@ SPDX-License-Identifier: GPL-2.0
+
+@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
+@ has relicensed it under the GPLv2. Therefore this program is free software;
+@ you can redistribute it and/or modify it under the terms of the GNU General
+@ Public License version 2 as published by the Free Software Foundation.
+@
+@ The original headers, including the original license headers, are
+@ included below for completeness.
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see https://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+@ sha1_block procedure for ARMv4.
+@
+@ January 2007.
+
+@ Size/performance trade-off
+@ ====================================================================
+@ impl		size in bytes	comp cycles[*]	measured performance
+@ ====================================================================
+@ thumb		304		3212		4420
+@ armv4-small	392/+29%	1958/+64%	2250/+96%
+@ armv4-compact	740/+89%	1552/+26%	1840/+22%
+@ armv4-large	1420/+92%	1307/+19%	1370/+34%[***]
+@ full unroll	~5100/+260%	~1260/+4%	~1300/+5%
+@ ====================================================================
+@ thumb		= same as 'small' but in Thumb instructions[**] and
+@		  with recurring code in two private functions;
+@ small		= detached Xload/update, loops are folded;
+@ compact	= detached Xload/update, 5x unroll;
+@ large		= interleaved Xload/update, 5x unroll;
+@ full unroll	= interleaved Xload/update, full unroll, estimated[!];
+@
+@ [*]	Manually counted instructions in "grand" loop body. Measured
+@	performance is affected by prologue and epilogue overhead,
+@	i-cache availability, branch penalties, etc.
+@ [**]	While each Thumb instruction is twice smaller, they are not as
+@	diverse as ARM ones: e.g., there are only two arithmetic
+@	instructions with 3 arguments, no [fixed] rotate, addressing
+@	modes are limited. As result it takes more instructions to do
+@	the same job in Thumb, therefore the code is never twice as
+@	small and always slower.
+@ [***]	which is also ~35% better than compiler generated code. Dual-
+@	issue Cortex A8 core was measured to process input block in
+@	~990 cycles.
+
+@ August 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
+@ Cortex A8 core and in absolute terms ~870 cycles per input block
+@ [or 13.6 cycles per byte].
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 10%
+@ improvement on Cortex A8 core and 12.2 cycles per byte.
+
+#include <linux/linkage.h>
+
+.text
+
+.align	2
+ENTRY(sha1_block_data_order)
+	stmdb	sp!,{r4-r12,lr}
+	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
+	ldmia	r0,{r3,r4,r5,r6,r7}
+.Lloop:
+	ldr	r8,.LK_00_19
+	mov	r14,sp
+	sub	sp,sp,#15*4
+	mov	r5,r5,ror#30
+	mov	r6,r6,ror#30
+	mov	r7,r7,ror#30		@ [6]
+.L_00_15:
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r4,r5			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	eor	r10,r4,r5			@ F_xx_xx
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r3,r10,ror#2
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r3,r4			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	eor	r10,r3,r4			@ F_xx_xx
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r7,r10,ror#2
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r7,r3			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	eor	r10,r7,r3			@ F_xx_xx
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r6,r10,ror#2
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r6,r7			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	eor	r10,r6,r7			@ F_xx_xx
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r5,r10,ror#2
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+	cmp	r14,sp
+	bne	.L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+
+	ldr	r8,.LK_20_39		@ [+15+16*4]
+	cmn	sp,#0			@ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r4,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
+ ARM(	teq	r14,sp		)	@ preserve carry
+ THUMB(	mov	r11,sp		)
+ THUMB(	teq	r14,r11		)	@ preserve carry
+	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
+	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
+
+	ldr	r8,.LK_40_59
+	sub	sp,sp,#20*4		@ [+2]
+.L_40_59:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r4,r10,ror#2					@ F_xx_xx
+	and r11,r5,r6					@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
+	add	r7,r7,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r3,r10,ror#2					@ F_xx_xx
+	and r11,r4,r5					@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
+	add	r6,r6,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r7,r10,ror#2					@ F_xx_xx
+	and r11,r3,r4					@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
+	add	r5,r5,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r6,r10,ror#2					@ F_xx_xx
+	and r11,r7,r3					@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
+	add	r4,r4,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r5,r10,ror#2					@ F_xx_xx
+	and r11,r6,r7					@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
+	add	r3,r3,r11,ror#2
+	cmp	r14,sp
+	bne	.L_40_59		@ [+((12+5)*5+2)*4]
+
+	ldr	r8,.LK_60_79
+	sub	sp,sp,#20*4
+	cmp	sp,#0			@ set carry to denote 60_79
+	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
+.L_done:
+	add	sp,sp,#80*4		@ "deallocate" stack frame
+	ldmia	r0,{r8,r9,r10,r11,r12}
+	add	r3,r8,r3
+	add	r4,r9,r4
+	add	r5,r10,r5,ror#2
+	add	r6,r11,r6,ror#2
+	add	r7,r12,r7,ror#2
+	stmia	r0,{r3,r4,r5,r6,r7}
+	teq	r1,r2
+	bne	.Lloop			@ [+18], total 1307
+
+	ldmia	sp!,{r4-r12,pc}
+.align	2
+.LK_00_19:	.word	0x5a827999
+.LK_20_39:	.word	0x6ed9eba1
+.LK_40_59:	.word	0x8f1bbcdc
+.LK_60_79:	.word	0xca62c1d6
+ENDPROC(sha1_block_data_order)
+.asciz	"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
+.align	2
diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S
new file mode 100644
index 000000000000..6edba3ab62e8
--- /dev/null
+++ b/lib/crypto/arm/sha1-armv7-neon.S
@@ -0,0 +1,633 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.syntax unified
+.fpu neon
+
+.text
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 4
+.LK_VEC:
+.LK1:	.long K1, K1, K1, K1
+.LK2:	.long K2, K2, K2, K2
+.LK3:	.long K3, K3, K3, K3
+.LK4:	.long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q7
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q6
+#define W6 q5
+#define W7 q1
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define ARM_LE(code...)
+#else
+#define ARM_LE(code...)		code
+#endif
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	bic RT0, d, b; \
+	add e, e, a, ror #(32 - 5); \
+	and RT1, c, b; \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add RT0, RT0, RT3; \
+	add e, e, RT1; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	eor RT0, d, b; \
+	add e, e, a, ror #(32 - 5); \
+	eor RT0, RT0, c; \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT3; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	eor RT0, b, c; \
+	and RT1, b, c; \
+	add e, e, a, ror #(32 - 5); \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	and RT0, RT0, d; \
+	add RT1, RT1, RT3; \
+	add e, e, RT0; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
+           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
+	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+	add       RWK, sp, #(WK_offs(0));			\
+	\
+	vld1.32   {W0, W7}, [RDATA]!;				\
+ ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
+	vld1.32   {W6, W5}, [RDATA]!;				\
+	vadd.u32  tmp0, W0, curK;				\
+ ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
+ ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
+	vadd.u32  tmp1, W7, curK;				\
+ ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
+	vadd.u32  tmp2, W6, curK;				\
+	vst1.32   {tmp0, tmp1}, [RWK]!;				\
+	vadd.u32  tmp3, W5, curK;				\
+	vst1.32   {tmp2, tmp3}, [RWK];				\
+
+#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vld1.32   {W0, W7}, [RDATA]!;				\
+
+#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add       RWK, sp, #(WK_offs(0));			\
+
+#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
+
+#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vld1.32   {W6, W5}, [RDATA]!;				\
+
+#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp0, W0, curK;				\
+
+#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
+
+#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
+
+#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp1, W7, curK;				\
+
+#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
+
+#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp2, W6, curK;				\
+
+#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp0, tmp1}, [RWK]!;				\
+
+#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp3, W5, curK;				\
+
+#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp2, tmp3}, [RWK];				\
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0;			\
+	vext.8    W, W_m16, W_m12, #8;		\
+
+#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add       RWK, sp, #(WK_offs(i));	\
+	vext.8    tmp0, W_m04, tmp0, #4;	\
+
+#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0, W_m16;		\
+	veor.32   W, W, W_m08;			\
+
+#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp1, tmp1;			\
+	veor      W, W, tmp0;			\
+
+#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32  tmp0, W, #1;			\
+
+#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vext.8    tmp1, tmp1, W, #(16-12);	\
+	vshr.u32  W, W, #31;			\
+
+#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vorr      tmp0, tmp0, W;		\
+	vshr.u32  W, tmp1, #30;			\
+
+#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32  tmp1, tmp1, #2;		\
+
+#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0, W;		\
+
+#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      W, tmp0, tmp1;		\
+
+#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp0, W, curK;		\
+
+#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32 {tmp0}, [RWK];
+
+
+/*
+ * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
+ *
+ * void sha1_transform_neon(struct sha1_block_state *state,
+ *			    const u8 *data, size_t nblocks);
+ */
+.align 3
+ENTRY(sha1_transform_neon)
+  /* input:
+   *	r0: state
+   *	r1: data (64*nblocks bytes)
+   *	r2: nblocks
+   */
+
+  cmp RNBLKS, #0;
+  beq .Ldo_nothing;
+
+  push {r4-r12, lr};
+  /*vpush {q4-q7};*/
+
+  adr RT3, .LK_VEC;
+
+  mov ROLDSTACK, sp;
+
+  /* Align stack. */
+  sub RT0, sp, #(16*4);
+  and RT0, #(~(16-1));
+  mov sp, RT0;
+
+  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+  /* Get the values of the chaining variables. */
+  ldm RSTATE, {_a-_e};
+
+  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+  /* Precalc 0-15. */
+  W_PRECALC_00_15();
+
+.Loop:
+  /* Transform 0-15 + Precalc 16-31. */
+  _R( _a, _b, _c, _d, _e, F1,  0,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  1,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  2,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1,  3,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
+      W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+  _R( _b, _c, _d, _e, _a, F1,  4,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1,  5,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  6,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  7,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
+      W3, W4, W5, W6, W7, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F1,  8,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1,  9,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 10,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1, 11,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
+      W2, W3, W4, W5, W6, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F1, 12,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1, 13,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1, 14,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 15,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
+      W1, W2, W3, W4, W5, _, _, _ );
+
+  /* Transform 16-63 + Precalc 32-79. */
+  _R( _e, _a, _b, _c, _d, F1, 16,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _d, _e, _a, _b, _c, F1, 17,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _c, _d, _e, _a, _b, F1, 18,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F1, 19,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _a, _b, _c, _d, _e, F2, 20,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _e, _a, _b, _c, _d, F2, 21,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _d, _e, _a, _b, _c, F2, 22,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F2, 23,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+  _R( _b, _c, _d, _e, _a, F2, 24,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _a, _b, _c, _d, _e, F2, 25,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _e, _a, _b, _c, _d, F2, 26,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F2, 27,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+
+  _R( _c, _d, _e, _a, _b, F2, 28,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _b, _c, _d, _e, _a, F2, 29,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _a, _b, _c, _d, _e, F2, 30,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F2, 31,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+
+  _R( _d, _e, _a, _b, _c, F2, 32,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _c, _d, _e, _a, _b, F2, 33,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _b, _c, _d, _e, _a, F2, 34,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _a, _b, _c, _d, _e, F2, 35,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+
+  _R( _e, _a, _b, _c, _d, F2, 36,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _d, _e, _a, _b, _c, F2, 37,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _c, _d, _e, _a, _b, F2, 38,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _b, _c, _d, _e, _a, F2, 39,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+
+  _R( _a, _b, _c, _d, _e, F3, 40,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _e, _a, _b, _c, _d, F3, 41,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _d, _e, _a, _b, _c, F3, 42,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _c, _d, _e, _a, _b, F3, 43,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+  _R( _b, _c, _d, _e, _a, F3, 44,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _a, _b, _c, _d, _e, F3, 45,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _e, _a, _b, _c, _d, F3, 46,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _d, _e, _a, _b, _c, F3, 47,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+
+  _R( _c, _d, _e, _a, _b, F3, 48,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F3, 49,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _a, _b, _c, _d, _e, F3, 50,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _e, _a, _b, _c, _d, F3, 51,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _d, _e, _a, _b, _c, F3, 52,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F3, 53,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _b, _c, _d, _e, _a, F3, 54,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _a, _b, _c, _d, _e, F3, 55,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+
+  _R( _e, _a, _b, _c, _d, F3, 56,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F3, 57,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _c, _d, _e, _a, _b, F3, 58,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _b, _c, _d, _e, _a, F3, 59,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+
+  subs RNBLKS, #1;
+
+  _R( _a, _b, _c, _d, _e, F4, 60,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F4, 61,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _d, _e, _a, _b, _c, F4, 62,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _c, _d, _e, _a, _b, F4, 63,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+
+  beq .Lend;
+
+  /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+  _R( _b, _c, _d, _e, _a, F4, 64,
+      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 65,
+      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 66,
+      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 67,
+      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F4, 68,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 69,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 70,
+      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 71,
+      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F4, 72,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 73,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 74,
+      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 75,
+      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _e, _a, _b, _c, _d, F4, 76,
+      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 77,
+      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 78,
+      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 79,
+      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  b .Loop;
+
+.Lend:
+  /* Transform 64-79 */
+  R( _b, _c, _d, _e, _a, F4, 64 );
+  R( _a, _b, _c, _d, _e, F4, 65 );
+  R( _e, _a, _b, _c, _d, F4, 66 );
+  R( _d, _e, _a, _b, _c, F4, 67 );
+  R( _c, _d, _e, _a, _b, F4, 68 );
+  R( _b, _c, _d, _e, _a, F4, 69 );
+  R( _a, _b, _c, _d, _e, F4, 70 );
+  R( _e, _a, _b, _c, _d, F4, 71 );
+  R( _d, _e, _a, _b, _c, F4, 72 );
+  R( _c, _d, _e, _a, _b, F4, 73 );
+  R( _b, _c, _d, _e, _a, F4, 74 );
+  R( _a, _b, _c, _d, _e, F4, 75 );
+  R( _e, _a, _b, _c, _d, F4, 76 );
+  R( _d, _e, _a, _b, _c, F4, 77 );
+  R( _c, _d, _e, _a, _b, F4, 78 );
+  R( _b, _c, _d, _e, _a, F4, 79 );
+
+  mov sp, ROLDSTACK;
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  /*vpop {q4-q7};*/
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  pop {r4-r12, pc};
+
+.Ldo_nothing:
+  bx lr
+ENDPROC(sha1_transform_neon)
diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S
new file mode 100644
index 000000000000..2de40dd25e47
--- /dev/null
+++ b/lib/crypto/arm/sha1-ce-core.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a
+	.fpu		crypto-neon-fp-armv8
+
+	k0		.req	q0
+	k1		.req	q1
+	k2		.req	q2
+	k3		.req	q3
+
+	ta0		.req	q4
+	ta1		.req	q5
+	tb0		.req	q5
+	tb1		.req	q4
+
+	dga		.req	q6
+	dgb		.req	q7
+	dgbs		.req	s28
+
+	dg0		.req	q12
+	dg1a0		.req	q13
+	dg1a1		.req	q14
+	dg1b0		.req	q14
+	dg1b1		.req	q13
+
+	.macro		add_only, op, ev, rc, s0, dg1
+	.ifnb		\s0
+	vadd.u32	tb\ev, q\s0, \rc
+	.endif
+	sha1h.32	dg1b\ev, dg0
+	.ifb		\dg1
+	sha1\op\().32	dg0, dg1a\ev, ta\ev
+	.else
+	sha1\op\().32	dg0, \dg1, ta\ev
+	.endif
+	.endm
+
+	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1
+	sha1su0.32	q\s0, q\s1, q\s2
+	add_only	\op, \ev, \rc, \s1, \dg1
+	sha1su1.32	q\s0, q\s3
+	.endm
+
+	.align		6
+.Lsha1_rcon:
+	.word		0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+	.word		0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+	.word		0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+	.word		0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+	/*
+	 * void sha1_ce_transform(struct sha1_block_state *state,
+	 *			  const u8 *data, size_t nblocks);
+	 */
+ENTRY(sha1_ce_transform)
+	/* load round constants */
+	adr		ip, .Lsha1_rcon
+	vld1.32		{k0-k1}, [ip, :128]!
+	vld1.32		{k2-k3}, [ip, :128]
+
+	/* load state */
+	vld1.32		{dga}, [r0]
+	vldr		dgbs, [r0, #16]
+
+	/* load input */
+0:	vld1.32		{q8-q9}, [r1]!
+	vld1.32		{q10-q11}, [r1]!
+	subs		r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+	vrev32.8	q8, q8
+	vrev32.8	q9, q9
+	vrev32.8	q10, q10
+	vrev32.8	q11, q11
+#endif
+
+	vadd.u32	ta0, q8, k0
+	vmov		dg0, dga
+
+	add_update	c, 0, k0,  8,  9, 10, 11, dgb
+	add_update	c, 1, k0,  9, 10, 11,  8
+	add_update	c, 0, k0, 10, 11,  8,  9
+	add_update	c, 1, k0, 11,  8,  9, 10
+	add_update	c, 0, k1,  8,  9, 10, 11
+
+	add_update	p, 1, k1,  9, 10, 11,  8
+	add_update	p, 0, k1, 10, 11,  8,  9
+	add_update	p, 1, k1, 11,  8,  9, 10
+	add_update	p, 0, k1,  8,  9, 10, 11
+	add_update	p, 1, k2,  9, 10, 11,  8
+
+	add_update	m, 0, k2, 10, 11,  8,  9
+	add_update	m, 1, k2, 11,  8,  9, 10
+	add_update	m, 0, k2,  8,  9, 10, 11
+	add_update	m, 1, k2,  9, 10, 11,  8
+	add_update	m, 0, k3, 10, 11,  8,  9
+
+	add_update	p, 1, k3, 11,  8,  9, 10
+	add_only	p, 0, k3,  9
+	add_only	p, 1, k3, 10
+	add_only	p, 0, k3, 11
+	add_only	p, 1
+
+	/* update state */
+	vadd.u32	dga, dga, dg0
+	vadd.u32	dgb, dgb, dg1a0
+	bne		0b
+
+	/* store new state */
+	vst1.32		{dga}, [r0]
+	vstr		dgbs, [r0, #16]
+	bx		lr
+ENDPROC(sha1_ce_transform)
diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h
new file mode 100644
index 000000000000..fa1e92419000
--- /dev/null
+++ b/lib/crypto/arm/sha1.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha1_block_data_order(struct sha1_block_state *state,
+				      const u8 *data, size_t nblocks);
+asmlinkage void sha1_transform_neon(struct sha1_block_state *state,
+				    const u8 *data, size_t nblocks);
+asmlinkage void sha1_ce_transform(struct sha1_block_state *state,
+				  const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		kernel_neon_begin();
+		if (static_branch_likely(&have_ce))
+			sha1_ce_transform(state, data, nblocks);
+		else
+			sha1_transform_neon(state, data, nblocks);
+		kernel_neon_end();
+	} else {
+		sha1_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha1_mod_init_arch sha1_mod_init_arch
+static inline void sha1_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON) {
+		static_branch_enable(&have_neon);
+		if (elf_hwcap2 & HWCAP2_SHA1)
+			static_branch_enable(&have_ce);
+	}
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl
new file mode 100644
index 000000000000..f3a2b54efd4e
--- /dev/null
+++ b/lib/crypto/arm/sha256-armv4.pl
@@ -0,0 +1,724 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	$t1,$t1
+# endif
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	sub	$Ktbl,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+	adr	$Ktbl,.Lsha256_block_data_order
+	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0],@X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1],@X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2],@X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3],@X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0,@X[0]
+	vadd.i32	$T1,$T1,@X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2,@X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3,@X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+	adr	$Ktbl,.LARMv8
+	sub	$Ktbl,$Ktbl,#.LARMv8-K256
+# else
+	adrl	$Ktbl,K256
+# endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
+	vld1.32		{$W0},[$Ktbl]!
+	vrev32.8	@MSG[0],@MSG[0]
+	vrev32.8	@MSG[1],@MSG[1]
+	vrev32.8	@MSG[2],@MSG[2]
+	vrev32.8	@MSG[3],@MSG[3]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	vmov		$EFGH_SAVE,$EFGH
+	teq		$inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vld1.32		{$W0},[$Ktbl]!
+	vadd.i32	$W1,$W1,@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vld1.32		{$W1},[$Ktbl]
+	vadd.i32	$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#256-16	@ rewind
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vadd.i32	$W1,$W1,@MSG[3]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD,$EFGH},[$ctx]
+
+	ret		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+{   my  %opcode = (
+	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
+	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
new file mode 100644
index 000000000000..7481ac8e6c0d
--- /dev/null
+++ b/lib/crypto/arm/sha256-ce.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a
+	.fpu		crypto-neon-fp-armv8
+
+	k0		.req	q7
+	k1		.req	q8
+	rk		.req	r3
+
+	ta0		.req	q9
+	ta1		.req	q10
+	tb0		.req	q10
+	tb1		.req	q9
+
+	dga		.req	q11
+	dgb		.req	q12
+
+	dg0		.req	q13
+	dg1		.req	q14
+	dg2		.req	q15
+
+	.macro		add_only, ev, s0
+	vmov		dg2, dg0
+	.ifnb		\s0
+	vld1.32		{k\ev}, [rk, :128]!
+	.endif
+	sha256h.32	dg0, dg1, tb\ev
+	sha256h2.32	dg1, dg2, tb\ev
+	.ifnb		\s0
+	vadd.u32	ta\ev, q\s0, k\ev
+	.endif
+	.endm
+
+	.macro		add_update, ev, s0, s1, s2, s3
+	sha256su0.32	q\s0, q\s1
+	add_only	\ev, \s1
+	sha256su1.32	q\s0, q\s2, q\s3
+	.endm
+
+	.align		6
+.Lsha256_rcon:
+	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+	/*
+	 * void sha256_ce_transform(struct sha256_block_state *state,
+	 *			    const u8 *data, size_t nblocks);
+	 */
+ENTRY(sha256_ce_transform)
+	/* load state */
+	vld1.32		{dga-dgb}, [r0]
+
+	/* load input */
+0:	vld1.32		{q0-q1}, [r1]!
+	vld1.32		{q2-q3}, [r1]!
+	subs		r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+	vrev32.8	q0, q0
+	vrev32.8	q1, q1
+	vrev32.8	q2, q2
+	vrev32.8	q3, q3
+#endif
+
+	/* load first round constant */
+	adr		rk, .Lsha256_rcon
+	vld1.32		{k0}, [rk, :128]!
+
+	vadd.u32	ta0, q0, k0
+	vmov		dg0, dga
+	vmov		dg1, dgb
+
+	add_update	1, 0, 1, 2, 3
+	add_update	0, 1, 2, 3, 0
+	add_update	1, 2, 3, 0, 1
+	add_update	0, 3, 0, 1, 2
+	add_update	1, 0, 1, 2, 3
+	add_update	0, 1, 2, 3, 0
+	add_update	1, 2, 3, 0, 1
+	add_update	0, 3, 0, 1, 2
+	add_update	1, 0, 1, 2, 3
+	add_update	0, 1, 2, 3, 0
+	add_update	1, 2, 3, 0, 1
+	add_update	0, 3, 0, 1, 2
+
+	add_only	1, 1
+	add_only	0, 2
+	add_only	1, 3
+	add_only	0
+
+	/* update state */
+	vadd.u32	dga, dga, dg0
+	vadd.u32	dgb, dgb, dg1
+	bne		0b
+
+	/* store new state */
+	vst1.32		{dga-dgb}, [r0]
+	bx		lr
+ENDPROC(sha256_ce_transform)
diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h
new file mode 100644
index 000000000000..da75cbdc51d4
--- /dev/null
+++ b/lib/crypto/arm/sha256.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/simd.h>
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state,
+					     const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(struct sha256_block_state *state,
+				    const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		if (static_branch_likely(&have_ce))
+			sha256_ce_transform(state, data, nblocks);
+		else
+			sha256_block_data_order_neon(state, data, nblocks);
+		kernel_neon_end();
+	} else {
+		sha256_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON) {
+		static_branch_enable(&have_neon);
+		if (elf_hwcap2 & HWCAP2_SHA2)
+			static_branch_enable(&have_ce);
+	}
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha512-armv4.pl b/lib/crypto/arm/sha512-armv4.pl
new file mode 100644
index 000000000000..2fc3516912fa
--- /dev/null
+++ b/lib/crypto/arm/sha512-armv4.pl
@@ -0,0 +1,657 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	# parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############	r13 is stack pointer
+$Ktbl="r14";
+############	r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
+	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
+	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
+	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
+	eor	$t0,$t0,$Elo,lsr#18
+	eor	$t1,$t1,$Ehi,lsr#18
+	eor	$t0,$t0,$Ehi,lsl#14
+	eor	$t1,$t1,$Elo,lsl#14
+	eor	$t0,$t0,$Ehi,lsr#9
+	eor	$t1,$t1,$Elo,lsr#9
+	eor	$t0,$t0,$Elo,lsl#23
+	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
+	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
+	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
+	ldr	$t3,[sp,#$Goff+4]	@ g.hi
+
+	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
+	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
+	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
+	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
+	eor	$t0,$t0,$t2
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
+
+	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
+	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
+	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
+	adc	$Thi,$Thi,$t3		@ T += K[i]
+	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
+	adc	$Ehi,$Ehi,$Thi		@ d += T
+	teq	$t0,#$magic
+
+	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	$Ktbl,$Ktbl,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	$t0,$Alo,lsr#28
+	mov	$t1,$Ahi,lsr#28
+	eor	$t0,$t0,$Ahi,lsl#4
+	eor	$t1,$t1,$Alo,lsl#4
+	eor	$t0,$t0,$Ahi,lsr#2
+	eor	$t1,$t1,$Alo,lsr#2
+	eor	$t0,$t0,$Alo,lsl#30
+	eor	$t1,$t1,$Ahi,lsl#30
+	eor	$t0,$t0,$Ahi,lsr#7
+	eor	$t1,$t1,$Alo,lsr#7
+	eor	$t0,$t0,$Alo,lsl#25
+	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
+	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
+	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
+
+	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
+	ldr	$t2,[sp,#$Coff+4]	@ c.hi
+	and	$Alo,$Alo,$t3
+	and	$t3,$Ahi,$t1
+	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
+	and	$Ahi,$Ahi,$t2
+	adds	$Alo,$Alo,$Tlo
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
+	add	$Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
+
+.global	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha512_block_data_order
+#else
+	adr	r3,.Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4-r12,lr}
+	sub	$Ktbl,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	$Elo,[$ctx,#$Eoff+$lo]
+	ldr	$Ehi,[$ctx,#$Eoff+$hi]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+.Loop:
+	str	$t0, [sp,#$Goff+0]
+	str	$t1, [sp,#$Goff+4]
+	str	$t2, [sp,#$Hoff+0]
+	str	$t3, [sp,#$Hoff+4]
+	ldr	$Alo,[$ctx,#$Aoff+$lo]
+	ldr	$Ahi,[$ctx,#$Aoff+$hi]
+	ldr	$Tlo,[$ctx,#$Boff+$lo]
+	ldr	$Thi,[$ctx,#$Boff+$hi]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	str	$Tlo,[sp,#$Boff+0]
+	str	$Thi,[sp,#$Boff+4]
+	str	$t0, [sp,#$Coff+0]
+	str	$t1, [sp,#$Coff+4]
+	str	$t2, [sp,#$Doff+0]
+	str	$t3, [sp,#$Doff+4]
+	ldr	$Tlo,[$ctx,#$Foff+$lo]
+	ldr	$Thi,[$ctx,#$Foff+$hi]
+	str	$Tlo,[sp,#$Foff+0]
+	str	$Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+	ldrb	$Tlo,[$inp,#7]
+	ldrb	$t0, [$inp,#6]
+	ldrb	$t1, [$inp,#5]
+	ldrb	$t2, [$inp,#4]
+	ldrb	$Thi,[$inp,#3]
+	ldrb	$t3, [$inp,#2]
+	orr	$Tlo,$Tlo,$t0,lsl#8
+	ldrb	$t0, [$inp,#1]
+	orr	$Tlo,$Tlo,$t1,lsl#16
+	ldrb	$t1, [$inp],#8
+	orr	$Tlo,$Tlo,$t2,lsl#24
+	orr	$Thi,$Thi,$t3,lsl#8
+	orr	$Thi,$Thi,$t0,lsl#16
+	orr	$Thi,$Thi,$t1,lsl#24
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
+___
+	&BODY_00_15(0x94);
+$code.=<<___;
+	tst	$Ktbl,#1
+	beq	.L00_15
+	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
+	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
+	eor	$Tlo,$Tlo,$t1,lsl#31
+	eor	$Thi,$Thi,$t0,lsl#31
+	eor	$Tlo,$Tlo,$t0,lsr#8
+	eor	$Thi,$Thi,$t1,lsr#8
+	eor	$Tlo,$Tlo,$t1,lsl#24
+	eor	$Thi,$Thi,$t0,lsl#24
+	eor	$Tlo,$Tlo,$t0,lsr#7
+	eor	$Thi,$Thi,$t1,lsr#7
+	eor	$Tlo,$Tlo,$t1,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	$t0,$t2,lsr#19
+	mov	$t1,$t3,lsr#19
+	eor	$t0,$t0,$t3,lsl#13
+	eor	$t1,$t1,$t2,lsl#13
+	eor	$t0,$t0,$t3,lsr#29
+	eor	$t1,$t1,$t2,lsr#29
+	eor	$t0,$t0,$t2,lsl#3
+	eor	$t1,$t1,$t3,lsl#3
+	eor	$t0,$t0,$t2,lsr#6
+	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
+	eor	$t0,$t0,$t3,lsl#26
+
+	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
+	adc	$Thi,$Thi,$t1
+
+	ldr	$t1,[sp,#`$Xoff+8*16`+4]
+	adds	$Tlo,$Tlo,$t2
+	adc	$Thi,$Thi,$t3
+	adds	$Tlo,$Tlo,$t0
+	adc	$Thi,$Thi,$t1
+___
+	&BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	beq	.L16_79
+	bic	$Ktbl,$Ktbl,#1
+
+	ldr	$Tlo,[sp,#$Boff+0]
+	ldr	$Thi,[sp,#$Boff+4]
+	ldr	$t0, [$ctx,#$Aoff+$lo]
+	ldr	$t1, [$ctx,#$Aoff+$hi]
+	ldr	$t2, [$ctx,#$Boff+$lo]
+	ldr	$t3, [$ctx,#$Boff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Boff+$hi]
+
+	ldr	$Alo,[sp,#$Coff+0]
+	ldr	$Ahi,[sp,#$Coff+4]
+	ldr	$Tlo,[sp,#$Doff+0]
+	ldr	$Thi,[sp,#$Doff+4]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Doff+$hi]
+
+	ldr	$Tlo,[sp,#$Foff+0]
+	ldr	$Thi,[sp,#$Foff+4]
+	ldr	$t0, [$ctx,#$Eoff+$lo]
+	ldr	$t1, [$ctx,#$Eoff+$hi]
+	ldr	$t2, [$ctx,#$Foff+$lo]
+	ldr	$t3, [$ctx,#$Foff+$hi]
+	adds	$Elo,$Elo,$t0
+	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
+	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Foff+$hi]
+
+	ldr	$Alo,[sp,#$Goff+0]
+	ldr	$Ahi,[sp,#$Goff+4]
+	ldr	$Tlo,[sp,#$Hoff+0]
+	ldr	$Thi,[sp,#$Hoff+4]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Hoff+$hi]
+
+	add	sp,sp,#640
+	sub	$Ktbl,$Ktbl,#640
+
+	teq	$inp,$len
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+#if $i>0
+	 vadd.i64	$a,$Maj			@ h+=Maj from the past
+#endif
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vmov		$Ch,$e
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	veor		$t1,$t0
+	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	veor		$t2,$t1			@ Sigma1(e)
+	vadd.i64	$T1,$Ch,$h
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vadd.i64	$T1,$t2
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vadd.i64	$K,@X[$i%16]
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	veor		$Maj,$a,$b
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	veor		$h,$t0,$t1
+	vadd.i64	$T1,$K
+	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
+	veor		$h,$t2			@ Sigma0(a)
+	vadd.i64	$d,$T1
+	vadd.i64	$Maj,$T1
+	@ vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	 vadd.i64	@_[0],d30			@ h+=Maj from the past
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	VFP_ABI_PUSH
+	adr	$Ktbl,.Lsha512_block_data_order
+	sub	$Ktbl,$Ktbl,.Lsha512_block_data_order-K512
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	 vadd.i64	$A,d30		@ h+=Maj from the past
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	VFP_ABI_POP
+	ret				@ bx lr
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h
new file mode 100644
index 000000000000..f147b6490d6c
--- /dev/null
+++ b/lib/crypto/arm/sha512.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm32-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <crypto/internal/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage void sha512_block_data_order_neon(struct sha512_block_state *state,
+					     const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(crypto_simd_usable())) {
+		kernel_neon_begin();
+		sha512_block_data_order_neon(state, data, nblocks);
+		kernel_neon_end();
+	} else {
+		sha512_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static inline void sha512_mod_init_arch(void)
+{
+	if (cpu_has_neon())
+		static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore
new file mode 100644
index 000000000000..f6c4e8ef80da
--- /dev/null
+++ b/lib/crypto/arm64/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
+sha512-core.S
diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig
new file mode 100644
index 000000000000..0b903ef524d8
--- /dev/null
+++ b/lib/crypto/arm64/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA20_NEON
+	tristate
+	depends on KERNEL_MODE_NEON
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_NEON
+	tristate
+	depends on KERNEL_MODE_NEON
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile
new file mode 100644
index 000000000000..6207088397a7
--- /dev/null
+++ b/lib/crypto/arm64/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
+poly1305-neon-y := poly1305-core.o poly1305-glue.o
+AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
+AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) void $(@)
+
+$(obj)/%-core.S: $(src)/%-armv8.pl
+	$(call cmd,perlasm)
+
+clean-files += poly1305-core.S
diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S
new file mode 100644
index 000000000000..80079586ecc7
--- /dev/null
+++ b/lib/crypto/arm64/chacha-neon-core.S
@@ -0,0 +1,805 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Originally based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+	.text
+	.align		6
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+	adr_l		x10, ROT8
+	ld1		{v12.4s}, [x10]
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	ext		v1.16b, v1.16b, v1.16b, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	ext		v3.16b, v3.16b, v3.16b, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	ext		v1.16b, v1.16b, v1.16b, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	ext		v3.16b, v3.16b, v3.16b, #4
+
+	subs		w3, w3, #2
+	b.ne		.Ldoubleround
+
+	ret
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+	// w3: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	// x0..3 = s0..3
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+
+	bl		chacha_permute
+
+	ld1		{v4.16b-v7.16b}, [x2]
+
+	// o0 = i0 ^ (x0 + s0)
+	add		v0.4s, v0.4s, v8.4s
+	eor		v0.16b, v0.16b, v4.16b
+
+	// o1 = i1 ^ (x1 + s1)
+	add		v1.4s, v1.4s, v9.4s
+	eor		v1.16b, v1.16b, v5.16b
+
+	// o2 = i2 ^ (x2 + s2)
+	add		v2.4s, v2.4s, v10.4s
+	eor		v2.16b, v2.16b, v6.16b
+
+	// o3 = i3 ^ (x3 + s3)
+	add		v3.4s, v3.4s, v11.4s
+	eor		v3.16b, v3.16b, v7.16b
+
+	st1		{v0.16b-v3.16b}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+SYM_FUNC_END(chacha_block_xor_neon)
+
+SYM_FUNC_START(hchacha_block_neon)
+	// x0: Input state matrix, s
+	// x1: output (8 32-bit words)
+	// w2: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	ld1		{v0.4s-v3.4s}, [x0]
+
+	mov		w3, w2
+	bl		chacha_permute
+
+	st1		{v0.4s}, [x1], #16
+	st1		{v3.4s}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+SYM_FUNC_END(hchacha_block_neon)
+
+	a0		.req	w12
+	a1		.req	w13
+	a2		.req	w14
+	a3		.req	w15
+	a4		.req	w16
+	a5		.req	w17
+	a6		.req	w19
+	a7		.req	w20
+	a8		.req	w21
+	a9		.req	w22
+	a10		.req	w23
+	a11		.req	w24
+	a12		.req	w25
+	a13		.req	w26
+	a14		.req	w27
+	a15		.req	w28
+
+	.align		6
+SYM_FUNC_START(chacha_4block_xor_neon)
+	frame_push	10
+
+	// x0: Input state matrix, s
+	// x1: 4 data blocks output, o
+	// x2: 4 data blocks input, i
+	// w3: nrounds
+	// x4: byte count
+
+	adr_l		x10, .Lpermute
+	and		x5, x4, #63
+	add		x10, x10, x5
+
+	//
+	// This function encrypts four consecutive ChaCha blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+	// At the same time, a fifth block is encrypted in parallel using
+	// scalar registers
+	//
+	adr_l		x9, CTRINC		// ... and ROT8
+	ld1		{v30.4s-v31.4s}, [x9]
+
+	// x0..15[0-3] = s0..3[0..3]
+	add		x8, x0, #16
+	ld4r		{ v0.4s- v3.4s}, [x0]
+	ld4r		{ v4.4s- v7.4s}, [x8], #16
+	ld4r		{ v8.4s-v11.4s}, [x8], #16
+	ld4r		{v12.4s-v15.4s}, [x8]
+
+	mov		a0, v0.s[0]
+	mov		a1, v1.s[0]
+	mov		a2, v2.s[0]
+	mov		a3, v3.s[0]
+	mov		a4, v4.s[0]
+	mov		a5, v5.s[0]
+	mov		a6, v6.s[0]
+	mov		a7, v7.s[0]
+	mov		a8, v8.s[0]
+	mov		a9, v9.s[0]
+	mov		a10, v10.s[0]
+	mov		a11, v11.s[0]
+	mov		a12, v12.s[0]
+	mov		a13, v13.s[0]
+	mov		a14, v14.s[0]
+	mov		a15, v15.s[0]
+
+	// x12 += counter values 1-4
+	add		v12.4s, v12.4s, v30.4s
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
+	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
+	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
+	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
+
+	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
+	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
+	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
+	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
+
+	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
+	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
+	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
+	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
+	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
+	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
+	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
+
+	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
+	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
+	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
+	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
+
+	shl		v4.4s, v16.4s, #12
+	shl		v5.4s, v17.4s, #12
+	shl		v6.4s, v18.4s, #12
+	shl		v7.4s, v19.4s, #12
+
+	sri		v4.4s, v16.4s, #20
+	  ror		a4, a4, #20
+	sri		v5.4s, v17.4s, #20
+	  ror		a5, a5, #20
+	sri		v6.4s, v18.4s, #20
+	  ror		a6, a6, #20
+	sri		v7.4s, v19.4s, #20
+	  ror		a7, a7, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
+	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
+	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
+	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
+
+	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
+	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
+	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
+	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
+
+	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
+	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
+	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
+	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
+	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
+	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
+	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
+
+	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
+	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
+	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
+	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
+
+	shl		v4.4s, v16.4s, #7
+	shl		v5.4s, v17.4s, #7
+	shl		v6.4s, v18.4s, #7
+	shl		v7.4s, v19.4s, #7
+
+	sri		v4.4s, v16.4s, #25
+	  ror		a4, a4, #25
+	sri		v5.4s, v17.4s, #25
+	  ror		a5, a5, #25
+	sri		v6.4s, v18.4s, #25
+	 ror		a6, a6, #25
+	sri		v7.4s, v19.4s, #25
+	  ror		a7, a7, #25
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
+	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
+	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
+	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
+
+	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
+	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
+	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
+	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
+
+	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
+	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
+	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
+	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
+	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
+	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
+	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
+
+	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
+	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
+	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
+	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
+
+	shl		v5.4s, v16.4s, #12
+	shl		v6.4s, v17.4s, #12
+	shl		v7.4s, v18.4s, #12
+	shl		v4.4s, v19.4s, #12
+
+	sri		v5.4s, v16.4s, #20
+	  ror		a5, a5, #20
+	sri		v6.4s, v17.4s, #20
+	  ror		a6, a6, #20
+	sri		v7.4s, v18.4s, #20
+	  ror		a7, a7, #20
+	sri		v4.4s, v19.4s, #20
+	  ror		a4, a4, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
+	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
+	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
+	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
+
+	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
+	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
+	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
+	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
+
+	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
+	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
+	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
+	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
+	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
+	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
+	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
+
+	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
+	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
+	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
+	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
+
+	shl		v5.4s, v16.4s, #7
+	shl		v6.4s, v17.4s, #7
+	shl		v7.4s, v18.4s, #7
+	shl		v4.4s, v19.4s, #7
+
+	sri		v5.4s, v16.4s, #25
+	  ror		a5, a5, #25
+	sri		v6.4s, v17.4s, #25
+	  ror		a6, a6, #25
+	sri		v7.4s, v18.4s, #25
+	  ror		a7, a7, #25
+	sri		v4.4s, v19.4s, #25
+	  ror		a4, a4, #25
+
+	subs		w3, w3, #2
+	b.ne		.Ldoubleround4
+
+	ld4r		{v16.4s-v19.4s}, [x0], #16
+	ld4r		{v20.4s-v23.4s}, [x0], #16
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+	add		v0.4s, v0.4s, v16.4s
+	  mov		w6, v16.s[0]
+	  mov		w7, v17.s[0]
+	add		v1.4s, v1.4s, v17.4s
+	  mov		w8, v18.s[0]
+	  mov		w9, v19.s[0]
+	add		v2.4s, v2.4s, v18.4s
+	  add		a0, a0, w6
+	  add		a1, a1, w7
+	add		v3.4s, v3.4s, v19.4s
+	  add		a2, a2, w8
+	  add		a3, a3, w9
+CPU_BE(	  rev		a0, a0		)
+CPU_BE(	  rev		a1, a1		)
+CPU_BE(	  rev		a2, a2		)
+CPU_BE(	  rev		a3, a3		)
+
+	ld4r		{v24.4s-v27.4s}, [x0], #16
+	ld4r		{v28.4s-v31.4s}, [x0]
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	add		v4.4s, v4.4s, v20.4s
+	  mov		w6, v20.s[0]
+	  mov		w7, v21.s[0]
+	add		v5.4s, v5.4s, v21.4s
+	  mov		w8, v22.s[0]
+	  mov		w9, v23.s[0]
+	add		v6.4s, v6.4s, v22.4s
+	  add		a4, a4, w6
+	  add		a5, a5, w7
+	add		v7.4s, v7.4s, v23.4s
+	  add		a6, a6, w8
+	  add		a7, a7, w9
+CPU_BE(	  rev		a4, a4		)
+CPU_BE(	  rev		a5, a5		)
+CPU_BE(	  rev		a6, a6		)
+CPU_BE(	  rev		a7, a7		)
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	add		v8.4s, v8.4s, v24.4s
+	  mov		w6, v24.s[0]
+	  mov		w7, v25.s[0]
+	add		v9.4s, v9.4s, v25.4s
+	  mov		w8, v26.s[0]
+	  mov		w9, v27.s[0]
+	add		v10.4s, v10.4s, v26.4s
+	  add		a8, a8, w6
+	  add		a9, a9, w7
+	add		v11.4s, v11.4s, v27.4s
+	  add		a10, a10, w8
+	  add		a11, a11, w9
+CPU_BE(	  rev		a8, a8		)
+CPU_BE(	  rev		a9, a9		)
+CPU_BE(	  rev		a10, a10	)
+CPU_BE(	  rev		a11, a11	)
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	add		v12.4s, v12.4s, v28.4s
+	  mov		w6, v28.s[0]
+	  mov		w7, v29.s[0]
+	add		v13.4s, v13.4s, v29.4s
+	  mov		w8, v30.s[0]
+	  mov		w9, v31.s[0]
+	add		v14.4s, v14.4s, v30.4s
+	  add		a12, a12, w6
+	  add		a13, a13, w7
+	add		v15.4s, v15.4s, v31.4s
+	  add		a14, a14, w8
+	  add		a15, a15, w9
+CPU_BE(	  rev		a12, a12	)
+CPU_BE(	  rev		a13, a13	)
+CPU_BE(	  rev		a14, a14	)
+CPU_BE(	  rev		a15, a15	)
+
+	// interleave 32-bit words in state n, n+1
+	  ldp		w6, w7, [x2], #64
+	zip1		v16.4s, v0.4s, v1.4s
+	  ldp		w8, w9, [x2, #-56]
+	  eor		a0, a0, w6
+	zip2		v17.4s, v0.4s, v1.4s
+	  eor		a1, a1, w7
+	zip1		v18.4s, v2.4s, v3.4s
+	  eor		a2, a2, w8
+	zip2		v19.4s, v2.4s, v3.4s
+	  eor		a3, a3, w9
+	  ldp		w6, w7, [x2, #-48]
+	zip1		v20.4s, v4.4s, v5.4s
+	  ldp		w8, w9, [x2, #-40]
+	  eor		a4, a4, w6
+	zip2		v21.4s, v4.4s, v5.4s
+	  eor		a5, a5, w7
+	zip1		v22.4s, v6.4s, v7.4s
+	  eor		a6, a6, w8
+	zip2		v23.4s, v6.4s, v7.4s
+	  eor		a7, a7, w9
+	  ldp		w6, w7, [x2, #-32]
+	zip1		v24.4s, v8.4s, v9.4s
+	  ldp		w8, w9, [x2, #-24]
+	  eor		a8, a8, w6
+	zip2		v25.4s, v8.4s, v9.4s
+	  eor		a9, a9, w7
+	zip1		v26.4s, v10.4s, v11.4s
+	  eor		a10, a10, w8
+	zip2		v27.4s, v10.4s, v11.4s
+	  eor		a11, a11, w9
+	  ldp		w6, w7, [x2, #-16]
+	zip1		v28.4s, v12.4s, v13.4s
+	  ldp		w8, w9, [x2, #-8]
+	  eor		a12, a12, w6
+	zip2		v29.4s, v12.4s, v13.4s
+	  eor		a13, a13, w7
+	zip1		v30.4s, v14.4s, v15.4s
+	  eor		a14, a14, w8
+	zip2		v31.4s, v14.4s, v15.4s
+	  eor		a15, a15, w9
+
+	add		x3, x2, x4
+	sub		x3, x3, #128		// start of last block
+
+	subs		x5, x4, #128
+	csel		x2, x2, x3, ge
+
+	// interleave 64-bit words in state n, n+2
+	zip1		v0.2d, v16.2d, v18.2d
+	zip2		v4.2d, v16.2d, v18.2d
+	  stp		a0, a1, [x1], #64
+	zip1		v8.2d, v17.2d, v19.2d
+	zip2		v12.2d, v17.2d, v19.2d
+	  stp		a2, a3, [x1, #-56]
+
+	subs		x6, x4, #192
+	ld1		{v16.16b-v19.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v1.2d, v20.2d, v22.2d
+	zip2		v5.2d, v20.2d, v22.2d
+	  stp		a4, a5, [x1, #-48]
+	zip1		v9.2d, v21.2d, v23.2d
+	zip2		v13.2d, v21.2d, v23.2d
+	  stp		a6, a7, [x1, #-40]
+
+	subs		x7, x4, #256
+	ld1		{v20.16b-v23.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v2.2d, v24.2d, v26.2d
+	zip2		v6.2d, v24.2d, v26.2d
+	  stp		a8, a9, [x1, #-32]
+	zip1		v10.2d, v25.2d, v27.2d
+	zip2		v14.2d, v25.2d, v27.2d
+	  stp		a10, a11, [x1, #-24]
+
+	subs		x8, x4, #320
+	ld1		{v24.16b-v27.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v3.2d, v28.2d, v30.2d
+	zip2		v7.2d, v28.2d, v30.2d
+	  stp		a12, a13, [x1, #-16]
+	zip1		v11.2d, v29.2d, v31.2d
+	zip2		v15.2d, v29.2d, v31.2d
+	  stp		a14, a15, [x1, #-8]
+
+	tbnz		x5, #63, .Lt128
+	ld1		{v28.16b-v31.16b}, [x2]
+
+	// xor with corresponding input, write to output
+	eor		v16.16b, v16.16b, v0.16b
+	eor		v17.16b, v17.16b, v1.16b
+	eor		v18.16b, v18.16b, v2.16b
+	eor		v19.16b, v19.16b, v3.16b
+
+	tbnz		x6, #63, .Lt192
+
+	eor		v20.16b, v20.16b, v4.16b
+	eor		v21.16b, v21.16b, v5.16b
+	eor		v22.16b, v22.16b, v6.16b
+	eor		v23.16b, v23.16b, v7.16b
+
+	st1		{v16.16b-v19.16b}, [x1], #64
+	tbnz		x7, #63, .Lt256
+
+	eor		v24.16b, v24.16b, v8.16b
+	eor		v25.16b, v25.16b, v9.16b
+	eor		v26.16b, v26.16b, v10.16b
+	eor		v27.16b, v27.16b, v11.16b
+
+	st1		{v20.16b-v23.16b}, [x1], #64
+	tbnz		x8, #63, .Lt320
+
+	eor		v28.16b, v28.16b, v12.16b
+	eor		v29.16b, v29.16b, v13.16b
+	eor		v30.16b, v30.16b, v14.16b
+	eor		v31.16b, v31.16b, v15.16b
+
+	st1		{v24.16b-v27.16b}, [x1], #64
+	st1		{v28.16b-v31.16b}, [x1]
+
+.Lout:	frame_pop
+	ret
+
+	// fewer than 192 bytes of in/output
+.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
+	ld1		{v28.16b-v31.16b}, [x10]
+	add		x5, x5, x1
+	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
+	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
+	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
+	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0:	eor		v20.16b, v20.16b, v28.16b
+	eor		v21.16b, v21.16b, v29.16b
+	eor		v22.16b, v22.16b, v30.16b
+	eor		v23.16b, v23.16b, v31.16b
+	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
+1:	st1		{v16.16b-v19.16b}, [x1]
+	b		.Lout
+
+	// fewer than 128 bytes of in/output
+.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
+	add		x5, x5, x1
+	sub		x1, x1, #64
+	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
+	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
+	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
+	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
+	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
+	b		0b
+
+	// fewer than 256 bytes of in/output
+.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
+	ld1		{v4.16b-v7.16b}, [x10]
+	add		x6, x6, x1
+	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
+	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
+	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
+	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
+2:	st1		{v20.16b-v23.16b}, [x1]
+	b		.Lout
+
+	// fewer than 320 bytes of in/output
+.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
+	ld1		{v4.16b-v7.16b}, [x10]
+	add		x7, x7, x1
+	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
+	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
+	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
+	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
+3:	st1		{v24.16b-v27.16b}, [x1]
+	b		.Lout
+SYM_FUNC_END(chacha_4block_xor_neon)
+
+	.section	".rodata", "a", %progbits
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.set		.Li, 0
+	.rept		128
+	.byte		(.Li - 64)
+	.set		.Li, .Li + 1
+	.endr
+
+CTRINC:	.word		1, 2, 3, 4
+ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c
new file mode 100644
index 000000000000..d0188f974ca5
--- /dev/null
+++ b/lib/crypto/arm64/chacha-neon-glue.c
@@ -0,0 +1,119 @@
+/*
+ * ChaCha and HChaCha functions (ARM64 optimized)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+				      u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+			  int bytes, int nrounds)
+{
+	while (bytes > 0) {
+		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+		if (l <= CHACHA_BLOCK_SIZE) {
+			u8 buf[CHACHA_BLOCK_SIZE];
+
+			memcpy(buf, src, l);
+			chacha_block_xor_neon(state, buf, buf, nrounds);
+			memcpy(dst, buf, l);
+			state->x[12] += 1;
+			break;
+		}
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+	}
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+		hchacha_block_generic(state, out, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, out, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+	    !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_neon_begin();
+		chacha_doneon(state, dst, src, todo, nrounds);
+		kernel_neon_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_neon);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+	return 0;
+}
+subsys_initcall(chacha_simd_mod_init);
+
+static void __exit chacha_simd_mod_exit(void)
+{
+}
+module_exit(chacha_simd_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl
new file mode 100644
index 000000000000..22c9069c0650
--- /dev/null
+++ b/lib/crypto/arm64/poly1305-armv8.pl
@@ -0,0 +1,917 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+#		IALU/gcc-4.9	NEON
+#
+# Apple A7	1.86/+5%	0.72
+# Cortex-A53	2.69/+58%	1.47
+# Cortex-A57	2.70/+7%	1.14
+# Denver	1.64/+50%	1.18(*)
+# X-Gene	2.13/+68%	2.27
+# Mongoose	1.77/+75%	1.12
+# Kryo		2.70/+55%	1.13
+# ThunderX2	1.17/+95%	1.36
+#
+# (*)	estimate based on resources availability is less than 1.0,
+#	i.e. measured result is worse than expected, presumably binary
+#	translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern	OPENSSL_armcap_P
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl	poly1305_blocks
+.globl	poly1305_emit
+
+.globl	poly1305_init
+.type	poly1305_init,%function
+.align	5
+poly1305_init:
+	cmp	$inp,xzr
+	stp	xzr,xzr,[$ctx]		// zero hash value
+	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
+
+	csel	x0,xzr,x0,eq
+	b.eq	.Lno_key
+
+#ifndef	__KERNEL__
+	adrp	x17,OPENSSL_armcap_P
+	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+	ldp	$r0,$r1,[$inp]		// load key
+	mov	$s1,#0xfffffffc0fffffff
+	movk	$s1,#0x0fff,lsl#48
+#ifdef	__AARCH64EB__
+	rev	$r0,$r0			// flip bytes
+	rev	$r1,$r1
+#endif
+	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
+	and	$s1,$s1,#-4
+	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
+	mov	w#$s1,#-1
+	stp	$r0,$r1,[$ctx,#32]	// save key value
+	str	w#$s1,[$ctx,#48]	// impossible key power value
+
+#ifndef	__KERNEL__
+	tst	w17,#ARMV7_NEON
+
+	adr	$d0,.Lpoly1305_blocks
+	adr	$r0,.Lpoly1305_blocks_neon
+	adr	$d1,.Lpoly1305_emit
+
+	csel	$d0,$d0,$r0,eq
+
+# ifdef	__ILP32__
+	stp	w#$d0,w#$d1,[$len]
+# else
+	stp	$d0,$d1,[$len]
+# endif
+#endif
+	mov	x0,#1
+.Lno_key:
+	ret
+.size	poly1305_init,.-poly1305_init
+
+.type	poly1305_blocks,%function
+.align	5
+poly1305_blocks:
+.Lpoly1305_blocks:
+	ands	$len,$len,#-16
+	b.eq	.Lno_data
+
+	ldp	$h0,$h1,[$ctx]		// load hash value
+	ldp	$h2,x17,[$ctx,#16]	// [along with is_base2_26]
+	ldp	$r0,$r1,[$ctx,#32]	// load key value
+
+#ifdef	__AARCH64EB__
+	lsr	$d0,$h0,#32
+	mov	w#$d1,w#$h0
+	lsr	$d2,$h1,#32
+	mov	w15,w#$h1
+	lsr	x16,$h2,#32
+#else
+	mov	w#$d0,w#$h0
+	lsr	$d1,$h0,#32
+	mov	w#$d2,w#$h1
+	lsr	x15,$h1,#32
+	mov	w16,w#$h2
+#endif
+
+	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
+	lsr	$d1,$d2,#12
+	adds	$d0,$d0,$d2,lsl#52
+	add	$d1,$d1,x15,lsl#14
+	adc	$d1,$d1,xzr
+	lsr	$d2,x16,#24
+	adds	$d1,$d1,x16,lsl#40
+	adc	$d2,$d2,xzr
+
+	cmp	x17,#0			// is_base2_26?
+	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
+	csel	$h0,$h0,$d0,eq		// choose between radixes
+	csel	$h1,$h1,$d1,eq
+	csel	$h2,$h2,$d2,eq
+
+.Loop:
+	ldp	$t0,$t1,[$inp],#16	// load input
+	sub	$len,$len,#16
+#ifdef	__AARCH64EB__
+	rev	$t0,$t0
+	rev	$t1,$t1
+#endif
+	adds	$h0,$h0,$t0		// accumulate input
+	adcs	$h1,$h1,$t1
+
+	mul	$d0,$h0,$r0		// h0*r0
+	adc	$h2,$h2,$padbit
+	umulh	$d1,$h0,$r0
+
+	mul	$t0,$h1,$s1		// h1*5*r1
+	umulh	$t1,$h1,$s1
+
+	adds	$d0,$d0,$t0
+	mul	$t0,$h0,$r1		// h0*r1
+	adc	$d1,$d1,$t1
+	umulh	$d2,$h0,$r1
+
+	adds	$d1,$d1,$t0
+	mul	$t0,$h1,$r0		// h1*r0
+	adc	$d2,$d2,xzr
+	umulh	$t1,$h1,$r0
+
+	adds	$d1,$d1,$t0
+	mul	$t0,$h2,$s1		// h2*5*r1
+	adc	$d2,$d2,$t1
+	mul	$t1,$h2,$r0		// h2*r0
+
+	adds	$d1,$d1,$t0
+	adc	$d2,$d2,$t1
+
+	and	$t0,$d2,#-4		// final reduction
+	and	$h2,$d2,#3
+	add	$t0,$t0,$d2,lsr#2
+	adds	$h0,$d0,$t0
+	adcs	$h1,$d1,xzr
+	adc	$h2,$h2,xzr
+
+	cbnz	$len,.Loop
+
+	stp	$h0,$h1,[$ctx]		// store hash value
+	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26]
+
+.Lno_data:
+	ret
+.size	poly1305_blocks,.-poly1305_blocks
+
+.type	poly1305_emit,%function
+.align	5
+poly1305_emit:
+.Lpoly1305_emit:
+	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
+	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26]
+	ldp	$t0,$t1,[$nonce]	// load nonce
+
+#ifdef	__AARCH64EB__
+	lsr	$d0,$h0,#32
+	mov	w#$d1,w#$h0
+	lsr	$d2,$h1,#32
+	mov	w15,w#$h1
+	lsr	x16,$h2,#32
+#else
+	mov	w#$d0,w#$h0
+	lsr	$d1,$h0,#32
+	mov	w#$d2,w#$h1
+	lsr	x15,$h1,#32
+	mov	w16,w#$h2
+#endif
+
+	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
+	lsr	$d1,$d2,#12
+	adds	$d0,$d0,$d2,lsl#52
+	add	$d1,$d1,x15,lsl#14
+	adc	$d1,$d1,xzr
+	lsr	$d2,x16,#24
+	adds	$d1,$d1,x16,lsl#40
+	adc	$d2,$d2,xzr
+
+	cmp	$r0,#0			// is_base2_26?
+	csel	$h0,$h0,$d0,eq		// choose between radixes
+	csel	$h1,$h1,$d1,eq
+	csel	$h2,$h2,$d2,eq
+
+	adds	$d0,$h0,#5		// compare to modulus
+	adcs	$d1,$h1,xzr
+	adc	$d2,$h2,xzr
+
+	tst	$d2,#-4			// see if it's carried/borrowed
+
+	csel	$h0,$h0,$d0,eq
+	csel	$h1,$h1,$d1,eq
+
+#ifdef	__AARCH64EB__
+	ror	$t0,$t0,#32		// flip nonce words
+	ror	$t1,$t1,#32
+#endif
+	adds	$h0,$h0,$t0		// accumulate nonce
+	adc	$h1,$h1,$t1
+#ifdef	__AARCH64EB__
+	rev	$h0,$h0			// flip output bytes
+	rev	$h1,$h1
+#endif
+	stp	$h0,$h1,[$mac]		// write result
+
+	ret
+.size	poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros;		# borrow
+
+$code.=<<___;
+.type	poly1305_mult,%function
+.align	5
+poly1305_mult:
+	mul	$d0,$h0,$r0		// h0*r0
+	umulh	$d1,$h0,$r0
+
+	mul	$t0,$h1,$s1		// h1*5*r1
+	umulh	$t1,$h1,$s1
+
+	adds	$d0,$d0,$t0
+	mul	$t0,$h0,$r1		// h0*r1
+	adc	$d1,$d1,$t1
+	umulh	$d2,$h0,$r1
+
+	adds	$d1,$d1,$t0
+	mul	$t0,$h1,$r0		// h1*r0
+	adc	$d2,$d2,xzr
+	umulh	$t1,$h1,$r0
+
+	adds	$d1,$d1,$t0
+	mul	$t0,$h2,$s1		// h2*5*r1
+	adc	$d2,$d2,$t1
+	mul	$t1,$h2,$r0		// h2*r0
+
+	adds	$d1,$d1,$t0
+	adc	$d2,$d2,$t1
+
+	and	$t0,$d2,#-4		// final reduction
+	and	$h2,$d2,#3
+	add	$t0,$t0,$d2,lsr#2
+	adds	$h0,$d0,$t0
+	adcs	$h1,$d1,xzr
+	adc	$h2,$h2,xzr
+
+	ret
+.size	poly1305_mult,.-poly1305_mult
+
+.type	poly1305_splat,%function
+.align	4
+poly1305_splat:
+	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x13,$h0,#26,#26
+	extr	x14,$h1,$h0,#52
+	and	x14,x14,#0x03ffffff
+	ubfx	x15,$h1,#14,#26
+	extr	x16,$h2,$h1,#40
+
+	str	w12,[$ctx,#16*0]	// r0
+	add	w12,w13,w13,lsl#2	// r1*5
+	str	w13,[$ctx,#16*1]	// r1
+	add	w13,w14,w14,lsl#2	// r2*5
+	str	w12,[$ctx,#16*2]	// s1
+	str	w14,[$ctx,#16*3]	// r2
+	add	w14,w15,w15,lsl#2	// r3*5
+	str	w13,[$ctx,#16*4]	// s2
+	str	w15,[$ctx,#16*5]	// r3
+	add	w15,w16,w16,lsl#2	// r4*5
+	str	w14,[$ctx,#16*6]	// s3
+	str	w16,[$ctx,#16*7]	// r4
+	str	w15,[$ctx,#16*8]	// s4
+
+	ret
+.size	poly1305_splat,.-poly1305_splat
+
+#ifdef	__KERNEL__
+.globl	poly1305_blocks_neon
+#endif
+.type	poly1305_blocks_neon,%function
+.align	5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+	ldr	$is_base2_26,[$ctx,#24]
+	cmp	$len,#128
+	b.lo	.Lpoly1305_blocks
+
+	.inst	0xd503233f		// paciasp
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+
+	stp	d8,d9,[sp,#16]		// meet ABI requirements
+	stp	d10,d11,[sp,#32]
+	stp	d12,d13,[sp,#48]
+	stp	d14,d15,[sp,#64]
+
+	cbz	$is_base2_26,.Lbase2_64_neon
+
+	ldp	w10,w11,[$ctx]		// load hash value base 2^26
+	ldp	w12,w13,[$ctx,#8]
+	ldr	w14,[$ctx,#16]
+
+	tst	$len,#31
+	b.eq	.Leven_neon
+
+	ldp	$r0,$r1,[$ctx,#32]	// load key value
+
+	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
+	lsr	$h1,x12,#12
+	adds	$h0,$h0,x12,lsl#52
+	add	$h1,$h1,x13,lsl#14
+	adc	$h1,$h1,xzr
+	lsr	$h2,x14,#24
+	adds	$h1,$h1,x14,lsl#40
+	adc	$d2,$h2,xzr		// can be partially reduced...
+
+	ldp	$d0,$d1,[$inp],#16	// load input
+	sub	$len,$len,#16
+	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
+
+#ifdef	__AARCH64EB__
+	rev	$d0,$d0
+	rev	$d1,$d1
+#endif
+	adds	$h0,$h0,$d0		// accumulate input
+	adcs	$h1,$h1,$d1
+	adc	$h2,$h2,$padbit
+
+	bl	poly1305_mult
+
+	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x11,$h0,#26,#26
+	extr	x12,$h1,$h0,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,$h1,#14,#26
+	extr	x14,$h2,$h1,#40
+
+	b	.Leven_neon
+
+.align	4
+.Lbase2_64_neon:
+	ldp	$r0,$r1,[$ctx,#32]	// load key value
+
+	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
+	ldr	$h2,[$ctx,#16]
+
+	tst	$len,#31
+	b.eq	.Linit_neon
+
+	ldp	$d0,$d1,[$inp],#16	// load input
+	sub	$len,$len,#16
+	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
+#ifdef	__AARCH64EB__
+	rev	$d0,$d0
+	rev	$d1,$d1
+#endif
+	adds	$h0,$h0,$d0		// accumulate input
+	adcs	$h1,$h1,$d1
+	adc	$h2,$h2,$padbit
+
+	bl	poly1305_mult
+
+.Linit_neon:
+	ldr	w17,[$ctx,#48]		// first table element
+	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x11,$h0,#26,#26
+	extr	x12,$h1,$h0,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,$h1,#14,#26
+	extr	x14,$h2,$h1,#40
+
+	cmp	w17,#-1			// is value impossible?
+	b.ne	.Leven_neon
+
+	fmov	${H0},x10
+	fmov	${H1},x11
+	fmov	${H2},x12
+	fmov	${H3},x13
+	fmov	${H4},x14
+
+	////////////////////////////////// initialize r^n table
+	mov	$h0,$r0			// r^1
+	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
+	mov	$h1,$r1
+	mov	$h2,xzr
+	add	$ctx,$ctx,#48+12
+	bl	poly1305_splat
+
+	bl	poly1305_mult		// r^2
+	sub	$ctx,$ctx,#4
+	bl	poly1305_splat
+
+	bl	poly1305_mult		// r^3
+	sub	$ctx,$ctx,#4
+	bl	poly1305_splat
+
+	bl	poly1305_mult		// r^4
+	sub	$ctx,$ctx,#4
+	bl	poly1305_splat
+	sub	$ctx,$ctx,#48		// restore original $ctx
+	b	.Ldo_neon
+
+.align	4
+.Leven_neon:
+	fmov	${H0},x10
+	fmov	${H1},x11
+	fmov	${H2},x12
+	fmov	${H3},x13
+	fmov	${H4},x14
+
+.Ldo_neon:
+	ldp	x8,x12,[$inp,#32]	// inp[2:3]
+	subs	$len,$len,#64
+	ldp	x9,x13,[$inp,#48]
+	add	$in2,$inp,#96
+	adrp	$zeros,.Lzeros
+	add	$zeros,$zeros,#:lo12:.Lzeros
+
+	lsl	$padbit,$padbit,#24
+	add	x15,$ctx,#48
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+#endif
+	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	and	x5,x9,#0x03ffffff
+	ubfx	x6,x8,#26,#26
+	ubfx	x7,x9,#26,#26
+	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	extr	x8,x12,x8,#52
+	extr	x9,x13,x9,#52
+	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	fmov	$IN23_0,x4
+	and	x8,x8,#0x03ffffff
+	and	x9,x9,#0x03ffffff
+	ubfx	x10,x12,#14,#26
+	ubfx	x11,x13,#14,#26
+	add	x12,$padbit,x12,lsr#40
+	add	x13,$padbit,x13,lsr#40
+	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	fmov	$IN23_1,x6
+	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	fmov	$IN23_2,x8
+	fmov	$IN23_3,x10
+	fmov	$IN23_4,x12
+
+	ldp	x8,x12,[$inp],#16	// inp[0:1]
+	ldp	x9,x13,[$inp],#48
+
+	ld1	{$R0,$R1,$S1,$R2},[x15],#64
+	ld1	{$S2,$R3,$S3,$R4},[x15],#64
+	ld1	{$S4},[x15]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+#endif
+	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	and	x5,x9,#0x03ffffff
+	ubfx	x6,x8,#26,#26
+	ubfx	x7,x9,#26,#26
+	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	extr	x8,x12,x8,#52
+	extr	x9,x13,x9,#52
+	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	fmov	$IN01_0,x4
+	and	x8,x8,#0x03ffffff
+	and	x9,x9,#0x03ffffff
+	ubfx	x10,x12,#14,#26
+	ubfx	x11,x13,#14,#26
+	add	x12,$padbit,x12,lsr#40
+	add	x13,$padbit,x13,lsr#40
+	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	fmov	$IN01_1,x6
+	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	movi	$MASK.2d,#-1
+	fmov	$IN01_2,x8
+	fmov	$IN01_3,x10
+	fmov	$IN01_4,x12
+	ushr	$MASK.2d,$MASK.2d,#38
+
+	b.ls	.Lskip_loop
+
+.align	4
+.Loop_neon:
+	////////////////////////////////////////////////////////////////
+	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	//   \___________________/
+	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	//   \___________________/ \____________________/
+	//
+	// Note that we start with inp[2:3]*r^2. This is because it
+	// doesn't depend on reduction in previous iteration.
+	////////////////////////////////////////////////////////////////
+	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
+	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
+	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
+	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
+	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+	subs	$len,$len,#64
+	umull	$ACC4,$IN23_0,${R4}[2]
+	csel	$in2,$zeros,$in2,lo
+	umull	$ACC3,$IN23_0,${R3}[2]
+	umull	$ACC2,$IN23_0,${R2}[2]
+	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
+	umull	$ACC1,$IN23_0,${R1}[2]
+	 ldp	x9,x13,[$in2],#48
+	umull	$ACC0,$IN23_0,${R0}[2]
+#ifdef	__AARCH64EB__
+	 rev	x8,x8
+	 rev	x12,x12
+	 rev	x9,x9
+	 rev	x13,x13
+#endif
+
+	umlal	$ACC4,$IN23_1,${R3}[2]
+	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	umlal	$ACC3,$IN23_1,${R2}[2]
+	 and	x5,x9,#0x03ffffff
+	umlal	$ACC2,$IN23_1,${R1}[2]
+	 ubfx	x6,x8,#26,#26
+	umlal	$ACC1,$IN23_1,${R0}[2]
+	 ubfx	x7,x9,#26,#26
+	umlal	$ACC0,$IN23_1,${S4}[2]
+	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+
+	umlal	$ACC4,$IN23_2,${R2}[2]
+	 extr	x8,x12,x8,#52
+	umlal	$ACC3,$IN23_2,${R1}[2]
+	 extr	x9,x13,x9,#52
+	umlal	$ACC2,$IN23_2,${R0}[2]
+	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	umlal	$ACC1,$IN23_2,${S4}[2]
+	 fmov	$IN23_0,x4
+	umlal	$ACC0,$IN23_2,${S3}[2]
+	 and	x8,x8,#0x03ffffff
+
+	umlal	$ACC4,$IN23_3,${R1}[2]
+	 and	x9,x9,#0x03ffffff
+	umlal	$ACC3,$IN23_3,${R0}[2]
+	 ubfx	x10,x12,#14,#26
+	umlal	$ACC2,$IN23_3,${S4}[2]
+	 ubfx	x11,x13,#14,#26
+	umlal	$ACC1,$IN23_3,${S3}[2]
+	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	umlal	$ACC0,$IN23_3,${S2}[2]
+	 fmov	$IN23_1,x6
+
+	add	$IN01_2,$IN01_2,$H2
+	 add	x12,$padbit,x12,lsr#40
+	umlal	$ACC4,$IN23_4,${R0}[2]
+	 add	x13,$padbit,x13,lsr#40
+	umlal	$ACC3,$IN23_4,${S4}[2]
+	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	umlal	$ACC2,$IN23_4,${S3}[2]
+	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	umlal	$ACC1,$IN23_4,${S2}[2]
+	 fmov	$IN23_2,x8
+	umlal	$ACC0,$IN23_4,${S1}[2]
+	 fmov	$IN23_3,x10
+
+	////////////////////////////////////////////////////////////////
+	// (hash+inp[0:1])*r^4 and accumulate
+
+	add	$IN01_0,$IN01_0,$H0
+	 fmov	$IN23_4,x12
+	umlal	$ACC3,$IN01_2,${R1}[0]
+	 ldp	x8,x12,[$inp],#16	// inp[0:1]
+	umlal	$ACC0,$IN01_2,${S3}[0]
+	 ldp	x9,x13,[$inp],#48
+	umlal	$ACC4,$IN01_2,${R2}[0]
+	umlal	$ACC1,$IN01_2,${S4}[0]
+	umlal	$ACC2,$IN01_2,${R0}[0]
+#ifdef	__AARCH64EB__
+	 rev	x8,x8
+	 rev	x12,x12
+	 rev	x9,x9
+	 rev	x13,x13
+#endif
+
+	add	$IN01_1,$IN01_1,$H1
+	umlal	$ACC3,$IN01_0,${R3}[0]
+	umlal	$ACC4,$IN01_0,${R4}[0]
+	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	umlal	$ACC2,$IN01_0,${R2}[0]
+	 and	x5,x9,#0x03ffffff
+	umlal	$ACC0,$IN01_0,${R0}[0]
+	 ubfx	x6,x8,#26,#26
+	umlal	$ACC1,$IN01_0,${R1}[0]
+	 ubfx	x7,x9,#26,#26
+
+	add	$IN01_3,$IN01_3,$H3
+	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	umlal	$ACC3,$IN01_1,${R2}[0]
+	 extr	x8,x12,x8,#52
+	umlal	$ACC4,$IN01_1,${R3}[0]
+	 extr	x9,x13,x9,#52
+	umlal	$ACC0,$IN01_1,${S4}[0]
+	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	umlal	$ACC2,$IN01_1,${R1}[0]
+	 fmov	$IN01_0,x4
+	umlal	$ACC1,$IN01_1,${R0}[0]
+	 and	x8,x8,#0x03ffffff
+
+	add	$IN01_4,$IN01_4,$H4
+	 and	x9,x9,#0x03ffffff
+	umlal	$ACC3,$IN01_3,${R0}[0]
+	 ubfx	x10,x12,#14,#26
+	umlal	$ACC0,$IN01_3,${S2}[0]
+	 ubfx	x11,x13,#14,#26
+	umlal	$ACC4,$IN01_3,${R1}[0]
+	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	umlal	$ACC1,$IN01_3,${S3}[0]
+	 fmov	$IN01_1,x6
+	umlal	$ACC2,$IN01_3,${S4}[0]
+	 add	x12,$padbit,x12,lsr#40
+
+	umlal	$ACC3,$IN01_4,${S4}[0]
+	 add	x13,$padbit,x13,lsr#40
+	umlal	$ACC0,$IN01_4,${S1}[0]
+	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	umlal	$ACC4,$IN01_4,${R0}[0]
+	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	umlal	$ACC1,$IN01_4,${S2}[0]
+	 fmov	$IN01_2,x8
+	umlal	$ACC2,$IN01_4,${S3}[0]
+	 fmov	$IN01_3,x10
+	 fmov	$IN01_4,x12
+
+	/////////////////////////////////////////////////////////////////
+	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	// and P. Schwabe
+	//
+	// [see discussion in poly1305-armv4 module]
+
+	ushr	$T0.2d,$ACC3,#26
+	xtn	$H3,$ACC3
+	 ushr	$T1.2d,$ACC0,#26
+	 and	$ACC0,$ACC0,$MASK.2d
+	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
+	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
+	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
+
+	ushr	$T0.2d,$ACC4,#26
+	xtn	$H4,$ACC4
+	 ushr	$T1.2d,$ACC1,#26
+	 xtn	$H1,$ACC1
+	bic	$H4,#0xfc,lsl#24
+	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
+
+	add	$ACC0,$ACC0,$T0.2d
+	shl	$T0.2d,$T0.2d,#2
+	 shrn	$T1.2s,$ACC2,#26
+	 xtn	$H2,$ACC2
+	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
+	 bic	$H1,#0xfc,lsl#24
+	 add	$H3,$H3,$T1.2s		// h2 -> h3
+	 bic	$H2,#0xfc,lsl#24
+
+	shrn	$T0.2s,$ACC0,#26
+	xtn	$H0,$ACC0
+	 ushr	$T1.2s,$H3,#26
+	 bic	$H3,#0xfc,lsl#24
+	 bic	$H0,#0xfc,lsl#24
+	add	$H1,$H1,$T0.2s		// h0 -> h1
+	 add	$H4,$H4,$T1.2s		// h3 -> h4
+
+	b.hi	.Loop_neon
+
+.Lskip_loop:
+	dup	$IN23_2,${IN23_2}[0]
+	add	$IN01_2,$IN01_2,$H2
+
+	////////////////////////////////////////////////////////////////
+	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	adds	$len,$len,#32
+	b.ne	.Long_tail
+
+	dup	$IN23_2,${IN01_2}[0]
+	add	$IN23_0,$IN01_0,$H0
+	add	$IN23_3,$IN01_3,$H3
+	add	$IN23_1,$IN01_1,$H1
+	add	$IN23_4,$IN01_4,$H4
+
+.Long_tail:
+	dup	$IN23_0,${IN23_0}[0]
+	umull2	$ACC0,$IN23_2,${S3}
+	umull2	$ACC3,$IN23_2,${R1}
+	umull2	$ACC4,$IN23_2,${R2}
+	umull2	$ACC2,$IN23_2,${R0}
+	umull2	$ACC1,$IN23_2,${S4}
+
+	dup	$IN23_1,${IN23_1}[0]
+	umlal2	$ACC0,$IN23_0,${R0}
+	umlal2	$ACC2,$IN23_0,${R2}
+	umlal2	$ACC3,$IN23_0,${R3}
+	umlal2	$ACC4,$IN23_0,${R4}
+	umlal2	$ACC1,$IN23_0,${R1}
+
+	dup	$IN23_3,${IN23_3}[0]
+	umlal2	$ACC0,$IN23_1,${S4}
+	umlal2	$ACC3,$IN23_1,${R2}
+	umlal2	$ACC2,$IN23_1,${R1}
+	umlal2	$ACC4,$IN23_1,${R3}
+	umlal2	$ACC1,$IN23_1,${R0}
+
+	dup	$IN23_4,${IN23_4}[0]
+	umlal2	$ACC3,$IN23_3,${R0}
+	umlal2	$ACC4,$IN23_3,${R1}
+	umlal2	$ACC0,$IN23_3,${S2}
+	umlal2	$ACC1,$IN23_3,${S3}
+	umlal2	$ACC2,$IN23_3,${S4}
+
+	umlal2	$ACC3,$IN23_4,${S4}
+	umlal2	$ACC0,$IN23_4,${S1}
+	umlal2	$ACC4,$IN23_4,${R0}
+	umlal2	$ACC1,$IN23_4,${S2}
+	umlal2	$ACC2,$IN23_4,${S3}
+
+	b.eq	.Lshort_tail
+
+	////////////////////////////////////////////////////////////////
+	// (hash+inp[0:1])*r^4:r^3 and accumulate
+
+	add	$IN01_0,$IN01_0,$H0
+	umlal	$ACC3,$IN01_2,${R1}
+	umlal	$ACC0,$IN01_2,${S3}
+	umlal	$ACC4,$IN01_2,${R2}
+	umlal	$ACC1,$IN01_2,${S4}
+	umlal	$ACC2,$IN01_2,${R0}
+
+	add	$IN01_1,$IN01_1,$H1
+	umlal	$ACC3,$IN01_0,${R3}
+	umlal	$ACC0,$IN01_0,${R0}
+	umlal	$ACC4,$IN01_0,${R4}
+	umlal	$ACC1,$IN01_0,${R1}
+	umlal	$ACC2,$IN01_0,${R2}
+
+	add	$IN01_3,$IN01_3,$H3
+	umlal	$ACC3,$IN01_1,${R2}
+	umlal	$ACC0,$IN01_1,${S4}
+	umlal	$ACC4,$IN01_1,${R3}
+	umlal	$ACC1,$IN01_1,${R0}
+	umlal	$ACC2,$IN01_1,${R1}
+
+	add	$IN01_4,$IN01_4,$H4
+	umlal	$ACC3,$IN01_3,${R0}
+	umlal	$ACC0,$IN01_3,${S2}
+	umlal	$ACC4,$IN01_3,${R1}
+	umlal	$ACC1,$IN01_3,${S3}
+	umlal	$ACC2,$IN01_3,${S4}
+
+	umlal	$ACC3,$IN01_4,${S4}
+	umlal	$ACC0,$IN01_4,${S1}
+	umlal	$ACC4,$IN01_4,${R0}
+	umlal	$ACC1,$IN01_4,${S2}
+	umlal	$ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+	////////////////////////////////////////////////////////////////
+	// horizontal add
+
+	addp	$ACC3,$ACC3,$ACC3
+	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
+	addp	$ACC0,$ACC0,$ACC0
+	 ldp	d10,d11,[sp,#32]
+	addp	$ACC4,$ACC4,$ACC4
+	 ldp	d12,d13,[sp,#48]
+	addp	$ACC1,$ACC1,$ACC1
+	 ldp	d14,d15,[sp,#64]
+	addp	$ACC2,$ACC2,$ACC2
+	 ldr	x30,[sp,#8]
+
+	////////////////////////////////////////////////////////////////
+	// lazy reduction, but without narrowing
+
+	ushr	$T0.2d,$ACC3,#26
+	and	$ACC3,$ACC3,$MASK.2d
+	 ushr	$T1.2d,$ACC0,#26
+	 and	$ACC0,$ACC0,$MASK.2d
+
+	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
+	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
+
+	ushr	$T0.2d,$ACC4,#26
+	and	$ACC4,$ACC4,$MASK.2d
+	 ushr	$T1.2d,$ACC1,#26
+	 and	$ACC1,$ACC1,$MASK.2d
+	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
+
+	add	$ACC0,$ACC0,$T0.2d
+	shl	$T0.2d,$T0.2d,#2
+	 ushr	$T1.2d,$ACC2,#26
+	 and	$ACC2,$ACC2,$MASK.2d
+	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
+	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
+
+	ushr	$T0.2d,$ACC0,#26
+	and	$ACC0,$ACC0,$MASK.2d
+	 ushr	$T1.2d,$ACC3,#26
+	 and	$ACC3,$ACC3,$MASK.2d
+	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
+	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
+
+	////////////////////////////////////////////////////////////////
+	// write the result, can be partially reduced
+
+	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+	mov	x4,#1
+	st1	{$ACC4}[0],[$ctx]
+	str	x4,[$ctx,#8]		// set is_base2_26
+
+	ldr	x29,[sp],#80
+	 .inst	0xd50323bf		// autiasp
+	ret
+.size	poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.pushsection .rodata
+.align	5
+.Lzeros:
+.long	0,0,0,0,0,0,0,0
+.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.popsection
+
+.align	2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+___
+
+foreach (split("\n",$code)) {
+	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
+	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
+	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
+	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
+	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
+	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
+	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+	s/\.[124]([sd])\[/.$1\[/;
+	s/w#x([0-9]+)/w$1/g;
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c
new file mode 100644
index 000000000000..31aea21ce42f
--- /dev/null
+++ b/lib/crypto/arm64/poly1305-glue.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+				const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+				   u8 digest[POLY1305_DIGEST_SIZE],
+				   const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+			  unsigned int len, u32 padbit)
+{
+	len = round_down(len, POLY1305_BLOCK_SIZE);
+	if (static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		do {
+			unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+			kernel_neon_begin();
+			poly1305_blocks_neon(state, src, todo, padbit);
+			kernel_neon_end();
+
+			len -= todo;
+			src += todo;
+		} while (len);
+	} else
+		poly1305_blocks(state, src, len, padbit);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM64 scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init neon_poly1305_mod_init(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+	return 0;
+}
+subsys_initcall(neon_poly1305_mod_init);
+
+static void __exit neon_poly1305_mod_exit(void)
+{
+}
+module_exit(neon_poly1305_mod_exit);
+
+MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S
new file mode 100644
index 000000000000..21efbbafd7d6
--- /dev/null
+++ b/lib/crypto/arm64/sha1-ce-core.S
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a+crypto
+
+	k0		.req	v0
+	k1		.req	v1
+	k2		.req	v2
+	k3		.req	v3
+
+	t0		.req	v4
+	t1		.req	v5
+
+	dga		.req	q6
+	dgav		.req	v6
+	dgb		.req	s7
+	dgbv		.req	v7
+
+	dg0q		.req	q12
+	dg0s		.req	s12
+	dg0v		.req	v12
+	dg1s		.req	s13
+	dg1v		.req	v13
+	dg2s		.req	s14
+
+	.macro		add_only, op, ev, rc, s0, dg1
+	.ifc		\ev, ev
+	add		t1.4s, v\s0\().4s, \rc\().4s
+	sha1h		dg2s, dg0s
+	.ifnb		\dg1
+	sha1\op		dg0q, \dg1, t0.4s
+	.else
+	sha1\op		dg0q, dg1s, t0.4s
+	.endif
+	.else
+	.ifnb		\s0
+	add		t0.4s, v\s0\().4s, \rc\().4s
+	.endif
+	sha1h		dg1s, dg0s
+	sha1\op		dg0q, dg2s, t1.4s
+	.endif
+	.endm
+
+	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1
+	sha1su0		v\s0\().4s, v\s1\().4s, v\s2\().4s
+	add_only	\op, \ev, \rc, \s1, \dg1
+	sha1su1		v\s0\().4s, v\s3\().4s
+	.endm
+
+	.macro		loadrc, k, val, tmp
+	movz		\tmp, :abs_g0_nc:\val
+	movk		\tmp, :abs_g1:\val
+	dup		\k, \tmp
+	.endm
+
+	/*
+	 * size_t __sha1_ce_transform(struct sha1_block_state *state,
+	 *			      const u8 *data, size_t nblocks);
+	 */
+SYM_FUNC_START(__sha1_ce_transform)
+	/* load round constants */
+	loadrc		k0.4s, 0x5a827999, w6
+	loadrc		k1.4s, 0x6ed9eba1, w6
+	loadrc		k2.4s, 0x8f1bbcdc, w6
+	loadrc		k3.4s, 0xca62c1d6, w6
+
+	/* load state */
+	ld1		{dgav.4s}, [x0]
+	ldr		dgb, [x0, #16]
+
+	/* load input */
+0:	ld1		{v8.4s-v11.4s}, [x1], #64
+	sub		x2, x2, #1
+
+CPU_LE(	rev32		v8.16b, v8.16b		)
+CPU_LE(	rev32		v9.16b, v9.16b		)
+CPU_LE(	rev32		v10.16b, v10.16b	)
+CPU_LE(	rev32		v11.16b, v11.16b	)
+
+	add		t0.4s, v8.4s, k0.4s
+	mov		dg0v.16b, dgav.16b
+
+	add_update	c, ev, k0,  8,  9, 10, 11, dgb
+	add_update	c, od, k0,  9, 10, 11,  8
+	add_update	c, ev, k0, 10, 11,  8,  9
+	add_update	c, od, k0, 11,  8,  9, 10
+	add_update	c, ev, k1,  8,  9, 10, 11
+
+	add_update	p, od, k1,  9, 10, 11,  8
+	add_update	p, ev, k1, 10, 11,  8,  9
+	add_update	p, od, k1, 11,  8,  9, 10
+	add_update	p, ev, k1,  8,  9, 10, 11
+	add_update	p, od, k2,  9, 10, 11,  8
+
+	add_update	m, ev, k2, 10, 11,  8,  9
+	add_update	m, od, k2, 11,  8,  9, 10
+	add_update	m, ev, k2,  8,  9, 10, 11
+	add_update	m, od, k2,  9, 10, 11,  8
+	add_update	m, ev, k3, 10, 11,  8,  9
+
+	add_update	p, od, k3, 11,  8,  9, 10
+	add_only	p, ev, k3,  9
+	add_only	p, od, k3, 10
+	add_only	p, ev, k3, 11
+	add_only	p, od
+
+	/* update state */
+	add		dgbv.2s, dgbv.2s, dg1v.2s
+	add		dgav.4s, dgav.4s, dg0v.4s
+
+	/* return early if voluntary preemption is needed */
+	cond_yield	1f, x5, x6
+
+	/* handled all input blocks? */
+	cbnz		x2, 0b
+
+	/* store new state */
+1:	st1		{dgav.4s}, [x0]
+	str		dgb, [x0, #16]
+	mov		x0, x2
+	ret
+SYM_FUNC_END(__sha1_ce_transform)
diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h
new file mode 100644
index 000000000000..f822563538cc
--- /dev/null
+++ b/lib/crypto/arm64/sha1.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage size_t __sha1_ce_transform(struct sha1_block_state *state,
+				      const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_ce) && likely(may_use_simd())) {
+		do {
+			size_t rem;
+
+			kernel_neon_begin();
+			rem = __sha1_ce_transform(state, data, nblocks);
+			kernel_neon_end();
+			data += (nblocks - rem) * SHA1_BLOCK_SIZE;
+			nblocks = rem;
+		} while (nblocks);
+	} else {
+		sha1_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static inline void sha1_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(SHA1))
+		static_branch_enable(&have_ce);
+}
diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl
new file mode 100644
index 000000000000..35ec9ae99fe1
--- /dev/null
+++ b/lib/crypto/arm64/sha2-armv8.pl
@@ -0,0 +1,786 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		SHA256-hw	SHA256(*)	SHA512
+# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# Denver	2.01		10.5 (+26%)	6.70 (+8%)
+# X-Gene			20.0 (+100%)	12.8 (+300%(***))
+# Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+#
+# (*)	Software SHA256 results are of lesser relevance, presented
+#	mostly for informational purposes.
+# (**)	The result is a trade-off: it's possible to improve it by
+#	10% (or by 1 cycle per round), but at the cost of 20% loss
+#	on Cortex-A53 (or by 4 cycles per round).
+# (***)	Super-impressive coefficients over gcc-generated code are
+#	indication of some compiler "pathology", most notably code
+#	generated with -mgeneral-regs-only is significantly faster
+#	and the gap is only 40-90%.
+#
+# October 2016.
+#
+# Originally it was reckoned that it makes no sense to implement NEON
+# version of SHA256 for 64-bit processors. This is because performance
+# improvement on most wide-spread Cortex-A5x processors was observed
+# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+# observed that 32-bit NEON SHA256 performs significantly better than
+# 64-bit scalar version on *some* of the more recent processors. As
+# result 64-bit NEON version of SHA256 was added to provide best
+# all-round performance. For example it executes ~30% faster on X-Gene
+# and Mongoose. [For reference, NEON version of SHA512 is bound to
+# deliver much less improvement, likely *negative* on Cortex-A5x.
+# Which is why NEON support is limited to SHA256.]
+
+$output=pop;
+$flavour=pop;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" $xlate $flavour $output";
+    *STDOUT=*OUT;
+} else {
+    open STDOUT,">$output";
+}
+
+if ($output =~ /512/) {
+	$BITS=512;
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$reg_t="x";
+} else {
+	$BITS=256;
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___	if ($i<16);
+#ifndef	__AARCH64EB__
+	rev	@X[$i],@X[$i]			// $i
+#endif
+___
+$code.=<<___	if ($i<13 && ($i&1));
+	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___	if ($i==13);
+	ldp	@X[14],@X[15],[$inp]
+___
+$code.=<<___	if ($i>=14);
+	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___	if ($i>0 && $i<16);
+	add	$a,$a,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=11);
+	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___	if ($i<15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+	and	$t1,$f,$e
+	bic	$t2,$g,$e
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	add	$d,$d,$h			// d+=h
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	//add	$h,$h,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	ror	$T1,@X[($j+1)&15],#$sigma0[0]
+	and	$t1,$f,$e
+	ror	$T2,@X[($j+14)&15],#$sigma1[0]
+	bic	$t2,$g,$e
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	eor	$t0,$t0,$e,ror#$Sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
+	eor	$T0,$T0,$a,ror#$Sigma0[1]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
+	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
+	add	@X[$j],@X[$j],@X[($j+9)&15]
+	add	$d,$d,$h			// d+=h
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	add	@X[$j],@X[$j],$T1
+	add	$h,$h,$t1			// h+=Sigma0(a)
+	add	@X[$j],@X[$j],$T2
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#ifndef	__KERNEL__
+# include "arm_arch.h"
+#endif
+
+.text
+
+.extern	OPENSSL_armcap_P
+.globl	$func
+.type	$func,%function
+.align	6
+$func:
+___
+$code.=<<___	if ($SZ==4);
+#ifndef	__KERNEL__
+# ifdef	__ILP32__
+	ldrsw	x16,.LOPENSSL_armcap_P
+# else
+	ldr	x16,.LOPENSSL_armcap_P
+# endif
+	adr	x17,.LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA256
+	b.ne	.Lv8_entry
+	tst	w16,#ARMV7_NEON
+	b.ne	.Lneon_entry
+#endif
+___
+$code.=<<___;
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*$SZ
+
+	ldp	$A,$B,[$ctx]				// load context
+	ldp	$C,$D,[$ctx,#2*$SZ]
+	ldp	$E,$F,[$ctx,#4*$SZ]
+	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
+	ldp	$G,$H,[$ctx,#6*$SZ]
+	adr	$Ktbl,.LK$BITS
+	stp	$ctx,$num,[x29,#96]
+
+.Loop:
+	ldp	@X[0],@X[1],[$inp],#2*$SZ
+	ldr	$t2,[$Ktbl],#$SZ			// *K++
+	eor	$t3,$B,$C				// magic seed
+	str	$inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	cbnz	$t2,.Loop_16_xx
+
+	ldp	$ctx,$num,[x29,#96]
+	ldr	$inp,[x29,#112]
+	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
+
+	ldp	@X[0],@X[1],[$ctx]
+	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
+	add	$inp,$inp,#14*$SZ			// advance input pointer
+	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
+	add	$A,$A,@X[0]
+	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
+	add	$B,$B,@X[1]
+	add	$C,$C,@X[2]
+	add	$D,$D,@X[3]
+	stp	$A,$B,[$ctx]
+	add	$E,$E,@X[4]
+	add	$F,$F,@X[5]
+	stp	$C,$D,[$ctx,#2*$SZ]
+	add	$G,$G,@X[6]
+	add	$H,$H,@X[7]
+	cmp	$inp,$num
+	stp	$E,$F,[$ctx,#4*$SZ]
+	stp	$G,$H,[$ctx,#6*$SZ]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*$SZ
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	ret
+.size	$func,.-$func
+
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0	// terminator
+___
+$code.=<<___ if ($SZ==4);
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+___
+$code.=<<___;
+.size	.LK$BITS,.-.LK$BITS
+#ifndef	__KERNEL__
+.align	3
+.LOPENSSL_armcap_P:
+# ifdef	__ILP32__
+	.long	OPENSSL_armcap_P-.
+# else
+	.quad	OPENSSL_armcap_P-.
+# endif
+#endif
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+#ifndef	__KERNEL__
+.type	sha256_block_armv8,%function
+.align	6
+sha256_block_armv8:
+.Lv8_entry:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adr		$Ktbl,.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha256_block_armv8,.-sha256_block_armv8
+#endif
+___
+}
+
+if ($SZ==4) {	######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	&ushr_32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	&sli_32		($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&sli_32		($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T4,$T7,$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T4,$T7,32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T5,$T7,$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T3,$T7,$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_u32	($T3,$T7,32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T6,@X[0],$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T7,@X[0],$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T6,@X[0],32-$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T5,@X[0],$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T6);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T5,@X[0],32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl], #16");
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T5,$T5,$T5);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dhi($T5), &Dlo($T7));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 while($#insns>=1) { eval(shift(@insns)); }
+	&st1_32		("{$T0}","[$Xfer], #16");
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_8		("{@X[0]}","[$inp],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&rev32		(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&st1_32		("{$T0}","[$Xfer], #16");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
+	'&and	($t1,$f,$e)',
+	'&bic	($t4,$g,$e)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
+	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ror	($t0,$t0,"#$Sigma1[0]")',
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&ror	($t4,$t4,"#$Sigma0[0]")',
+	'&add	($d,$d,$h)',			# d+=h
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#ifdef	__KERNEL__
+.globl	sha256_block_neon
+#endif
+.type	sha256_block_neon,%function
+.align	4
+sha256_block_neon:
+.Lneon_entry:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	$Ktbl,.LK256
+	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
+
+	ld1.8	{@X[0]},[$inp], #16
+	ld1.8	{@X[1]},[$inp], #16
+	ld1.8	{@X[2]},[$inp], #16
+	ld1.8	{@X[3]},[$inp], #16
+	ld1.32	{$T0},[$Ktbl], #16
+	ld1.32	{$T1},[$Ktbl], #16
+	ld1.32	{$T2},[$Ktbl], #16
+	ld1.32	{$T3},[$Ktbl], #16
+	rev32	@X[0],@X[0]		// yes, even on
+	rev32	@X[1],@X[1]		// big-endian
+	rev32	@X[2],@X[2]
+	rev32	@X[3],@X[3]
+	mov	$Xfer,sp
+	add.32	$T0,$T0,@X[0]
+	add.32	$T1,$T1,@X[1]
+	add.32	$T2,$T2,@X[2]
+	st1.32	{$T0-$T1},[$Xfer], #32
+	add.32	$T3,$T3,@X[3]
+	st1.32	{$T2-$T3},[$Xfer]
+	sub	$Xfer,$Xfer,#32
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldp	$E,$F,[$ctx,#16]
+	ldp	$G,$H,[$ctx,#24]
+	ldr	$t1,[sp,#0]
+	mov	$t2,wzr
+	eor	$t3,$B,$C
+	mov	$t4,wzr
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	cmp	$t1,#0				// check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
+	cmp	$inp,$num
+	mov	$Xfer, #64
+	csel	$Xfer, $Xfer, xzr, eq
+	sub	$inp,$inp,$Xfer			// avoid SEGV
+	mov	$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	add	$A,$A,$t4			// h+=Sigma0(a) from the past
+	ldp	$t0,$t1,[$ctx,#0]
+	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
+	ldp	$t2,$t3,[$ctx,#8]
+	add	$A,$A,$t0			// accumulate
+	add	$B,$B,$t1
+	ldp	$t0,$t1,[$ctx,#16]
+	add	$C,$C,$t2
+	add	$D,$D,$t3
+	ldp	$t2,$t3,[$ctx,#24]
+	add	$E,$E,$t0
+	add	$F,$F,$t1
+	 ldr	$t1,[sp,#0]
+	stp	$A,$B,[$ctx,#0]
+	add	$G,$G,$t2
+	 mov	$t2,wzr
+	stp	$C,$D,[$ctx,#8]
+	add	$H,$H,$t3
+	stp	$E,$F,[$ctx,#16]
+	 eor	$t3,$B,$C
+	stp	$G,$H,[$ctx,#24]
+	 mov	$t4,wzr
+	 mov	$Xfer,sp
+	b.ne	.L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+.size	sha256_block_neon,.-sha256_block_neon
+___
+}
+
+$code.=<<___;
+#ifndef	__KERNEL__
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
+
+	s/\.[ui]?8(\s)/$1/;
+	s/\.\w?32\b//		and s/\.16b/\.4s/g;
+	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
new file mode 100644
index 000000000000..b99d9589c421
--- /dev/null
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a+crypto
+
+	dga		.req	q20
+	dgav		.req	v20
+	dgb		.req	q21
+	dgbv		.req	v21
+
+	t0		.req	v22
+	t1		.req	v23
+
+	dg0q		.req	q24
+	dg0v		.req	v24
+	dg1q		.req	q25
+	dg1v		.req	v25
+	dg2q		.req	q26
+	dg2v		.req	v26
+
+	.macro		add_only, ev, rc, s0
+	mov		dg2v.16b, dg0v.16b
+	.ifeq		\ev
+	add		t1.4s, v\s0\().4s, \rc\().4s
+	sha256h		dg0q, dg1q, t0.4s
+	sha256h2	dg1q, dg2q, t0.4s
+	.else
+	.ifnb		\s0
+	add		t0.4s, v\s0\().4s, \rc\().4s
+	.endif
+	sha256h		dg0q, dg1q, t1.4s
+	sha256h2	dg1q, dg2q, t1.4s
+	.endif
+	.endm
+
+	.macro		add_update, ev, rc, s0, s1, s2, s3
+	sha256su0	v\s0\().4s, v\s1\().4s
+	add_only	\ev, \rc, \s1
+	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
+	.endm
+
+	/*
+	 * The SHA-256 round constants
+	 */
+	.section	".rodata", "a"
+	.align		4
+.Lsha2_rcon:
+	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+	/*
+	 * size_t __sha256_ce_transform(struct sha256_block_state *state,
+	 *				const u8 *data, size_t nblocks);
+	 */
+	.text
+SYM_FUNC_START(__sha256_ce_transform)
+	/* load round constants */
+	adr_l		x8, .Lsha2_rcon
+	ld1		{ v0.4s- v3.4s}, [x8], #64
+	ld1		{ v4.4s- v7.4s}, [x8], #64
+	ld1		{ v8.4s-v11.4s}, [x8], #64
+	ld1		{v12.4s-v15.4s}, [x8]
+
+	/* load state */
+	ld1		{dgav.4s, dgbv.4s}, [x0]
+
+	/* load input */
+0:	ld1		{v16.4s-v19.4s}, [x1], #64
+	sub		x2, x2, #1
+
+CPU_LE(	rev32		v16.16b, v16.16b	)
+CPU_LE(	rev32		v17.16b, v17.16b	)
+CPU_LE(	rev32		v18.16b, v18.16b	)
+CPU_LE(	rev32		v19.16b, v19.16b	)
+
+	add		t0.4s, v16.4s, v0.4s
+	mov		dg0v.16b, dgav.16b
+	mov		dg1v.16b, dgbv.16b
+
+	add_update	0,  v1, 16, 17, 18, 19
+	add_update	1,  v2, 17, 18, 19, 16
+	add_update	0,  v3, 18, 19, 16, 17
+	add_update	1,  v4, 19, 16, 17, 18
+
+	add_update	0,  v5, 16, 17, 18, 19
+	add_update	1,  v6, 17, 18, 19, 16
+	add_update	0,  v7, 18, 19, 16, 17
+	add_update	1,  v8, 19, 16, 17, 18
+
+	add_update	0,  v9, 16, 17, 18, 19
+	add_update	1, v10, 17, 18, 19, 16
+	add_update	0, v11, 18, 19, 16, 17
+	add_update	1, v12, 19, 16, 17, 18
+
+	add_only	0, v13, 17
+	add_only	1, v14, 18
+	add_only	0, v15, 19
+	add_only	1
+
+	/* update state */
+	add		dgav.4s, dgav.4s, dg0v.4s
+	add		dgbv.4s, dgbv.4s, dg1v.4s
+
+	/* return early if voluntary preemption is needed */
+	cond_yield	1f, x5, x6
+
+	/* handled all input blocks? */
+	cbnz		x2, 0b
+
+	/* store new state */
+1:	st1		{dgav.4s, dgbv.4s}, [x0]
+	mov		x0, x2
+	ret
+SYM_FUNC_END(__sha256_ce_transform)
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
new file mode 100644
index 000000000000..a211966c124a
--- /dev/null
+++ b/lib/crypto/arm64/sha256.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_neon(struct sha256_block_state *state,
+				  const u8 *data, size_t nblocks);
+asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
+		if (static_branch_likely(&have_ce)) {
+			do {
+				size_t rem;
+
+				kernel_neon_begin();
+				rem = __sha256_ce_transform(state,
+							    data, nblocks);
+				kernel_neon_end();
+				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
+				nblocks = rem;
+			} while (nblocks);
+		} else {
+			kernel_neon_begin();
+			sha256_block_neon(state, data, nblocks);
+			kernel_neon_end();
+		}
+	} else {
+		sha256_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(ASIMD)) {
+		static_branch_enable(&have_neon);
+		if (cpu_have_named_feature(SHA2))
+			static_branch_enable(&have_ce);
+	}
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S
new file mode 100644
index 000000000000..22f1ded89bc8
--- /dev/null
+++ b/lib/crypto/arm64/sha512-ce-core.S
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	/*
+	 * We have to specify the "sha3" feature here, since the GNU and clang
+	 * assemblers both consider the SHA-512 instructions to be part of the
+	 * "sha3" feature.  (Except binutils 2.30 through 2.42, which used
+	 * "sha2".  But "sha3" implies "sha2", so "sha3" still works in those
+	 * versions.)  "sha3" doesn't make a lot of sense, since SHA-512 is part
+	 * of the SHA-2 family of algorithms, and also the Arm Architecture
+	 * Reference Manual defines FEAT_SHA512 and FEAT_SHA3 separately.
+	 * Regardless, we must use "sha3" to be compatible with the assemblers.
+	 */
+	.arch		armv8-a+sha3
+
+	/*
+	 * The SHA-512 round constants
+	 */
+	.section	".rodata", "a"
+	.align		4
+.Lsha512_rcon:
+	.quad		0x428a2f98d728ae22, 0x7137449123ef65cd
+	.quad		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.quad		0x3956c25bf348b538, 0x59f111f1b605d019
+	.quad		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.quad		0xd807aa98a3030242, 0x12835b0145706fbe
+	.quad		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.quad		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.quad		0x9bdc06a725c71235, 0xc19bf174cf692694
+	.quad		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.quad		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.quad		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.quad		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.quad		0x983e5152ee66dfab, 0xa831c66d2db43210
+	.quad		0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.quad		0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.quad		0x06ca6351e003826f, 0x142929670a0e6e70
+	.quad		0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.quad		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.quad		0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.quad		0x81c2c92e47edaee6, 0x92722c851482353b
+	.quad		0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.quad		0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.quad		0xd192e819d6ef5218, 0xd69906245565a910
+	.quad		0xf40e35855771202a, 0x106aa07032bbd1b8
+	.quad		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.quad		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.quad		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.quad		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.quad		0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.quad		0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.quad		0x90befffa23631e28, 0xa4506cebde82bde9
+	.quad		0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.quad		0xca273eceea26619c, 0xd186b8c721c0c207
+	.quad		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.quad		0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.quad		0x113f9804bef90dae, 0x1b710b35131c471b
+	.quad		0x28db77f523047d84, 0x32caab7b40c72493
+	.quad		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.quad		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.quad		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+
+	.macro		dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
+	.ifnb		\rc1
+	ld1		{v\rc1\().2d}, [x4], #16
+	.endif
+	add		v5.2d, v\rc0\().2d, v\in0\().2d
+	ext		v6.16b, v\i2\().16b, v\i3\().16b, #8
+	ext		v5.16b, v5.16b, v5.16b, #8
+	ext		v7.16b, v\i1\().16b, v\i2\().16b, #8
+	add		v\i3\().2d, v\i3\().2d, v5.2d
+	.ifnb		\in1
+	ext		v5.16b, v\in3\().16b, v\in4\().16b, #8
+	sha512su0	v\in0\().2d, v\in1\().2d
+	.endif
+	sha512h		q\i3, q6, v7.2d
+	.ifnb		\in1
+	sha512su1	v\in0\().2d, v\in2\().2d, v5.2d
+	.endif
+	add		v\i4\().2d, v\i1\().2d, v\i3\().2d
+	sha512h2	q\i3, q\i1, v\i0\().2d
+	.endm
+
+	/*
+	 * size_t __sha512_ce_transform(struct sha512_block_state *state,
+	 *				const u8 *data, size_t nblocks);
+	 */
+	.text
+SYM_FUNC_START(__sha512_ce_transform)
+	/* load state */
+	ld1		{v8.2d-v11.2d}, [x0]
+
+	/* load first 4 round constants */
+	adr_l		x3, .Lsha512_rcon
+	ld1		{v20.2d-v23.2d}, [x3], #64
+
+	/* load input */
+0:	ld1		{v12.2d-v15.2d}, [x1], #64
+	ld1		{v16.2d-v19.2d}, [x1], #64
+	sub		x2, x2, #1
+
+CPU_LE(	rev64		v12.16b, v12.16b	)
+CPU_LE(	rev64		v13.16b, v13.16b	)
+CPU_LE(	rev64		v14.16b, v14.16b	)
+CPU_LE(	rev64		v15.16b, v15.16b	)
+CPU_LE(	rev64		v16.16b, v16.16b	)
+CPU_LE(	rev64		v17.16b, v17.16b	)
+CPU_LE(	rev64		v18.16b, v18.16b	)
+CPU_LE(	rev64		v19.16b, v19.16b	)
+
+	mov		x4, x3				// rc pointer
+
+	mov		v0.16b, v8.16b
+	mov		v1.16b, v9.16b
+	mov		v2.16b, v10.16b
+	mov		v3.16b, v11.16b
+
+	// v0  ab  cd  --  ef  gh  ab
+	// v1  cd  --  ef  gh  ab  cd
+	// v2  ef  gh  ab  cd  --  ef
+	// v3  gh  ab  cd  --  ef  gh
+	// v4  --  ef  gh  ab  cd  --
+
+	dround		0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
+	dround		3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
+	dround		2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
+	dround		4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
+	dround		1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
+
+	dround		0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
+	dround		3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
+	dround		2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
+	dround		4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
+	dround		1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
+
+	dround		0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
+	dround		3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
+	dround		2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
+	dround		4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
+	dround		1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
+
+	dround		0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
+	dround		3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
+	dround		2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
+	dround		4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
+	dround		1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
+
+	dround		0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
+	dround		3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
+	dround		2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
+	dround		4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
+	dround		1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
+
+	dround		0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
+	dround		3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
+	dround		2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
+	dround		4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
+	dround		1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
+
+	dround		0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
+	dround		3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
+	dround		2, 3, 1, 4, 0, 28, 24, 12
+	dround		4, 2, 0, 1, 3, 29, 25, 13
+	dround		1, 4, 3, 0, 2, 30, 26, 14
+
+	dround		0, 1, 2, 3, 4, 31, 27, 15
+	dround		3, 0, 4, 2, 1, 24,   , 16
+	dround		2, 3, 1, 4, 0, 25,   , 17
+	dround		4, 2, 0, 1, 3, 26,   , 18
+	dround		1, 4, 3, 0, 2, 27,   , 19
+
+	/* update state */
+	add		v8.2d, v8.2d, v0.2d
+	add		v9.2d, v9.2d, v1.2d
+	add		v10.2d, v10.2d, v2.2d
+	add		v11.2d, v11.2d, v3.2d
+
+	cond_yield	3f, x4, x5
+	/* handled all input blocks? */
+	cbnz		x2, 0b
+
+	/* store new state */
+3:	st1		{v8.2d-v11.2d}, [x0]
+	mov		x0, x2
+	ret
+SYM_FUNC_END(__sha512_ce_transform)
diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h
new file mode 100644
index 000000000000..6abb40b467f2
--- /dev/null
+++ b/lib/crypto/arm64/sha512.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm64-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state,
+					const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_sha512_insns) &&
+	    likely(crypto_simd_usable())) {
+		do {
+			size_t rem;
+
+			kernel_neon_begin();
+			rem = __sha512_ce_transform(state, data, nblocks);
+			kernel_neon_end();
+			data += (nblocks - rem) * SHA512_BLOCK_SIZE;
+			nblocks = rem;
+		} while (nblocks);
+	} else {
+		sha512_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static inline void sha512_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(SHA512))
+		static_branch_enable(&have_sha512_insns);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
index 09682136b57c..9828176a2efe 100644
--- a/lib/crypto/blake2s-generic.c
+++ b/lib/crypto/blake2s-generic.c
@@ -9,11 +9,12 @@
  */
 
 #include <crypto/internal/blake2s.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/types.h>
 #include <linux/unaligned.h>
 
 static const u8 blake2s_sigma[10][16] = {
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index b0f9a678300b..f6ec68c3dcda 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -9,12 +9,13 @@
  */
 
 #include <crypto/internal/blake2s.h>
-#include <linux/types.h>
-#include <linux/string.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
 
 static inline void blake2s_set_lastblock(struct blake2s_state *state)
 {
diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c
index ced87dd31a97..77f68de71066 100644
--- a/lib/crypto/chacha.c
+++ b/lib/crypto/chacha.c
@@ -5,13 +5,13 @@
  * Copyright (C) 2015 Martin Willi
  */
 
+#include <crypto/chacha.h>
+#include <linux/bitops.h>
 #include <linux/bug.h>
-#include <linux/kernel.h>
 #include <linux/export.h>
-#include <linux/bitops.h>
+#include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/unaligned.h>
-#include <crypto/chacha.h>
 
 static void chacha_permute(struct chacha_state *state, int nrounds)
 {
diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c
index e29eed49a5a1..0b49d6aedefd 100644
--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
@@ -7,16 +7,16 @@
  * Information: https://tools.ietf.org/html/rfc8439
  */
 
-#include <crypto/chacha20poly1305.h>
 #include <crypto/chacha.h>
+#include <crypto/chacha20poly1305.h>
 #include <crypto/poly1305.h>
 #include <crypto/utils.h>
-
-#include <linux/unaligned.h>
-#include <linux/kernel.h>
+#include <linux/export.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/unaligned.h>
 
 static void chacha_load_key(u32 *k, const u8 *in)
 {
diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c
index de7c99172fa2..f8aa70c9f559 100644
--- a/lib/crypto/curve25519-generic.c
+++ b/lib/crypto/curve25519-generic.c
@@ -10,6 +10,7 @@
  */
 
 #include <crypto/curve25519.h>
+#include <linux/export.h>
 #include <linux/module.h>
 
 const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
diff --git a/lib/crypto/des.c b/lib/crypto/des.c
index d3423b34a8e9..a906070136dc 100644
--- a/lib/crypto/des.c
+++ b/lib/crypto/des.c
@@ -7,21 +7,20 @@
  * Copyright (c) 2005 Dag Arne Osvik <da@osvik.no>
  */
 
+#include <crypto/des.h>
+#include <crypto/internal/des.h>
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/crypto.h>
 #include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/fips.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/types.h>
-
 #include <linux/unaligned.h>
 
-#include <crypto/des.h>
-#include <crypto/internal/des.h>
-
 #define ROL(x, r) ((x) = rol32((x), (r)))
 #define ROR(x, r) ((x) = ror32((x), (r)))
 
diff --git a/lib/crypto/gf128mul.c b/lib/crypto/gf128mul.c
index fbe72cb3453a..2a34590fe3f1 100644
--- a/lib/crypto/gf128mul.c
+++ b/lib/crypto/gf128mul.c
@@ -49,6 +49,7 @@
 */
 
 #include <crypto/gf128mul.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/lib/crypto/hash_info.c b/lib/crypto/hash_info.c
new file mode 100644
index 000000000000..9a467638c971
--- /dev/null
+++ b/lib/crypto/hash_info.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ */
+
+#include <linux/export.h>
+#include <crypto/hash_info.h>
+
+const char *const hash_algo_name[HASH_ALGO__LAST] = {
+	[HASH_ALGO_MD4]		= "md4",
+	[HASH_ALGO_MD5]		= "md5",
+	[HASH_ALGO_SHA1]	= "sha1",
+	[HASH_ALGO_RIPE_MD_160]	= "rmd160",
+	[HASH_ALGO_SHA256]	= "sha256",
+	[HASH_ALGO_SHA384]	= "sha384",
+	[HASH_ALGO_SHA512]	= "sha512",
+	[HASH_ALGO_SHA224]	= "sha224",
+	[HASH_ALGO_RIPE_MD_128]	= "rmd128",
+	[HASH_ALGO_RIPE_MD_256]	= "rmd256",
+	[HASH_ALGO_RIPE_MD_320]	= "rmd320",
+	[HASH_ALGO_WP_256]	= "wp256",
+	[HASH_ALGO_WP_384]	= "wp384",
+	[HASH_ALGO_WP_512]	= "wp512",
+	[HASH_ALGO_TGR_128]	= "tgr128",
+	[HASH_ALGO_TGR_160]	= "tgr160",
+	[HASH_ALGO_TGR_192]	= "tgr192",
+	[HASH_ALGO_SM3_256]	= "sm3",
+	[HASH_ALGO_STREEBOG_256] = "streebog256",
+	[HASH_ALGO_STREEBOG_512] = "streebog512",
+	[HASH_ALGO_SHA3_256]    = "sha3-256",
+	[HASH_ALGO_SHA3_384]    = "sha3-384",
+	[HASH_ALGO_SHA3_512]    = "sha3-512",
+};
+EXPORT_SYMBOL_GPL(hash_algo_name);
+
+const int hash_digest_size[HASH_ALGO__LAST] = {
+	[HASH_ALGO_MD4]		= MD5_DIGEST_SIZE,
+	[HASH_ALGO_MD5]		= MD5_DIGEST_SIZE,
+	[HASH_ALGO_SHA1]	= SHA1_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_160]	= RMD160_DIGEST_SIZE,
+	[HASH_ALGO_SHA256]	= SHA256_DIGEST_SIZE,
+	[HASH_ALGO_SHA384]	= SHA384_DIGEST_SIZE,
+	[HASH_ALGO_SHA512]	= SHA512_DIGEST_SIZE,
+	[HASH_ALGO_SHA224]	= SHA224_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_128]	= RMD128_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_256]	= RMD256_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_320]	= RMD320_DIGEST_SIZE,
+	[HASH_ALGO_WP_256]	= WP256_DIGEST_SIZE,
+	[HASH_ALGO_WP_384]	= WP384_DIGEST_SIZE,
+	[HASH_ALGO_WP_512]	= WP512_DIGEST_SIZE,
+	[HASH_ALGO_TGR_128]	= TGR128_DIGEST_SIZE,
+	[HASH_ALGO_TGR_160]	= TGR160_DIGEST_SIZE,
+	[HASH_ALGO_TGR_192]	= TGR192_DIGEST_SIZE,
+	[HASH_ALGO_SM3_256]	= SM3256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE,
+	[HASH_ALGO_SHA3_256]    = SHA3_256_DIGEST_SIZE,
+	[HASH_ALGO_SHA3_384]    = SHA3_384_DIGEST_SIZE,
+	[HASH_ALGO_SHA3_512]    = SHA3_512_DIGEST_SIZE,
+};
+EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c
index ebcca381e248..26862ad90a96 100644
--- a/lib/crypto/libchacha.c
+++ b/lib/crypto/libchacha.c
@@ -5,12 +5,11 @@
  * Copyright (C) 2015 Martin Willi
  */
 
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/module.h>
-
 #include <crypto/algapi.h> // for crypto_xor_cpy
 #include <crypto/chacha.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src,
 			  unsigned int bytes, int nrounds)
diff --git a/lib/crypto/memneq.c b/lib/crypto/memneq.c
index a2afd10349c9..44daacb8cb51 100644
--- a/lib/crypto/memneq.c
+++ b/lib/crypto/memneq.c
@@ -59,9 +59,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <linux/unaligned.h>
 #include <crypto/algapi.h>
+#include <linux/export.h>
 #include <linux/module.h>
+#include <linux/unaligned.h>
 
 /* Generic path for arbitrary size */
 static inline unsigned long
diff --git a/lib/crypto/mips/.gitignore b/lib/crypto/mips/.gitignore
new file mode 100644
index 000000000000..0d47d4f21c6d
--- /dev/null
+++ b/lib/crypto/mips/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig
new file mode 100644
index 000000000000..0670a170c1be
--- /dev/null
+++ b/lib/crypto/mips/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA_MIPS
+	tristate
+	depends on CPU_MIPS32_R2
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_MIPS
+	tristate
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile
new file mode 100644
index 000000000000..804488c7aded
--- /dev/null
+++ b/lib/crypto/mips/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
+chacha-mips-y := chacha-core.o chacha-glue.o
+AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
+
+obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
+poly1305-mips-y := poly1305-core.o poly1305-glue.o
+
+perlasm-flavour-$(CONFIG_32BIT) := o32
+perlasm-flavour-$(CONFIG_64BIT) := 64
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
+
+$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
+	$(call if_changed,perlasm)
+
+targets += poly1305-core.S
diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S
new file mode 100644
index 000000000000..706aeb850fb0
--- /dev/null
+++ b/lib/crypto/mips/chacha-core.S
@@ -0,0 +1,491 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32		0x3c
+#define CHACHA20_BLOCK_SIZE	64
+#define STACK_SIZE		32
+
+#define X0	$t0
+#define X1	$t1
+#define X2	$t2
+#define X3	$t3
+#define X4	$t4
+#define X5	$t5
+#define X6	$t6
+#define X7	$t7
+#define X8	$t8
+#define X9	$t9
+#define X10	$v1
+#define X11	$s6
+#define X12	$s5
+#define X13	$s4
+#define X14	$s3
+#define X15	$s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0	$s1
+#define T1	$s0
+#define T(n)	T ## n
+#define X(n)	X ## n
+
+/* Input arguments */
+#define STATE		$a0
+#define OUT		$a1
+#define IN		$a2
+#define BYTES		$a3
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0		$v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X		X15
+#define SAVED_CA	$s7
+
+#define IS_UNALIGNED	$s7
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define	CPU_TO_LE32(n) \
+	wsbh	n, n; \
+	rotr	n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define CPU_TO_LE32(n)
+#endif
+
+#define FOR_EACH_WORD(x) \
+	x( 0); \
+	x( 1); \
+	x( 2); \
+	x( 3); \
+	x( 4); \
+	x( 5); \
+	x( 6); \
+	x( 7); \
+	x( 8); \
+	x( 9); \
+	x(10); \
+	x(11); \
+	x(12); \
+	x(13); \
+	x(14); \
+	x(15);
+
+#define FOR_EACH_WORD_REV(x) \
+	x(15); \
+	x(14); \
+	x(13); \
+	x(12); \
+	x(11); \
+	x(10); \
+	x( 9); \
+	x( 8); \
+	x( 7); \
+	x( 6); \
+	x( 5); \
+	x( 4); \
+	x( 3); \
+	x( 2); \
+	x( 1); \
+	x( 0);
+
+#define PLUS_ONE_0	 1
+#define PLUS_ONE_1	 2
+#define PLUS_ONE_2	 3
+#define PLUS_ONE_3	 4
+#define PLUS_ONE_4	 5
+#define PLUS_ONE_5	 6
+#define PLUS_ONE_6	 7
+#define PLUS_ONE_7	 8
+#define PLUS_ONE_8	 9
+#define PLUS_ONE_9	10
+#define PLUS_ONE_10	11
+#define PLUS_ONE_11	12
+#define PLUS_ONE_12	13
+#define PLUS_ONE_13	14
+#define PLUS_ONE_14	15
+#define PLUS_ONE_15	16
+#define PLUS_ONE(x)	PLUS_ONE_ ## x
+#define _CONCAT3(a,b,c)	a ## b ## c
+#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
+
+#define STORE_UNALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lwl	T1, (x*4)+MSB ## (IN); \
+	lwr	T1, (x*4)+LSB ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	swl	X ## x, (x*4)+MSB ## (OUT); \
+	swr	X ## x, (x*4)+LSB ## (OUT);
+
+#define STORE_ALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lw	T1, (x*4) ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	sw	X ## x, (x*4) ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x) \
+.Lchacha_mips_jmptbl_aligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define JMPTBL_UNALIGNED(x) \
+.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
+	addu	X(A), X(K); \
+	addu	X(B), X(L); \
+	addu	X(C), X(M); \
+	addu	X(D), X(N); \
+	xor	X(V), X(A); \
+	xor	X(W), X(B); \
+	xor	X(Y), X(C); \
+	xor	X(Z), X(D); \
+	rotr	X(V), 32 - S; \
+	rotr	X(W), 32 - S; \
+	rotr	X(Y), 32 - S; \
+	rotr	X(Z), 32 - S;
+
+.text
+.set	reorder
+.set	noat
+.globl	chacha_crypt_arch
+.ent	chacha_crypt_arch
+chacha_crypt_arch:
+	.frame	$sp, STACK_SIZE, $ra
+
+	/* Load number of rounds */
+	lw	$at, 16($sp)
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Return bytes = 0. */
+	beqz	BYTES, .Lchacha_mips_end
+
+	lw	NONCE_0, 48(STATE)
+
+	/* Save s0-s7 */
+	sw	$s0,  0($sp)
+	sw	$s1,  4($sp)
+	sw	$s2,  8($sp)
+	sw	$s3, 12($sp)
+	sw	$s4, 16($sp)
+	sw	$s5, 20($sp)
+	sw	$s6, 24($sp)
+	sw	$s7, 28($sp)
+
+	/* Test IN or OUT is unaligned.
+	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
+	 */
+	or	IS_UNALIGNED, IN, OUT
+	andi	IS_UNALIGNED, 0x3
+
+	b	.Lchacha_rounds_start
+
+.align 4
+.Loop_chacha_rounds:
+	addiu	IN,  CHACHA20_BLOCK_SIZE
+	addiu	OUT, CHACHA20_BLOCK_SIZE
+	addiu	NONCE_0, 1
+
+.Lchacha_rounds_start:
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+
+	move	X12, NONCE_0
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_chacha_xor_rounds:
+	addiu	$at, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$at, .Loop_chacha_xor_rounds
+
+	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
+
+	/* Is data src/dst unaligned? Jump */
+	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
+
+	/* Set number rounds here to fill delayslot. */
+	lw	$at, (STACK_SIZE+16)($sp)
+
+	/* BYTES < 0, it has no full block. */
+	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
+
+	FOR_EACH_WORD_REV(STORE_ALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha_rounds
+
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+
+	/* BYTES < 0? Handle last bytes */
+	bltz	BYTES, .Lchacha_mips_xor_bytes
+
+.Lchacha_mips_xor_done:
+	/* Restore used registers */
+	lw	$s0,  0($sp)
+	lw	$s1,  4($sp)
+	lw	$s2,  8($sp)
+	lw	$s3, 12($sp)
+	lw	$s4, 16($sp)
+	lw	$s5, 20($sp)
+	lw	$s6, 24($sp)
+	lw	$s7, 28($sp)
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+.Lchacha_mips_end:
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+
+.Lchacha_mips_no_full_block_aligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_ALIGNED)
+
+
+.Loop_chacha_unaligned:
+	/* Set number rounds here to fill delayslot. */
+	lw	$at, (STACK_SIZE+16)($sp)
+
+	/* BYTES > 0, it has no full block. */
+	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
+
+	FOR_EACH_WORD_REV(STORE_UNALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha_rounds
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+	.set noreorder
+	/* Fall through to byte handling */
+	bgez	BYTES, .Lchacha_mips_xor_done
+.Lchacha_mips_xor_unaligned_0_b:
+.Lchacha_mips_xor_aligned_0_b:
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+	.set reorder
+
+.Lchacha_mips_xor_bytes:
+	addu	IN, $at
+	addu	OUT, $at
+	/* First byte */
+	lbu	T1, 0(IN)
+	addiu	$at, BYTES, 1
+	xor	T1, SAVED_X
+	sb	T1, 0(OUT)
+	beqz	$at, .Lchacha_mips_xor_done
+	/* Second byte */
+	lbu	T1, 1(IN)
+	addiu	$at, BYTES, 2
+	rotr	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 1(OUT)
+	beqz	$at, .Lchacha_mips_xor_done
+	/* Third byte */
+	lbu	T1, 2(IN)
+	rotr	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 2(OUT)
+	b	.Lchacha_mips_xor_done
+
+.Lchacha_mips_no_full_block_unaligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_UNALIGNED)
+.end chacha_crypt_arch
+.set at
+
+/* Input arguments
+ * STATE	$a0
+ * OUT		$a1
+ * NROUND	$a2
+ */
+
+#undef X12
+#undef X13
+#undef X14
+#undef X15
+
+#define X12	$a3
+#define X13	$at
+#define X14	$v0
+#define X15	STATE
+
+.set noat
+.globl	hchacha_block_arch
+.ent	hchacha_block_arch
+hchacha_block_arch:
+	.frame	$sp, STACK_SIZE, $ra
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Save X11(s6) */
+	sw	X11, 0($sp)
+
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+	lw	X12, 48(STATE)
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_hchacha_xor_rounds:
+	addiu	$a2, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$a2, .Loop_hchacha_xor_rounds
+
+	/* Restore used register */
+	lw	X11, 0($sp)
+
+	sw	X0,  0(OUT)
+	sw	X1,  4(OUT)
+	sw	X2,  8(OUT)
+	sw	X3,  12(OUT)
+	sw	X12, 16(OUT)
+	sw	X13, 20(OUT)
+	sw	X14, 24(OUT)
+	sw	X15, 28(OUT)
+
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+.end hchacha_block_arch
+.set at
diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c
new file mode 100644
index 000000000000..88c097594eb0
--- /dev/null
+++ b/lib/crypto/mips/chacha-glue.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha and HChaCha functions (MIPS optimized)
+ *
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <crypto/chacha.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void chacha_crypt_arch(struct chacha_state *state,
+				  u8 *dst, const u8 *src,
+				  unsigned int bytes, int nrounds);
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+asmlinkage void hchacha_block_arch(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+EXPORT_SYMBOL(hchacha_block_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return true;
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c
new file mode 100644
index 000000000000..764a38a65200
--- /dev/null
+++ b/lib/crypto/mips/poly1305-glue.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+				   u8 digest[POLY1305_DIGEST_SIZE],
+				   const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl
new file mode 100644
index 000000000000..399f10c3e385
--- /dev/null
+++ b/lib/crypto/mips/poly1305-mips.pl
@@ -0,0 +1,1273 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
+# project.
+# ====================================================================
+
+# Poly1305 hash for MIPS.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+#		IALU/gcc
+# R1x000	~5.5/+130%	(big-endian)
+# Octeon II	2.50/+70%	(little-endian)
+#
+# March 2019
+#
+# Add 32-bit code path.
+#
+# October 2019
+#
+# Modulo-scheduling reduction allows to omit dependency chain at the
+# end of inner loop and improve performance. Also optimize MIPS32R2
+# code path for MIPS 1004K core. Per René von Dorst's suggestions.
+#
+#		IALU/gcc
+# R1x000	~9.8/?		(big-endian)
+# Octeon II	3.65/+140%	(little-endian)
+# MT7621/1004K	4.75/?		(little-endian)
+#
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+
+if ($flavour =~ /64|n32/i) {{{
+######################################################################
+# 64-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
+     defined(_MIPS_ARCH_MIPS64R6)) \\
+     && !defined(_MIPS_ARCH_MIPS64R2)
+# define _MIPS_ARCH_MIPS64R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+# define dmultu(rs,rt)
+# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
+# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
+#else
+# define dmultu(rs,rt)		dmultu	rs,rt
+# define mflo(rd,rs,rt)	mflo	rd
+# define mfhi(rd,rs,rt)	mfhi	rd
+#endif
+
+#ifdef	__KERNEL__
+# define poly1305_init   poly1305_block_init_arch
+# define poly1305_blocks poly1305_blocks_arch
+# define poly1305_emit   poly1305_emit_arch
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	poly1305_init
+.ent	poly1305_init
+poly1305_init:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	sd	$zero,0($ctx)
+	sd	$zero,8($ctx)
+	sd	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+	andi	$tmp0,$inp,7		# $inp % 8
+	dsubu	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+	ld	$in0,0($inp)
+	ld	$in1,8($inp)
+	beqz	$tmp0,.Laligned_key
+	ld	$tmp2,16($inp)
+
+	subu	$tmp1,$zero,$tmp0
+# ifdef	MIPSEB
+	dsllv	$in0,$in0,$tmp0
+	dsrlv	$tmp3,$in1,$tmp1
+	dsllv	$in1,$in1,$tmp0
+	dsrlv	$tmp2,$tmp2,$tmp1
+# else
+	dsrlv	$in0,$in0,$tmp0
+	dsllv	$tmp3,$in1,$tmp1
+	dsrlv	$in1,$in1,$tmp0
+	dsllv	$tmp2,$tmp2,$tmp1
+# endif
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+.Laligned_key:
+#else
+	ldl	$in0,0+MSB($inp)
+	ldl	$in1,8+MSB($inp)
+	ldr	$in0,0+LSB($inp)
+	ldr	$in1,8+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+	dsbh	$in0,$in0		# byte swap
+	 dsbh	$in1,$in1
+	dshd	$in0,$in0
+	 dshd	$in1,$in1
+# else
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+
+	and	$tmp1,$in0,$tmp0	# byte swap
+	 and	$tmp3,$in1,$tmp0
+	dsrl	$tmp2,$in0,24
+	 dsrl	$tmp4,$in1,24
+	dsll	$tmp1,24
+	 dsll	$tmp3,24
+	and	$tmp2,$tmp0
+	 and	$tmp4,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	and	$tmp2,$in0,$tmp0
+	 and	$tmp4,$in1,$tmp0
+	dsrl	$in0,8
+	 dsrl	$in1,8
+	dsll	$tmp2,8
+	 dsll	$tmp4,8
+	and	$in0,$tmp0
+	 and	$in1,$tmp0
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+	dsrl	$tmp1,$in0,32
+	 dsrl	$tmp3,$in1,32
+	dsll	$in0,32
+	 dsll	$in1,32
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+# endif
+#endif
+	li	$tmp0,1
+	dsll	$tmp0,32		# 0x0000000100000000
+	daddiu	$tmp0,-63		# 0x00000000ffffffc1
+	dsll	$tmp0,28		# 0x0ffffffc10000000
+	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
+
+	and	$in0,$tmp0
+	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
+	and	$in1,$tmp0
+
+	sd	$in0,24($ctx)
+	dsrl	$tmp0,$in1,2
+	sd	$in1,32($ctx)
+	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
+	sd	$tmp0,40($ctx)
+
+.Lno_key:
+	li	$v0,0			# return 0
+	jr	$ra
+.end	poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+my ($shr,$shl) = ($s6,$s7);		# used on R6
+
+$code.=<<___;
+.align	5
+.globl	poly1305_blocks
+.ent	poly1305_blocks
+poly1305_blocks:
+	.set	noreorder
+	dsrl	$len,4			# number of complete blocks
+	bnez	$len,poly1305_blocks_internal
+	nop
+	jr	$ra
+	nop
+.end	poly1305_blocks
+
+.align	5
+.ent	poly1305_blocks_internal
+poly1305_blocks_internal:
+	.set	noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+	.frame	$sp,8*8,$ra
+	.mask	$SAVED_REGS_MASK|0x000c0000,-8
+	dsubu	$sp,8*8
+	sd	$s7,56($sp)
+	sd	$s6,48($sp)
+#else
+	.frame	$sp,6*8,$ra
+	.mask	$SAVED_REGS_MASK,-8
+	dsubu	$sp,6*8
+#endif
+	sd	$s5,40($sp)
+	sd	$s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	sd	$s3,24($sp)
+	sd	$s2,16($sp)
+	sd	$s1,8($sp)
+	sd	$s0,0($sp)
+___
+$code.=<<___;
+	.set	reorder
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+	andi	$shr,$inp,7
+	dsubu	$inp,$inp,$shr		# align $inp
+	sll	$shr,$shr,3		# byte to bit offset
+	subu	$shl,$zero,$shr
+#endif
+
+	ld	$h0,0($ctx)		# load hash value
+	ld	$h1,8($ctx)
+	ld	$h2,16($ctx)
+
+	ld	$r0,24($ctx)		# load key
+	ld	$r1,32($ctx)
+	ld	$rs1,40($ctx)
+
+	dsll	$len,4
+	daddu	$len,$inp		# end of buffer
+	b	.Loop
+
+.align	4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS64R6)
+	ld	$in0,0($inp)		# load input
+	ld	$in1,8($inp)
+	beqz	$shr,.Laligned_inp
+
+	ld	$tmp2,16($inp)
+# ifdef	MIPSEB
+	dsllv	$in0,$in0,$shr
+	dsrlv	$tmp3,$in1,$shl
+	dsllv	$in1,$in1,$shr
+	dsrlv	$tmp2,$tmp2,$shl
+# else
+	dsrlv	$in0,$in0,$shr
+	dsllv	$tmp3,$in1,$shl
+	dsrlv	$in1,$in1,$shr
+	dsllv	$tmp2,$tmp2,$shl
+# endif
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+.Laligned_inp:
+#else
+	ldl	$in0,0+MSB($inp)	# load input
+	ldl	$in1,8+MSB($inp)
+	ldr	$in0,0+LSB($inp)
+	ldr	$in1,8+LSB($inp)
+#endif
+	daddiu	$inp,16
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+	dsbh	$in0,$in0		# byte swap
+	 dsbh	$in1,$in1
+	dshd	$in0,$in0
+	 dshd	$in1,$in1
+# else
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+
+	and	$tmp1,$in0,$tmp0	# byte swap
+	 and	$tmp3,$in1,$tmp0
+	dsrl	$tmp2,$in0,24
+	 dsrl	$tmp4,$in1,24
+	dsll	$tmp1,24
+	 dsll	$tmp3,24
+	and	$tmp2,$tmp0
+	 and	$tmp4,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	and	$tmp2,$in0,$tmp0
+	 and	$tmp4,$in1,$tmp0
+	dsrl	$in0,8
+	 dsrl	$in1,8
+	dsll	$tmp2,8
+	 dsll	$tmp4,8
+	and	$in0,$tmp0
+	 and	$in1,$tmp0
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+	dsrl	$tmp1,$in0,32
+	 dsrl	$tmp3,$in1,32
+	dsll	$in0,32
+	 dsll	$in1,32
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+# endif
+#endif
+	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
+	andi	$h2,$h2,3
+	dsll	$tmp0,$tmp1,2
+
+	daddu	$d0,$h0,$in0		# accumulate input
+	 daddu	$tmp1,$tmp0
+	sltu	$tmp0,$d0,$h0
+	daddu	$d0,$d0,$tmp1		# ... and residue
+	sltu	$tmp1,$d0,$tmp1
+	daddu	$d1,$h1,$in1
+	daddu	$tmp0,$tmp1
+	sltu	$tmp1,$d1,$h1
+	daddu	$d1,$tmp0
+
+	dmultu	($r0,$d0)		# h0*r0
+	 daddu	$d2,$h2,$padbit
+	 sltu	$tmp0,$d1,$tmp0
+	mflo	($h0,$r0,$d0)
+	mfhi	($h1,$r0,$d0)
+
+	dmultu	($rs1,$d1)		# h1*5*r1
+	 daddu	$d2,$tmp1
+	 daddu	$d2,$tmp0
+	mflo	($tmp0,$rs1,$d1)
+	mfhi	($tmp1,$rs1,$d1)
+
+	dmultu	($r1,$d0)		# h0*r1
+	mflo	($tmp2,$r1,$d0)
+	mfhi	($h2,$r1,$d0)
+	 daddu	$h0,$tmp0
+	 daddu	$h1,$tmp1
+	 sltu	$tmp0,$h0,$tmp0
+
+	dmultu	($r0,$d1)		# h1*r0
+	 daddu	$h1,$tmp0
+	 daddu	$h1,$tmp2
+	mflo	($tmp0,$r0,$d1)
+	mfhi	($tmp1,$r0,$d1)
+
+	dmultu	($rs1,$d2)		# h2*5*r1
+	 sltu	$tmp2,$h1,$tmp2
+	 daddu	$h2,$tmp2
+	mflo	($tmp2,$rs1,$d2)
+
+	dmultu	($r0,$d2)		# h2*r0
+	 daddu	$h1,$tmp0
+	 daddu	$h2,$tmp1
+	mflo	($tmp3,$r0,$d2)
+	 sltu	$tmp0,$h1,$tmp0
+	 daddu	$h2,$tmp0
+
+	daddu	$h1,$tmp2
+	sltu	$tmp2,$h1,$tmp2
+	daddu	$h2,$tmp2
+	daddu	$h2,$tmp3
+
+	bne	$inp,$len,.Loop
+
+	sd	$h0,0($ctx)		# store hash value
+	sd	$h1,8($ctx)
+	sd	$h2,16($ctx)
+
+	.set	noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+	ld	$s7,56($sp)
+	ld	$s6,48($sp)
+#endif
+	ld	$s5,40($sp)		# epilogue
+	ld	$s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
+	ld	$s3,24($sp)
+	ld	$s2,16($sp)
+	ld	$s1,8($sp)
+	ld	$s0,0($sp)
+___
+$code.=<<___;
+	jr	$ra
+#if defined(_MIPS_ARCH_MIPS64R6)
+	daddu	$sp,8*8
+#else
+	daddu	$sp,6*8
+#endif
+.end	poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align	5
+.globl	poly1305_emit
+.ent	poly1305_emit
+poly1305_emit:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	ld	$tmp2,16($ctx)
+	ld	$tmp0,0($ctx)
+	ld	$tmp1,8($ctx)
+
+	li	$in0,-4			# final reduction
+	dsrl	$in1,$tmp2,2
+	and	$in0,$tmp2
+	andi	$tmp2,$tmp2,3
+	daddu	$in0,$in1
+
+	daddu	$tmp0,$tmp0,$in0
+	sltu	$in1,$tmp0,$in0
+	 daddiu	$in0,$tmp0,5		# compare to modulus
+	daddu	$tmp1,$tmp1,$in1
+	 sltiu	$tmp3,$in0,5
+	sltu	$tmp4,$tmp1,$in1
+	 daddu	$in1,$tmp1,$tmp3
+	daddu	$tmp2,$tmp2,$tmp4
+	 sltu	$tmp3,$in1,$tmp3
+	 daddu	$tmp2,$tmp2,$tmp3
+
+	dsrl	$tmp2,2			# see if it carried/borrowed
+	dsubu	$tmp2,$zero,$tmp2
+
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	and	$in0,$tmp2
+	and	$in1,$tmp2
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+
+	lwu	$tmp0,0($nonce)		# load nonce
+	lwu	$tmp1,4($nonce)
+	lwu	$tmp2,8($nonce)
+	lwu	$tmp3,12($nonce)
+	dsll	$tmp1,32
+	dsll	$tmp3,32
+	or	$tmp0,$tmp1
+	or	$tmp2,$tmp3
+
+	daddu	$in0,$tmp0		# accumulate nonce
+	daddu	$in1,$tmp2
+	sltu	$tmp0,$in0,$tmp0
+	daddu	$in1,$tmp0
+
+	dsrl	$tmp0,$in0,8		# write mac value
+	dsrl	$tmp1,$in0,16
+	dsrl	$tmp2,$in0,24
+	sb	$in0,0($mac)
+	dsrl	$tmp3,$in0,32
+	sb	$tmp0,1($mac)
+	dsrl	$tmp0,$in0,40
+	sb	$tmp1,2($mac)
+	dsrl	$tmp1,$in0,48
+	sb	$tmp2,3($mac)
+	dsrl	$tmp2,$in0,56
+	sb	$tmp3,4($mac)
+	dsrl	$tmp3,$in1,8
+	sb	$tmp0,5($mac)
+	dsrl	$tmp0,$in1,16
+	sb	$tmp1,6($mac)
+	dsrl	$tmp1,$in1,24
+	sb	$tmp2,7($mac)
+
+	sb	$in1,8($mac)
+	dsrl	$tmp2,$in1,32
+	sb	$tmp3,9($mac)
+	dsrl	$tmp3,$in1,40
+	sb	$tmp0,10($mac)
+	dsrl	$tmp0,$in1,48
+	sb	$tmp1,11($mac)
+	dsrl	$tmp1,$in1,56
+	sb	$tmp2,12($mac)
+	sb	$tmp3,13($mac)
+	sb	$tmp0,14($mac)
+	sb	$tmp1,15($mac)
+
+	jr	$ra
+.end	poly1305_emit
+.rdata
+.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
+     defined(_MIPS_ARCH_MIPS32R6)) \\
+     && !defined(_MIPS_ARCH_MIPS32R2)
+# define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+# define multu(rs,rt)
+# define mflo(rd,rs,rt)	mulu	rd,rs,rt
+# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
+#else
+# define multu(rs,rt)	multu	rs,rt
+# define mflo(rd,rs,rt)	mflo	rd
+# define mfhi(rd,rs,rt)	mfhi	rd
+#endif
+
+#ifdef	__KERNEL__
+# define poly1305_init   poly1305_block_init_arch
+# define poly1305_blocks poly1305_blocks_arch
+# define poly1305_emit   poly1305_emit_arch
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 3
+#else
+# define MSB 3
+# define LSB 0
+#endif
+
+.text
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	poly1305_init
+.ent	poly1305_init
+poly1305_init:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	sw	$zero,0($ctx)
+	sw	$zero,4($ctx)
+	sw	$zero,8($ctx)
+	sw	$zero,12($ctx)
+	sw	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+	andi	$tmp0,$inp,3		# $inp % 4
+	subu	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+	lw	$in0,0($inp)
+	lw	$in1,4($inp)
+	lw	$in2,8($inp)
+	lw	$in3,12($inp)
+	beqz	$tmp0,.Laligned_key
+
+	lw	$tmp2,16($inp)
+	subu	$tmp1,$zero,$tmp0
+# ifdef	MIPSEB
+	sllv	$in0,$in0,$tmp0
+	srlv	$tmp3,$in1,$tmp1
+	sllv	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	srlv	$tmp3,$in2,$tmp1
+	sllv	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	srlv	$tmp3,$in3,$tmp1
+	sllv	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	srlv	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+# else
+	srlv	$in0,$in0,$tmp0
+	sllv	$tmp3,$in1,$tmp1
+	srlv	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	sllv	$tmp3,$in2,$tmp1
+	srlv	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	sllv	$tmp3,$in3,$tmp1
+	srlv	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	sllv	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+# endif
+.Laligned_key:
+#else
+	lwl	$in0,0+MSB($inp)
+	lwl	$in1,4+MSB($inp)
+	lwl	$in2,8+MSB($inp)
+	lwl	$in3,12+MSB($inp)
+	lwr	$in0,0+LSB($inp)
+	lwr	$in1,4+LSB($inp)
+	lwr	$in2,8+LSB($inp)
+	lwr	$in3,12+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+	wsbh	$in0,$in0		# byte swap
+	wsbh	$in1,$in1
+	wsbh	$in2,$in2
+	wsbh	$in3,$in3
+	rotr	$in0,$in0,16
+	rotr	$in1,$in1,16
+	rotr	$in2,$in2,16
+	rotr	$in3,$in3,16
+# else
+	srl	$tmp0,$in0,24		# byte swap
+	srl	$tmp1,$in0,8
+	andi	$tmp2,$in0,0xFF00
+	sll	$in0,$in0,24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	$in0,$tmp0
+	 srl	$tmp0,$in1,24
+	or	$tmp1,$tmp2
+	 srl	$tmp2,$in1,8
+	or	$in0,$tmp1
+	 andi	$tmp1,$in1,0xFF00
+	 sll	$in1,$in1,24
+	 andi	$tmp2,0xFF00
+	 sll	$tmp1,$tmp1,8
+	 or	$in1,$tmp0
+	srl	$tmp0,$in2,24
+	 or	$tmp2,$tmp1
+	srl	$tmp1,$in2,8
+	 or	$in1,$tmp2
+	andi	$tmp2,$in2,0xFF00
+	sll	$in2,$in2,24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	$in2,$tmp0
+	 srl	$tmp0,$in3,24
+	or	$tmp1,$tmp2
+	 srl	$tmp2,$in3,8
+	or	$in2,$tmp1
+	 andi	$tmp1,$in3,0xFF00
+	 sll	$in3,$in3,24
+	 andi	$tmp2,0xFF00
+	 sll	$tmp1,$tmp1,8
+	 or	$in3,$tmp0
+	 or	$tmp2,$tmp1
+	 or	$in3,$tmp2
+# endif
+#endif
+	lui	$tmp0,0x0fff
+	ori	$tmp0,0xffff		# 0x0fffffff
+	and	$in0,$in0,$tmp0
+	subu	$tmp0,3			# 0x0ffffffc
+	and	$in1,$in1,$tmp0
+	and	$in2,$in2,$tmp0
+	and	$in3,$in3,$tmp0
+
+	sw	$in0,20($ctx)
+	sw	$in1,24($ctx)
+	sw	$in2,28($ctx)
+	sw	$in3,32($ctx)
+
+	srl	$tmp1,$in1,2
+	srl	$tmp2,$in2,2
+	srl	$tmp3,$in3,2
+	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
+	addu	$in2,$in2,$tmp2
+	addu	$in3,$in3,$tmp3
+	sw	$in1,36($ctx)
+	sw	$in2,40($ctx)
+	sw	$in3,44($ctx)
+.Lno_key:
+	li	$v0,0
+	jr	$ra
+.end	poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
+
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
+my ($d0,$d1,$d2,$d3) =
+   ($a4,$a5,$a6,$a7);
+my $shr = $t2;		# used on R6
+my $one = $t2;		# used on R2
+
+$code.=<<___;
+.globl	poly1305_blocks
+.align	5
+.ent	poly1305_blocks
+poly1305_blocks:
+	.frame	$sp,16*4,$ra
+	.mask	$SAVED_REGS_MASK,-4
+	.set	noreorder
+	subu	$sp, $sp,4*12
+	sw	$s11,4*11($sp)
+	sw	$s10,4*10($sp)
+	sw	$s9, 4*9($sp)
+	sw	$s8, 4*8($sp)
+	sw	$s7, 4*7($sp)
+	sw	$s6, 4*6($sp)
+	sw	$s5, 4*5($sp)
+	sw	$s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	sw	$s3, 4*3($sp)
+	sw	$s2, 4*2($sp)
+	sw	$s1, 4*1($sp)
+	sw	$s0, 4*0($sp)
+___
+$code.=<<___;
+	.set	reorder
+
+	srl	$len,4			# number of complete blocks
+	li	$one,1
+	beqz	$len,.Labort
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+	andi	$shr,$inp,3
+	subu	$inp,$inp,$shr		# align $inp
+	sll	$shr,$shr,3		# byte to bit offset
+#endif
+
+	lw	$h0,0($ctx)		# load hash value
+	lw	$h1,4($ctx)
+	lw	$h2,8($ctx)
+	lw	$h3,12($ctx)
+	lw	$h4,16($ctx)
+
+	lw	$r0,20($ctx)		# load key
+	lw	$r1,24($ctx)
+	lw	$r2,28($ctx)
+	lw	$r3,32($ctx)
+	lw	$rs1,36($ctx)
+	lw	$rs2,40($ctx)
+	lw	$rs3,44($ctx)
+
+	sll	$len,4
+	addu	$len,$len,$inp		# end of buffer
+	b	.Loop
+
+.align	4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS32R6)
+	lw	$d0,0($inp)		# load input
+	lw	$d1,4($inp)
+	lw	$d2,8($inp)
+	lw	$d3,12($inp)
+	beqz	$shr,.Laligned_inp
+
+	lw	$t0,16($inp)
+	subu	$t1,$zero,$shr
+# ifdef	MIPSEB
+	sllv	$d0,$d0,$shr
+	srlv	$at,$d1,$t1
+	sllv	$d1,$d1,$shr
+	or	$d0,$d0,$at
+	srlv	$at,$d2,$t1
+	sllv	$d2,$d2,$shr
+	or	$d1,$d1,$at
+	srlv	$at,$d3,$t1
+	sllv	$d3,$d3,$shr
+	or	$d2,$d2,$at
+	srlv	$t0,$t0,$t1
+	or	$d3,$d3,$t0
+# else
+	srlv	$d0,$d0,$shr
+	sllv	$at,$d1,$t1
+	srlv	$d1,$d1,$shr
+	or	$d0,$d0,$at
+	sllv	$at,$d2,$t1
+	srlv	$d2,$d2,$shr
+	or	$d1,$d1,$at
+	sllv	$at,$d3,$t1
+	srlv	$d3,$d3,$shr
+	or	$d2,$d2,$at
+	sllv	$t0,$t0,$t1
+	or	$d3,$d3,$t0
+# endif
+.Laligned_inp:
+#else
+	lwl	$d0,0+MSB($inp)		# load input
+	lwl	$d1,4+MSB($inp)
+	lwl	$d2,8+MSB($inp)
+	lwl	$d3,12+MSB($inp)
+	lwr	$d0,0+LSB($inp)
+	lwr	$d1,4+LSB($inp)
+	lwr	$d2,8+LSB($inp)
+	lwr	$d3,12+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+	wsbh	$d0,$d0			# byte swap
+	wsbh	$d1,$d1
+	wsbh	$d2,$d2
+	wsbh	$d3,$d3
+	rotr	$d0,$d0,16
+	rotr	$d1,$d1,16
+	rotr	$d2,$d2,16
+	rotr	$d3,$d3,16
+# else
+	srl	$at,$d0,24		# byte swap
+	srl	$t0,$d0,8
+	andi	$t1,$d0,0xFF00
+	sll	$d0,$d0,24
+	andi	$t0,0xFF00
+	sll	$t1,$t1,8
+	or	$d0,$at
+	 srl	$at,$d1,24
+	or	$t0,$t1
+	 srl	$t1,$d1,8
+	or	$d0,$t0
+	 andi	$t0,$d1,0xFF00
+	 sll	$d1,$d1,24
+	 andi	$t1,0xFF00
+	 sll	$t0,$t0,8
+	 or	$d1,$at
+	srl	$at,$d2,24
+	 or	$t1,$t0
+	srl	$t0,$d2,8
+	 or	$d1,$t1
+	andi	$t1,$d2,0xFF00
+	sll	$d2,$d2,24
+	andi	$t0,0xFF00
+	sll	$t1,$t1,8
+	or	$d2,$at
+	 srl	$at,$d3,24
+	or	$t0,$t1
+	 srl	$t1,$d3,8
+	or	$d2,$t0
+	 andi	$t0,$d3,0xFF00
+	 sll	$d3,$d3,24
+	 andi	$t1,0xFF00
+	 sll	$t0,$t0,8
+	 or	$d3,$at
+	 or	$t1,$t0
+	 or	$d3,$t1
+# endif
+#endif
+	srl	$t0,$h4,2		# modulo-scheduled reduction
+	andi	$h4,$h4,3
+	sll	$at,$t0,2
+
+	addu	$d0,$d0,$h0		# accumulate input
+	 addu	$t0,$t0,$at
+	sltu	$h0,$d0,$h0
+	addu	$d0,$d0,$t0		# ... and residue
+	sltu	$at,$d0,$t0
+
+	addu	$d1,$d1,$h1
+	 addu	$h0,$h0,$at		# carry
+	sltu	$h1,$d1,$h1
+	addu	$d1,$d1,$h0
+	sltu	$h0,$d1,$h0
+
+	addu	$d2,$d2,$h2
+	 addu	$h1,$h1,$h0		# carry
+	sltu	$h2,$d2,$h2
+	addu	$d2,$d2,$h1
+	sltu	$h1,$d2,$h1
+
+	addu	$d3,$d3,$h3
+	 addu	$h2,$h2,$h1		# carry
+	sltu	$h3,$d3,$h3
+	addu	$d3,$d3,$h2
+
+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
+	multu	$r0,$d0			# d0*r0
+	 sltu	$h2,$d3,$h2
+	maddu	$rs3,$d1		# d1*s3
+	 addu	$h3,$h3,$h2		# carry
+	maddu	$rs2,$d2		# d2*s2
+	 addu	$h4,$h4,$padbit
+	maddu	$rs1,$d3		# d3*s1
+	 addu	$h4,$h4,$h3
+	mfhi	$at
+	mflo	$h0
+
+	multu	$r1,$d0			# d0*r1
+	maddu	$r0,$d1			# d1*r0
+	maddu	$rs3,$d2		# d2*s3
+	maddu	$rs2,$d3		# d3*s2
+	maddu	$rs1,$h4		# h4*s1
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h1
+
+	multu	$r2,$d0			# d0*r2
+	maddu	$r1,$d1			# d1*r1
+	maddu	$r0,$d2			# d2*r0
+	maddu	$rs3,$d3		# d3*s3
+	maddu	$rs2,$h4		# h4*s2
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h2
+
+	mul	$t0,$r0,$h4		# h4*r0
+
+	multu	$r3,$d0			# d0*r3
+	maddu	$r2,$d1			# d1*r2
+	maddu	$r1,$d2			# d2*r1
+	maddu	$r0,$d3			# d3*r0
+	maddu	$rs3,$h4		# h4*s3
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h3
+
+	 addiu	$inp,$inp,16
+
+	addu	$h4,$t0,$at
+#else
+	multu	($r0,$d0)		# d0*r0
+	mflo	($h0,$r0,$d0)
+	mfhi	($h1,$r0,$d0)
+
+	 sltu	$h2,$d3,$h2
+	 addu	$h3,$h3,$h2		# carry
+
+	multu	($rs3,$d1)		# d1*s3
+	mflo	($at,$rs3,$d1)
+	mfhi	($t0,$rs3,$d1)
+
+	 addu	$h4,$h4,$padbit
+	 addiu	$inp,$inp,16
+	 addu	$h4,$h4,$h3
+
+	multu	($rs2,$d2)		# d2*s2
+	mflo	($a3,$rs2,$d2)
+	mfhi	($t1,$rs2,$d2)
+	 addu	$h0,$h0,$at
+	 addu	$h1,$h1,$t0
+	multu	($rs1,$d3)		# d3*s1
+	 sltu	$at,$h0,$at
+	 addu	$h1,$h1,$at
+
+	mflo	($at,$rs1,$d3)
+	mfhi	($t0,$rs1,$d3)
+	 addu	$h0,$h0,$a3
+	 addu	$h1,$h1,$t1
+	multu	($r1,$d0)		# d0*r1
+	 sltu	$a3,$h0,$a3
+	 addu	$h1,$h1,$a3
+
+
+	mflo	($a3,$r1,$d0)
+	mfhi	($h2,$r1,$d0)
+	 addu	$h0,$h0,$at
+	 addu	$h1,$h1,$t0
+	multu	($r0,$d1)		# d1*r0
+	 sltu	$at,$h0,$at
+	 addu	$h1,$h1,$at
+
+	mflo	($at,$r0,$d1)
+	mfhi	($t0,$r0,$d1)
+	 addu	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	multu	($rs3,$d2)		# d2*s3
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$rs3,$d2)
+	mfhi	($t1,$rs3,$d2)
+	 addu	$h1,$h1,$at
+	 addu	$h2,$h2,$t0
+	multu	($rs2,$d3)		# d3*s2
+	 sltu	$at,$h1,$at
+	 addu	$h2,$h2,$at
+
+	mflo	($at,$rs2,$d3)
+	mfhi	($t0,$rs2,$d3)
+	 addu	$h1,$h1,$a3
+	 addu	$h2,$h2,$t1
+	multu	($rs1,$h4)		# h4*s1
+	 sltu	$a3,$h1,$a3
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$rs1,$h4)
+	 addu	$h1,$h1,$at
+	 addu	$h2,$h2,$t0
+	multu	($r2,$d0)		# d0*r2
+	 sltu	$at,$h1,$at
+	 addu	$h2,$h2,$at
+
+
+	mflo	($at,$r2,$d0)
+	mfhi	($h3,$r2,$d0)
+	 addu	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	multu	($r1,$d1)		# d1*r1
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$r1,$d1)
+	mfhi	($t1,$r1,$d1)
+	 addu	$h2,$h2,$at
+	 sltu	$at,$h2,$at
+	multu	($r0,$d2)		# d2*r0
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$r0,$d2)
+	mfhi	($t0,$r0,$d2)
+	 addu	$h2,$h2,$a3
+	 addu	$h3,$h3,$t1
+	multu	($rs3,$d3)		# d3*s3
+	 sltu	$a3,$h2,$a3
+	 addu	$h3,$h3,$a3
+
+	mflo	($a3,$rs3,$d3)
+	mfhi	($t1,$rs3,$d3)
+	 addu	$h2,$h2,$at
+	 addu	$h3,$h3,$t0
+	multu	($rs2,$h4)		# h4*s2
+	 sltu	$at,$h2,$at
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$rs2,$h4)
+	 addu	$h2,$h2,$a3
+	 addu	$h3,$h3,$t1
+	multu	($r3,$d0)		# d0*r3
+	 sltu	$a3,$h2,$a3
+	 addu	$h3,$h3,$a3
+
+
+	mflo	($a3,$r3,$d0)
+	mfhi	($t1,$r3,$d0)
+	 addu	$h2,$h2,$at
+	 sltu	$at,$h2,$at
+	multu	($r2,$d1)		# d1*r2
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$r2,$d1)
+	mfhi	($t0,$r2,$d1)
+	 addu	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	multu	($r0,$d3)		# d3*r0
+	 addu	$t1,$t1,$a3
+
+	mflo	($a3,$r0,$d3)
+	mfhi	($d3,$r0,$d3)
+	 addu	$h3,$h3,$at
+	 addu	$t1,$t1,$t0
+	multu	($r1,$d2)		# d2*r1
+	 sltu	$at,$h3,$at
+	 addu	$t1,$t1,$at
+
+	mflo	($at,$r1,$d2)
+	mfhi	($t0,$r1,$d2)
+	 addu	$h3,$h3,$a3
+	 addu	$t1,$t1,$d3
+	multu	($rs3,$h4)		# h4*s3
+	 sltu	$a3,$h3,$a3
+	 addu	$t1,$t1,$a3
+
+	mflo	($a3,$rs3,$h4)
+	 addu	$h3,$h3,$at
+	 addu	$t1,$t1,$t0
+	multu	($r0,$h4)		# h4*r0
+	 sltu	$at,$h3,$at
+	 addu	$t1,$t1,$at
+
+
+	mflo	($h4,$r0,$h4)
+	 addu	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	 addu	$t1,$t1,$a3
+	addu	$h4,$h4,$t1
+
+	li	$padbit,1		# if we loop, padbit is 1
+#endif
+	bne	$inp,$len,.Loop
+
+	sw	$h0,0($ctx)		# store hash value
+	sw	$h1,4($ctx)
+	sw	$h2,8($ctx)
+	sw	$h3,12($ctx)
+	sw	$h4,16($ctx)
+
+	.set	noreorder
+.Labort:
+	lw	$s11,4*11($sp)
+	lw	$s10,4*10($sp)
+	lw	$s9, 4*9($sp)
+	lw	$s8, 4*8($sp)
+	lw	$s7, 4*7($sp)
+	lw	$s6, 4*6($sp)
+	lw	$s5, 4*5($sp)
+	lw	$s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	lw	$s3, 4*3($sp)
+	lw	$s2, 4*2($sp)
+	lw	$s1, 4*1($sp)
+	lw	$s0, 4*0($sp)
+___
+$code.=<<___;
+	jr	$ra
+	addu	$sp,$sp,4*12
+.end	poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.align	5
+.globl	poly1305_emit
+.ent	poly1305_emit
+poly1305_emit:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	lw	$tmp4,16($ctx)
+	lw	$tmp0,0($ctx)
+	lw	$tmp1,4($ctx)
+	lw	$tmp2,8($ctx)
+	lw	$tmp3,12($ctx)
+
+	li	$in0,-4			# final reduction
+	srl	$ctx,$tmp4,2
+	and	$in0,$in0,$tmp4
+	andi	$tmp4,$tmp4,3
+	addu	$ctx,$ctx,$in0
+
+	addu	$tmp0,$tmp0,$ctx
+	sltu	$ctx,$tmp0,$ctx
+	 addiu	$in0,$tmp0,5		# compare to modulus
+	addu	$tmp1,$tmp1,$ctx
+	 sltiu	$in1,$in0,5
+	sltu	$ctx,$tmp1,$ctx
+	 addu	$in1,$in1,$tmp1
+	addu	$tmp2,$tmp2,$ctx
+	 sltu	$in2,$in1,$tmp1
+	sltu	$ctx,$tmp2,$ctx
+	 addu	$in2,$in2,$tmp2
+	addu	$tmp3,$tmp3,$ctx
+	 sltu	$in3,$in2,$tmp2
+	sltu	$ctx,$tmp3,$ctx
+	 addu	$in3,$in3,$tmp3
+	addu	$tmp4,$tmp4,$ctx
+	 sltu	$ctx,$in3,$tmp3
+	 addu	$ctx,$tmp4
+
+	srl	$ctx,2			# see if it carried/borrowed
+	subu	$ctx,$zero,$ctx
+
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	xor	$in2,$tmp2
+	xor	$in3,$tmp3
+	and	$in0,$ctx
+	and	$in1,$ctx
+	and	$in2,$ctx
+	and	$in3,$ctx
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	xor	$in2,$tmp2
+	xor	$in3,$tmp3
+
+	lw	$tmp0,0($nonce)		# load nonce
+	lw	$tmp1,4($nonce)
+	lw	$tmp2,8($nonce)
+	lw	$tmp3,12($nonce)
+
+	addu	$in0,$tmp0		# accumulate nonce
+	sltu	$ctx,$in0,$tmp0
+
+	addu	$in1,$tmp1
+	sltu	$tmp1,$in1,$tmp1
+	addu	$in1,$ctx
+	sltu	$ctx,$in1,$ctx
+	addu	$ctx,$tmp1
+
+	addu	$in2,$tmp2
+	sltu	$tmp2,$in2,$tmp2
+	addu	$in2,$ctx
+	sltu	$ctx,$in2,$ctx
+	addu	$ctx,$tmp2
+
+	addu	$in3,$tmp3
+	addu	$in3,$ctx
+
+	srl	$tmp0,$in0,8		# write mac value
+	srl	$tmp1,$in0,16
+	srl	$tmp2,$in0,24
+	sb	$in0, 0($mac)
+	sb	$tmp0,1($mac)
+	srl	$tmp0,$in1,8
+	sb	$tmp1,2($mac)
+	srl	$tmp1,$in1,16
+	sb	$tmp2,3($mac)
+	srl	$tmp2,$in1,24
+	sb	$in1, 4($mac)
+	sb	$tmp0,5($mac)
+	srl	$tmp0,$in2,8
+	sb	$tmp1,6($mac)
+	srl	$tmp1,$in2,16
+	sb	$tmp2,7($mac)
+	srl	$tmp2,$in2,24
+	sb	$in2, 8($mac)
+	sb	$tmp0,9($mac)
+	srl	$tmp0,$in3,8
+	sb	$tmp1,10($mac)
+	srl	$tmp1,$in3,16
+	sb	$tmp2,11($mac)
+	srl	$tmp2,$in3,24
+	sb	$in3, 12($mac)
+	sb	$tmp0,13($mac)
+	sb	$tmp1,14($mac)
+	sb	$tmp2,15($mac)
+
+	jr	$ra
+.end	poly1305_emit
+.rdata
+.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+}
+}}}
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/lib/crypto/mips/sha1.h b/lib/crypto/mips/sha1.h
new file mode 100644
index 000000000000..ba1965002e4a
--- /dev/null
+++ b/lib/crypto/mips/sha1.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * SHA1 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha1_generic.c, which is:
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void octeon_sha1_store_hash(struct sha1_block_state *state)
+{
+	u64 *hash = (u64 *)&state->h[0];
+	union {
+		u32 word[2];
+		u64 dword;
+	} hash_tail = { { state->h[4], } };
+
+	write_octeon_64bit_hash_dword(hash[0], 0);
+	write_octeon_64bit_hash_dword(hash[1], 1);
+	write_octeon_64bit_hash_dword(hash_tail.dword, 2);
+	memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0]));
+}
+
+static void octeon_sha1_read_hash(struct sha1_block_state *state)
+{
+	u64 *hash = (u64 *)&state->h[0];
+	union {
+		u32 word[2];
+		u64 dword;
+	} hash_tail;
+
+	hash[0]		= read_octeon_64bit_hash_dword(0);
+	hash[1]		= read_octeon_64bit_hash_dword(1);
+	hash_tail.dword	= read_octeon_64bit_hash_dword(2);
+	state->h[4]	= hash_tail.word[0];
+	memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword));
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return sha1_blocks_generic(state, data, nblocks);
+
+	flags = octeon_crypto_enable(&cop2_state);
+	octeon_sha1_store_hash(state);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_sha1_start(block[7]);
+
+		data += SHA1_BLOCK_SIZE;
+	} while (--nblocks);
+
+	octeon_sha1_read_hash(state);
+	octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mips/sha256.h b/lib/crypto/mips/sha256.h
new file mode 100644
index 000000000000..ccccfd131634
--- /dev/null
+++ b/lib/crypto/mips/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha256_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	u64 *state64 = (u64 *)state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return sha256_blocks_generic(state, data, nblocks);
+
+	flags = octeon_crypto_enable(&cop2_state);
+	write_octeon_64bit_hash_dword(state64[0], 0);
+	write_octeon_64bit_hash_dword(state64[1], 1);
+	write_octeon_64bit_hash_dword(state64[2], 2);
+	write_octeon_64bit_hash_dword(state64[3], 3);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_sha256_start(block[7]);
+
+		data += SHA256_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state64[0] = read_octeon_64bit_hash_dword(0);
+	state64[1] = read_octeon_64bit_hash_dword(1);
+	state64[2] = read_octeon_64bit_hash_dword(2);
+	state64[3] = read_octeon_64bit_hash_dword(3);
+	octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mips/sha512.h b/lib/crypto/mips/sha512.h
new file mode 100644
index 000000000000..b3ffbc1e8ca8
--- /dev/null
+++ b/lib/crypto/mips/sha512.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * SHA-512 and SHA-384 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha512_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return sha512_blocks_generic(state, data, nblocks);
+
+	flags = octeon_crypto_enable(&cop2_state);
+	write_octeon_64bit_hash_sha512(state->h[0], 0);
+	write_octeon_64bit_hash_sha512(state->h[1], 1);
+	write_octeon_64bit_hash_sha512(state->h[2], 2);
+	write_octeon_64bit_hash_sha512(state->h[3], 3);
+	write_octeon_64bit_hash_sha512(state->h[4], 4);
+	write_octeon_64bit_hash_sha512(state->h[5], 5);
+	write_octeon_64bit_hash_sha512(state->h[6], 6);
+	write_octeon_64bit_hash_sha512(state->h[7], 7);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_sha512(block[0], 0);
+		write_octeon_64bit_block_sha512(block[1], 1);
+		write_octeon_64bit_block_sha512(block[2], 2);
+		write_octeon_64bit_block_sha512(block[3], 3);
+		write_octeon_64bit_block_sha512(block[4], 4);
+		write_octeon_64bit_block_sha512(block[5], 5);
+		write_octeon_64bit_block_sha512(block[6], 6);
+		write_octeon_64bit_block_sha512(block[7], 7);
+		write_octeon_64bit_block_sha512(block[8], 8);
+		write_octeon_64bit_block_sha512(block[9], 9);
+		write_octeon_64bit_block_sha512(block[10], 10);
+		write_octeon_64bit_block_sha512(block[11], 11);
+		write_octeon_64bit_block_sha512(block[12], 12);
+		write_octeon_64bit_block_sha512(block[13], 13);
+		write_octeon_64bit_block_sha512(block[14], 14);
+		octeon_sha512_start(block[15]);
+
+		data += SHA512_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state->h[0] = read_octeon_64bit_hash_sha512(0);
+	state->h[1] = read_octeon_64bit_hash_sha512(1);
+	state->h[2] = read_octeon_64bit_hash_sha512(2);
+	state->h[3] = read_octeon_64bit_hash_sha512(3);
+	state->h[4] = read_octeon_64bit_hash_sha512(4);
+	state->h[5] = read_octeon_64bit_hash_sha512(5);
+	state->h[6] = read_octeon_64bit_hash_sha512(6);
+	state->h[7] = read_octeon_64bit_hash_sha512(7);
+	octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mpi/mpi-add.c b/lib/crypto/mpi/mpi-add.c
index 3015140d4860..c0375c1672fa 100644
--- a/lib/crypto/mpi/mpi-add.c
+++ b/lib/crypto/mpi/mpi-add.c
@@ -11,6 +11,8 @@
  *	 to avoid revealing of sensitive data due to paging etc.
  */
 
+#include <linux/export.h>
+
 #include "mpi-internal.h"
 
 int mpi_add(MPI w, MPI u, MPI v)
diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c
index 934d81311360..b3d0e7ddbc03 100644
--- a/lib/crypto/mpi/mpi-bit.c
+++ b/lib/crypto/mpi/mpi-bit.c
@@ -18,6 +18,8 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  */
 
+#include <linux/export.h>
+
 #include "mpi-internal.h"
 #include "longlong.h"
 
diff --git a/lib/crypto/mpi/mpi-cmp.c b/lib/crypto/mpi/mpi-cmp.c
index ceaebe181cd7..b42929296bce 100644
--- a/lib/crypto/mpi/mpi-cmp.c
+++ b/lib/crypto/mpi/mpi-cmp.c
@@ -18,6 +18,8 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  */
 
+#include <linux/export.h>
+
 #include "mpi-internal.h"
 
 int mpi_cmp_ui(MPI u, unsigned long v)
diff --git a/lib/crypto/mpi/mpi-mul.c b/lib/crypto/mpi/mpi-mul.c
index 7e6ff1ce3e9b..d79f186ad90b 100644
--- a/lib/crypto/mpi/mpi-mul.c
+++ b/lib/crypto/mpi/mpi-mul.c
@@ -11,6 +11,8 @@
  *	 to avoid revealing of sensitive data due to paging etc.
  */
 
+#include <linux/export.h>
+
 #include "mpi-internal.h"
 
 int mpi_mul(MPI w, MPI u, MPI v)
diff --git a/lib/crypto/mpi/mpi-pow.c b/lib/crypto/mpi/mpi-pow.c
index 67fbd4c2503d..9e695a3bda3a 100644
--- a/lib/crypto/mpi/mpi-pow.c
+++ b/lib/crypto/mpi/mpi-pow.c
@@ -13,8 +13,10 @@
  *	 however I decided to publish this code under the plain GPL.
  */
 
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/string.h>
+
 #include "mpi-internal.h"
 #include "longlong.h"
 
diff --git a/lib/crypto/mpi/mpi-sub-ui.c b/lib/crypto/mpi/mpi-sub-ui.c
index b41b082b5f3e..0edcdbd24833 100644
--- a/lib/crypto/mpi/mpi-sub-ui.c
+++ b/lib/crypto/mpi/mpi-sub-ui.c
@@ -32,6 +32,8 @@
  * see https://www.gnu.org/licenses/.
  */
 
+#include <linux/export.h>
+
 #include "mpi-internal.h"
 
 int mpi_sub_ui(MPI w, MPI u, unsigned long vval)
diff --git a/lib/crypto/mpi/mpicoder.c b/lib/crypto/mpi/mpicoder.c
index dde01030807d..47f6939599b3 100644
--- a/lib/crypto/mpi/mpicoder.c
+++ b/lib/crypto/mpi/mpicoder.c
@@ -19,8 +19,9 @@
  */
 
 #include <linux/bitops.h>
-#include <linux/count_zeros.h>
 #include <linux/byteorder/generic.h>
+#include <linux/count_zeros.h>
+#include <linux/export.h>
 #include <linux/scatterlist.h>
 #include <linux/string.h>
 #include "mpi-internal.h"
diff --git a/lib/crypto/mpi/mpiutil.c b/lib/crypto/mpi/mpiutil.c
index 979ece5a81d2..7f2db830f404 100644
--- a/lib/crypto/mpi/mpiutil.c
+++ b/lib/crypto/mpi/mpiutil.c
@@ -18,6 +18,8 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  */
 
+#include <linux/export.h>
+
 #include "mpi-internal.h"
 
 /****************
diff --git a/lib/crypto/poly1305-donna32.c b/lib/crypto/poly1305-donna32.c
index 0a4a2d99e365..b66131b3f6d4 100644
--- a/lib/crypto/poly1305-donna32.c
+++ b/lib/crypto/poly1305-donna32.c
@@ -6,9 +6,10 @@
  * public domain.
  */
 
+#include <crypto/internal/poly1305.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/unaligned.h>
-#include <crypto/internal/poly1305.h>
 
 void poly1305_core_setkey(struct poly1305_core_key *key,
 			  const u8 raw_key[POLY1305_BLOCK_SIZE])
diff --git a/lib/crypto/poly1305-donna64.c b/lib/crypto/poly1305-donna64.c
index 530287531b2e..8a72a5a84944 100644
--- a/lib/crypto/poly1305-donna64.c
+++ b/lib/crypto/poly1305-donna64.c
@@ -6,9 +6,10 @@
  * public domain.
  */
 
+#include <crypto/internal/poly1305.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/unaligned.h>
-#include <crypto/internal/poly1305.h>
 
 void poly1305_core_setkey(struct poly1305_core_key *key,
 			  const u8 raw_key[POLY1305_BLOCK_SIZE])
diff --git a/lib/crypto/poly1305-generic.c b/lib/crypto/poly1305-generic.c
index a73f700fa1fb..71a16c5c538b 100644
--- a/lib/crypto/poly1305-generic.c
+++ b/lib/crypto/poly1305-generic.c
@@ -8,6 +8,7 @@
  */
 
 #include <crypto/internal/poly1305.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 
diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c
index 5f2f2af3b59f..a6dc182b6c22 100644
--- a/lib/crypto/poly1305.c
+++ b/lib/crypto/poly1305.c
@@ -9,6 +9,7 @@
 
 #include <crypto/internal/blockhash.h>
 #include <crypto/internal/poly1305.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig
new file mode 100644
index 000000000000..2eaeb7665a6a
--- /dev/null
+++ b/lib/crypto/powerpc/Kconfig
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA20_P10
+	tristate
+	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_P10
+	tristate
+	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+	depends on BROKEN # Needs to be fixed to work in softirq context
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+	select CRYPTO_LIB_POLY1305_GENERIC
diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile
new file mode 100644
index 000000000000..5709ae14258a
--- /dev/null
+++ b/lib/crypto/powerpc/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
+chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
+poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha-p10-glue.c
new file mode 100644
index 000000000000..fcd23c6f1590
--- /dev/null
+++ b/lib/crypto/powerpc/chacha-p10-glue.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * ChaCha stream cipher (P10 accelerated)
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+
+asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst,
+				const u8 *src, unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+	preempt_disable();
+	enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+	disable_kernel_vsx();
+	preempt_enable();
+}
+
+static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src,
+			     unsigned int bytes, int nrounds)
+{
+	unsigned int l = bytes & ~0x0FF;
+
+	if (l > 0) {
+		chacha_p10le_8x(state, dst, src, l, nrounds);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += l / CHACHA_BLOCK_SIZE;
+	}
+
+	if (bytes > 0)
+		chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	hchacha_block_generic(state, out, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+	    !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		vsx_begin();
+		chacha_p10_do_8x(state, dst, src, todo, nrounds);
+		vsx_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_p10);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_p10_init(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		static_branch_enable(&have_p10);
+	return 0;
+}
+subsys_initcall(chacha_p10_init);
+
+static void __exit chacha_p10_exit(void)
+{
+}
+module_exit(chacha_p10_exit);
+
+MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/powerpc/chacha-p10le-8x.S b/lib/crypto/powerpc/chacha-p10le-8x.S
new file mode 100644
index 000000000000..b29562bd5d40
--- /dev/null
+++ b/lib/crypto/powerpc/chacha-p10le-8x.S
@@ -0,0 +1,840 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# do rounds,  8 quarter rounds
+# 1.  a += b; d ^= a; d <<<= 16;
+# 2.  c += d; b ^= c; b <<<= 12;
+# 3.  a += b; d ^= a; d <<<= 8;
+# 4.  c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
+# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+#
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine	"any"
+.text
+
+.macro	SAVE_GPR GPR OFFSET FRAME
+	std	\GPR,\OFFSET(\FRAME)
+.endm
+
+.macro	SAVE_VRS VRS OFFSET FRAME
+	li	16, \OFFSET
+	stvx	\VRS, 16, \FRAME
+.endm
+
+.macro	SAVE_VSX VSX OFFSET FRAME
+	li	16, \OFFSET
+	stxvx	\VSX, 16, \FRAME
+.endm
+
+.macro	RESTORE_GPR GPR OFFSET FRAME
+	ld	\GPR,\OFFSET(\FRAME)
+.endm
+
+.macro	RESTORE_VRS VRS OFFSET FRAME
+	li	16, \OFFSET
+	lvx	\VRS, 16, \FRAME
+.endm
+
+.macro	RESTORE_VSX VSX OFFSET FRAME
+	li	16, \OFFSET
+	lxvx	\VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+	mflr 0
+	std 0, 16(1)
+	stdu 1,-752(1)
+
+	SAVE_GPR 14, 112, 1
+	SAVE_GPR 15, 120, 1
+	SAVE_GPR 16, 128, 1
+	SAVE_GPR 17, 136, 1
+	SAVE_GPR 18, 144, 1
+	SAVE_GPR 19, 152, 1
+	SAVE_GPR 20, 160, 1
+	SAVE_GPR 21, 168, 1
+	SAVE_GPR 22, 176, 1
+	SAVE_GPR 23, 184, 1
+	SAVE_GPR 24, 192, 1
+	SAVE_GPR 25, 200, 1
+	SAVE_GPR 26, 208, 1
+	SAVE_GPR 27, 216, 1
+	SAVE_GPR 28, 224, 1
+	SAVE_GPR 29, 232, 1
+	SAVE_GPR 30, 240, 1
+	SAVE_GPR 31, 248, 1
+
+	addi	9, 1, 256
+	SAVE_VRS 20, 0, 9
+	SAVE_VRS 21, 16, 9
+	SAVE_VRS 22, 32, 9
+	SAVE_VRS 23, 48, 9
+	SAVE_VRS 24, 64, 9
+	SAVE_VRS 25, 80, 9
+	SAVE_VRS 26, 96, 9
+	SAVE_VRS 27, 112, 9
+	SAVE_VRS 28, 128, 9
+	SAVE_VRS 29, 144, 9
+	SAVE_VRS 30, 160, 9
+	SAVE_VRS 31, 176, 9
+
+	SAVE_VSX 14, 192, 9
+	SAVE_VSX 15, 208, 9
+	SAVE_VSX 16, 224, 9
+	SAVE_VSX 17, 240, 9
+	SAVE_VSX 18, 256, 9
+	SAVE_VSX 19, 272, 9
+	SAVE_VSX 20, 288, 9
+	SAVE_VSX 21, 304, 9
+	SAVE_VSX 22, 320, 9
+	SAVE_VSX 23, 336, 9
+	SAVE_VSX 24, 352, 9
+	SAVE_VSX 25, 368, 9
+	SAVE_VSX 26, 384, 9
+	SAVE_VSX 27, 400, 9
+	SAVE_VSX 28, 416, 9
+	SAVE_VSX 29, 432, 9
+	SAVE_VSX 30, 448, 9
+	SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+	addi	9, 1, 256
+	RESTORE_VRS 20, 0, 9
+	RESTORE_VRS 21, 16, 9
+	RESTORE_VRS 22, 32, 9
+	RESTORE_VRS 23, 48, 9
+	RESTORE_VRS 24, 64, 9
+	RESTORE_VRS 25, 80, 9
+	RESTORE_VRS 26, 96, 9
+	RESTORE_VRS 27, 112, 9
+	RESTORE_VRS 28, 128, 9
+	RESTORE_VRS 29, 144, 9
+	RESTORE_VRS 30, 160, 9
+	RESTORE_VRS 31, 176, 9
+
+	RESTORE_VSX 14, 192, 9
+	RESTORE_VSX 15, 208, 9
+	RESTORE_VSX 16, 224, 9
+	RESTORE_VSX 17, 240, 9
+	RESTORE_VSX 18, 256, 9
+	RESTORE_VSX 19, 272, 9
+	RESTORE_VSX 20, 288, 9
+	RESTORE_VSX 21, 304, 9
+	RESTORE_VSX 22, 320, 9
+	RESTORE_VSX 23, 336, 9
+	RESTORE_VSX 24, 352, 9
+	RESTORE_VSX 25, 368, 9
+	RESTORE_VSX 26, 384, 9
+	RESTORE_VSX 27, 400, 9
+	RESTORE_VSX 28, 416, 9
+	RESTORE_VSX 29, 432, 9
+	RESTORE_VSX 30, 448, 9
+	RESTORE_VSX 31, 464, 9
+
+	RESTORE_GPR 14, 112, 1
+	RESTORE_GPR 15, 120, 1
+	RESTORE_GPR 16, 128, 1
+	RESTORE_GPR 17, 136, 1
+	RESTORE_GPR 18, 144, 1
+	RESTORE_GPR 19, 152, 1
+	RESTORE_GPR 20, 160, 1
+	RESTORE_GPR 21, 168, 1
+	RESTORE_GPR 22, 176, 1
+	RESTORE_GPR 23, 184, 1
+	RESTORE_GPR 24, 192, 1
+	RESTORE_GPR 25, 200, 1
+	RESTORE_GPR 26, 208, 1
+	RESTORE_GPR 27, 216, 1
+	RESTORE_GPR 28, 224, 1
+	RESTORE_GPR 29, 232, 1
+	RESTORE_GPR 30, 240, 1
+	RESTORE_GPR 31, 248, 1
+
+	addi    1, 1, 752
+	ld 0, 16(1)
+	mtlr 0
+.endm # RESTORE_REGS
+
+.macro QT_loop_8x
+	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 20, 20
+	vadduwm 0, 0, 4
+	vadduwm 1, 1, 5
+	vadduwm 2, 2, 6
+	vadduwm 3, 3, 7
+	  vadduwm 16, 16, 20
+	  vadduwm 17, 17, 21
+	  vadduwm 18, 18, 22
+	  vadduwm 19, 19, 23
+
+	  vpermxor 12, 12, 0, 25
+	  vpermxor 13, 13, 1, 25
+	  vpermxor 14, 14, 2, 25
+	  vpermxor 15, 15, 3, 25
+	  vpermxor 28, 28, 16, 25
+	  vpermxor 29, 29, 17, 25
+	  vpermxor 30, 30, 18, 25
+	  vpermxor 31, 31, 19, 25
+	xxlor	32+25, 0, 0
+	vadduwm 8, 8, 12
+	vadduwm 9, 9, 13
+	vadduwm 10, 10, 14
+	vadduwm 11, 11, 15
+	  vadduwm 24, 24, 28
+	  vadduwm 25, 25, 29
+	  vadduwm 26, 26, 30
+	  vadduwm 27, 27, 31
+	vxor 4, 4, 8
+	vxor 5, 5, 9
+	vxor 6, 6, 10
+	vxor 7, 7, 11
+	  vxor 20, 20, 24
+	  vxor 21, 21, 25
+	  vxor 22, 22, 26
+	  vxor 23, 23, 27
+
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 21, 21
+	vrlw 4, 4, 25  #
+	vrlw 5, 5, 25
+	vrlw 6, 6, 25
+	vrlw 7, 7, 25
+	  vrlw 20, 20, 25  #
+	  vrlw 21, 21, 25
+	  vrlw 22, 22, 25
+	  vrlw 23, 23, 25
+	xxlor	32+25, 0, 0
+	vadduwm 0, 0, 4
+	vadduwm 1, 1, 5
+	vadduwm 2, 2, 6
+	vadduwm 3, 3, 7
+	  vadduwm 16, 16, 20
+	  vadduwm 17, 17, 21
+	  vadduwm 18, 18, 22
+	  vadduwm 19, 19, 23
+
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 22, 22
+	  vpermxor 12, 12, 0, 25
+	  vpermxor 13, 13, 1, 25
+	  vpermxor 14, 14, 2, 25
+	  vpermxor 15, 15, 3, 25
+	  vpermxor 28, 28, 16, 25
+	  vpermxor 29, 29, 17, 25
+	  vpermxor 30, 30, 18, 25
+	  vpermxor 31, 31, 19, 25
+	xxlor	32+25, 0, 0
+	vadduwm 8, 8, 12
+	vadduwm 9, 9, 13
+	vadduwm 10, 10, 14
+	vadduwm 11, 11, 15
+	  vadduwm 24, 24, 28
+	  vadduwm 25, 25, 29
+	  vadduwm 26, 26, 30
+	  vadduwm 27, 27, 31
+	xxlor	0, 32+28, 32+28
+	xxlor	32+28, 23, 23
+	vxor 4, 4, 8
+	vxor 5, 5, 9
+	vxor 6, 6, 10
+	vxor 7, 7, 11
+	  vxor 20, 20, 24
+	  vxor 21, 21, 25
+	  vxor 22, 22, 26
+	  vxor 23, 23, 27
+	vrlw 4, 4, 28  #
+	vrlw 5, 5, 28
+	vrlw 6, 6, 28
+	vrlw 7, 7, 28
+	  vrlw 20, 20, 28  #
+	  vrlw 21, 21, 28
+	  vrlw 22, 22, 28
+	  vrlw 23, 23, 28
+	xxlor	32+28, 0, 0
+
+	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 20, 20
+	vadduwm 0, 0, 5
+	vadduwm 1, 1, 6
+	vadduwm 2, 2, 7
+	vadduwm 3, 3, 4
+	  vadduwm 16, 16, 21
+	  vadduwm 17, 17, 22
+	  vadduwm 18, 18, 23
+	  vadduwm 19, 19, 20
+
+	  vpermxor 15, 15, 0, 25
+	  vpermxor 12, 12, 1, 25
+	  vpermxor 13, 13, 2, 25
+	  vpermxor 14, 14, 3, 25
+	  vpermxor 31, 31, 16, 25
+	  vpermxor 28, 28, 17, 25
+	  vpermxor 29, 29, 18, 25
+	  vpermxor 30, 30, 19, 25
+
+	xxlor	32+25, 0, 0
+	vadduwm 10, 10, 15
+	vadduwm 11, 11, 12
+	vadduwm 8, 8, 13
+	vadduwm 9, 9, 14
+	  vadduwm 26, 26, 31
+	  vadduwm 27, 27, 28
+	  vadduwm 24, 24, 29
+	  vadduwm 25, 25, 30
+	vxor 5, 5, 10
+	vxor 6, 6, 11
+	vxor 7, 7, 8
+	vxor 4, 4, 9
+	  vxor 21, 21, 26
+	  vxor 22, 22, 27
+	  vxor 23, 23, 24
+	  vxor 20, 20, 25
+
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 21, 21
+	vrlw 5, 5, 25
+	vrlw 6, 6, 25
+	vrlw 7, 7, 25
+	vrlw 4, 4, 25
+	  vrlw 21, 21, 25
+	  vrlw 22, 22, 25
+	  vrlw 23, 23, 25
+	  vrlw 20, 20, 25
+	xxlor	32+25, 0, 0
+
+	vadduwm 0, 0, 5
+	vadduwm 1, 1, 6
+	vadduwm 2, 2, 7
+	vadduwm 3, 3, 4
+	  vadduwm 16, 16, 21
+	  vadduwm 17, 17, 22
+	  vadduwm 18, 18, 23
+	  vadduwm 19, 19, 20
+
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 22, 22
+	  vpermxor 15, 15, 0, 25
+	  vpermxor 12, 12, 1, 25
+	  vpermxor 13, 13, 2, 25
+	  vpermxor 14, 14, 3, 25
+	  vpermxor 31, 31, 16, 25
+	  vpermxor 28, 28, 17, 25
+	  vpermxor 29, 29, 18, 25
+	  vpermxor 30, 30, 19, 25
+	xxlor	32+25, 0, 0
+
+	vadduwm 10, 10, 15
+	vadduwm 11, 11, 12
+	vadduwm 8, 8, 13
+	vadduwm 9, 9, 14
+	  vadduwm 26, 26, 31
+	  vadduwm 27, 27, 28
+	  vadduwm 24, 24, 29
+	  vadduwm 25, 25, 30
+
+	xxlor	0, 32+28, 32+28
+	xxlor	32+28, 23, 23
+	vxor 5, 5, 10
+	vxor 6, 6, 11
+	vxor 7, 7, 8
+	vxor 4, 4, 9
+	  vxor 21, 21, 26
+	  vxor 22, 22, 27
+	  vxor 23, 23, 24
+	  vxor 20, 20, 25
+	vrlw 5, 5, 28
+	vrlw 6, 6, 28
+	vrlw 7, 7, 28
+	vrlw 4, 4, 28
+	  vrlw 21, 21, 28
+	  vrlw 22, 22, 28
+	  vrlw 23, 23, 28
+	  vrlw 20, 20, 28
+	xxlor	32+28, 0, 0
+.endm
+
+.macro QT_loop_4x
+	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+	vadduwm 0, 0, 4
+	vadduwm 1, 1, 5
+	vadduwm 2, 2, 6
+	vadduwm 3, 3, 7
+	  vpermxor 12, 12, 0, 20
+	  vpermxor 13, 13, 1, 20
+	  vpermxor 14, 14, 2, 20
+	  vpermxor 15, 15, 3, 20
+	vadduwm 8, 8, 12
+	vadduwm 9, 9, 13
+	vadduwm 10, 10, 14
+	vadduwm 11, 11, 15
+	vxor 4, 4, 8
+	vxor 5, 5, 9
+	vxor 6, 6, 10
+	vxor 7, 7, 11
+	vrlw 4, 4, 21
+	vrlw 5, 5, 21
+	vrlw 6, 6, 21
+	vrlw 7, 7, 21
+	vadduwm 0, 0, 4
+	vadduwm 1, 1, 5
+	vadduwm 2, 2, 6
+	vadduwm 3, 3, 7
+	  vpermxor 12, 12, 0, 22
+	  vpermxor 13, 13, 1, 22
+	  vpermxor 14, 14, 2, 22
+	  vpermxor 15, 15, 3, 22
+	vadduwm 8, 8, 12
+	vadduwm 9, 9, 13
+	vadduwm 10, 10, 14
+	vadduwm 11, 11, 15
+	vxor 4, 4, 8
+	vxor 5, 5, 9
+	vxor 6, 6, 10
+	vxor 7, 7, 11
+	vrlw 4, 4, 23
+	vrlw 5, 5, 23
+	vrlw 6, 6, 23
+	vrlw 7, 7, 23
+
+	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+	vadduwm 0, 0, 5
+	vadduwm 1, 1, 6
+	vadduwm 2, 2, 7
+	vadduwm 3, 3, 4
+	  vpermxor 15, 15, 0, 20
+	  vpermxor 12, 12, 1, 20
+	  vpermxor 13, 13, 2, 20
+	  vpermxor 14, 14, 3, 20
+	vadduwm 10, 10, 15
+	vadduwm 11, 11, 12
+	vadduwm 8, 8, 13
+	vadduwm 9, 9, 14
+	vxor 5, 5, 10
+	vxor 6, 6, 11
+	vxor 7, 7, 8
+	vxor 4, 4, 9
+	vrlw 5, 5, 21
+	vrlw 6, 6, 21
+	vrlw 7, 7, 21
+	vrlw 4, 4, 21
+	vadduwm 0, 0, 5
+	vadduwm 1, 1, 6
+	vadduwm 2, 2, 7
+	vadduwm 3, 3, 4
+	  vpermxor 15, 15, 0, 22
+	  vpermxor 12, 12, 1, 22
+	  vpermxor 13, 13, 2, 22
+	  vpermxor 14, 14, 3, 22
+	vadduwm 10, 10, 15
+	vadduwm 11, 11, 12
+	vadduwm 8, 8, 13
+	vadduwm 9, 9, 14
+	vxor 5, 5, 10
+	vxor 6, 6, 11
+	vxor 7, 7, 8
+	vxor 4, 4, 9
+	vrlw 5, 5, 23
+	vrlw 6, 6, 23
+	vrlw 7, 7, 23
+	vrlw 4, 4, 23
+.endm
+
+# Transpose
+.macro TP_4x a0 a1 a2 a3
+	xxmrghw  10, 32+\a0, 32+\a1	# a0, a1, b0, b1
+	xxmrghw  11, 32+\a2, 32+\a3	# a2, a3, b2, b3
+	xxmrglw  12, 32+\a0, 32+\a1	# c0, c1, d0, d1
+	xxmrglw  13, 32+\a2, 32+\a3	# c2, c3, d2, d3
+	xxpermdi	32+\a0, 10, 11, 0	# a0, a1, a2, a3
+	xxpermdi	32+\a1, 10, 11, 3	# b0, b1, b2, b3
+	xxpermdi	32+\a2, 12, 13, 0	# c0, c1, c2, c3
+	xxpermdi	32+\a3, 12, 13, 3	# d0, d1, d2, d3
+.endm
+
+# key stream = working state + state
+.macro Add_state S
+	vadduwm \S+0, \S+0, 16-\S
+	vadduwm \S+4, \S+4, 17-\S
+	vadduwm \S+8, \S+8, 18-\S
+	vadduwm \S+12, \S+12, 19-\S
+
+	vadduwm \S+1, \S+1, 16-\S
+	vadduwm \S+5, \S+5, 17-\S
+	vadduwm \S+9, \S+9, 18-\S
+	vadduwm \S+13, \S+13, 19-\S
+
+	vadduwm \S+2, \S+2, 16-\S
+	vadduwm \S+6, \S+6, 17-\S
+	vadduwm \S+10, \S+10, 18-\S
+	vadduwm \S+14, \S+14, 19-\S
+
+	vadduwm	\S+3, \S+3, 16-\S
+	vadduwm	\S+7, \S+7, 17-\S
+	vadduwm	\S+11, \S+11, 18-\S
+	vadduwm	\S+15, \S+15, 19-\S
+.endm
+
+#
+# write 256 bytes
+#
+.macro Write_256 S
+	add 9, 14, 5
+	add 16, 14, 4
+	lxvw4x 0, 0, 9
+	lxvw4x 1, 17, 9
+	lxvw4x 2, 18, 9
+	lxvw4x 3, 19, 9
+	lxvw4x 4, 20, 9
+	lxvw4x 5, 21, 9
+	lxvw4x 6, 22, 9
+	lxvw4x 7, 23, 9
+	lxvw4x 8, 24, 9
+	lxvw4x 9, 25, 9
+	lxvw4x 10, 26, 9
+	lxvw4x 11, 27, 9
+	lxvw4x 12, 28, 9
+	lxvw4x 13, 29, 9
+	lxvw4x 14, 30, 9
+	lxvw4x 15, 31, 9
+
+	xxlxor \S+32, \S+32, 0
+	xxlxor \S+36, \S+36, 1
+	xxlxor \S+40, \S+40, 2
+	xxlxor \S+44, \S+44, 3
+	xxlxor \S+33, \S+33, 4
+	xxlxor \S+37, \S+37, 5
+	xxlxor \S+41, \S+41, 6
+	xxlxor \S+45, \S+45, 7
+	xxlxor \S+34, \S+34, 8
+	xxlxor \S+38, \S+38, 9
+	xxlxor \S+42, \S+42, 10
+	xxlxor \S+46, \S+46, 11
+	xxlxor \S+35, \S+35, 12
+	xxlxor \S+39, \S+39, 13
+	xxlxor \S+43, \S+43, 14
+	xxlxor \S+47, \S+47, 15
+
+	stxvw4x \S+32, 0, 16
+	stxvw4x \S+36, 17, 16
+	stxvw4x \S+40, 18, 16
+	stxvw4x \S+44, 19, 16
+
+	stxvw4x \S+33, 20, 16
+	stxvw4x \S+37, 21, 16
+	stxvw4x \S+41, 22, 16
+	stxvw4x \S+45, 23, 16
+
+	stxvw4x \S+34, 24, 16
+	stxvw4x \S+38, 25, 16
+	stxvw4x \S+42, 26, 16
+	stxvw4x \S+46, 27, 16
+
+	stxvw4x \S+35, 28, 16
+	stxvw4x \S+39, 29, 16
+	stxvw4x \S+43, 30, 16
+	stxvw4x \S+47, 31, 16
+
+.endm
+
+#
+# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
+#		       unsigned int len, int nrounds);
+#
+SYM_FUNC_START(chacha_p10le_8x)
+.align 5
+	cmpdi	6, 0
+	ble	Out_no_chacha
+
+	SAVE_REGS
+
+	# r17 - r31 mainly for Write_256 macro.
+	li	17, 16
+	li	18, 32
+	li	19, 48
+	li	20, 64
+	li	21, 80
+	li	22, 96
+	li	23, 112
+	li	24, 128
+	li	25, 144
+	li	26, 160
+	li	27, 176
+	li	28, 192
+	li	29, 208
+	li	30, 224
+	li	31, 240
+
+	mr 15, 6			# len
+	li 14, 0			# offset to inp and outp
+
+        lxvw4x	48, 0, 3		#  vr16, constants
+	lxvw4x	49, 17, 3		#  vr17, key 1
+	lxvw4x	50, 18, 3		#  vr18, key 2
+	lxvw4x	51, 19, 3		#  vr19, counter, nonce
+
+	# create (0, 1, 2, 3) counters
+	vspltisw 0, 0
+	vspltisw 1, 1
+	vspltisw 2, 2
+	vspltisw 3, 3
+	vmrghw	4, 0, 1
+	vmrglw	5, 2, 3
+	vsldoi	30, 4, 5, 8		# vr30 counter, 4 (0, 1, 2, 3)
+
+	vspltisw 21, 12
+	vspltisw 23, 7
+
+	addis	11, 2, permx@toc@ha
+	addi	11, 11, permx@toc@l
+	lxvw4x	32+20, 0, 11
+	lxvw4x	32+22, 17, 11
+
+	sradi	8, 7, 1
+
+	mtctr 8
+
+	# save constants to vsx
+	xxlor	16, 48, 48
+	xxlor	17, 49, 49
+	xxlor	18, 50, 50
+	xxlor	19, 51, 51
+
+	vspltisw 25, 4
+	vspltisw 26, 8
+
+	xxlor	25, 32+26, 32+26
+	xxlor	24, 32+25, 32+25
+
+	vadduwm	31, 30, 25		# counter = (0, 1, 2, 3) + (4, 4, 4, 4)
+	xxlor	30, 32+30, 32+30
+	xxlor	31, 32+31, 32+31
+
+	xxlor	20, 32+20, 32+20
+	xxlor	21, 32+21, 32+21
+	xxlor	22, 32+22, 32+22
+	xxlor	23, 32+23, 32+23
+
+	cmpdi	6, 512
+	blt	Loop_last
+
+Loop_8x:
+	xxspltw  32+0, 16, 0
+	xxspltw  32+1, 16, 1
+	xxspltw  32+2, 16, 2
+	xxspltw  32+3, 16, 3
+
+	xxspltw  32+4, 17, 0
+	xxspltw  32+5, 17, 1
+	xxspltw  32+6, 17, 2
+	xxspltw  32+7, 17, 3
+	xxspltw  32+8, 18, 0
+	xxspltw  32+9, 18, 1
+	xxspltw  32+10, 18, 2
+	xxspltw  32+11, 18, 3
+	xxspltw  32+12, 19, 0
+	xxspltw  32+13, 19, 1
+	xxspltw  32+14, 19, 2
+	xxspltw  32+15, 19, 3
+	vadduwm	12, 12, 30	# increase counter
+
+	xxspltw  32+16, 16, 0
+	xxspltw  32+17, 16, 1
+	xxspltw  32+18, 16, 2
+	xxspltw  32+19, 16, 3
+
+	xxspltw  32+20, 17, 0
+	xxspltw  32+21, 17, 1
+	xxspltw  32+22, 17, 2
+	xxspltw  32+23, 17, 3
+	xxspltw  32+24, 18, 0
+	xxspltw  32+25, 18, 1
+	xxspltw  32+26, 18, 2
+	xxspltw  32+27, 18, 3
+	xxspltw  32+28, 19, 0
+	xxspltw  32+29, 19, 1
+	vadduwm	28, 28, 31	# increase counter
+	xxspltw  32+30, 19, 2
+	xxspltw  32+31, 19, 3
+
+.align 5
+quarter_loop_8x:
+	QT_loop_8x
+
+	bdnz	quarter_loop_8x
+
+	xxlor	0, 32+30, 32+30
+	xxlor	32+30, 30, 30
+	vadduwm	12, 12, 30
+	xxlor	32+30, 0, 0
+	TP_4x 0, 1, 2, 3
+	TP_4x 4, 5, 6, 7
+	TP_4x 8, 9, 10, 11
+	TP_4x 12, 13, 14, 15
+
+	xxlor	0, 48, 48
+	xxlor	1, 49, 49
+	xxlor	2, 50, 50
+	xxlor	3, 51, 51
+	xxlor	48, 16, 16
+	xxlor	49, 17, 17
+	xxlor	50, 18, 18
+	xxlor	51, 19, 19
+	Add_state 0
+	xxlor	48, 0, 0
+	xxlor	49, 1, 1
+	xxlor	50, 2, 2
+	xxlor	51, 3, 3
+	Write_256 0
+	addi	14, 14, 256	# offset +=256
+	addi	15, 15, -256	# len -=256
+
+	xxlor	5, 32+31, 32+31
+	xxlor	32+31, 31, 31
+	vadduwm	28, 28, 31
+	xxlor	32+31, 5, 5
+	TP_4x 16+0, 16+1, 16+2, 16+3
+	TP_4x 16+4, 16+5, 16+6, 16+7
+	TP_4x 16+8, 16+9, 16+10, 16+11
+	TP_4x 16+12, 16+13, 16+14, 16+15
+
+	xxlor	32, 16, 16
+	xxlor	33, 17, 17
+	xxlor	34, 18, 18
+	xxlor	35, 19, 19
+	Add_state 16
+	Write_256 16
+	addi	14, 14, 256	# offset +=256
+	addi	15, 15, -256	# len +=256
+
+	xxlor	32+24, 24, 24
+	xxlor	32+25, 25, 25
+	xxlor	32+30, 30, 30
+	vadduwm	30, 30, 25
+	vadduwm	31, 30, 24
+	xxlor	30, 32+30, 32+30
+	xxlor	31, 32+31, 32+31
+
+	cmpdi	15, 0
+	beq	Out_loop
+
+	cmpdi	15, 512
+	blt	Loop_last
+
+	mtctr 8
+	b Loop_8x
+
+Loop_last:
+        lxvw4x	48, 0, 3		#  vr16, constants
+	lxvw4x	49, 17, 3		#  vr17, key 1
+	lxvw4x	50, 18, 3		#  vr18, key 2
+	lxvw4x	51, 19, 3		#  vr19, counter, nonce
+
+	vspltisw 21, 12
+	vspltisw 23, 7
+	addis	11, 2, permx@toc@ha
+	addi	11, 11, permx@toc@l
+	lxvw4x	32+20, 0, 11
+	lxvw4x	32+22, 17, 11
+
+	sradi	8, 7, 1
+	mtctr 8
+
+Loop_4x:
+	vspltw  0, 16, 0
+	vspltw  1, 16, 1
+	vspltw  2, 16, 2
+	vspltw  3, 16, 3
+
+	vspltw  4, 17, 0
+	vspltw  5, 17, 1
+	vspltw  6, 17, 2
+	vspltw  7, 17, 3
+	vspltw  8, 18, 0
+	vspltw  9, 18, 1
+	vspltw  10, 18, 2
+	vspltw  11, 18, 3
+	vspltw  12, 19, 0
+	vadduwm	12, 12, 30	# increase counter
+	vspltw  13, 19, 1
+	vspltw  14, 19, 2
+	vspltw  15, 19, 3
+
+.align 5
+quarter_loop:
+	QT_loop_4x
+
+	bdnz	quarter_loop
+
+	vadduwm	12, 12, 30
+	TP_4x 0, 1, 2, 3
+	TP_4x 4, 5, 6, 7
+	TP_4x 8, 9, 10, 11
+	TP_4x 12, 13, 14, 15
+
+	Add_state 0
+	Write_256 0
+	addi	14, 14, 256	# offset += 256
+	addi	15, 15, -256	# len += 256
+
+	# Update state counter
+	vspltisw 25, 4
+	vadduwm	30, 30, 25
+
+	cmpdi	15, 0
+	beq	Out_loop
+	cmpdi	15, 256
+	blt	Out_loop
+
+	mtctr 8
+	b Loop_4x
+
+Out_loop:
+	RESTORE_REGS
+	blr
+
+Out_no_chacha:
+	li	3, 0
+	blr
+SYM_FUNC_END(chacha_p10le_8x)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 5
+permx:
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
+SYM_DATA_END(PERMX)
diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305-p10-glue.c
new file mode 100644
index 000000000000..3f1664a724b6
--- /dev/null
+++ b/lib/crypto/powerpc/poly1305-p10-glue.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+#include <asm/switch_to.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+	preempt_disable();
+	enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+	disable_kernel_vsx();
+	preempt_enable();
+}
+
+void poly1305_block_init_arch(struct poly1305_block_state *dctx,
+			      const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_block_init_generic(dctx, raw_key);
+
+	dctx->h = (struct poly1305_state){};
+	dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
+	dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
+}
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+			  unsigned int len, u32 padbit)
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_blocks_generic(state, src, len, padbit);
+	vsx_begin();
+	if (len >= POLY1305_BLOCK_SIZE * 4) {
+		poly1305_p10le_4blocks(state, src, len);
+		src += len - (len % (POLY1305_BLOCK_SIZE * 4));
+		len %= POLY1305_BLOCK_SIZE * 4;
+	}
+	while (len >= POLY1305_BLOCK_SIZE) {
+		poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit);
+		len -= POLY1305_BLOCK_SIZE;
+		src += POLY1305_BLOCK_SIZE;
+	}
+	vsx_end();
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+void poly1305_emit_arch(const struct poly1305_state *state,
+			u8 digest[POLY1305_DIGEST_SIZE],
+			const u32 nonce[4])
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_emit_generic(state, digest, nonce);
+	poly1305_emit_64(state, nonce, digest);
+}
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	return static_key_enabled(&have_p10);
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init poly1305_p10_init(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		static_branch_enable(&have_p10);
+	return 0;
+}
+subsys_initcall(poly1305_p10_init);
+
+static void __exit poly1305_p10_exit(void)
+{
+}
+module_exit(poly1305_p10_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
+MODULE_DESCRIPTION("Optimized Poly1305 for P10");
diff --git a/lib/crypto/powerpc/poly1305-p10le_64.S b/lib/crypto/powerpc/poly1305-p10le_64.S
new file mode 100644
index 000000000000..a3c1987f1ecd
--- /dev/null
+++ b/lib/crypto/powerpc/poly1305-p10le_64.S
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated poly1305 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+#  - 26 bits limbs
+#  - Handle multiple 64 byte blcok.
+#
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+#
+# Improve performance by breaking down polynominal to the sum of products with
+#     h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+#  07/22/21 - this revison based on the above sum of products.  Setup r^4, r^3, r^2, r and s3, s2, s1, s0
+#             to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+#    vs    [r^1, r^3, r^2, r^4]
+#    vs0 = [r0,.....]
+#    vs1 = [r1,.....]
+#    vs2 = [r2,.....]
+#    vs3 = [r3,.....]
+#    vs4 = [r4,.....]
+#    vs5 = [r1*5,...]
+#    vs6 = [r2*5,...]
+#    vs7 = [r2*5,...]
+#    vs8 = [r4*5,...]
+#
+#  Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0,   r4*5, r3*5, r2*5;
+# r2, r1,   r0,   r4*5, r3*5;
+# r3, r2,   r1,   r0,   r4*5;
+# r4, r3,   r2,   r1,   r0  ;
+#
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+#  k = 32 bytes key
+#  r3 = k (r, s)
+#  r4 = mlen
+#  r5 = m
+#
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+
+.text
+
+.macro	SAVE_GPR GPR OFFSET FRAME
+	std	\GPR,\OFFSET(\FRAME)
+.endm
+
+.macro	SAVE_VRS VRS OFFSET FRAME
+	li	16, \OFFSET
+	stvx	\VRS, 16, \FRAME
+.endm
+
+.macro	SAVE_VSX VSX OFFSET FRAME
+	li	16, \OFFSET
+	stxvx	\VSX, 16, \FRAME
+.endm
+
+.macro	RESTORE_GPR GPR OFFSET FRAME
+	ld	\GPR,\OFFSET(\FRAME)
+.endm
+
+.macro	RESTORE_VRS VRS OFFSET FRAME
+	li	16, \OFFSET
+	lvx	\VRS, 16, \FRAME
+.endm
+
+.macro	RESTORE_VSX VSX OFFSET FRAME
+	li	16, \OFFSET
+	lxvx	\VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+	mflr 0
+	std 0, 16(1)
+	stdu 1,-752(1)
+
+	SAVE_GPR 14, 112, 1
+	SAVE_GPR 15, 120, 1
+	SAVE_GPR 16, 128, 1
+	SAVE_GPR 17, 136, 1
+	SAVE_GPR 18, 144, 1
+	SAVE_GPR 19, 152, 1
+	SAVE_GPR 20, 160, 1
+	SAVE_GPR 21, 168, 1
+	SAVE_GPR 22, 176, 1
+	SAVE_GPR 23, 184, 1
+	SAVE_GPR 24, 192, 1
+	SAVE_GPR 25, 200, 1
+	SAVE_GPR 26, 208, 1
+	SAVE_GPR 27, 216, 1
+	SAVE_GPR 28, 224, 1
+	SAVE_GPR 29, 232, 1
+	SAVE_GPR 30, 240, 1
+	SAVE_GPR 31, 248, 1
+
+	addi	9, 1, 256
+	SAVE_VRS 20, 0, 9
+	SAVE_VRS 21, 16, 9
+	SAVE_VRS 22, 32, 9
+	SAVE_VRS 23, 48, 9
+	SAVE_VRS 24, 64, 9
+	SAVE_VRS 25, 80, 9
+	SAVE_VRS 26, 96, 9
+	SAVE_VRS 27, 112, 9
+	SAVE_VRS 28, 128, 9
+	SAVE_VRS 29, 144, 9
+	SAVE_VRS 30, 160, 9
+	SAVE_VRS 31, 176, 9
+
+	SAVE_VSX 14, 192, 9
+	SAVE_VSX 15, 208, 9
+	SAVE_VSX 16, 224, 9
+	SAVE_VSX 17, 240, 9
+	SAVE_VSX 18, 256, 9
+	SAVE_VSX 19, 272, 9
+	SAVE_VSX 20, 288, 9
+	SAVE_VSX 21, 304, 9
+	SAVE_VSX 22, 320, 9
+	SAVE_VSX 23, 336, 9
+	SAVE_VSX 24, 352, 9
+	SAVE_VSX 25, 368, 9
+	SAVE_VSX 26, 384, 9
+	SAVE_VSX 27, 400, 9
+	SAVE_VSX 28, 416, 9
+	SAVE_VSX 29, 432, 9
+	SAVE_VSX 30, 448, 9
+	SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+	addi	9, 1, 256
+	RESTORE_VRS 20, 0, 9
+	RESTORE_VRS 21, 16, 9
+	RESTORE_VRS 22, 32, 9
+	RESTORE_VRS 23, 48, 9
+	RESTORE_VRS 24, 64, 9
+	RESTORE_VRS 25, 80, 9
+	RESTORE_VRS 26, 96, 9
+	RESTORE_VRS 27, 112, 9
+	RESTORE_VRS 28, 128, 9
+	RESTORE_VRS 29, 144, 9
+	RESTORE_VRS 30, 160, 9
+	RESTORE_VRS 31, 176, 9
+
+	RESTORE_VSX 14, 192, 9
+	RESTORE_VSX 15, 208, 9
+	RESTORE_VSX 16, 224, 9
+	RESTORE_VSX 17, 240, 9
+	RESTORE_VSX 18, 256, 9
+	RESTORE_VSX 19, 272, 9
+	RESTORE_VSX 20, 288, 9
+	RESTORE_VSX 21, 304, 9
+	RESTORE_VSX 22, 320, 9
+	RESTORE_VSX 23, 336, 9
+	RESTORE_VSX 24, 352, 9
+	RESTORE_VSX 25, 368, 9
+	RESTORE_VSX 26, 384, 9
+	RESTORE_VSX 27, 400, 9
+	RESTORE_VSX 28, 416, 9
+	RESTORE_VSX 29, 432, 9
+	RESTORE_VSX 30, 448, 9
+	RESTORE_VSX 31, 464, 9
+
+	RESTORE_GPR 14, 112, 1
+	RESTORE_GPR 15, 120, 1
+	RESTORE_GPR 16, 128, 1
+	RESTORE_GPR 17, 136, 1
+	RESTORE_GPR 18, 144, 1
+	RESTORE_GPR 19, 152, 1
+	RESTORE_GPR 20, 160, 1
+	RESTORE_GPR 21, 168, 1
+	RESTORE_GPR 22, 176, 1
+	RESTORE_GPR 23, 184, 1
+	RESTORE_GPR 24, 192, 1
+	RESTORE_GPR 25, 200, 1
+	RESTORE_GPR 26, 208, 1
+	RESTORE_GPR 27, 216, 1
+	RESTORE_GPR 28, 224, 1
+	RESTORE_GPR 29, 232, 1
+	RESTORE_GPR 30, 240, 1
+	RESTORE_GPR 31, 248, 1
+
+	addi    1, 1, 752
+	ld 0, 16(1)
+	mtlr 0
+.endm # RESTORE_REGS
+
+#
+# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
+# p[1] = a0*r1 + a1*r0   + a2*r4*5 + a3*r3*5 + a4*r2*5;
+# p[2] = a0*r2 + a1*r1   + a2*r0   + a3*r4*5 + a4*r3*5;
+# p[3] = a0*r3 + a1*r2   + a2*r1   + a3*r0   + a4*r4*5;
+# p[4] = a0*r4 + a1*r3   + a2*r2   + a3*r1   + a4*r0  ;
+#
+#    [r^2, r^3, r^1, r^4]
+#    [m3,  m2,  m4,  m1]
+#
+# multiply odd and even words
+.macro mul_odd
+	vmulouw	14, 4, 26
+	vmulouw	10, 5, 3
+	vmulouw	11, 6, 2
+	vmulouw	12, 7, 1
+	vmulouw	13, 8, 0
+	vmulouw	15, 4, 27
+	vaddudm	14, 14, 10
+	vaddudm	14, 14, 11
+	vmulouw	10, 5, 26
+	vmulouw	11, 6, 3
+	vaddudm	14, 14, 12
+	vaddudm	14, 14, 13	# x0
+	vaddudm	15, 15, 10
+	vaddudm	15, 15, 11
+	vmulouw	12, 7, 2
+	vmulouw	13, 8, 1
+	vaddudm	15, 15, 12
+	vaddudm	15, 15, 13	# x1
+	vmulouw	16, 4, 28
+	vmulouw	10, 5, 27
+	vmulouw	11, 6, 26
+	vaddudm	16, 16, 10
+	vaddudm	16, 16, 11
+	vmulouw	12, 7, 3
+	vmulouw	13, 8, 2
+	vaddudm	16, 16, 12
+	vaddudm	16, 16, 13	# x2
+	vmulouw	17, 4, 29
+	vmulouw	10, 5, 28
+	vmulouw	11, 6, 27
+	vaddudm	17, 17, 10
+	vaddudm	17, 17, 11
+	vmulouw	12, 7, 26
+	vmulouw	13, 8, 3
+	vaddudm	17, 17, 12
+	vaddudm	17, 17, 13	# x3
+	vmulouw	18, 4, 30
+	vmulouw	10, 5, 29
+	vmulouw	11, 6, 28
+	vaddudm	18, 18, 10
+	vaddudm	18, 18, 11
+	vmulouw	12, 7, 27
+	vmulouw	13, 8, 26
+	vaddudm	18, 18, 12
+	vaddudm	18, 18, 13	# x4
+.endm
+
+.macro mul_even
+	vmuleuw	9, 4, 26
+	vmuleuw	10, 5, 3
+	vmuleuw	11, 6, 2
+	vmuleuw	12, 7, 1
+	vmuleuw	13, 8, 0
+	vaddudm	14, 14, 9
+	vaddudm	14, 14, 10
+	vaddudm	14, 14, 11
+	vaddudm	14, 14, 12
+	vaddudm	14, 14, 13	# x0
+
+	vmuleuw	9, 4, 27
+	vmuleuw	10, 5, 26
+	vmuleuw	11, 6, 3
+	vmuleuw	12, 7, 2
+	vmuleuw	13, 8, 1
+	vaddudm	15, 15, 9
+	vaddudm	15, 15, 10
+	vaddudm	15, 15, 11
+	vaddudm	15, 15, 12
+	vaddudm	15, 15, 13	# x1
+
+	vmuleuw	9, 4, 28
+	vmuleuw	10, 5, 27
+	vmuleuw	11, 6, 26
+	vmuleuw	12, 7, 3
+	vmuleuw	13, 8, 2
+	vaddudm	16, 16, 9
+	vaddudm	16, 16, 10
+	vaddudm	16, 16, 11
+	vaddudm	16, 16, 12
+	vaddudm	16, 16, 13	# x2
+
+	vmuleuw	9, 4, 29
+	vmuleuw	10, 5, 28
+	vmuleuw	11, 6, 27
+	vmuleuw	12, 7, 26
+	vmuleuw	13, 8, 3
+	vaddudm	17, 17, 9
+	vaddudm	17, 17, 10
+	vaddudm	17, 17, 11
+	vaddudm	17, 17, 12
+	vaddudm	17, 17, 13	# x3
+
+	vmuleuw	9, 4, 30
+	vmuleuw	10, 5, 29
+	vmuleuw	11, 6, 28
+	vmuleuw	12, 7, 27
+	vmuleuw	13, 8, 26
+	vaddudm	18, 18, 9
+	vaddudm	18, 18, 10
+	vaddudm	18, 18, 11
+	vaddudm	18, 18, 12
+	vaddudm	18, 18, 13	# x4
+.endm
+
+#
+# poly1305_setup_r
+#
+# setup r^4, r^3, r^2, r vectors
+#    [r, r^3, r^2, r^4]
+#    vs0 = [r0,...]
+#    vs1 = [r1,...]
+#    vs2 = [r2,...]
+#    vs3 = [r3,...]
+#    vs4 = [r4,...]
+#    vs5 = [r4*5,...]
+#    vs6 = [r3*5,...]
+#    vs7 = [r2*5,...]
+#    vs8 = [r1*5,...]
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0,   r4*5, r3*5, r2*5;
+# r2, r1,   r0,   r4*5, r3*5;
+# r3, r2,   r1,   r0,   r4*5;
+# r4, r3,   r2,   r1,   r0  ;
+#
+.macro poly1305_setup_r
+
+	# save r
+	xxlor	26, 58, 58
+	xxlor	27, 59, 59
+	xxlor	28, 60, 60
+	xxlor	29, 61, 61
+	xxlor	30, 62, 62
+
+	xxlxor	31, 31, 31
+
+#    [r, r^3, r^2, r^4]
+	# compute r^2
+	vmr	4, 26
+	vmr	5, 27
+	vmr	6, 28
+	vmr	7, 29
+	vmr	8, 30
+	bl	do_mul		# r^2 r^1
+	xxpermdi 58, 58, 36, 0x3		# r0
+	xxpermdi 59, 59, 37, 0x3		# r1
+	xxpermdi 60, 60, 38, 0x3		# r2
+	xxpermdi 61, 61, 39, 0x3		# r3
+	xxpermdi 62, 62, 40, 0x3		# r4
+	xxpermdi 36, 36, 36, 0x3
+	xxpermdi 37, 37, 37, 0x3
+	xxpermdi 38, 38, 38, 0x3
+	xxpermdi 39, 39, 39, 0x3
+	xxpermdi 40, 40, 40, 0x3
+	vspltisb 13, 2
+	vsld	9, 27, 13
+	vsld	10, 28, 13
+	vsld	11, 29, 13
+	vsld	12, 30, 13
+	vaddudm	0, 9, 27
+	vaddudm	1, 10, 28
+	vaddudm	2, 11, 29
+	vaddudm	3, 12, 30
+
+	bl	do_mul		# r^4 r^3
+	vmrgow	26, 26, 4
+	vmrgow	27, 27, 5
+	vmrgow	28, 28, 6
+	vmrgow	29, 29, 7
+	vmrgow	30, 30, 8
+	vspltisb 13, 2
+	vsld	9, 27, 13
+	vsld	10, 28, 13
+	vsld	11, 29, 13
+	vsld	12, 30, 13
+	vaddudm	0, 9, 27
+	vaddudm	1, 10, 28
+	vaddudm	2, 11, 29
+	vaddudm	3, 12, 30
+
+	# r^2 r^4
+	xxlor	0, 58, 58
+	xxlor	1, 59, 59
+	xxlor	2, 60, 60
+	xxlor	3, 61, 61
+	xxlor	4, 62, 62
+	xxlor	5, 32, 32
+	xxlor	6, 33, 33
+	xxlor	7, 34, 34
+	xxlor	8, 35, 35
+
+	vspltw	9, 26, 3
+	vspltw	10, 26, 2
+	vmrgow	26, 10, 9
+	vspltw	9, 27, 3
+	vspltw	10, 27, 2
+	vmrgow	27, 10, 9
+	vspltw	9, 28, 3
+	vspltw	10, 28, 2
+	vmrgow	28, 10, 9
+	vspltw	9, 29, 3
+	vspltw	10, 29, 2
+	vmrgow	29, 10, 9
+	vspltw	9, 30, 3
+	vspltw	10, 30, 2
+	vmrgow	30, 10, 9
+
+	vsld	9, 27, 13
+	vsld	10, 28, 13
+	vsld	11, 29, 13
+	vsld	12, 30, 13
+	vaddudm	0, 9, 27
+	vaddudm	1, 10, 28
+	vaddudm	2, 11, 29
+	vaddudm	3, 12, 30
+.endm
+
+SYM_FUNC_START_LOCAL(do_mul)
+	mul_odd
+
+	# do reduction ( h %= p )
+	# carry reduction
+	vspltisb 9, 2
+	vsrd	10, 14, 31
+	vsrd	11, 17, 31
+	vand	7, 17, 25
+	vand	4, 14, 25
+	vaddudm	18, 18, 11
+	vsrd	12, 18, 31
+	vaddudm	15, 15, 10
+
+	vsrd	11, 15, 31
+	vand	8, 18, 25
+	vand	5, 15, 25
+	vaddudm	4, 4, 12
+	vsld	10, 12, 9
+	vaddudm	6, 16, 11
+
+	vsrd	13, 6, 31
+	vand	6, 6, 25
+	vaddudm	4, 4, 10
+	vsrd	10, 4, 31
+	vaddudm	7, 7, 13
+
+	vsrd	11, 7, 31
+	vand	7, 7, 25
+	vand	4, 4, 25
+	vaddudm	5, 5, 10
+	vaddudm	8, 8, 11
+	blr
+SYM_FUNC_END(do_mul)
+
+#
+# init key
+#
+.macro do_poly1305_init
+	addis	10, 2, rmask@toc@ha
+	addi	10, 10, rmask@toc@l
+
+	ld	11, 0(10)
+	ld	12, 8(10)
+
+	li	14, 16
+	li	15, 32
+	addis	10, 2, cnum@toc@ha
+	addi	10, 10, cnum@toc@l
+	lvx	25, 0, 10	# v25 - mask
+	lvx	31, 14, 10	# v31 = 1a
+	lvx	19, 15, 10	# v19 = 1 << 24
+	lxv	24, 48(10)	# vs24
+	lxv	25, 64(10)	# vs25
+
+	# initialize
+	# load key from r3 to vectors
+	ld	9, 24(3)
+	ld	10, 32(3)
+	and.	9, 9, 11
+	and.	10, 10, 12
+
+	# break 26 bits
+	extrdi	14, 9, 26, 38
+	extrdi	15, 9, 26, 12
+	extrdi	16, 9, 12, 0
+	mtvsrdd	58, 0, 14
+	insrdi	16, 10, 14, 38
+	mtvsrdd	59, 0, 15
+	extrdi	17, 10, 26, 24
+	mtvsrdd	60, 0, 16
+	extrdi	18, 10, 24, 0
+	mtvsrdd	61, 0, 17
+	mtvsrdd	62, 0, 18
+
+	# r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
+	li	9, 5
+	mtvsrdd	36, 0, 9
+	vmulouw	0, 27, 4		# v0 = rr0
+	vmulouw	1, 28, 4		# v1 = rr1
+	vmulouw	2, 29, 4		# v2 = rr2
+	vmulouw	3, 30, 4		# v3 = rr3
+.endm
+
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+#  k = 32 bytes key
+#  r3 = k (r, s)
+#  r4 = mlen
+#  r5 = m
+#
+SYM_FUNC_START(poly1305_p10le_4blocks)
+.align 5
+	cmpdi	5, 64
+	blt	Out_no_poly1305
+
+	SAVE_REGS
+
+	do_poly1305_init
+
+	li	21, 0	# counter to message
+
+	poly1305_setup_r
+
+	# load previous H state
+	# break/convert r6 to 26 bits
+	ld	9, 0(3)
+	ld	10, 8(3)
+	ld	19, 16(3)
+	sldi	19, 19, 24
+	mtvsrdd	41, 0, 19
+	extrdi	14, 9, 26, 38
+	extrdi	15, 9, 26, 12
+	extrdi	16, 9, 12, 0
+	mtvsrdd	36, 0, 14
+	insrdi	16, 10, 14, 38
+	mtvsrdd	37, 0, 15
+	extrdi	17, 10, 26, 24
+	mtvsrdd	38, 0, 16
+	extrdi	18, 10, 24, 0
+	mtvsrdd	39, 0, 17
+	mtvsrdd	40, 0, 18
+	vor	8, 8, 9
+
+	# input m1 m2
+	add	20, 4, 21
+	xxlor	49, 24, 24
+	xxlor	50, 25, 25
+	lxvw4x	43, 0, 20
+	addi	17, 20, 16
+	lxvw4x	44, 0, 17
+	vperm	14, 11, 12, 17
+	vperm	15, 11, 12, 18
+	vand	9, 14, 25	# a0
+	vsrd	10, 14, 31	# >> 26
+	vsrd	11, 10, 31	# 12 bits left
+	vand	10, 10, 25	# a1
+	vspltisb 13, 12
+	vand	16, 15, 25
+	vsld	12, 16, 13
+	vor	11, 11, 12
+	vand	11, 11, 25	# a2
+	vspltisb 13, 14
+	vsrd	12, 15, 13	# >> 14
+	vsrd	13, 12, 31	# >> 26, a4
+	vand	12, 12, 25	# a3
+
+	vaddudm	20, 4, 9
+	vaddudm	21, 5, 10
+	vaddudm	22, 6, 11
+	vaddudm	23, 7, 12
+	vaddudm	24, 8, 13
+
+	# m3 m4
+	addi	17, 17, 16
+	lxvw4x	43, 0, 17
+	addi	17, 17, 16
+	lxvw4x	44, 0, 17
+	vperm	14, 11, 12, 17
+	vperm	15, 11, 12, 18
+	vand	9, 14, 25	# a0
+	vsrd	10, 14, 31	# >> 26
+	vsrd	11, 10, 31	# 12 bits left
+	vand	10, 10, 25	# a1
+	vspltisb 13, 12
+	vand	16, 15, 25
+	vsld	12, 16, 13
+	vspltisb 13, 14
+	vor	11, 11, 12
+	vand	11, 11, 25	# a2
+	vsrd	12, 15, 13	# >> 14
+	vsrd	13, 12, 31	# >> 26, a4
+	vand	12, 12, 25	# a3
+
+	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
+	vmrgow	4, 9, 20
+	vmrgow	5, 10, 21
+	vmrgow	6, 11, 22
+	vmrgow	7, 12, 23
+	vmrgow	8, 13, 24
+	vaddudm	8, 8, 19
+
+	addi	5, 5, -64	# len -= 64
+	addi	21, 21, 64	# offset += 64
+
+	li      9, 64
+	divdu   31, 5, 9
+
+	cmpdi	31, 0
+	ble	Skip_block_loop
+
+	mtctr	31
+
+# h4 =   m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+# Rewrite the polynominal sum of product as follows,
+# h1 = (h0 + m1) * r^2,	h2 = (h0 + m2) * r^2
+# h3 = (h1 + m3) * r^2,	h4 = (h2 + m4) * r^2  --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
+#  .... Repeat
+# h5 = (h3 + m5) * r^2,	h6 = (h4 + m6) * r^2  -->
+# h7 = (h5 + m7) * r^2,	h8 = (h6 + m8) * r^1  --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
+#
+loop_4blocks:
+
+	# Multiply odd words and even words
+	mul_odd
+	mul_even
+	# carry reduction
+	vspltisb 9, 2
+	vsrd	10, 14, 31
+	vsrd	11, 17, 31
+	vand	7, 17, 25
+	vand	4, 14, 25
+	vaddudm	18, 18, 11
+	vsrd	12, 18, 31
+	vaddudm	15, 15, 10
+
+	vsrd	11, 15, 31
+	vand	8, 18, 25
+	vand	5, 15, 25
+	vaddudm	4, 4, 12
+	vsld	10, 12, 9
+	vaddudm	6, 16, 11
+
+	vsrd	13, 6, 31
+	vand	6, 6, 25
+	vaddudm	4, 4, 10
+	vsrd	10, 4, 31
+	vaddudm	7, 7, 13
+
+	vsrd	11, 7, 31
+	vand	7, 7, 25
+	vand	4, 4, 25
+	vaddudm	5, 5, 10
+	vaddudm	8, 8, 11
+
+	# input m1  m2  m3  m4
+	add	20, 4, 21
+	xxlor	49, 24, 24
+	xxlor	50, 25, 25
+	lxvw4x	43, 0, 20
+	addi	17, 20, 16
+	lxvw4x	44, 0, 17
+	vperm	14, 11, 12, 17
+	vperm	15, 11, 12, 18
+	addi	17, 17, 16
+	lxvw4x	43, 0, 17
+	addi	17, 17, 16
+	lxvw4x	44, 0, 17
+	vperm	17, 11, 12, 17
+	vperm	18, 11, 12, 18
+
+	vand	20, 14, 25	# a0
+	vand	9, 17, 25	# a0
+	vsrd	21, 14, 31	# >> 26
+	vsrd	22, 21, 31	# 12 bits left
+	vsrd	10, 17, 31	# >> 26
+	vsrd	11, 10, 31	# 12 bits left
+
+	vand	21, 21, 25	# a1
+	vand	10, 10, 25	# a1
+
+	vspltisb 13, 12
+	vand	16, 15, 25
+	vsld	23, 16, 13
+	vor	22, 22, 23
+	vand	22, 22, 25	# a2
+	vand	16, 18, 25
+	vsld	12, 16, 13
+	vor	11, 11, 12
+	vand	11, 11, 25	# a2
+	vspltisb 13, 14
+	vsrd	23, 15, 13	# >> 14
+	vsrd	24, 23, 31	# >> 26, a4
+	vand	23, 23, 25	# a3
+	vsrd	12, 18, 13	# >> 14
+	vsrd	13, 12, 31	# >> 26, a4
+	vand	12, 12, 25	# a3
+
+	vaddudm	4, 4, 20
+	vaddudm	5, 5, 21
+	vaddudm	6, 6, 22
+	vaddudm	7, 7, 23
+	vaddudm	8, 8, 24
+
+	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
+	vmrgow	4, 9, 4
+	vmrgow	5, 10, 5
+	vmrgow	6, 11, 6
+	vmrgow	7, 12, 7
+	vmrgow	8, 13, 8
+	vaddudm	8, 8, 19
+
+	addi	5, 5, -64	# len -= 64
+	addi	21, 21, 64	# offset += 64
+
+	bdnz	loop_4blocks
+
+Skip_block_loop:
+	xxlor	58, 0, 0
+	xxlor	59, 1, 1
+	xxlor	60, 2, 2
+	xxlor	61, 3, 3
+	xxlor	62, 4, 4
+	xxlor	32, 5, 5
+	xxlor	33, 6, 6
+	xxlor	34, 7, 7
+	xxlor	35, 8, 8
+
+	# Multiply odd words and even words
+	mul_odd
+	mul_even
+
+	# Sum the products.
+	xxpermdi 41, 31, 46, 0
+	xxpermdi 42, 31, 47, 0
+	vaddudm	4, 14, 9
+	xxpermdi 36, 31, 36, 3
+	vaddudm	5, 15, 10
+	xxpermdi 37, 31, 37, 3
+	xxpermdi 43, 31, 48, 0
+	vaddudm	6, 16, 11
+	xxpermdi 38, 31, 38, 3
+	xxpermdi 44, 31, 49, 0
+	vaddudm	7, 17, 12
+	xxpermdi 39, 31, 39, 3
+	xxpermdi 45, 31, 50, 0
+	vaddudm	8, 18, 13
+	xxpermdi 40, 31, 40, 3
+
+	# carry reduction
+	vspltisb 9, 2
+	vsrd	10, 4, 31
+	vsrd	11, 7, 31
+	vand	7, 7, 25
+	vand	4, 4, 25
+	vaddudm	8, 8, 11
+	vsrd	12, 8, 31
+	vaddudm	5, 5, 10
+
+	vsrd	11, 5, 31
+	vand	8, 8, 25
+	vand	5, 5, 25
+	vaddudm	4, 4, 12
+	vsld	10, 12, 9
+	vaddudm	6, 6, 11
+
+	vsrd	13, 6, 31
+	vand	6, 6, 25
+	vaddudm	4, 4, 10
+	vsrd	10, 4, 31
+	vaddudm	7, 7, 13
+
+	vsrd	11, 7, 31
+	vand	7, 7, 25
+	vand	4, 4, 25
+	vaddudm	5, 5, 10
+	vsrd	10, 5, 31
+	vand	5, 5, 25
+	vaddudm	6, 6, 10
+	vaddudm	8, 8, 11
+
+	b	do_final_update
+
+do_final_update:
+	# combine 26 bit limbs
+	# v4, v5, v6, v7 and v8 are 26 bit vectors
+	vsld	5, 5, 31
+	vor	20, 4, 5
+	vspltisb 11, 12
+	vsrd	12, 6, 11
+	vsld	6, 6, 31
+	vsld	6, 6, 31
+	vor	20, 20, 6
+	vspltisb 11, 14
+	vsld	7, 7, 11
+	vor	21, 7, 12
+	mfvsrld	16, 40		# save last 2 bytes
+	vsld	8, 8, 11
+	vsld	8, 8, 31
+	vor	21, 21, 8
+	mfvsrld	17, 52
+	mfvsrld	19, 53
+	srdi	16, 16, 24
+
+	std	17, 0(3)
+	std	19, 8(3)
+	stw	16, 16(3)
+
+Out_loop:
+	li	3, 0
+
+	RESTORE_REGS
+
+	blr
+
+Out_no_poly1305:
+	li	3, 0
+	blr
+SYM_FUNC_END(poly1305_p10le_4blocks)
+
+#
+# =======================================================================
+# The following functions implement 64 x 64 bits multiplication poly1305.
+#
+SYM_FUNC_START_LOCAL(Poly1305_init_64)
+	#  mask 0x0FFFFFFC0FFFFFFC
+	#  mask 0x0FFFFFFC0FFFFFFF
+	addis	10, 2, rmask@toc@ha
+	addi	10, 10, rmask@toc@l
+	ld	11, 0(10)
+	ld	12, 8(10)
+
+	# initialize
+	# load key from r3
+	ld	9, 24(3)
+	ld	10, 32(3)
+	and.	9, 9, 11	# cramp mask r0
+	and.	10, 10, 12	# cramp mask r1
+
+        srdi    21, 10, 2
+        add     19, 21, 10      # s1: r19 - (r1 >> 2) *5
+
+        # setup r and s
+        li      25, 0
+	mtvsrdd 32+0, 9, 19	# r0, s1
+	mtvsrdd 32+1, 10, 9	# r1, r0
+	mtvsrdd 32+2, 19, 25	# s1
+	mtvsrdd 32+3, 9, 25	# r0
+
+	blr
+SYM_FUNC_END(Poly1305_init_64)
+
+# Poly1305_mult
+# v6 = (h0, h1), v8 = h2
+# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
+#
+# Output: v7, v10, v11
+#
+SYM_FUNC_START_LOCAL(Poly1305_mult)
+	#
+	#	d0 = h0 * r0 + h1 * s1
+	vmsumudm	7, 6, 0, 9		# h0 * r0, h1 * s1
+
+	#	d1 = h0 * r1 + h1 * r0 + h2 * s1
+	vmsumudm	11, 6, 1, 9		# h0 * r1, h1 * r0
+	vmsumudm	10, 8, 2, 11		# d1 += h2 * s1
+
+	#       d2 = r0
+	vmsumudm	11, 8, 3, 9		# d2 = h2 * r0
+	blr
+SYM_FUNC_END(Poly1305_mult)
+
+#
+# carry reduction
+# h %=p
+#
+# Input: v7, v10, v11
+# Output: r27, r28, r29
+#
+SYM_FUNC_START_LOCAL(Carry_reduction)
+	mfvsrld	27, 32+7
+	mfvsrld	28, 32+10
+	mfvsrld	29, 32+11
+	mfvsrd	20, 32+7	# h0.h
+	mfvsrd	21, 32+10	# h1.h
+
+	addc	28, 28, 20
+	adde	29, 29, 21
+	srdi	22, 29, 0x2
+	sldi	23, 22, 0x2
+	add	23, 23, 22	# (h2 & 3) * 5
+	addc	27, 27, 23	# h0
+	addze	28, 28		# h1
+	andi.	29, 29, 0x3	# h2
+	blr
+SYM_FUNC_END(Carry_reduction)
+
+#
+# poly1305 multiplication
+# h *= r, h %= p
+#	d0 = h0 * r0 + h1 * s1
+#	d1 = h0 * r1 + h1 * r0 + h2 * s1
+#       d2 = h0 * r0
+#
+#
+# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
+#   - no highbit if final leftover block (highbit = 0)
+#
+SYM_FUNC_START(poly1305_64s)
+	cmpdi	5, 0
+	ble	Out_no_poly1305_64
+
+	mflr 0
+	std 0, 16(1)
+	stdu 1,-400(1)
+
+	SAVE_GPR 14, 112, 1
+	SAVE_GPR 15, 120, 1
+	SAVE_GPR 16, 128, 1
+	SAVE_GPR 17, 136, 1
+	SAVE_GPR 18, 144, 1
+	SAVE_GPR 19, 152, 1
+	SAVE_GPR 20, 160, 1
+	SAVE_GPR 21, 168, 1
+	SAVE_GPR 22, 176, 1
+	SAVE_GPR 23, 184, 1
+	SAVE_GPR 24, 192, 1
+	SAVE_GPR 25, 200, 1
+	SAVE_GPR 26, 208, 1
+	SAVE_GPR 27, 216, 1
+	SAVE_GPR 28, 224, 1
+	SAVE_GPR 29, 232, 1
+	SAVE_GPR 30, 240, 1
+	SAVE_GPR 31, 248, 1
+
+	# Init poly1305
+	bl Poly1305_init_64
+
+	li 25, 0			# offset to inp and outp
+
+	add 11, 25, 4
+
+	# load h
+	# h0, h1, h2?
+        ld	27, 0(3)
+        ld	28, 8(3)
+        lwz	29, 16(3)
+
+        li      30, 16
+        divdu   31, 5, 30
+
+        mtctr   31
+
+        mr      24, 6		# highbit
+
+Loop_block_64:
+	vxor	9, 9, 9
+
+	ld	20, 0(11)
+	ld	21, 8(11)
+	addi	11, 11, 16
+
+	addc	27, 27, 20
+	adde	28, 28, 21
+	adde	29, 29, 24
+
+	li	22, 0
+	mtvsrdd	32+6, 27, 28	# h0, h1
+	mtvsrdd	32+8, 29, 22	# h2
+
+	bl	Poly1305_mult
+
+	bl	Carry_reduction
+
+	bdnz	Loop_block_64
+
+	std	27, 0(3)
+	std	28, 8(3)
+	stw	29, 16(3)
+
+	li	3, 0
+
+	RESTORE_GPR 14, 112, 1
+	RESTORE_GPR 15, 120, 1
+	RESTORE_GPR 16, 128, 1
+	RESTORE_GPR 17, 136, 1
+	RESTORE_GPR 18, 144, 1
+	RESTORE_GPR 19, 152, 1
+	RESTORE_GPR 20, 160, 1
+	RESTORE_GPR 21, 168, 1
+	RESTORE_GPR 22, 176, 1
+	RESTORE_GPR 23, 184, 1
+	RESTORE_GPR 24, 192, 1
+	RESTORE_GPR 25, 200, 1
+	RESTORE_GPR 26, 208, 1
+	RESTORE_GPR 27, 216, 1
+	RESTORE_GPR 28, 224, 1
+	RESTORE_GPR 29, 232, 1
+	RESTORE_GPR 30, 240, 1
+	RESTORE_GPR 31, 248, 1
+
+	addi    1, 1, 400
+	ld 0, 16(1)
+	mtlr 0
+
+	blr
+
+Out_no_poly1305_64:
+	li	3, 0
+	blr
+SYM_FUNC_END(poly1305_64s)
+
+#
+# Input: r3 = h, r4 = s, r5 = mac
+# mac = h + s
+#
+SYM_FUNC_START(poly1305_emit_64)
+	ld	10, 0(3)
+	ld	11, 8(3)
+	ld	12, 16(3)
+
+	# compare modulus
+	# h + 5 + (-p)
+	mr	6, 10
+	mr	7, 11
+	mr	8, 12
+	addic.	6, 6, 5
+	addze	7, 7
+	addze	8, 8
+	srdi	9, 8, 2		# overflow?
+	cmpdi	9, 0
+	beq	Skip_h64
+	mr	10, 6
+	mr	11, 7
+	mr	12, 8
+
+Skip_h64:
+	ld	6, 0(4)
+	ld	7, 8(4)
+	addc	10, 10, 6
+	adde	11, 11, 7
+	addze	12, 12
+
+	std	10, 0(5)
+	std	11, 8(5)
+	blr
+SYM_FUNC_END(poly1305_emit_64)
+
+SYM_DATA_START_LOCAL(RMASK)
+.align 5
+rmask:
+.byte	0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
+cnum:
+.long	0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
+.long	0x1a, 0x00, 0x1a, 0x00
+.long	0x01000000, 0x01000000, 0x01000000, 0x01000000
+.long	0x00010203, 0x04050607, 0x10111213, 0x14151617
+.long	0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
+SYM_DATA_END(RMASK)
diff --git a/lib/crypto/powerpc/sha1-powerpc-asm.S b/lib/crypto/powerpc/sha1-powerpc-asm.S
new file mode 100644
index 000000000000..f0d5ed557ab1
--- /dev/null
+++ b/lib/crypto/powerpc/sha1-powerpc-asm.S
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#ifdef __BIG_ENDIAN__
+#define LWZ(rt, d, ra)	\
+	lwz	rt,d(ra)
+#else
+#define LWZ(rt, d, ra)	\
+	li	rt,d;	\
+	lwbrx	rt,rt,ra
+#endif
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t)	((((t)+5)%6)+7)
+#define RA(t)	((((t)+4)%6)+7)
+#define RB(t)	((((t)+3)%6)+7)
+#define RC(t)	((((t)+2)%6)+7)
+#define RD(t)	((((t)+1)%6)+7)
+#define RE(t)	((((t)+0)%6)+7)
+
+/* We use registers 16 - 31 for the W values */
+#define W(t)	(((t)%16)+16)
+
+#define LOADW(t)				\
+	LWZ(W(t),(t)*4,r4)
+
+#define STEPD0_LOAD(t)				\
+	andc	r0,RD(t),RB(t);		\
+	and	r6,RB(t),RC(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	or	r6,r6,r0;			\
+	add	r0,RE(t),r15;			\
+	add	RT(t),RT(t),r6;		\
+	add	r14,r0,W(t);			\
+	LWZ(W((t)+4),((t)+4)*4,r4);	\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RT(t),RT(t),r14
+
+#define STEPD0_UPDATE(t)			\
+	and	r6,RB(t),RC(t);		\
+	andc	r0,RD(t),RB(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	or	r6,r6,r0;			\
+	add	r0,RE(t),r15;			\
+	xor	r5,W((t)+4-3),W((t)+4-8);		\
+	add	RT(t),RT(t),r6;		\
+	xor	W((t)+4),W((t)+4-16),W((t)+4-14);	\
+	add	r0,r0,W(t);			\
+	xor	W((t)+4),W((t)+4),r5;			\
+	add	RT(t),RT(t),r0;		\
+	rotlwi	W((t)+4),W((t)+4),1
+
+#define STEPD1(t)				\
+	xor	r6,RB(t),RC(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	xor	r6,r6,RD(t);			\
+	add	r0,RE(t),r15;			\
+	add	RT(t),RT(t),r6;		\
+	add	r0,r0,W(t);			\
+	add	RT(t),RT(t),r0
+
+#define STEPD1_UPDATE(t)				\
+	xor	r6,RB(t),RC(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	xor	r6,r6,RD(t);			\
+	add	r0,RE(t),r15;			\
+	xor	r5,W((t)+4-3),W((t)+4-8);		\
+	add	RT(t),RT(t),r6;		\
+	xor	W((t)+4),W((t)+4-16),W((t)+4-14);	\
+	add	r0,r0,W(t);			\
+	xor	W((t)+4),W((t)+4),r5;			\
+	add	RT(t),RT(t),r0;		\
+	rotlwi	W((t)+4),W((t)+4),1
+
+#define STEPD2_UPDATE(t)			\
+	and	r6,RB(t),RC(t);		\
+	and	r0,RB(t),RD(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	or	r6,r6,r0;			\
+	rotlwi	RB(t),RB(t),30;			\
+	and	r0,RC(t),RD(t);		\
+	xor	r5,W((t)+4-3),W((t)+4-8);	\
+	or	r6,r6,r0;			\
+	xor	W((t)+4),W((t)+4-16),W((t)+4-14);	\
+	add	r0,RE(t),r15;			\
+	add	RT(t),RT(t),r6;		\
+	add	r0,r0,W(t);			\
+	xor	W((t)+4),W((t)+4),r5;		\
+	add	RT(t),RT(t),r0;		\
+	rotlwi	W((t)+4),W((t)+4),1
+
+#define STEP0LD4(t)				\
+	STEPD0_LOAD(t);				\
+	STEPD0_LOAD((t)+1);			\
+	STEPD0_LOAD((t)+2);			\
+	STEPD0_LOAD((t)+3)
+
+#define STEPUP4(t, fn)				\
+	STEP##fn##_UPDATE(t);			\
+	STEP##fn##_UPDATE((t)+1);		\
+	STEP##fn##_UPDATE((t)+2);		\
+	STEP##fn##_UPDATE((t)+3)
+
+#define STEPUP20(t, fn)				\
+	STEPUP4(t, fn);				\
+	STEPUP4((t)+4, fn);			\
+	STEPUP4((t)+8, fn);			\
+	STEPUP4((t)+12, fn);			\
+	STEPUP4((t)+16, fn)
+
+_GLOBAL(powerpc_sha_transform)
+	PPC_STLU r1,-INT_FRAME_SIZE(r1)
+	SAVE_GPRS(14, 31, r1)
+
+	/* Load up A - E */
+	lwz	RA(0),0(r3)	/* A */
+	lwz	RB(0),4(r3)	/* B */
+	lwz	RC(0),8(r3)	/* C */
+	lwz	RD(0),12(r3)	/* D */
+	lwz	RE(0),16(r3)	/* E */
+
+	LOADW(0)
+	LOADW(1)
+	LOADW(2)
+	LOADW(3)
+
+	lis	r15,0x5a82	/* K0-19 */
+	ori	r15,r15,0x7999
+	STEP0LD4(0)
+	STEP0LD4(4)
+	STEP0LD4(8)
+	STEPUP4(12, D0)
+	STEPUP4(16, D0)
+
+	lis	r15,0x6ed9	/* K20-39 */
+	ori	r15,r15,0xeba1
+	STEPUP20(20, D1)
+
+	lis	r15,0x8f1b	/* K40-59 */
+	ori	r15,r15,0xbcdc
+	STEPUP20(40, D2)
+
+	lis	r15,0xca62	/* K60-79 */
+	ori	r15,r15,0xc1d6
+	STEPUP4(60, D1)
+	STEPUP4(64, D1)
+	STEPUP4(68, D1)
+	STEPUP4(72, D1)
+	lwz	r20,16(r3)
+	STEPD1(76)
+	lwz	r19,12(r3)
+	STEPD1(77)
+	lwz	r18,8(r3)
+	STEPD1(78)
+	lwz	r17,4(r3)
+	STEPD1(79)
+
+	lwz	r16,0(r3)
+	add	r20,RE(80),r20
+	add	RD(0),RD(80),r19
+	add	RC(0),RC(80),r18
+	add	RB(0),RB(80),r17
+	add	RA(0),RA(80),r16
+	mr	RE(0),r20
+	stw	RA(0),0(r3)
+	stw	RB(0),4(r3)
+	stw	RC(0),8(r3)
+	stw	RD(0),12(r3)
+	stw	RE(0),16(r3)
+
+	REST_GPRS(14, 31, r1)
+	addi	r1,r1,INT_FRAME_SIZE
+	blr
diff --git a/lib/crypto/powerpc/sha1-spe-asm.S b/lib/crypto/powerpc/sha1-spe-asm.S
new file mode 100644
index 000000000000..0f447523be5e
--- /dev/null
+++ b/lib/crypto/powerpc/sha1-spe-asm.S
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-1 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP	r3	/* pointer to hash value			*/
+#define rWP	r4	/* pointer to input				*/
+#define rKP	r5	/* pointer to constants				*/
+
+#define rW0	r14	/* 64 bit round words				*/
+#define rW1	r15
+#define rW2	r16
+#define rW3	r17
+#define rW4	r18
+#define rW5	r19
+#define rW6	r20
+#define rW7	r21
+
+#define rH0	r6	/* 32 bit hash values 				*/
+#define rH1	r7
+#define rH2	r8
+#define rH3	r9
+#define rH4	r10
+
+#define rT0	r22	/* 64 bit temporary				*/
+#define rT1	r0	/* 32 bit temporaries				*/
+#define rT2	r11
+#define rT3	r12
+
+#define rK	r23	/* 64 bit constant in volatile register		*/
+
+#define LOAD_K01
+
+#define LOAD_K11 \
+	evlwwsplat	rK,0(rKP);
+
+#define LOAD_K21 \
+	evlwwsplat	rK,4(rKP);
+
+#define LOAD_K31 \
+	evlwwsplat	rK,8(rKP);
+
+#define LOAD_K41 \
+	evlwwsplat	rK,12(rKP);
+
+#define INITIALIZE \
+	stwu		r1,-128(r1);	/* create stack frame		*/ \
+	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
+	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
+	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
+	evstdw		r17,32(r1);					   \
+	evstdw		r18,40(r1);					   \
+	evstdw		r19,48(r1);					   \
+	evstdw		r20,56(r1);					   \
+	evstdw		r21,64(r1);					   \
+	evstdw		r22,72(r1);					   \
+	evstdw		r23,80(r1);
+
+
+#define FINALIZE \
+	evldw		r14,8(r1);	/* restore SPE registers	*/ \
+	evldw		r15,16(r1);					   \
+	evldw		r16,24(r1);					   \
+	evldw		r17,32(r1);					   \
+	evldw		r18,40(r1);					   \
+	evldw		r19,48(r1);					   \
+	evldw		r20,56(r1);					   \
+	evldw		r21,64(r1);					   \
+	evldw		r22,72(r1);					   \
+	evldw		r23,80(r1);					   \
+	xor		r0,r0,r0;					   \
+	stw		r0,8(r1);	/* Delete sensitive data	*/ \
+	stw		r0,16(r1);	/* that we might have pushed	*/ \
+	stw		r0,24(r1);	/* from other context that runs	*/ \
+	stw		r0,32(r1);	/* the same code. Assume that	*/ \
+	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
+	stw		r0,48(r1);	/* were already overwritten on	*/ \
+	stw		r0,56(r1);	/* the way down to here		*/ \
+	stw		r0,64(r1);					   \
+	stw		r0,72(r1);					   \
+	stw		r0,80(r1);					   \
+	addi		r1,r1,128;	/* cleanup stack frame		*/
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+	lwz		reg,off(rWP);	/* load data			*/
+#define NEXT_BLOCK \
+	addi		rWP,rWP,64;	/* increment per block		*/
+#else
+#define LOAD_DATA(reg, off) \
+	lwbrx		reg,0,rWP;	/* load data			*/ \
+	addi		rWP,rWP,4;	/* increment per word		*/
+#define NEXT_BLOCK			/* nothing to do		*/
+#endif
+
+#define	R_00_15(a, b, c, d, e, w0, w1, k, off) \
+	LOAD_DATA(w0, off)		/* 1: W				*/ \
+	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
+	LOAD_K##k##1							   \
+	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
+	rotrwi		rT0,a,27;	/* 1: A' = A rotl 5		*/ \
+	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
+	add		e,e,rT0;	/* 1: E = E + A'		*/ \
+	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
+	add		e,e,w0;		/* 1: E = E + W			*/ \
+	LOAD_DATA(w1, off+4)		/* 2: W				*/ \
+	add		e,e,rT2;	/* 1: E = E + F			*/ \
+	and		rT1,a,b;	/* 2: F' = B and C 		*/ \
+	add		e,e,rK;		/* 1: E = E + K			*/ \
+	andc		rT2,c,a;	/* 2: F" = ~B and D 		*/ \
+	add		d,d,rK;		/* 2: E = E + K			*/ \
+	or		rT2,rT2,rT1;	/* 2: F = F' or F"		*/ \
+	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
+	add		d,d,w1;		/* 2: E = E + W			*/ \
+	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
+	add		d,d,rT0;	/* 2: E = E + A'		*/ \
+	evmergelo	w1,w1,w0;	/*    mix W[0]/W[1]		*/ \
+	add		d,d,rT2		/* 2: E = E + F			*/
+
+#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
+	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
+	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
+	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
+	or		rT1,rT1,rT2;	/* 1: F = F' or F"		*/ \
+	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
+	add		e,e,rT1;	/* 1: E = E + F			*/ \
+	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
+	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
+	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
+	add		e,e,rT2;	/* 1: E = E + A'		*/ \
+	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
+	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
+	LOAD_K##k##1							   \
+	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
+	add		e,e,rT0;	/* 1: E = E + WK		*/ \
+	add		d,d,rT1;	/* 2: E = E + WK		*/ \
+	and		rT2,a,b;	/* 2: F' = B and C 		*/ \
+	andc		rT1,c,a;	/* 2: F" = ~B and D 		*/ \
+	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
+	or		rT1,rT1,rT2;	/* 2: F = F' or F"		*/ \
+	add		d,d,rT0;	/* 2: E = E + A'		*/ \
+	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
+	add		d,d,rT1		/* 2: E = E + F			*/
+
+#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
+	xor		rT2,b,c;	/* 1: F' = B xor C		*/ \
+	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
+	xor		rT2,rT2,d;	/* 1: F = F' xor D		*/ \
+	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
+	add		e,e,rT2;	/* 1: E = E + F			*/ \
+	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
+	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
+	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
+	add		e,e,rT2;	/* 1: E = E + A'		*/ \
+	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
+	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
+	LOAD_K##k##1							   \
+	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
+	add		e,e,rT0;	/* 1: E = E + WK		*/ \
+	xor		rT2,a,b;	/* 2: F' = B xor C		*/ \
+	add		d,d,rT1;	/* 2: E = E + WK		*/ \
+	xor		rT2,rT2,c;	/* 2: F = F' xor D		*/ \
+	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
+	add		d,d,rT2;	/* 2: E = E + F			*/ \
+	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
+	add		d,d,rT0		/* 2: E = E + A'		*/
+
+#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+	and		rT2,b,c;	/* 1: F' = B and C		*/ \
+	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
+	or		rT1,b,c;	/* 1: F" = B or C		*/ \
+	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
+	and		rT1,d,rT1;	/* 1: F" = F" and D		*/ \
+	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
+	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
+	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
+	add		e,e,rT2;	/* 1: E = E + F			*/ \
+	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
+	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
+	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
+	add		e,e,rT2;	/* 1: E = E + A'		*/ \
+	LOAD_K##k##1							   \
+	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
+	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
+	add		e,e,rT0;	/* 1: E = E + WK		*/ \
+	and		rT2,a,b;	/* 2: F' = B and C		*/ \
+	or		rT0,a,b;	/* 2: F" = B or C		*/ \
+	add		d,d,rT1;	/* 2: E = E + WK		*/ \
+	and		rT0,c,rT0;	/* 2: F" = F" and D		*/ \
+	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
+	or		rT2,rT2,rT0;	/* 2: F = F' or F"		*/ \
+	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
+	add		d,d,rT2;	/* 2: E = E + F			*/ \
+	add		d,d,rT0		/* 2: E = E + A'		*/
+
+#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+	R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
+
+_GLOBAL(ppc_spe_sha1_transform)
+	INITIALIZE
+
+	lwz		rH0,0(rHP)
+	lwz		rH1,4(rHP)
+	mtctr		r5
+	lwz		rH2,8(rHP)
+	lis		rKP,PPC_SPE_SHA1_K@h
+	lwz		rH3,12(rHP)
+	ori		rKP,rKP,PPC_SPE_SHA1_K@l
+	lwz		rH4,16(rHP)
+
+ppc_spe_sha1_main:
+	R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
+	R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
+	R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
+	R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
+	R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
+	R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
+	R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
+	R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
+
+	R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
+	R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
+
+	R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
+	R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
+	R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
+	R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
+	R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
+	R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
+	R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
+	R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
+	R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
+	R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
+
+	R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
+	R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
+	R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
+	R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
+	R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
+	R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
+	R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
+	R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
+	R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
+	R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
+
+	R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
+	R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
+	R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
+	R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
+	R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
+	R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
+	R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
+	lwz		rT3,0(rHP)
+	R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
+	lwz		rW1,4(rHP)
+	R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
+	lwz		rW2,8(rHP)
+	R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
+	lwz		rW3,12(rHP)
+	NEXT_BLOCK
+	lwz		rW4,16(rHP)
+
+	add		rH0,rH0,rT3
+	stw		rH0,0(rHP)
+	add		rH1,rH1,rW1
+	stw		rH1,4(rHP)
+	add		rH2,rH2,rW2
+	stw		rH2,8(rHP)
+	add		rH3,rH3,rW3
+	stw		rH3,12(rHP)
+	add		rH4,rH4,rW4
+	stw		rH4,16(rHP)
+
+	bdnz		ppc_spe_sha1_main
+
+	FINALIZE
+	blr
+
+.data
+.align 4
+PPC_SPE_SHA1_K:
+	.long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
diff --git a/lib/crypto/powerpc/sha1.h b/lib/crypto/powerpc/sha1.h
new file mode 100644
index 000000000000..e2c010f0370b
--- /dev/null
+++ b/lib/crypto/powerpc/sha1.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for PowerPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+#ifdef CONFIG_SPE
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA1 takes ~1000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 2KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 2048
+
+asmlinkage void ppc_spe_sha1_transform(struct sha1_block_state *state,
+				       const u8 *data, u32 nblocks);
+
+static void spe_begin(void)
+{
+	/* We just start SPE operations and will save SPE registers later. */
+	preempt_disable();
+	enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+	disable_kernel_spe();
+	/* reenable preemption */
+	preempt_enable();
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	do {
+		u32 unit = min_t(size_t, nblocks, MAX_BYTES / SHA1_BLOCK_SIZE);
+
+		spe_begin();
+		ppc_spe_sha1_transform(state, data, unit);
+		spe_end();
+
+		data += unit * SHA1_BLOCK_SIZE;
+		nblocks -= unit;
+	} while (nblocks);
+}
+#else /* CONFIG_SPE */
+asmlinkage void powerpc_sha_transform(struct sha1_block_state *state,
+				      const u8 data[SHA1_BLOCK_SIZE]);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	do {
+		powerpc_sha_transform(state, data);
+		data += SHA1_BLOCK_SIZE;
+	} while (--nblocks);
+}
+#endif /* !CONFIG_SPE */
diff --git a/lib/crypto/powerpc/sha256-spe-asm.S b/lib/crypto/powerpc/sha256-spe-asm.S
new file mode 100644
index 000000000000..cd99d71dae34
--- /dev/null
+++ b/lib/crypto/powerpc/sha256-spe-asm.S
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-256 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP	r3	/* pointer to hash values in memory		*/
+#define rKP	r24	/* pointer to round constants			*/
+#define rWP	r4	/* pointer to input data			*/
+
+#define rH0	r5	/* 8 32 bit hash values in 8 registers		*/
+#define rH1	r6
+#define rH2	r7
+#define rH3	r8
+#define rH4	r9
+#define rH5	r10
+#define rH6	r11
+#define rH7	r12
+
+#define rW0	r14	/* 64 bit registers. 16 words in 8 registers	*/
+#define rW1	r15
+#define rW2	r16
+#define rW3	r17
+#define rW4	r18
+#define rW5	r19
+#define rW6	r20
+#define rW7	r21
+
+#define rT0	r22	/* 64 bit temporaries 				*/
+#define rT1	r23
+#define rT2	r0	/* 32 bit temporaries				*/
+#define rT3	r25
+
+#define CMP_KN_LOOP
+#define CMP_KC_LOOP \
+	cmpwi		rT1,0;
+
+#define INITIALIZE \
+	stwu		r1,-128(r1);	/* create stack frame		*/ \
+	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
+	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
+	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
+	evstdw		r17,32(r1);					   \
+	evstdw		r18,40(r1);					   \
+	evstdw		r19,48(r1);					   \
+	evstdw		r20,56(r1);					   \
+	evstdw		r21,64(r1);					   \
+	evstdw		r22,72(r1);					   \
+	evstdw		r23,80(r1);					   \
+	stw		r24,88(r1);	/* save normal registers	*/ \
+	stw		r25,92(r1);
+
+
+#define FINALIZE \
+	evldw		r14,8(r1);	/* restore SPE registers	*/ \
+	evldw		r15,16(r1);					   \
+	evldw		r16,24(r1);					   \
+	evldw		r17,32(r1);					   \
+	evldw		r18,40(r1);					   \
+	evldw		r19,48(r1);					   \
+	evldw		r20,56(r1);					   \
+	evldw		r21,64(r1);					   \
+	evldw		r22,72(r1);					   \
+	evldw		r23,80(r1);					   \
+	lwz		r24,88(r1);	/* restore normal registers	*/ \
+	lwz		r25,92(r1);					   \
+	xor		r0,r0,r0;					   \
+	stw		r0,8(r1);	/* Delete sensitive data	*/ \
+	stw		r0,16(r1);	/* that we might have pushed	*/ \
+	stw		r0,24(r1);	/* from other context that runs	*/ \
+	stw		r0,32(r1);	/* the same code. Assume that	*/ \
+	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
+	stw		r0,48(r1);	/* was already overwritten on	*/ \
+	stw		r0,56(r1);	/* the way down to here		*/ \
+	stw		r0,64(r1);					   \
+	stw		r0,72(r1);					   \
+	stw		r0,80(r1);					   \
+	addi		r1,r1,128;	/* cleanup stack frame		*/
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+	lwz		reg,off(rWP);	/* load data			*/
+#define NEXT_BLOCK \
+	addi		rWP,rWP,64;	/* increment per block		*/
+#else
+#define LOAD_DATA(reg, off) \
+	lwbrx		reg,0,rWP; 	/* load data			*/ \
+	addi		rWP,rWP,4;	/* increment per word		*/
+#define NEXT_BLOCK			/* nothing to do		*/
+#endif
+
+#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
+	LOAD_DATA(w, off)		/* 1: W				*/ \
+	rotrwi		rT0,e,6;	/* 1: S1 = e rotr 6		*/ \
+	rotrwi		rT1,e,11;	/* 1: S1' = e rotr 11		*/ \
+	rotrwi		rT2,e,25;	/* 1: S1" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 1: S1 = S1 xor S1'		*/ \
+	and		rT3,e,f;	/* 1: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 1: S1 = S1 xor S1"		*/ \
+	andc		rT1,g,e;	/* 1: ch' = ~e and g		*/ \
+	lwz		rT2,off(rKP);	/* 1: K				*/ \
+	xor		rT3,rT3,rT1;	/* 1: ch = ch xor ch'		*/ \
+	add		h,h,rT0;	/* 1: temp1 = h + S1		*/ \
+	add		rT3,rT3,w;	/* 1: temp1' = ch + w		*/ \
+	rotrwi		rT0,a,2;	/* 1: S0 = a rotr 2		*/ \
+	add		h,h,rT3;	/* 1: temp1 = temp1 + temp1'	*/ \
+	rotrwi		rT1,a,13;	/* 1: S0' = a rotr 13		*/ \
+	add		h,h,rT2;	/* 1: temp1 = temp1 + K		*/ \
+	rotrwi		rT3,a,22;	/* 1: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 1: S0 = S0 xor S0'		*/ \
+	add		d,d,h;		/* 1: d = d + temp1		*/ \
+	xor		rT3,rT0,rT3;	/* 1: S0 = S0 xor S0"		*/ \
+	evmergelo	w,w,w;		/*    shift W			*/ \
+	or		rT2,a,b;	/* 1: maj = a or b		*/ \
+	and		rT1,a,b;	/* 1: maj' = a and b		*/ \
+	and		rT2,rT2,c;	/* 1: maj = maj and c		*/ \
+	LOAD_DATA(w, off+4)		/* 2: W				*/ \
+	or		rT2,rT1,rT2;	/* 1: maj = maj or maj'		*/ \
+	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
+	add		rT3,rT3,rT2;	/* 1: temp2 = S0 + maj		*/ \
+	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
+	add		h,h,rT3;	/* 1: h = temp1 + temp2		*/ \
+	rotrwi		rT2,d,25;	/* 2: S1" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
+	and		rT3,d,e;	/* 2: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
+	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
+	lwz		rT2,off+4(rKP);	/* 2: K				*/ \
+	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
+	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
+	add		rT3,rT3,w;	/* 2: temp1' = ch + w		*/ \
+	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
+	add		g,g,rT3;	/* 2: temp1 = temp1 + temp1'	*/ \
+	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
+	add		g,g,rT2;	/* 2: temp1 = temp1 + K		*/ \
+	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
+	or		rT2,h,a;	/* 2: maj = a or b		*/ \
+	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
+	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
+	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
+	add		c,c,g;		/* 2: d = d + temp1		*/ \
+	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
+	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
+	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
+
+#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
+	rotrwi		rT2,e,6;	/* 1: S1 = e rotr 6		*/ \
+	evmergelohi	rT0,w0,w1;	/*    w[-15]			*/ \
+	rotrwi		rT3,e,11;	/* 1: S1' = e rotr 11		*/ \
+	evsrwiu		rT1,rT0,3;	/*    s0 = w[-15] >> 3		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
+	evrlwi		rT0,rT0,25;	/*    s0' = w[-15] rotr	7	*/ \
+	rotrwi		rT3,e,25;	/* 1: S1' = e rotr 25		*/ \
+	evxor		rT1,rT1,rT0;	/*    s0 = s0 xor s0'		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
+	evrlwi		rT0,rT0,21;	/*    s0' = w[-15] rotr 18	*/ \
+	add		h,h,rT2;	/* 1: temp1 = h + S1		*/ \
+	evxor		rT0,rT0,rT1;	/*    s0 = s0 xor s0'		*/ \
+	and		rT2,e,f;	/* 1: ch = e and f		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w[-16] + s0		*/ \
+	andc		rT3,g,e;	/* 1: ch' = ~e and g		*/ \
+	evsrwiu		rT0,w7,10;	/*    s1 = w[-2] >> 10		*/ \
+	xor		rT2,rT2,rT3;	/* 1: ch = ch xor ch'		*/ \
+	evrlwi		rT1,w7,15;	/*    s1' = w[-2] rotr 17	*/ \
+	add		h,h,rT2;	/* 1: temp1 = temp1 + ch	*/ \
+	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
+	rotrwi		rT2,a,2;	/* 1: S0 = a rotr 2		*/ \
+	evrlwi		rT1,w7,13;	/*    s1' = w[-2] rotr 19	*/ \
+	rotrwi		rT3,a,13;	/* 1: S0' = a rotr 13		*/ \
+	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
+	evldw		rT1,off(rKP);	/*    k				*/ \
+	rotrwi		rT3,a,22;	/* 1: S0' = a rotr 22		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w + s1		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
+	evmergelohi	rT0,w4,w5;	/*    w[-7]			*/ \
+	and		rT3,a,b;	/* 1: maj = a and b		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w + w[-7]		*/ \
+	CMP_K##k##_LOOP							   \
+	add		rT2,rT2,rT3;	/* 1: temp2 = S0 + maj		*/ \
+	evaddw		rT1,rT1,w0;	/*    wk = w + k		*/ \
+	xor		rT3,a,b;	/* 1: maj = a xor b		*/ \
+	evmergehi	rT0,rT1,rT1;	/*    wk1/wk2			*/ \
+	and		rT3,rT3,c;	/* 1: maj = maj and c		*/ \
+	add		h,h,rT0;	/* 1: temp1 = temp1 + wk	*/ \
+	add		rT2,rT2,rT3;	/* 1: temp2 = temp2 + maj	*/ \
+	add		g,g,rT1;	/* 2: temp1 = temp1 + wk	*/ \
+	add		d,d,h;		/* 1: d = d + temp1		*/ \
+	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
+	add		h,h,rT2;	/* 1: h = temp1 + temp2		*/ \
+	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
+	rotrwi		rT2,d,25;	/* 2: S" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
+	and		rT3,d,e;	/* 2: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
+	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
+	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
+	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
+	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
+	add		g,g,rT3;	/* 2: temp1 = temp1 + ch	*/ \
+	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
+	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
+	or		rT2,h,a;	/* 2: maj = a or b		*/ \
+	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
+	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
+	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
+	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
+	add		c,c,g;		/* 2: d = d + temp1		*/ \
+	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
+	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
+
+_GLOBAL(ppc_spe_sha256_transform)
+	INITIALIZE
+
+	mtctr		r5
+	lwz		rH0,0(rHP)
+	lwz		rH1,4(rHP)
+	lwz		rH2,8(rHP)
+	lwz		rH3,12(rHP)
+	lwz		rH4,16(rHP)
+	lwz		rH5,20(rHP)
+	lwz		rH6,24(rHP)
+	lwz		rH7,28(rHP)
+
+ppc_spe_sha256_main:
+	lis		rKP,PPC_SPE_SHA256_K@ha
+	addi		rKP,rKP,PPC_SPE_SHA256_K@l
+
+	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
+	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
+	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
+	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
+	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
+	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
+	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
+	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
+ppc_spe_sha256_16_rounds:
+	addi		rKP,rKP,64
+	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+		 rW0, rW1, rW4, rW5, rW7, N, 0)
+	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+		 rW1, rW2, rW5, rW6, rW0, N, 8)
+	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+		 rW2, rW3, rW6, rW7, rW1, N, 16)
+	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+		 rW3, rW4, rW7, rW0, rW2, N, 24)
+	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+		 rW4, rW5, rW0, rW1, rW3, N, 32)
+	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+		 rW5, rW6, rW1, rW2, rW4, N, 40)
+	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+		 rW6, rW7, rW2, rW3, rW5, N, 48)
+	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+		 rW7, rW0, rW3, rW4, rW6, C, 56)
+	bt		gt,ppc_spe_sha256_16_rounds
+
+	lwz		rW0,0(rHP)
+	NEXT_BLOCK
+	lwz		rW1,4(rHP)
+	lwz		rW2,8(rHP)
+	lwz		rW3,12(rHP)
+	lwz		rW4,16(rHP)
+	lwz		rW5,20(rHP)
+	lwz		rW6,24(rHP)
+	lwz		rW7,28(rHP)
+
+	add		rH0,rH0,rW0
+	stw		rH0,0(rHP)
+	add		rH1,rH1,rW1
+	stw		rH1,4(rHP)
+	add		rH2,rH2,rW2
+	stw		rH2,8(rHP)
+	add		rH3,rH3,rW3
+	stw		rH3,12(rHP)
+	add		rH4,rH4,rW4
+	stw		rH4,16(rHP)
+	add		rH5,rH5,rW5
+	stw		rH5,20(rHP)
+	add		rH6,rH6,rW6
+	stw		rH6,24(rHP)
+	add		rH7,rH7,rW7
+	stw		rH7,28(rHP)
+
+	bdnz		ppc_spe_sha256_main
+
+	FINALIZE
+	blr
+
+.data
+.align 5
+PPC_SPE_SHA256_K:
+	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/lib/crypto/powerpc/sha256.h b/lib/crypto/powerpc/sha256.h
new file mode 100644
index 000000000000..50d355441c7e
--- /dev/null
+++ b/lib/crypto/powerpc/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm, SPE optimized
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(struct sha256_block_state *state,
+				     const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+	/* We just start SPE operations and will save SPE registers later. */
+	preempt_disable();
+	enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+	disable_kernel_spe();
+	/* reenable preemption */
+	preempt_enable();
+}
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	do {
+		/* cut input data into smaller blocks */
+		u32 unit = min_t(size_t, nblocks,
+				 MAX_BYTES / SHA256_BLOCK_SIZE);
+
+		spe_begin();
+		ppc_spe_sha256_transform(state, data, unit);
+		spe_end();
+
+		data += unit * SHA256_BLOCK_SIZE;
+		nblocks -= unit;
+	} while (nblocks);
+}
diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig
new file mode 100644
index 000000000000..bc7a43f33eb3
--- /dev/null
+++ b/lib/crypto/riscv/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA_RISCV64
+	tristate
+	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile
new file mode 100644
index 000000000000..e27b78f317fc
--- /dev/null
+++ b/lib/crypto/riscv/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o
+chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha-riscv64-glue.c
new file mode 100644
index 000000000000..8c3f11d79be3
--- /dev/null
+++ b/lib/crypto/riscv/chacha-riscv64-glue.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ChaCha stream cipher (RISC-V optimized)
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb);
+
+asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+			    size_t nblocks, int nrounds);
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	hchacha_block_generic(state, out, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	u8 block_buffer[CHACHA_BLOCK_SIZE];
+	unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE;
+	unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE;
+
+	if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	kernel_vector_begin();
+	if (full_blocks) {
+		chacha_zvkb(state, src, dst, full_blocks, nrounds);
+		src += full_blocks * CHACHA_BLOCK_SIZE;
+		dst += full_blocks * CHACHA_BLOCK_SIZE;
+	}
+	if (tail_bytes) {
+		memcpy(block_buffer, src, tail_bytes);
+		chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds);
+		memcpy(dst, block_buffer, tail_bytes);
+	}
+	kernel_vector_end();
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&use_zvkb);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init riscv64_chacha_mod_init(void)
+{
+	if (riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&use_zvkb);
+	return 0;
+}
+subsys_initcall(riscv64_chacha_mod_init);
+
+static void __exit riscv64_chacha_mod_exit(void)
+{
+}
+module_exit(riscv64_chacha_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)");
+MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/riscv/chacha-riscv64-zvkb.S b/lib/crypto/riscv/chacha-riscv64-zvkb.S
new file mode 100644
index 000000000000..b777d0b4e379
--- /dev/null
+++ b/lib/crypto/riscv/chacha-riscv64-zvkb.S
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkb
+
+#define STATEP		a0
+#define INP		a1
+#define OUTP		a2
+#define NBLOCKS		a3
+#define NROUNDS		a4
+
+#define CONSTS0		a5
+#define CONSTS1		a6
+#define CONSTS2		a7
+#define CONSTS3		t0
+#define TMP		t1
+#define VL		t2
+#define STRIDE		t3
+#define ROUND_CTR	t4
+#define KEY0		s0
+#define KEY1		s1
+#define KEY2		s2
+#define KEY3		s3
+#define KEY4		s4
+#define KEY5		s5
+#define KEY6		s6
+#define KEY7		s7
+#define COUNTER		s8
+#define NONCE0		s9
+#define NONCE1		s10
+#define NONCE2		s11
+
+.macro	chacha_round	a0, b0, c0, d0,  a1, b1, c1, d1, \
+			a2, b2, c2, d2,  a3, b3, c3, d3
+	// a += b; d ^= a; d = rol(d, 16);
+	vadd.vv		\a0, \a0, \b0
+	vadd.vv		\a1, \a1, \b1
+	vadd.vv		\a2, \a2, \b2
+	vadd.vv		\a3, \a3, \b3
+	vxor.vv		\d0, \d0, \a0
+	vxor.vv		\d1, \d1, \a1
+	vxor.vv		\d2, \d2, \a2
+	vxor.vv		\d3, \d3, \a3
+	vror.vi		\d0, \d0, 32 - 16
+	vror.vi		\d1, \d1, 32 - 16
+	vror.vi		\d2, \d2, 32 - 16
+	vror.vi		\d3, \d3, 32 - 16
+
+	// c += d; b ^= c; b = rol(b, 12);
+	vadd.vv		\c0, \c0, \d0
+	vadd.vv		\c1, \c1, \d1
+	vadd.vv		\c2, \c2, \d2
+	vadd.vv		\c3, \c3, \d3
+	vxor.vv		\b0, \b0, \c0
+	vxor.vv		\b1, \b1, \c1
+	vxor.vv		\b2, \b2, \c2
+	vxor.vv		\b3, \b3, \c3
+	vror.vi		\b0, \b0, 32 - 12
+	vror.vi		\b1, \b1, 32 - 12
+	vror.vi		\b2, \b2, 32 - 12
+	vror.vi		\b3, \b3, 32 - 12
+
+	// a += b; d ^= a; d = rol(d, 8);
+	vadd.vv		\a0, \a0, \b0
+	vadd.vv		\a1, \a1, \b1
+	vadd.vv		\a2, \a2, \b2
+	vadd.vv		\a3, \a3, \b3
+	vxor.vv		\d0, \d0, \a0
+	vxor.vv		\d1, \d1, \a1
+	vxor.vv		\d2, \d2, \a2
+	vxor.vv		\d3, \d3, \a3
+	vror.vi		\d0, \d0, 32 - 8
+	vror.vi		\d1, \d1, 32 - 8
+	vror.vi		\d2, \d2, 32 - 8
+	vror.vi		\d3, \d3, 32 - 8
+
+	// c += d; b ^= c; b = rol(b, 7);
+	vadd.vv		\c0, \c0, \d0
+	vadd.vv		\c1, \c1, \d1
+	vadd.vv		\c2, \c2, \d2
+	vadd.vv		\c3, \c3, \d3
+	vxor.vv		\b0, \b0, \c0
+	vxor.vv		\b1, \b1, \c1
+	vxor.vv		\b2, \b2, \c2
+	vxor.vv		\b3, \b3, \c3
+	vror.vi		\b0, \b0, 32 - 7
+	vror.vi		\b1, \b1, 32 - 7
+	vror.vi		\b2, \b2, 32 - 7
+	vror.vi		\b3, \b3, 32 - 7
+.endm
+
+// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+//		    size_t nblocks, int nrounds);
+//
+// |nblocks| is the number of 64-byte blocks to process, and must be nonzero.
+//
+// |state| gives the ChaCha state matrix, including the 32-bit counter in
+// state->x[12] following the RFC7539 convention; note that this differs from
+// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13].
+// The updated 32-bit counter is written back to state->x[12] before returning.
+SYM_FUNC_START(chacha_zvkb)
+	addi		sp, sp, -96
+	sd		s0, 0(sp)
+	sd		s1, 8(sp)
+	sd		s2, 16(sp)
+	sd		s3, 24(sp)
+	sd		s4, 32(sp)
+	sd		s5, 40(sp)
+	sd		s6, 48(sp)
+	sd		s7, 56(sp)
+	sd		s8, 64(sp)
+	sd		s9, 72(sp)
+	sd		s10, 80(sp)
+	sd		s11, 88(sp)
+
+	li		STRIDE, 64
+
+	// Set up the initial state matrix in scalar registers.
+	lw		CONSTS0, 0(STATEP)
+	lw		CONSTS1, 4(STATEP)
+	lw		CONSTS2, 8(STATEP)
+	lw		CONSTS3, 12(STATEP)
+	lw		KEY0, 16(STATEP)
+	lw		KEY1, 20(STATEP)
+	lw		KEY2, 24(STATEP)
+	lw		KEY3, 28(STATEP)
+	lw		KEY4, 32(STATEP)
+	lw		KEY5, 36(STATEP)
+	lw		KEY6, 40(STATEP)
+	lw		KEY7, 44(STATEP)
+	lw		COUNTER, 48(STATEP)
+	lw		NONCE0, 52(STATEP)
+	lw		NONCE1, 56(STATEP)
+	lw		NONCE2, 60(STATEP)
+
+.Lblock_loop:
+	// Set vl to the number of blocks to process in this iteration.
+	vsetvli		VL, NBLOCKS, e32, m1, ta, ma
+
+	// Set up the initial state matrix for the next VL blocks in v0-v15.
+	// v{i} holds the i'th 32-bit word of the state matrix for all blocks.
+	// Note that only the counter word, at index 12, differs across blocks.
+	vmv.v.x		v0, CONSTS0
+	vmv.v.x		v1, CONSTS1
+	vmv.v.x		v2, CONSTS2
+	vmv.v.x		v3, CONSTS3
+	vmv.v.x		v4, KEY0
+	vmv.v.x		v5, KEY1
+	vmv.v.x		v6, KEY2
+	vmv.v.x		v7, KEY3
+	vmv.v.x		v8, KEY4
+	vmv.v.x		v9, KEY5
+	vmv.v.x		v10, KEY6
+	vmv.v.x		v11, KEY7
+	vid.v		v12
+	vadd.vx		v12, v12, COUNTER
+	vmv.v.x		v13, NONCE0
+	vmv.v.x		v14, NONCE1
+	vmv.v.x		v15, NONCE2
+
+	// Load the first half of the input data for each block into v16-v23.
+	// v{16+i} holds the i'th 32-bit word for all blocks.
+	vlsseg8e32.v	v16, (INP), STRIDE
+
+	mv		ROUND_CTR, NROUNDS
+.Lnext_doubleround:
+	addi		ROUND_CTR, ROUND_CTR, -2
+	// column round
+	chacha_round	v0, v4, v8, v12, v1, v5, v9, v13, \
+			v2, v6, v10, v14, v3, v7, v11, v15
+	// diagonal round
+	chacha_round	v0, v5, v10, v15, v1, v6, v11, v12, \
+			v2, v7, v8, v13, v3, v4, v9, v14
+	bnez		ROUND_CTR, .Lnext_doubleround
+
+	// Load the second half of the input data for each block into v24-v31.
+	// v{24+i} holds the {8+i}'th 32-bit word for all blocks.
+	addi		TMP, INP, 32
+	vlsseg8e32.v	v24, (TMP), STRIDE
+
+	// Finalize the first half of the keystream for each block.
+	vadd.vx		v0, v0, CONSTS0
+	vadd.vx		v1, v1, CONSTS1
+	vadd.vx		v2, v2, CONSTS2
+	vadd.vx		v3, v3, CONSTS3
+	vadd.vx		v4, v4, KEY0
+	vadd.vx		v5, v5, KEY1
+	vadd.vx		v6, v6, KEY2
+	vadd.vx		v7, v7, KEY3
+
+	// Encrypt/decrypt the first half of the data for each block.
+	vxor.vv		v16, v16, v0
+	vxor.vv		v17, v17, v1
+	vxor.vv		v18, v18, v2
+	vxor.vv		v19, v19, v3
+	vxor.vv		v20, v20, v4
+	vxor.vv		v21, v21, v5
+	vxor.vv		v22, v22, v6
+	vxor.vv		v23, v23, v7
+
+	// Store the first half of the output data for each block.
+	vssseg8e32.v	v16, (OUTP), STRIDE
+
+	// Finalize the second half of the keystream for each block.
+	vadd.vx		v8, v8, KEY4
+	vadd.vx		v9, v9, KEY5
+	vadd.vx		v10, v10, KEY6
+	vadd.vx		v11, v11, KEY7
+	vid.v		v0
+	vadd.vx		v12, v12, COUNTER
+	vadd.vx		v13, v13, NONCE0
+	vadd.vx		v14, v14, NONCE1
+	vadd.vx		v15, v15, NONCE2
+	vadd.vv		v12, v12, v0
+
+	// Encrypt/decrypt the second half of the data for each block.
+	vxor.vv		v24, v24, v8
+	vxor.vv		v25, v25, v9
+	vxor.vv		v26, v26, v10
+	vxor.vv		v27, v27, v11
+	vxor.vv		v29, v29, v13
+	vxor.vv		v28, v28, v12
+	vxor.vv		v30, v30, v14
+	vxor.vv		v31, v31, v15
+
+	// Store the second half of the output data for each block.
+	addi		TMP, OUTP, 32
+	vssseg8e32.v	v24, (TMP), STRIDE
+
+	// Update the counter, the remaining number of blocks, and the input and
+	// output pointers according to the number of blocks processed (VL).
+	add		COUNTER, COUNTER, VL
+	sub		NBLOCKS, NBLOCKS, VL
+	slli		TMP, VL, 6
+	add		OUTP, OUTP, TMP
+	add		INP, INP, TMP
+	bnez		NBLOCKS, .Lblock_loop
+
+	sw		COUNTER, 48(STATEP)
+	ld		s0, 0(sp)
+	ld		s1, 8(sp)
+	ld		s2, 16(sp)
+	ld		s3, 24(sp)
+	ld		s4, 32(sp)
+	ld		s5, 40(sp)
+	ld		s6, 48(sp)
+	ld		s7, 56(sp)
+	ld		s8, 64(sp)
+	ld		s9, 72(sp)
+	ld		s10, 80(sp)
+	ld		s11, 88(sp)
+	addi		sp, sp, 96
+	ret
+SYM_FUNC_END(chacha_zvkb)
diff --git a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
new file mode 100644
index 000000000000..1618d1220a6e
--- /dev/null
+++ b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvknha, +zvkb
+
+#define STATEP		a0
+#define DATA		a1
+#define NUM_BLOCKS	a2
+
+#define STATEP_C	a3
+
+#define MASK		v0
+#define INDICES		v1
+#define W0		v2
+#define W1		v3
+#define W2		v4
+#define W3		v5
+#define VTMP		v6
+#define FEBA		v7
+#define HGDC		v8
+#define K0		v10
+#define K1		v11
+#define K2		v12
+#define K3		v13
+#define K4		v14
+#define K5		v15
+#define K6		v16
+#define K7		v17
+#define K8		v18
+#define K9		v19
+#define K10		v20
+#define K11		v21
+#define K12		v22
+#define K13		v23
+#define K14		v24
+#define K15		v25
+#define PREV_FEBA	v26
+#define PREV_HGDC	v27
+
+// Do 4 rounds of SHA-256.  w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words.  w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0.  This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro	sha256_4rounds	last, k, w0, w1, w2, w3
+	vadd.vv		VTMP, \k, \w0
+	vsha2cl.vv	HGDC, FEBA, VTMP
+	vsha2ch.vv	FEBA, HGDC, VTMP
+.if !\last
+	vmerge.vvm	VTMP, \w2, \w1, MASK
+	vsha2ms.vv	\w0, VTMP, \w3
+.endif
+.endm
+
+.macro	sha256_16rounds	last, k0, k1, k2, k3
+	sha256_4rounds	\last, \k0, W0, W1, W2, W3
+	sha256_4rounds	\last, \k1, W1, W2, W3, W0
+	sha256_4rounds	\last, \k2, W2, W3, W0, W1
+	sha256_4rounds	\last, \k3, W3, W0, W1, W2
+.endm
+
+// void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+//					       const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+	// Load the round constants into K0-K15.
+	vsetivli	zero, 4, e32, m1, ta, ma
+	la		t0, K256
+	vle32.v		K0, (t0)
+	addi		t0, t0, 16
+	vle32.v		K1, (t0)
+	addi		t0, t0, 16
+	vle32.v		K2, (t0)
+	addi		t0, t0, 16
+	vle32.v		K3, (t0)
+	addi		t0, t0, 16
+	vle32.v		K4, (t0)
+	addi		t0, t0, 16
+	vle32.v		K5, (t0)
+	addi		t0, t0, 16
+	vle32.v		K6, (t0)
+	addi		t0, t0, 16
+	vle32.v		K7, (t0)
+	addi		t0, t0, 16
+	vle32.v		K8, (t0)
+	addi		t0, t0, 16
+	vle32.v		K9, (t0)
+	addi		t0, t0, 16
+	vle32.v		K10, (t0)
+	addi		t0, t0, 16
+	vle32.v		K11, (t0)
+	addi		t0, t0, 16
+	vle32.v		K12, (t0)
+	addi		t0, t0, 16
+	vle32.v		K13, (t0)
+	addi		t0, t0, 16
+	vle32.v		K14, (t0)
+	addi		t0, t0, 16
+	vle32.v		K15, (t0)
+
+	// Setup mask for the vmerge to replace the first word (idx==0) in
+	// message scheduling.  There are 4 words, so an 8-bit mask suffices.
+	vsetivli	zero, 1, e8, m1, ta, ma
+	vmv.v.i		MASK, 0x01
+
+	// Load the state.  The state is stored as {a,b,c,d,e,f,g,h}, but we
+	// need {f,e,b,a},{h,g,d,c}.  The dst vtype is e32m1 and the index vtype
+	// is e8mf4.  We use index-load with the i8 indices {20, 16, 4, 0},
+	// loaded using the 32-bit little endian value 0x00041014.
+	li		t0, 0x00041014
+	vsetivli	zero, 1, e32, m1, ta, ma
+	vmv.v.x		INDICES, t0
+	addi		STATEP_C, STATEP, 8
+	vsetivli	zero, 4, e32, m1, ta, ma
+	vluxei8.v	FEBA, (STATEP), INDICES
+	vluxei8.v	HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+	addi		NUM_BLOCKS, NUM_BLOCKS, -1
+
+	// Save the previous state, as it's needed later.
+	vmv.v.v		PREV_FEBA, FEBA
+	vmv.v.v		PREV_HGDC, HGDC
+
+	// Load the next 512-bit message block and endian-swap each 32-bit word.
+	vle32.v		W0, (DATA)
+	vrev8.v		W0, W0
+	addi		DATA, DATA, 16
+	vle32.v		W1, (DATA)
+	vrev8.v		W1, W1
+	addi		DATA, DATA, 16
+	vle32.v		W2, (DATA)
+	vrev8.v		W2, W2
+	addi		DATA, DATA, 16
+	vle32.v		W3, (DATA)
+	vrev8.v		W3, W3
+	addi		DATA, DATA, 16
+
+	// Do the 64 rounds of SHA-256.
+	sha256_16rounds	0, K0, K1, K2, K3
+	sha256_16rounds	0, K4, K5, K6, K7
+	sha256_16rounds	0, K8, K9, K10, K11
+	sha256_16rounds	1, K12, K13, K14, K15
+
+	// Add the previous state.
+	vadd.vv		FEBA, FEBA, PREV_FEBA
+	vadd.vv		HGDC, HGDC, PREV_HGDC
+
+	// Repeat if more blocks remain.
+	bnez		NUM_BLOCKS, .Lnext_block
+
+	// Store the new state and return.
+	vsuxei8.v	FEBA, (STATEP), INDICES
+	vsuxei8.v	HGDC, (STATEP_C), INDICES
+	ret
+SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 2
+.type K256, @object
+K256:
+	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+.size K256, . - K256
diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h
new file mode 100644
index 000000000000..c0f79c18f119
--- /dev/null
+++ b/lib/crypto/riscv/sha256.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 (RISC-V accelerated)
+ *
+ * Copyright (C) 2022 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+
+asmlinkage void
+sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+				       const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_extensions) && crypto_simd_usable()) {
+		kernel_vector_begin();
+		sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
+		kernel_vector_end();
+	} else {
+		sha256_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	/* Both zvknha and zvknhb provide the SHA-256 instructions. */
+	if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
+	     riscv_isa_extension_available(NULL, ZVKNHB)) &&
+	    riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S
new file mode 100644
index 000000000000..b41eebf60546
--- /dev/null
+++ b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvknhb, +zvkb
+
+#define STATEP		a0
+#define DATA		a1
+#define NUM_BLOCKS	a2
+
+#define STATEP_C	a3
+#define K		a4
+
+#define MASK		v0
+#define INDICES		v1
+#define W0		v10	// LMUL=2
+#define W1		v12	// LMUL=2
+#define W2		v14	// LMUL=2
+#define W3		v16	// LMUL=2
+#define VTMP		v20	// LMUL=2
+#define FEBA		v22	// LMUL=2
+#define HGDC		v24	// LMUL=2
+#define PREV_FEBA	v26	// LMUL=2
+#define PREV_HGDC	v28	// LMUL=2
+
+// Do 4 rounds of SHA-512.  w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words.  w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0.  This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro	sha512_4rounds	last, w0, w1, w2, w3
+	vle64.v		VTMP, (K)
+	addi		K, K, 32
+	vadd.vv		VTMP, VTMP, \w0
+	vsha2cl.vv	HGDC, FEBA, VTMP
+	vsha2ch.vv	FEBA, HGDC, VTMP
+.if !\last
+	vmerge.vvm	VTMP, \w2, \w1, MASK
+	vsha2ms.vv	\w0, VTMP, \w3
+.endif
+.endm
+
+.macro	sha512_16rounds	last
+	sha512_4rounds	\last, W0, W1, W2, W3
+	sha512_4rounds	\last, W1, W2, W3, W0
+	sha512_4rounds	\last, W2, W3, W0, W1
+	sha512_4rounds	\last, W3, W0, W1, W2
+.endm
+
+// void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
+//				     const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha512_transform_zvknhb_zvkb)
+
+	// Setup mask for the vmerge to replace the first word (idx==0) in
+	// message scheduling.  There are 4 words, so an 8-bit mask suffices.
+	vsetivli	zero, 1, e8, m1, ta, ma
+	vmv.v.i		MASK, 0x01
+
+	// Load the state.  The state is stored as {a,b,c,d,e,f,g,h}, but we
+	// need {f,e,b,a},{h,g,d,c}.  The dst vtype is e64m2 and the index vtype
+	// is e8mf4.  We use index-load with the i8 indices {40, 32, 8, 0},
+	// loaded using the 32-bit little endian value 0x00082028.
+	li		t0, 0x00082028
+	vsetivli	zero, 1, e32, m1, ta, ma
+	vmv.v.x		INDICES, t0
+	addi		STATEP_C, STATEP, 16
+	vsetivli	zero, 4, e64, m2, ta, ma
+	vluxei8.v	FEBA, (STATEP), INDICES
+	vluxei8.v	HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+	la		K, K512
+	addi		NUM_BLOCKS, NUM_BLOCKS, -1
+
+	// Save the previous state, as it's needed later.
+	vmv.v.v		PREV_FEBA, FEBA
+	vmv.v.v		PREV_HGDC, HGDC
+
+	// Load the next 1024-bit message block and endian-swap each 64-bit word
+	vle64.v		W0, (DATA)
+	vrev8.v		W0, W0
+	addi		DATA, DATA, 32
+	vle64.v		W1, (DATA)
+	vrev8.v		W1, W1
+	addi		DATA, DATA, 32
+	vle64.v		W2, (DATA)
+	vrev8.v		W2, W2
+	addi		DATA, DATA, 32
+	vle64.v		W3, (DATA)
+	vrev8.v		W3, W3
+	addi		DATA, DATA, 32
+
+	// Do the 80 rounds of SHA-512.
+	sha512_16rounds 0
+	sha512_16rounds 0
+	sha512_16rounds 0
+	sha512_16rounds 0
+	sha512_16rounds 1
+
+	// Add the previous state.
+	vadd.vv		FEBA, FEBA, PREV_FEBA
+	vadd.vv		HGDC, HGDC, PREV_HGDC
+
+	// Repeat if more blocks remain.
+	bnez		NUM_BLOCKS, .Lnext_block
+
+	// Store the new state and return.
+	vsuxei8.v	FEBA, (STATEP), INDICES
+	vsuxei8.v	HGDC, (STATEP_C), INDICES
+	ret
+SYM_FUNC_END(sha512_transform_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 3
+.type K512, @object
+K512:
+	.dword		0x428a2f98d728ae22, 0x7137449123ef65cd
+	.dword		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.dword		0x3956c25bf348b538, 0x59f111f1b605d019
+	.dword		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.dword		0xd807aa98a3030242, 0x12835b0145706fbe
+	.dword		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.dword		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.dword		0x9bdc06a725c71235, 0xc19bf174cf692694
+	.dword		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.dword		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.dword		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.dword		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.dword		0x983e5152ee66dfab, 0xa831c66d2db43210
+	.dword		0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.dword		0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.dword		0x06ca6351e003826f, 0x142929670a0e6e70
+	.dword		0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.dword		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.dword		0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.dword		0x81c2c92e47edaee6, 0x92722c851482353b
+	.dword		0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.dword		0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.dword		0xd192e819d6ef5218, 0xd69906245565a910
+	.dword		0xf40e35855771202a, 0x106aa07032bbd1b8
+	.dword		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.dword		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.dword		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.dword		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.dword		0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.dword		0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.dword		0x90befffa23631e28, 0xa4506cebde82bde9
+	.dword		0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.dword		0xca273eceea26619c, 0xd186b8c721c0c207
+	.dword		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.dword		0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.dword		0x113f9804bef90dae, 0x1b710b35131c471b
+	.dword		0x28db77f523047d84, 0x32caab7b40c72493
+	.dword		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.dword		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.dword		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+.size K512, . - K512
diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h
new file mode 100644
index 000000000000..9d0abede322f
--- /dev/null
+++ b/lib/crypto/riscv/sha512.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-512 and SHA-384 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
+					     const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_extensions) &&
+	    likely(crypto_simd_usable())) {
+		kernel_vector_begin();
+		sha512_transform_zvknhb_zvkb(state, data, nblocks);
+		kernel_vector_end();
+	} else {
+		sha512_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static inline void sha512_mod_init_arch(void)
+{
+	if (riscv_isa_extension_available(NULL, ZVKNHB) &&
+	    riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig
new file mode 100644
index 000000000000..069b355fe51a
--- /dev/null
+++ b/lib/crypto/s390/Kconfig
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA_S390
+	tristate
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile
new file mode 100644
index 000000000000..06c2cf77178e
--- /dev/null
+++ b/lib/crypto/s390/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
+chacha_s390-y := chacha-glue.o chacha-s390.o
diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha-glue.c
new file mode 100644
index 000000000000..f95ba3483bbc
--- /dev/null
+++ b/lib/crypto/s390/chacha-glue.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha stream cipher (s390 optimized)
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#define KMSG_COMPONENT "chacha_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/chacha.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/fpu.h>
+#include "chacha-s390.h"
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	/* TODO: implement hchacha_block_arch() in assembly */
+	hchacha_block_generic(state, out, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	/* s390 chacha20 implementation has 20 rounds hard-coded,
+	 * it cannot handle a block of data or less, but otherwise
+	 * it can handle data of arbitrary size
+	 */
+	if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) {
+		chacha_crypt_generic(state, dst, src, bytes, nrounds);
+	} else {
+		DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+
+		kernel_fpu_begin(&vxstate, KERNEL_VXR);
+		chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]);
+		kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+		state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) /
+				CHACHA_BLOCK_SIZE;
+	}
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return cpu_has_vx();
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/s390/chacha-s390.S b/lib/crypto/s390/chacha-s390.S
new file mode 100644
index 000000000000..63f3102678c0
--- /dev/null
+++ b/lib/crypto/s390/chacha-s390.S
@@ -0,0 +1,908 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/fpu-insn.h>
+
+#define SP	%r15
+#define FRAME	(16 * 8 + 4 * 8)
+
+	.data
+	.balign	32
+
+SYM_DATA_START_LOCAL(sigma)
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
+	.long	1,0,0,0
+	.long	2,0,0,0
+	.long	3,0,0,0
+	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap
+
+	.long	0,1,2,3
+	.long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma
+	.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
+	.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
+	.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
+SYM_DATA_END(sigma)
+
+	.previous
+
+	GEN_BR_THUNK %r14
+
+	.text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+#		      counst u32 *key, const u32 *counter)
+
+#define	OUT		%r2
+#define	INP		%r3
+#define	LEN		%r4
+#define	KEY		%r5
+#define	COUNTER		%r6
+
+#define BEPERM		%v31
+#define CTR		%v26
+
+#define K0		%v16
+#define K1		%v17
+#define K2		%v18
+#define K3		%v19
+
+#define XA0		%v0
+#define XA1		%v1
+#define XA2		%v2
+#define XA3		%v3
+
+#define XB0		%v4
+#define XB1		%v5
+#define XB2		%v6
+#define XB3		%v7
+
+#define XC0		%v8
+#define XC1		%v9
+#define XC2		%v10
+#define XC3		%v11
+
+#define XD0		%v12
+#define XD1		%v13
+#define XD2		%v14
+#define XD3		%v15
+
+#define XT0		%v27
+#define XT1		%v28
+#define XT2		%v29
+#define XT3		%v30
+
+SYM_FUNC_START(chacha20_vx_4x)
+	stmg	%r6,%r7,6*8(SP)
+
+	larl	%r7,sigma
+	lhi	%r0,10
+	lhi	%r1,0
+
+	VL	K0,0,,%r7		# load sigma
+	VL	K1,0,,KEY		# load key
+	VL	K2,16,,KEY
+	VL	K3,0,,COUNTER		# load counter
+
+	VL	BEPERM,0x40,,%r7
+	VL	CTR,0x50,,%r7
+
+	VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma
+
+	VREPF	XB0,K1,0		# smash the key
+	VREPF	XB1,K1,1
+	VREPF	XB2,K1,2
+	VREPF	XB3,K1,3
+
+	VREPF	XD0,K3,0
+	VREPF	XD1,K3,1
+	VREPF	XD2,K3,2
+	VREPF	XD3,K3,3
+	VAF	XD0,XD0,CTR
+
+	VREPF	XC0,K2,0
+	VREPF	XC1,K2,1
+	VREPF	XC2,K2,2
+	VREPF	XC3,K2,3
+
+.Loop_4x:
+	VAF	XA0,XA0,XB0
+	VX	XD0,XD0,XA0
+	VERLLF	XD0,XD0,16
+
+	VAF	XA1,XA1,XB1
+	VX	XD1,XD1,XA1
+	VERLLF	XD1,XD1,16
+
+	VAF	XA2,XA2,XB2
+	VX	XD2,XD2,XA2
+	VERLLF	XD2,XD2,16
+
+	VAF	XA3,XA3,XB3
+	VX	XD3,XD3,XA3
+	VERLLF	XD3,XD3,16
+
+	VAF	XC0,XC0,XD0
+	VX	XB0,XB0,XC0
+	VERLLF	XB0,XB0,12
+
+	VAF	XC1,XC1,XD1
+	VX	XB1,XB1,XC1
+	VERLLF	XB1,XB1,12
+
+	VAF	XC2,XC2,XD2
+	VX	XB2,XB2,XC2
+	VERLLF	XB2,XB2,12
+
+	VAF	XC3,XC3,XD3
+	VX	XB3,XB3,XC3
+	VERLLF	XB3,XB3,12
+
+	VAF	XA0,XA0,XB0
+	VX	XD0,XD0,XA0
+	VERLLF	XD0,XD0,8
+
+	VAF	XA1,XA1,XB1
+	VX	XD1,XD1,XA1
+	VERLLF	XD1,XD1,8
+
+	VAF	XA2,XA2,XB2
+	VX	XD2,XD2,XA2
+	VERLLF	XD2,XD2,8
+
+	VAF	XA3,XA3,XB3
+	VX	XD3,XD3,XA3
+	VERLLF	XD3,XD3,8
+
+	VAF	XC0,XC0,XD0
+	VX	XB0,XB0,XC0
+	VERLLF	XB0,XB0,7
+
+	VAF	XC1,XC1,XD1
+	VX	XB1,XB1,XC1
+	VERLLF	XB1,XB1,7
+
+	VAF	XC2,XC2,XD2
+	VX	XB2,XB2,XC2
+	VERLLF	XB2,XB2,7
+
+	VAF	XC3,XC3,XD3
+	VX	XB3,XB3,XC3
+	VERLLF	XB3,XB3,7
+
+	VAF	XA0,XA0,XB1
+	VX	XD3,XD3,XA0
+	VERLLF	XD3,XD3,16
+
+	VAF	XA1,XA1,XB2
+	VX	XD0,XD0,XA1
+	VERLLF	XD0,XD0,16
+
+	VAF	XA2,XA2,XB3
+	VX	XD1,XD1,XA2
+	VERLLF	XD1,XD1,16
+
+	VAF	XA3,XA3,XB0
+	VX	XD2,XD2,XA3
+	VERLLF	XD2,XD2,16
+
+	VAF	XC2,XC2,XD3
+	VX	XB1,XB1,XC2
+	VERLLF	XB1,XB1,12
+
+	VAF	XC3,XC3,XD0
+	VX	XB2,XB2,XC3
+	VERLLF	XB2,XB2,12
+
+	VAF	XC0,XC0,XD1
+	VX	XB3,XB3,XC0
+	VERLLF	XB3,XB3,12
+
+	VAF	XC1,XC1,XD2
+	VX	XB0,XB0,XC1
+	VERLLF	XB0,XB0,12
+
+	VAF	XA0,XA0,XB1
+	VX	XD3,XD3,XA0
+	VERLLF	XD3,XD3,8
+
+	VAF	XA1,XA1,XB2
+	VX	XD0,XD0,XA1
+	VERLLF	XD0,XD0,8
+
+	VAF	XA2,XA2,XB3
+	VX	XD1,XD1,XA2
+	VERLLF	XD1,XD1,8
+
+	VAF	XA3,XA3,XB0
+	VX	XD2,XD2,XA3
+	VERLLF	XD2,XD2,8
+
+	VAF	XC2,XC2,XD3
+	VX	XB1,XB1,XC2
+	VERLLF	XB1,XB1,7
+
+	VAF	XC3,XC3,XD0
+	VX	XB2,XB2,XC3
+	VERLLF	XB2,XB2,7
+
+	VAF	XC0,XC0,XD1
+	VX	XB3,XB3,XC0
+	VERLLF	XB3,XB3,7
+
+	VAF	XC1,XC1,XD2
+	VX	XB0,XB0,XC1
+	VERLLF	XB0,XB0,7
+	brct	%r0,.Loop_4x
+
+	VAF	XD0,XD0,CTR
+
+	VMRHF	XT0,XA0,XA1		# transpose data
+	VMRHF	XT1,XA2,XA3
+	VMRLF	XT2,XA0,XA1
+	VMRLF	XT3,XA2,XA3
+	VPDI	XA0,XT0,XT1,0b0000
+	VPDI	XA1,XT0,XT1,0b0101
+	VPDI	XA2,XT2,XT3,0b0000
+	VPDI	XA3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XB0,XB1
+	VMRHF	XT1,XB2,XB3
+	VMRLF	XT2,XB0,XB1
+	VMRLF	XT3,XB2,XB3
+	VPDI	XB0,XT0,XT1,0b0000
+	VPDI	XB1,XT0,XT1,0b0101
+	VPDI	XB2,XT2,XT3,0b0000
+	VPDI	XB3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XC0,XC1
+	VMRHF	XT1,XC2,XC3
+	VMRLF	XT2,XC0,XC1
+	VMRLF	XT3,XC2,XC3
+	VPDI	XC0,XT0,XT1,0b0000
+	VPDI	XC1,XT0,XT1,0b0101
+	VPDI	XC2,XT2,XT3,0b0000
+	VPDI	XC3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XD0,XD1
+	VMRHF	XT1,XD2,XD3
+	VMRLF	XT2,XD0,XD1
+	VMRLF	XT3,XD2,XD3
+	VPDI	XD0,XT0,XT1,0b0000
+	VPDI	XD1,XT0,XT1,0b0101
+	VPDI	XD2,XT2,XT3,0b0000
+	VPDI	XD3,XT2,XT3,0b0101
+
+	VAF	XA0,XA0,K0
+	VAF	XB0,XB0,K1
+	VAF	XC0,XC0,K2
+	VAF	XD0,XD0,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+
+	VAF	XA0,XA1,K0
+	VAF	XB0,XB1,K1
+	VAF	XC0,XC1,K2
+	VAF	XD0,XD1,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_4x
+
+	VAF	XA0,XA2,K0
+	VAF	XB0,XB2,K1
+	VAF	XC0,XC2,K2
+	VAF	XD0,XD2,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_4x
+
+	VAF	XA0,XA3,K0
+	VAF	XB0,XB3,K1
+	VAF	XC0,XC3,K2
+	VAF	XD0,XD3,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+	lmg	%r6,%r7,6*8(SP)
+	BR_EX	%r14
+
+.Ltail_4x:
+	VLR	XT0,XC0
+	VLR	XT1,XD0
+
+	VST	XA0,8*8+0x00,,SP
+	VST	XB0,8*8+0x10,,SP
+	VST	XT0,8*8+0x20,,SP
+	VST	XT1,8*8+0x30,,SP
+
+	lghi	%r1,0
+
+.Loop_tail_4x:
+	llgc	%r5,0(%r1,INP)
+	llgc	%r6,8*8(%r1,SP)
+	xr	%r6,%r5
+	stc	%r6,0(%r1,OUT)
+	la	%r1,1(%r1)
+	brct	LEN,.Loop_tail_4x
+
+	lmg	%r6,%r7,6*8(SP)
+	BR_EX	%r14
+SYM_FUNC_END(chacha20_vx_4x)
+
+#undef	OUT
+#undef	INP
+#undef	LEN
+#undef	KEY
+#undef	COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+#		   counst u32 *key, const u32 *counter)
+
+#define	OUT		%r2
+#define	INP		%r3
+#define	LEN		%r4
+#define	KEY		%r5
+#define	COUNTER		%r6
+
+#define BEPERM		%v31
+
+#define K0		%v27
+#define K1		%v24
+#define K2		%v25
+#define K3		%v26
+
+#define A0		%v0
+#define B0		%v1
+#define C0		%v2
+#define D0		%v3
+
+#define A1		%v4
+#define B1		%v5
+#define C1		%v6
+#define D1		%v7
+
+#define A2		%v8
+#define B2		%v9
+#define C2		%v10
+#define D2		%v11
+
+#define A3		%v12
+#define B3		%v13
+#define C3		%v14
+#define D3		%v15
+
+#define A4		%v16
+#define B4		%v17
+#define C4		%v18
+#define D4		%v19
+
+#define A5		%v20
+#define B5		%v21
+#define C5		%v22
+#define D5		%v23
+
+#define T0		%v27
+#define T1		%v28
+#define T2		%v29
+#define T3		%v30
+
+SYM_FUNC_START(chacha20_vx)
+	clgfi	LEN,256
+	jle	chacha20_vx_4x
+	stmg	%r6,%r7,6*8(SP)
+
+	lghi	%r1,-FRAME
+	lgr	%r0,SP
+	la	SP,0(%r1,SP)
+	stg	%r0,0(SP)		# back-chain
+
+	larl	%r7,sigma
+	lhi	%r0,10
+
+	VLM	K1,K2,0,KEY,0		# load key
+	VL	K3,0,,COUNTER		# load counter
+
+	VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ...
+
+.Loop_outer_vx:
+	VLR	A0,K0
+	VLR	B0,K1
+	VLR	A1,K0
+	VLR	B1,K1
+	VLR	A2,K0
+	VLR	B2,K1
+	VLR	A3,K0
+	VLR	B3,K1
+	VLR	A4,K0
+	VLR	B4,K1
+	VLR	A5,K0
+	VLR	B5,K1
+
+	VLR	D0,K3
+	VAF	D1,K3,T1		# K[3]+1
+	VAF	D2,K3,T2		# K[3]+2
+	VAF	D3,K3,T3		# K[3]+3
+	VAF	D4,D2,T2		# K[3]+4
+	VAF	D5,D2,T3		# K[3]+5
+
+	VLR	C0,K2
+	VLR	C1,K2
+	VLR	C2,K2
+	VLR	C3,K2
+	VLR	C4,K2
+	VLR	C5,K2
+
+	VLR	T1,D1
+	VLR	T2,D2
+	VLR	T3,D3
+
+.Loop_vx:
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,16
+	VERLLF	D1,D1,16
+	VERLLF	D2,D2,16
+	VERLLF	D3,D3,16
+	VERLLF	D4,D4,16
+	VERLLF	D5,D5,16
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,12
+	VERLLF	B1,B1,12
+	VERLLF	B2,B2,12
+	VERLLF	B3,B3,12
+	VERLLF	B4,B4,12
+	VERLLF	B5,B5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,8
+	VERLLF	D1,D1,8
+	VERLLF	D2,D2,8
+	VERLLF	D3,D3,8
+	VERLLF	D4,D4,8
+	VERLLF	D5,D5,8
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,7
+	VERLLF	B1,B1,7
+	VERLLF	B2,B2,7
+	VERLLF	B3,B3,7
+	VERLLF	B4,B4,7
+	VERLLF	B5,B5,7
+
+	VSLDB	C0,C0,C0,8
+	VSLDB	C1,C1,C1,8
+	VSLDB	C2,C2,C2,8
+	VSLDB	C3,C3,C3,8
+	VSLDB	C4,C4,C4,8
+	VSLDB	C5,C5,C5,8
+	VSLDB	B0,B0,B0,4
+	VSLDB	B1,B1,B1,4
+	VSLDB	B2,B2,B2,4
+	VSLDB	B3,B3,B3,4
+	VSLDB	B4,B4,B4,4
+	VSLDB	B5,B5,B5,4
+	VSLDB	D0,D0,D0,12
+	VSLDB	D1,D1,D1,12
+	VSLDB	D2,D2,D2,12
+	VSLDB	D3,D3,D3,12
+	VSLDB	D4,D4,D4,12
+	VSLDB	D5,D5,D5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,16
+	VERLLF	D1,D1,16
+	VERLLF	D2,D2,16
+	VERLLF	D3,D3,16
+	VERLLF	D4,D4,16
+	VERLLF	D5,D5,16
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,12
+	VERLLF	B1,B1,12
+	VERLLF	B2,B2,12
+	VERLLF	B3,B3,12
+	VERLLF	B4,B4,12
+	VERLLF	B5,B5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,8
+	VERLLF	D1,D1,8
+	VERLLF	D2,D2,8
+	VERLLF	D3,D3,8
+	VERLLF	D4,D4,8
+	VERLLF	D5,D5,8
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,7
+	VERLLF	B1,B1,7
+	VERLLF	B2,B2,7
+	VERLLF	B3,B3,7
+	VERLLF	B4,B4,7
+	VERLLF	B5,B5,7
+
+	VSLDB	C0,C0,C0,8
+	VSLDB	C1,C1,C1,8
+	VSLDB	C2,C2,C2,8
+	VSLDB	C3,C3,C3,8
+	VSLDB	C4,C4,C4,8
+	VSLDB	C5,C5,C5,8
+	VSLDB	B0,B0,B0,12
+	VSLDB	B1,B1,B1,12
+	VSLDB	B2,B2,B2,12
+	VSLDB	B3,B3,B3,12
+	VSLDB	B4,B4,B4,12
+	VSLDB	B5,B5,B5,12
+	VSLDB	D0,D0,D0,4
+	VSLDB	D1,D1,D1,4
+	VSLDB	D2,D2,D2,4
+	VSLDB	D3,D3,D3,4
+	VSLDB	D4,D4,D4,4
+	VSLDB	D5,D5,D5,4
+	brct	%r0,.Loop_vx
+
+	VAF	A0,A0,K0
+	VAF	B0,B0,K1
+	VAF	C0,C0,K2
+	VAF	D0,D0,K3
+	VAF	A1,A1,K0
+	VAF	D1,D1,T1		# +K[3]+1
+
+	VPERM	A0,A0,A0,BEPERM
+	VPERM	B0,B0,B0,BEPERM
+	VPERM	C0,C0,C0,BEPERM
+	VPERM	D0,D0,D0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VAF	D2,D2,T2		# +K[3]+2
+	VAF	D3,D3,T3		# +K[3]+3
+	VLM	T0,T3,0,INP,0
+
+	VX	A0,A0,T0
+	VX	B0,B0,T1
+	VX	C0,C0,T2
+	VX	D0,D0,T3
+
+	VLM	K0,T3,0,%r7,4		# re-load sigma and increments
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	B1,B1,K1
+	VAF	C1,C1,K2
+
+	VPERM	A0,A1,A1,BEPERM
+	VPERM	B0,B1,B1,BEPERM
+	VPERM	C0,C1,C1,BEPERM
+	VPERM	D0,D1,D1,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A2,A2,K0
+	VAF	B2,B2,K1
+	VAF	C2,C2,K2
+
+	VPERM	A0,A2,A2,BEPERM
+	VPERM	B0,B2,B2,BEPERM
+	VPERM	C0,C2,C2,BEPERM
+	VPERM	D0,D2,D2,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A3,A3,K0
+	VAF	B3,B3,K1
+	VAF	C3,C3,K2
+	VAF	D2,K3,T3		# K[3]+3
+
+	VPERM	A0,A3,A3,BEPERM
+	VPERM	B0,B3,B3,BEPERM
+	VPERM	C0,C3,C3,BEPERM
+	VPERM	D0,D3,D3,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VAF	D3,D2,T1		# K[3]+4
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A4,A4,K0
+	VAF	B4,B4,K1
+	VAF	C4,C4,K2
+	VAF	D4,D4,D3		# +K[3]+4
+	VAF	D3,D3,T1		# K[3]+5
+	VAF	K3,D2,T3		# K[3]+=6
+
+	VPERM	A0,A4,A4,BEPERM
+	VPERM	B0,B4,B4,BEPERM
+	VPERM	C0,C4,C4,BEPERM
+	VPERM	D0,D4,D4,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A5,A5,K0
+	VAF	B5,B5,K1
+	VAF	C5,C5,K2
+	VAF	D5,D5,D3		# +K[3]+5
+
+	VPERM	A0,A5,A5,BEPERM
+	VPERM	B0,B5,B5,BEPERM
+	VPERM	C0,C5,C5,BEPERM
+	VPERM	D0,D5,D5,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	lhi	%r0,10
+	aghi	LEN,-0x40
+	jne	.Loop_outer_vx
+
+.Ldone_vx:
+	lmg	%r6,%r7,FRAME+6*8(SP)
+	la	SP,FRAME(SP)
+	BR_EX	%r14
+
+.Ltail_vx:
+	VSTM	A0,D0,8*8,SP,3
+	lghi	%r1,0
+
+.Loop_tail_vx:
+	llgc	%r5,0(%r1,INP)
+	llgc	%r6,8*8(%r1,SP)
+	xr	%r6,%r5
+	stc	%r6,0(%r1,OUT)
+	la	%r1,1(%r1)
+	brct	LEN,.Loop_tail_vx
+
+	lmg	%r6,%r7,FRAME+6*8(SP)
+	la	SP,FRAME(SP)
+	BR_EX	%r14
+SYM_FUNC_END(chacha20_vx)
+
+.previous
diff --git a/lib/crypto/s390/chacha-s390.h b/lib/crypto/s390/chacha-s390.h
new file mode 100644
index 000000000000..733744ce30f5
--- /dev/null
+++ b/lib/crypto/s390/chacha-s390.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+		 const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h
new file mode 100644
index 000000000000..08bd138e881c
--- /dev/null
+++ b/lib/crypto/s390/sha1.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha1);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_cpacf_sha1))
+		cpacf_kimd(CPACF_KIMD_SHA_1, state, data,
+			   nblocks * SHA1_BLOCK_SIZE);
+	else
+		sha1_blocks_generic(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static inline void sha1_mod_init_arch(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1))
+		static_branch_enable(&have_cpacf_sha1);
+}
diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h
new file mode 100644
index 000000000000..70a81cbc06b2
--- /dev/null
+++ b/lib/crypto/s390/sha256.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_cpacf_sha256))
+		cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
+			   nblocks * SHA256_BLOCK_SIZE);
+	else
+		sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
+		static_branch_enable(&have_cpacf_sha256);
+}
diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h
new file mode 100644
index 000000000000..24744651550c
--- /dev/null
+++ b/lib/crypto/s390/sha512.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-512 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha512);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_cpacf_sha512))
+		cpacf_kimd(CPACF_KIMD_SHA_512, state, data,
+			   nblocks * SHA512_BLOCK_SIZE);
+	else
+		sha512_blocks_generic(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static inline void sha512_mod_init_arch(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512))
+		static_branch_enable(&have_cpacf_sha512);
+}
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index ebb60519ae93..5904e4ae85d2 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -1,18 +1,21 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * SHA1 routine optimized to do word accesses rather than byte accesses,
- * and to avoid unnecessary copies into the context array.
- *
- * This was based on the git SHA1 implementation.
+ * SHA-1 and HMAC-SHA1 library functions
  */
 
-#include <linux/kernel.h>
+#include <crypto/hmac.h>
+#include <crypto/sha1.h>
+#include <linux/bitops.h>
 #include <linux/export.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/bitops.h>
 #include <linux/string.h>
-#include <crypto/sha1.h>
 #include <linux/unaligned.h>
+#include <linux/wordpart.h>
+
+static const struct sha1_block_state sha1_iv = {
+	.h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+};
 
 /*
  * If you have 32 registers or more, the compiler can (and should)
@@ -124,10 +127,10 @@ void sha1_transform(__u32 *digest, const char *data, __u32 *array)
 EXPORT_SYMBOL(sha1_transform);
 
 /**
- * sha1_init - initialize the vectors for a SHA1 digest
+ * sha1_init_raw - initialize the vectors for a SHA1 digest
  * @buf: vector to initialize
  */
-void sha1_init(__u32 *buf)
+void sha1_init_raw(__u32 *buf)
 {
 	buf[0] = 0x67452301;
 	buf[1] = 0xefcdab89;
@@ -135,7 +138,211 @@ void sha1_init(__u32 *buf)
 	buf[3] = 0x10325476;
 	buf[4] = 0xc3d2e1f0;
 }
-EXPORT_SYMBOL(sha1_init);
+EXPORT_SYMBOL(sha1_init_raw);
+
+static void __maybe_unused sha1_blocks_generic(struct sha1_block_state *state,
+					       const u8 *data, size_t nblocks)
+{
+	u32 workspace[SHA1_WORKSPACE_WORDS];
+
+	do {
+		sha1_transform(state->h, data, workspace);
+		data += SHA1_BLOCK_SIZE;
+	} while (--nblocks);
+
+	memzero_explicit(workspace, sizeof(workspace));
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA1_ARCH
+#include "sha1.h" /* $(SRCARCH)/sha1.h */
+#else
+#define sha1_blocks sha1_blocks_generic
+#endif
+
+void sha1_init(struct sha1_ctx *ctx)
+{
+	ctx->state = sha1_iv;
+	ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(sha1_init);
+
+void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+	ctx->bytecount += len;
+
+	if (partial + len >= SHA1_BLOCK_SIZE) {
+		size_t nblocks;
+
+		if (partial) {
+			size_t l = SHA1_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			sha1_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / SHA1_BLOCK_SIZE;
+		len %= SHA1_BLOCK_SIZE;
+
+		if (nblocks) {
+			sha1_blocks(&ctx->state, data, nblocks);
+			data += nblocks * SHA1_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(sha1_update);
+
+static void __sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	u64 bitcount = ctx->bytecount << 3;
+	size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA1_BLOCK_SIZE - 8) {
+		memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - partial);
+		sha1_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - 8 - partial);
+	*(__be64 *)&ctx->buf[SHA1_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+	sha1_blocks(&ctx->state, ctx->buf, 1);
+
+	for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+		put_unaligned_be32(ctx->state.h[i / 4], out + i);
+}
+
+void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	__sha1_final(ctx, out);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha1_final);
+
+void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
+{
+	struct sha1_ctx ctx;
+
+	sha1_init(&ctx);
+	sha1_update(&ctx, data, len);
+	sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha1);
+
+static void __hmac_sha1_preparekey(struct sha1_block_state *istate,
+				   struct sha1_block_state *ostate,
+				   const u8 *raw_key, size_t raw_key_len)
+{
+	union {
+		u8 b[SHA1_BLOCK_SIZE];
+		unsigned long w[SHA1_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA1_BLOCK_SIZE))
+		sha1(raw_key, raw_key_len, derived_key.b);
+	else
+		memcpy(derived_key.b, raw_key, raw_key_len);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = sha1_iv;
+	sha1_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = sha1_iv;
+	sha1_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha1_preparekey(struct hmac_sha1_key *key,
+			  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha1_preparekey(&key->istate, &key->ostate,
+			       raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_preparekey);
+
+void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key)
+{
+	ctx->sha_ctx.state = key->istate;
+	ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init);
+
+void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx,
+				const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha1_preparekey(&ctx->sha_ctx.state, &ctx->ostate,
+			       raw_key, raw_key_len);
+	ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init_usingrawkey);
+
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha1_final(&ctx->sha_ctx, ctx->sha_ctx.buf);
+	memset(&ctx->sha_ctx.buf[SHA1_DIGEST_SIZE], 0,
+	       SHA1_BLOCK_SIZE - SHA1_DIGEST_SIZE);
+	ctx->sha_ctx.buf[SHA1_DIGEST_SIZE] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA1_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA1_BLOCK_SIZE + SHA1_DIGEST_SIZE));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha1_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+		put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_final);
+
+void hmac_sha1(const struct hmac_sha1_key *key,
+	       const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE])
+{
+	struct hmac_sha1_ctx ctx;
+
+	hmac_sha1_init(&ctx, key);
+	hmac_sha1_update(&ctx, data, data_len);
+	hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1);
+
+void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			   const u8 *data, size_t data_len,
+			   u8 out[SHA1_DIGEST_SIZE])
+{
+	struct hmac_sha1_ctx ctx;
+
+	hmac_sha1_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha1_update(&ctx, data, data_len);
+	hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
+
+#ifdef sha1_mod_init_arch
+static int __init sha1_mod_init(void)
+{
+	sha1_mod_init_arch();
+	return 0;
+}
+subsys_initcall(sha1_mod_init);
+
+static void __exit sha1_mod_exit(void)
+{
+}
+module_exit(sha1_mod_exit);
+#endif
 
-MODULE_DESCRIPTION("SHA-1 Algorithm");
+MODULE_DESCRIPTION("SHA-1 and HMAC-SHA1 library functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha256-generic.c b/lib/crypto/sha256-generic.c
deleted file mode 100644
index a16ad4f25ebb..000000000000
--- a/lib/crypto/sha256-generic.c
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256, as specified in
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- *
- * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * Copyright (c) 2014 Red Hat Inc.
- */
-
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-static const u32 SHA256_K[] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
-};
-
-static inline u32 Ch(u32 x, u32 y, u32 z)
-{
-	return z ^ (x & (y ^ z));
-}
-
-static inline u32 Maj(u32 x, u32 y, u32 z)
-{
-	return (x & y) | (z & (x | y));
-}
-
-#define e0(x)       (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
-#define e1(x)       (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
-#define s0(x)       (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
-#define s1(x)       (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
-
-static inline void LOAD_OP(int I, u32 *W, const u8 *input)
-{
-	W[I] = get_unaligned_be32((__u32 *)input + I);
-}
-
-static inline void BLEND_OP(int I, u32 *W)
-{
-	W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
-}
-
-#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do {		\
-	u32 t1, t2;						\
-	t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i];	\
-	t2 = e0(a) + Maj(a, b, c);				\
-	d += t1;						\
-	h = t1 + t2;						\
-} while (0)
-
-static void sha256_block_generic(u32 state[SHA256_STATE_WORDS],
-				 const u8 *input, u32 W[64])
-{
-	u32 a, b, c, d, e, f, g, h;
-	int i;
-
-	/* load the input */
-	for (i = 0; i < 16; i += 8) {
-		LOAD_OP(i + 0, W, input);
-		LOAD_OP(i + 1, W, input);
-		LOAD_OP(i + 2, W, input);
-		LOAD_OP(i + 3, W, input);
-		LOAD_OP(i + 4, W, input);
-		LOAD_OP(i + 5, W, input);
-		LOAD_OP(i + 6, W, input);
-		LOAD_OP(i + 7, W, input);
-	}
-
-	/* now blend */
-	for (i = 16; i < 64; i += 8) {
-		BLEND_OP(i + 0, W);
-		BLEND_OP(i + 1, W);
-		BLEND_OP(i + 2, W);
-		BLEND_OP(i + 3, W);
-		BLEND_OP(i + 4, W);
-		BLEND_OP(i + 5, W);
-		BLEND_OP(i + 6, W);
-		BLEND_OP(i + 7, W);
-	}
-
-	/* load the state into our registers */
-	a = state[0];  b = state[1];  c = state[2];  d = state[3];
-	e = state[4];  f = state[5];  g = state[6];  h = state[7];
-
-	/* now iterate */
-	for (i = 0; i < 64; i += 8) {
-		SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
-		SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
-		SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
-		SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
-		SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
-		SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
-		SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
-		SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
-	}
-
-	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
-	state[4] += e; state[5] += f; state[6] += g; state[7] += h;
-}
-
-void sha256_blocks_generic(u32 state[SHA256_STATE_WORDS],
-			   const u8 *data, size_t nblocks)
-{
-	u32 W[64];
-
-	do {
-		sha256_block_generic(state, data, W);
-		data += SHA256_BLOCK_SIZE;
-	} while (--nblocks);
-
-	memzero_explicit(W, sizeof(W));
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_generic);
-
-MODULE_DESCRIPTION("SHA-256 Algorithm (generic implementation)");
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 107e5162507a..8fa15165d23e 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -1,83 +1,436 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * SHA-256, as specified in
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- *
- * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions
  *
  * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
  * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
  * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
  * Copyright (c) 2014 Red Hat Inc.
+ * Copyright 2025 Google LLC
  */
 
-#include <crypto/internal/blockhash.h>
-#include <crypto/internal/sha2.h>
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
 
-/*
- * If __DISABLE_EXPORTS is defined, then this file is being compiled for a
- * pre-boot environment.  In that case, ignore the kconfig options, pull the
- * generic code into the same translation unit, and use that only.
- */
-#ifdef __DISABLE_EXPORTS
-#include "sha256-generic.c"
+static const struct sha256_block_state sha224_iv = {
+	.h = {
+		SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
+		SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
+	},
+};
+
+static const struct sha256_block_state sha256_iv = {
+	.h = {
+		SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+		SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+	},
+};
+
+static const u32 sha256_K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+	0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+	0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+	0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+	0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
+#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
+#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
+#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
+
+static inline void LOAD_OP(int I, u32 *W, const u8 *input)
+{
+	W[I] = get_unaligned_be32((__u32 *)input + I);
+}
+
+static inline void BLEND_OP(int I, u32 *W)
+{
+	W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16];
+}
+
+#define SHA256_ROUND(i, a, b, c, d, e, f, g, h)                    \
+	do {                                                       \
+		u32 t1, t2;                                        \
+		t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \
+		t2 = e0(a) + Maj(a, b, c);                         \
+		d += t1;                                           \
+		h = t1 + t2;                                       \
+	} while (0)
+
+static void sha256_block_generic(struct sha256_block_state *state,
+				 const u8 *input, u32 W[64])
+{
+	u32 a, b, c, d, e, f, g, h;
+	int i;
+
+	/* load the input */
+	for (i = 0; i < 16; i += 8) {
+		LOAD_OP(i + 0, W, input);
+		LOAD_OP(i + 1, W, input);
+		LOAD_OP(i + 2, W, input);
+		LOAD_OP(i + 3, W, input);
+		LOAD_OP(i + 4, W, input);
+		LOAD_OP(i + 5, W, input);
+		LOAD_OP(i + 6, W, input);
+		LOAD_OP(i + 7, W, input);
+	}
+
+	/* now blend */
+	for (i = 16; i < 64; i += 8) {
+		BLEND_OP(i + 0, W);
+		BLEND_OP(i + 1, W);
+		BLEND_OP(i + 2, W);
+		BLEND_OP(i + 3, W);
+		BLEND_OP(i + 4, W);
+		BLEND_OP(i + 5, W);
+		BLEND_OP(i + 6, W);
+		BLEND_OP(i + 7, W);
+	}
+
+	/* load the state into our registers */
+	a = state->h[0];
+	b = state->h[1];
+	c = state->h[2];
+	d = state->h[3];
+	e = state->h[4];
+	f = state->h[5];
+	g = state->h[6];
+	h = state->h[7];
+
+	/* now iterate */
+	for (i = 0; i < 64; i += 8) {
+		SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
+		SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
+		SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
+		SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
+		SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
+		SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
+		SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
+		SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
+	}
+
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+	state->h[4] += e;
+	state->h[5] += f;
+	state->h[6] += g;
+	state->h[7] += h;
+}
+
+static void __maybe_unused
+sha256_blocks_generic(struct sha256_block_state *state,
+		      const u8 *data, size_t nblocks)
+{
+	u32 W[64];
+
+	do {
+		sha256_block_generic(state, data, W);
+		data += SHA256_BLOCK_SIZE;
+	} while (--nblocks);
+
+	memzero_explicit(W, sizeof(W));
+}
+
+#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS)
+#include "sha256.h" /* $(SRCARCH)/sha256.h */
+#else
+#define sha256_blocks sha256_blocks_generic
 #endif
 
-static inline bool sha256_purgatory(void)
+static void __sha256_init(struct __sha256_ctx *ctx,
+			  const struct sha256_block_state *iv,
+			  u64 initial_bytecount)
 {
-	return __is_defined(__DISABLE_EXPORTS);
+	ctx->state = *iv;
+	ctx->bytecount = initial_bytecount;
 }
 
-static inline void sha256_blocks(u32 state[SHA256_STATE_WORDS], const u8 *data,
-				 size_t nblocks)
+void sha224_init(struct sha224_ctx *ctx)
 {
-	sha256_choose_blocks(state, data, nblocks, sha256_purgatory(), false);
+	__sha256_init(&ctx->ctx, &sha224_iv, 0);
 }
+EXPORT_SYMBOL_GPL(sha224_init);
 
-void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len)
+void sha256_init(struct sha256_ctx *ctx)
 {
-	size_t partial = sctx->count % SHA256_BLOCK_SIZE;
+	__sha256_init(&ctx->ctx, &sha256_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha256_init);
+
+void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
+
+	ctx->bytecount += len;
+
+	if (partial + len >= SHA256_BLOCK_SIZE) {
+		size_t nblocks;
 
-	sctx->count += len;
-	BLOCK_HASH_UPDATE_BLOCKS(sha256_blocks, sctx->ctx.state, data, len,
-				 SHA256_BLOCK_SIZE, sctx->buf, partial);
+		if (partial) {
+			size_t l = SHA256_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			sha256_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / SHA256_BLOCK_SIZE;
+		len %= SHA256_BLOCK_SIZE;
+
+		if (nblocks) {
+			sha256_blocks(&ctx->state, data, nblocks);
+			data += nblocks * SHA256_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
 }
-EXPORT_SYMBOL(sha256_update);
+EXPORT_SYMBOL(__sha256_update);
 
-static inline void __sha256_final(struct sha256_state *sctx, u8 *out,
-				  size_t digest_size)
+static void __sha256_final(struct __sha256_ctx *ctx,
+			   u8 *out, size_t digest_size)
 {
-	size_t partial = sctx->count % SHA256_BLOCK_SIZE;
+	u64 bitcount = ctx->bytecount << 3;
+	size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA256_BLOCK_SIZE - 8) {
+		memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial);
+		sha256_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial);
+	*(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+	sha256_blocks(&ctx->state, ctx->buf, 1);
 
-	sha256_finup(&sctx->ctx, sctx->buf, partial, out, digest_size,
-		     sha256_purgatory(), false);
-	memzero_explicit(sctx, sizeof(*sctx));
+	for (size_t i = 0; i < digest_size; i += 4)
+		put_unaligned_be32(ctx->state.h[i / 4], out + i);
 }
 
-void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE])
+void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE])
 {
-	__sha256_final(sctx, out, SHA256_DIGEST_SIZE);
+	__sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(sha224_final);
+
+void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE])
+{
+	__sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL(sha256_final);
 
-void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE])
+void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE])
 {
-	__sha256_final(sctx, out, SHA224_DIGEST_SIZE);
+	struct sha224_ctx ctx;
+
+	sha224_init(&ctx);
+	sha224_update(&ctx, data, len);
+	sha224_final(&ctx, out);
 }
-EXPORT_SYMBOL(sha224_final);
+EXPORT_SYMBOL(sha224);
 
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 {
-	struct sha256_state sctx;
+	struct sha256_ctx ctx;
 
-	sha256_init(&sctx);
-	sha256_update(&sctx, data, len);
-	sha256_final(&sctx, out);
+	sha256_init(&ctx);
+	sha256_update(&ctx, data, len);
+	sha256_final(&ctx, out);
 }
 EXPORT_SYMBOL(sha256);
 
-MODULE_DESCRIPTION("SHA-256 Algorithm");
+/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
+#ifndef __DISABLE_EXPORTS
+static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
+				     struct sha256_block_state *ostate,
+				     const u8 *raw_key, size_t raw_key_len,
+				     const struct sha256_block_state *iv)
+{
+	union {
+		u8 b[SHA256_BLOCK_SIZE];
+		unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) {
+		if (iv == &sha224_iv)
+			sha224(raw_key, raw_key_len, derived_key.b);
+		else
+			sha256(raw_key, raw_key_len, derived_key.b);
+	} else {
+		memcpy(derived_key.b, raw_key, raw_key_len);
+	}
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = *iv;
+	sha256_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = *iv;
+	sha256_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha224_preparekey(struct hmac_sha224_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha224_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_preparekey);
+
+void hmac_sha256_preparekey(struct hmac_sha256_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha256_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_preparekey);
+
+void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
+			const struct __hmac_sha256_key *key)
+{
+	__sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE);
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha256_init);
+
+void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha224_iv);
+	ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey);
+
+void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha256_iv);
+	ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey);
+
+static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx,
+				u8 *out, size_t digest_size)
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+	memset(&ctx->sha_ctx.buf[digest_size], 0,
+	       SHA256_BLOCK_SIZE - digest_size);
+	ctx->sha_ctx.buf[digest_size] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < digest_size; i += 4)
+		put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
+		       u8 out[SHA224_DIGEST_SIZE])
+{
+	__hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_final);
+
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
+		       u8 out[SHA256_DIGEST_SIZE])
+{
+	__hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_final);
+
+void hmac_sha224(const struct hmac_sha224_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE])
+{
+	struct hmac_sha224_ctx ctx;
+
+	hmac_sha224_init(&ctx, key);
+	hmac_sha224_update(&ctx, data, data_len);
+	hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224);
+
+void hmac_sha256(const struct hmac_sha256_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE])
+{
+	struct hmac_sha256_ctx ctx;
+
+	hmac_sha256_init(&ctx, key);
+	hmac_sha256_update(&ctx, data, data_len);
+	hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256);
+
+void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA224_DIGEST_SIZE])
+{
+	struct hmac_sha224_ctx ctx;
+
+	hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha224_update(&ctx, data, data_len);
+	hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey);
+
+void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA256_DIGEST_SIZE])
+{
+	struct hmac_sha256_ctx ctx;
+
+	hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha256_update(&ctx, data, data_len);
+	hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
+#endif /* !__DISABLE_EXPORTS */
+
+#ifdef sha256_mod_init_arch
+static int __init sha256_mod_init(void)
+{
+	sha256_mod_init_arch();
+	return 0;
+}
+subsys_initcall(sha256_mod_init);
+
+static void __exit sha256_mod_exit(void)
+{
+}
+module_exit(sha256_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
new file mode 100644
index 000000000000..d8062188be98
--- /dev/null
+++ b/lib/crypto/sha512.c
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/overflow.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+
+static const struct sha512_block_state sha384_iv = {
+	.h = {
+		SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3,
+		SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7,
+	},
+};
+
+static const struct sha512_block_state sha512_iv = {
+	.h = {
+		SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3,
+		SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7,
+	},
+};
+
+static const u64 sha512_K[80] = {
+	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+	0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+	0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+	0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+	0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+	0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+	0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+	0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+	0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+	0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+	0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+	0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+	0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+	0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror64((x), 28) ^ ror64((x), 34) ^ ror64((x), 39))
+#define e1(x) (ror64((x), 14) ^ ror64((x), 18) ^ ror64((x), 41))
+#define s0(x) (ror64((x), 1) ^ ror64((x), 8) ^ ((x) >> 7))
+#define s1(x) (ror64((x), 19) ^ ror64((x), 61) ^ ((x) >> 6))
+
+static void sha512_block_generic(struct sha512_block_state *state,
+				 const u8 *data)
+{
+	u64 a = state->h[0];
+	u64 b = state->h[1];
+	u64 c = state->h[2];
+	u64 d = state->h[3];
+	u64 e = state->h[4];
+	u64 f = state->h[5];
+	u64 g = state->h[6];
+	u64 h = state->h[7];
+	u64 t1, t2;
+	u64 W[16];
+
+	for (int j = 0; j < 16; j++)
+		W[j] = get_unaligned_be64(data + j * sizeof(u64));
+
+	for (int i = 0; i < 80; i += 8) {
+		if ((i & 15) == 0 && i != 0) {
+			for (int j = 0; j < 16; j++) {
+				W[j & 15] += s1(W[(j - 2) & 15]) +
+					     W[(j - 7) & 15] +
+					     s0(W[(j - 15) & 15]);
+			}
+		}
+		t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i]   + W[(i & 15)];
+		t2 = e0(a) + Maj(a, b, c);    d += t1;    h = t1 + t2;
+		t1 = g + e1(d) + Ch(d, e, f) + sha512_K[i+1] + W[(i & 15) + 1];
+		t2 = e0(h) + Maj(h, a, b);    c += t1;    g = t1 + t2;
+		t1 = f + e1(c) + Ch(c, d, e) + sha512_K[i+2] + W[(i & 15) + 2];
+		t2 = e0(g) + Maj(g, h, a);    b += t1;    f = t1 + t2;
+		t1 = e + e1(b) + Ch(b, c, d) + sha512_K[i+3] + W[(i & 15) + 3];
+		t2 = e0(f) + Maj(f, g, h);    a += t1;    e = t1 + t2;
+		t1 = d + e1(a) + Ch(a, b, c) + sha512_K[i+4] + W[(i & 15) + 4];
+		t2 = e0(e) + Maj(e, f, g);    h += t1;    d = t1 + t2;
+		t1 = c + e1(h) + Ch(h, a, b) + sha512_K[i+5] + W[(i & 15) + 5];
+		t2 = e0(d) + Maj(d, e, f);    g += t1;    c = t1 + t2;
+		t1 = b + e1(g) + Ch(g, h, a) + sha512_K[i+6] + W[(i & 15) + 6];
+		t2 = e0(c) + Maj(c, d, e);    f += t1;    b = t1 + t2;
+		t1 = a + e1(f) + Ch(f, g, h) + sha512_K[i+7] + W[(i & 15) + 7];
+		t2 = e0(b) + Maj(b, c, d);    e += t1;    a = t1 + t2;
+	}
+
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+	state->h[4] += e;
+	state->h[5] += f;
+	state->h[6] += g;
+	state->h[7] += h;
+}
+
+static void __maybe_unused
+sha512_blocks_generic(struct sha512_block_state *state,
+		      const u8 *data, size_t nblocks)
+{
+	do {
+		sha512_block_generic(state, data);
+		data += SHA512_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA512_ARCH
+#include "sha512.h" /* $(SRCARCH)/sha512.h */
+#else
+#define sha512_blocks sha512_blocks_generic
+#endif
+
+static void __sha512_init(struct __sha512_ctx *ctx,
+			  const struct sha512_block_state *iv,
+			  u64 initial_bytecount)
+{
+	ctx->state = *iv;
+	ctx->bytecount_lo = initial_bytecount;
+	ctx->bytecount_hi = 0;
+}
+
+void sha384_init(struct sha384_ctx *ctx)
+{
+	__sha512_init(&ctx->ctx, &sha384_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha384_init);
+
+void sha512_init(struct sha512_ctx *ctx)
+{
+	__sha512_init(&ctx->ctx, &sha512_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha512_init);
+
+void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+	if (check_add_overflow(ctx->bytecount_lo, len, &ctx->bytecount_lo))
+		ctx->bytecount_hi++;
+
+	if (partial + len >= SHA512_BLOCK_SIZE) {
+		size_t nblocks;
+
+		if (partial) {
+			size_t l = SHA512_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			sha512_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / SHA512_BLOCK_SIZE;
+		len %= SHA512_BLOCK_SIZE;
+
+		if (nblocks) {
+			sha512_blocks(&ctx->state, data, nblocks);
+			data += nblocks * SHA512_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(__sha512_update);
+
+static void __sha512_final(struct __sha512_ctx *ctx,
+			   u8 *out, size_t digest_size)
+{
+	u64 bitcount_hi = (ctx->bytecount_hi << 3) | (ctx->bytecount_lo >> 61);
+	u64 bitcount_lo = ctx->bytecount_lo << 3;
+	size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA512_BLOCK_SIZE - 16) {
+		memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - partial);
+		sha512_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - 16 - partial);
+	*(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 16] = cpu_to_be64(bitcount_hi);
+	*(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 8] = cpu_to_be64(bitcount_lo);
+	sha512_blocks(&ctx->state, ctx->buf, 1);
+
+	for (size_t i = 0; i < digest_size; i += 8)
+		put_unaligned_be64(ctx->state.h[i / 8], out + i);
+}
+
+void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE])
+{
+	__sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha384_final);
+
+void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE])
+{
+	__sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha512_final);
+
+void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE])
+{
+	struct sha384_ctx ctx;
+
+	sha384_init(&ctx);
+	sha384_update(&ctx, data, len);
+	sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha384);
+
+void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE])
+{
+	struct sha512_ctx ctx;
+
+	sha512_init(&ctx);
+	sha512_update(&ctx, data, len);
+	sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha512);
+
+static void __hmac_sha512_preparekey(struct sha512_block_state *istate,
+				     struct sha512_block_state *ostate,
+				     const u8 *raw_key, size_t raw_key_len,
+				     const struct sha512_block_state *iv)
+{
+	union {
+		u8 b[SHA512_BLOCK_SIZE];
+		unsigned long w[SHA512_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA512_BLOCK_SIZE)) {
+		if (iv == &sha384_iv)
+			sha384(raw_key, raw_key_len, derived_key.b);
+		else
+			sha512(raw_key, raw_key_len, derived_key.b);
+	} else {
+		memcpy(derived_key.b, raw_key, raw_key_len);
+	}
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = *iv;
+	sha512_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = *iv;
+	sha512_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha384_preparekey(struct hmac_sha384_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha384_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_preparekey);
+
+void hmac_sha512_preparekey(struct hmac_sha512_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha512_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_preparekey);
+
+void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx,
+			const struct __hmac_sha512_key *key)
+{
+	__sha512_init(&ctx->sha_ctx, &key->istate, SHA512_BLOCK_SIZE);
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha512_init);
+
+void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha384_iv);
+	ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+	ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_init_usingrawkey);
+
+void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha512_iv);
+	ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+	ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_init_usingrawkey);
+
+static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx,
+				u8 *out, size_t digest_size)
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha512_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+	memset(&ctx->sha_ctx.buf[digest_size], 0,
+	       SHA512_BLOCK_SIZE - digest_size);
+	ctx->sha_ctx.buf[digest_size] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA512_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA512_BLOCK_SIZE + digest_size));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha512_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < digest_size; i += 8)
+		put_unaligned_be64(ctx->ostate.h[i / 8], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
+		       u8 out[SHA384_DIGEST_SIZE])
+{
+	__hmac_sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_final);
+
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
+		       u8 out[SHA512_DIGEST_SIZE])
+{
+	__hmac_sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_final);
+
+void hmac_sha384(const struct hmac_sha384_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE])
+{
+	struct hmac_sha384_ctx ctx;
+
+	hmac_sha384_init(&ctx, key);
+	hmac_sha384_update(&ctx, data, data_len);
+	hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384);
+
+void hmac_sha512(const struct hmac_sha512_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE])
+{
+	struct hmac_sha512_ctx ctx;
+
+	hmac_sha512_init(&ctx, key);
+	hmac_sha512_update(&ctx, data, data_len);
+	hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512);
+
+void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA384_DIGEST_SIZE])
+{
+	struct hmac_sha384_ctx ctx;
+
+	hmac_sha384_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha384_update(&ctx, data, data_len);
+	hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey);
+
+void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA512_DIGEST_SIZE])
+{
+	struct hmac_sha512_ctx ctx;
+
+	hmac_sha512_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha512_update(&ctx, data, data_len);
+	hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
+
+#ifdef sha512_mod_init_arch
+static int __init sha512_mod_init(void)
+{
+	sha512_mod_init_arch();
+	return 0;
+}
+subsys_initcall(sha512_mod_init);
+
+static void __exit sha512_mod_exit(void)
+{
+}
+module_exit(sha512_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sm3.c b/lib/crypto/sm3.c
index efff0e267d84..c6b9ad8a3ac6 100644
--- a/lib/crypto/sm3.c
+++ b/lib/crypto/sm3.c
@@ -9,6 +9,7 @@
  */
 
 #include <crypto/sm3.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h
new file mode 100644
index 000000000000..5015f93584b7
--- /dev/null
+++ b/lib/crypto/sparc/sha1.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 accelerated using the sparc64 crypto opcodes
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha1_opcodes);
+
+asmlinkage void sha1_sparc64_transform(struct sha1_block_state *state,
+				       const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha1_opcodes))
+		sha1_sparc64_transform(state, data, nblocks);
+	else
+		sha1_blocks_generic(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static inline void sha1_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA1))
+		return;
+
+	static_branch_enable(&have_sha1_opcodes);
+	pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha1_asm.S b/lib/crypto/sparc/sha1_asm.S
new file mode 100644
index 000000000000..00b46bac1b08
--- /dev/null
+++ b/lib/crypto/sparc/sha1_asm.S
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha1_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x0c], %f3
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x10], %f4
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	SHA1
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA1
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha1_sparc64_transform)
diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h
new file mode 100644
index 000000000000..1d10108eb195
--- /dev/null
+++ b/lib/crypto/sparc/sha256.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-256 accelerated using the sparc64 sha256 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
+
+asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state,
+					 const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha256_opcodes))
+		sha256_sparc64_transform(state, data, nblocks);
+	else
+		sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA256))
+		return;
+
+	static_branch_enable(&have_sha256_opcodes);
+	pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha256_asm.S b/lib/crypto/sparc/sha256_asm.S
new file mode 100644
index 000000000000..ddcdd3daf31e
--- /dev/null
+++ b/lib/crypto/sparc/sha256_asm.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha256_sparc64_transform)
+	/* %o0 = state, %o1 = data, %o2 = nblocks */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	ld	[%o0 + 0x0c], %f3
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x18], %f6
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x1c], %f7
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	SHA256
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	st	%f5, [%o0 + 0x14]
+	st	%f6, [%o0 + 0x18]
+	st	%f7, [%o0 + 0x1c]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA256
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha256_sparc64_transform)
diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h
new file mode 100644
index 000000000000..55303ab6b15f
--- /dev/null
+++ b/lib/crypto/sparc/sha512.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-512 accelerated using the sparc64 sha512 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_opcodes);
+
+asmlinkage void sha512_sparc64_transform(struct sha512_block_state *state,
+					 const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha512_opcodes))
+		sha512_sparc64_transform(state, data, nblocks);
+	else
+		sha512_blocks_generic(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static inline void sha512_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA512))
+		return;
+
+	static_branch_enable(&have_sha512_opcodes);
+	pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha512_asm.S b/lib/crypto/sparc/sha512_asm.S
new file mode 100644
index 000000000000..9932b4fe1b59
--- /dev/null
+++ b/lib/crypto/sparc/sha512_asm.S
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha512_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntry
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	andcc	%o1, 0x7, %g0
+	ldd	[%o0 + 0x30], %f12
+	bne,pn	%xcc, 10f
+	 ldd	[%o0 + 0x38], %f14
+
+1:
+	ldd	[%o1 + 0x00], %f16
+	ldd	[%o1 + 0x08], %f18
+	ldd	[%o1 + 0x10], %f20
+	ldd	[%o1 + 0x18], %f22
+	ldd	[%o1 + 0x20], %f24
+	ldd	[%o1 + 0x28], %f26
+	ldd	[%o1 + 0x30], %f28
+	ldd	[%o1 + 0x38], %f30
+	ldd	[%o1 + 0x40], %f32
+	ldd	[%o1 + 0x48], %f34
+	ldd	[%o1 + 0x50], %f36
+	ldd	[%o1 + 0x58], %f38
+	ldd	[%o1 + 0x60], %f40
+	ldd	[%o1 + 0x68], %f42
+	ldd	[%o1 + 0x70], %f44
+	ldd	[%o1 + 0x78], %f46
+
+	SHA512
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x80, %o1
+
+5:
+	std	%f0, [%o0 + 0x00]
+	std	%f2, [%o0 + 0x08]
+	std	%f4, [%o0 + 0x10]
+	std	%f6, [%o0 + 0x18]
+	std	%f8, [%o0 + 0x20]
+	std	%f10, [%o0 + 0x28]
+	std	%f12, [%o0 + 0x30]
+	std	%f14, [%o0 + 0x38]
+	retl
+	 VISExit
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f18
+1:
+	ldd	[%o1 + 0x08], %f20
+	ldd	[%o1 + 0x10], %f22
+	ldd	[%o1 + 0x18], %f24
+	ldd	[%o1 + 0x20], %f26
+	ldd	[%o1 + 0x28], %f28
+	ldd	[%o1 + 0x30], %f30
+	ldd	[%o1 + 0x38], %f32
+	ldd	[%o1 + 0x40], %f34
+	ldd	[%o1 + 0x48], %f36
+	ldd	[%o1 + 0x50], %f38
+	ldd	[%o1 + 0x58], %f40
+	ldd	[%o1 + 0x60], %f42
+	ldd	[%o1 + 0x68], %f44
+	ldd	[%o1 + 0x70], %f46
+	ldd	[%o1 + 0x78], %f48
+	ldd	[%o1 + 0x80], %f50
+
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+	faligndata %f26, %f28, %f24
+	faligndata %f28, %f30, %f26
+	faligndata %f30, %f32, %f28
+	faligndata %f32, %f34, %f30
+	faligndata %f34, %f36, %f32
+	faligndata %f36, %f38, %f34
+	faligndata %f38, %f40, %f36
+	faligndata %f40, %f42, %f38
+	faligndata %f42, %f44, %f40
+	faligndata %f44, %f46, %f42
+	faligndata %f46, %f48, %f44
+	faligndata %f48, %f50, %f46
+
+	SHA512
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f50, %f18
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x80, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha512_sparc64_transform)
diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig
new file mode 100644
index 000000000000..de7e8babb6af
--- /dev/null
+++ b/lib/crypto/tests/Kconfig
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config CRYPTO_LIB_POLY1305_KUNIT_TEST
+	tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_POLY1305
+	help
+	  KUnit tests for the Poly1305 library functions.
+
+config CRYPTO_LIB_SHA1_KUNIT_TEST
+	tristate "KUnit tests for SHA-1" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA1
+	help
+	  KUnit tests for the SHA-1 cryptographic hash function and its
+	  corresponding HMAC.
+
+# Option is named *_SHA256_KUNIT_TEST, though both SHA-224 and SHA-256 tests are
+# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA256).
+config CRYPTO_LIB_SHA256_KUNIT_TEST
+	tristate "KUnit tests for SHA-224 and SHA-256" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA256
+	help
+	  KUnit tests for the SHA-224 and SHA-256 cryptographic hash functions
+	  and their corresponding HMACs.
+
+# Option is named *_SHA512_KUNIT_TEST, though both SHA-384 and SHA-512 tests are
+# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA512).
+config CRYPTO_LIB_SHA512_KUNIT_TEST
+	tristate "KUnit tests for SHA-384 and SHA-512" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA512
+	help
+	  KUnit tests for the SHA-384 and SHA-512 cryptographic hash functions
+	  and their corresponding HMACs.
+
+config CRYPTO_LIB_BENCHMARK_VISIBLE
+	bool
+
+config CRYPTO_LIB_BENCHMARK
+	bool "Include benchmarks in KUnit tests for cryptographic functions"
+	depends on CRYPTO_LIB_BENCHMARK_VISIBLE
+	help
+	  Include benchmarks in the KUnit tests for cryptographic functions.
+	  The benchmark results are printed to the kernel log when the
+	  corresponding KUnit test suite runs.
+
+	  This is useful for evaluating the performance of the cryptographic
+	  functions.  However, it will increase the runtime of the KUnit tests.
+
+	  If you're only interested in correctness testing, leave this disabled.
diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile
new file mode 100644
index 000000000000..8601dccd6fdd
--- /dev/null
+++ b/lib/crypto/tests/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA512_KUNIT_TEST) += sha384_kunit.o sha512_kunit.o
diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h
new file mode 100644
index 000000000000..f437a0a9ac6c
--- /dev/null
+++ b/lib/crypto/tests/hash-test-template.h
@@ -0,0 +1,683 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Test cases for hash functions, including a benchmark.  This is included by
+ * KUnit test suites that want to use it.  See sha512_kunit.c for an example.
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <kunit/test.h>
+#include <linux/hrtimer.h>
+#include <linux/timekeeping.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+/* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */
+#define TEST_BUF_LEN 16384
+static u8 *test_buf;
+
+static u8 *orig_test_buf;
+
+static u64 random_seed;
+
+/*
+ * This is a simple linear congruential generator.  It is used only for testing,
+ * which does not require cryptographically secure random numbers.  A hard-coded
+ * algorithm is used instead of <linux/prandom.h> so that it matches the
+ * algorithm used by the test vector generation script.  This allows the input
+ * data in random test vectors to be concisely stored as just the seed.
+ */
+static u32 rand32(void)
+{
+	random_seed = (random_seed * 25214903917 + 11) & ((1ULL << 48) - 1);
+	return random_seed >> 16;
+}
+
+static void rand_bytes(u8 *out, size_t len)
+{
+	for (size_t i = 0; i < len; i++)
+		out[i] = rand32();
+}
+
+static void rand_bytes_seeded_from_len(u8 *out, size_t len)
+{
+	random_seed = len;
+	rand_bytes(out, len);
+}
+
+static bool rand_bool(void)
+{
+	return rand32() % 2;
+}
+
+/* Generate a random length, preferring small lengths. */
+static size_t rand_length(size_t max_len)
+{
+	size_t len;
+
+	switch (rand32() % 3) {
+	case 0:
+		len = rand32() % 128;
+		break;
+	case 1:
+		len = rand32() % 3072;
+		break;
+	default:
+		len = rand32();
+		break;
+	}
+	return len % (max_len + 1);
+}
+
+static size_t rand_offset(size_t max_offset)
+{
+	return min(rand32() % 128, max_offset);
+}
+
+static int hash_suite_init(struct kunit_suite *suite)
+{
+	/*
+	 * Allocate the test buffer using vmalloc() with a page-aligned length
+	 * so that it is immediately followed by a guard page.  This allows
+	 * buffer overreads to be detected, even in assembly code.
+	 */
+	size_t alloc_len = round_up(TEST_BUF_LEN, PAGE_SIZE);
+
+	orig_test_buf = vmalloc(alloc_len);
+	if (!orig_test_buf)
+		return -ENOMEM;
+
+	test_buf = orig_test_buf + alloc_len - TEST_BUF_LEN;
+	return 0;
+}
+
+static void hash_suite_exit(struct kunit_suite *suite)
+{
+	vfree(orig_test_buf);
+	orig_test_buf = NULL;
+	test_buf = NULL;
+}
+
+/*
+ * Test the hash function against a list of test vectors.
+ *
+ * Note that it's only necessary to run each test vector in one way (e.g.,
+ * one-shot instead of incremental), since consistency between different ways of
+ * using the APIs is verified by other test cases.
+ */
+static void test_hash_test_vectors(struct kunit *test)
+{
+	for (size_t i = 0; i < ARRAY_SIZE(hash_testvecs); i++) {
+		size_t data_len = hash_testvecs[i].data_len;
+		u8 actual_hash[HASH_SIZE];
+
+		KUNIT_ASSERT_LE(test, data_len, TEST_BUF_LEN);
+		rand_bytes_seeded_from_len(test_buf, data_len);
+
+		HASH(test_buf, data_len, actual_hash);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, actual_hash, hash_testvecs[i].digest, HASH_SIZE,
+			"Wrong result with test vector %zu; data_len=%zu", i,
+			data_len);
+	}
+}
+
+/*
+ * Test that the hash function produces correct results for *every* length up to
+ * 4096 bytes.  To do this, generate seeded random data, then calculate a hash
+ * value for each length 0..4096, then hash the hash values.  Verify just the
+ * final hash value, which should match only when all hash values were correct.
+ */
+static void test_hash_all_lens_up_to_4096(struct kunit *test)
+{
+	struct HASH_CTX ctx;
+	u8 hash[HASH_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096);
+	rand_bytes_seeded_from_len(test_buf, 4096);
+	HASH_INIT(&ctx);
+	for (size_t len = 0; len <= 4096; len++) {
+		HASH(test_buf, len, hash);
+		HASH_UPDATE(&ctx, hash, HASH_SIZE);
+	}
+	HASH_FINAL(&ctx, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, hash_testvec_consolidated, HASH_SIZE);
+}
+
+/*
+ * Test that the hash function produces the same result with a one-shot
+ * computation as it does with an incremental computation.
+ */
+static void test_hash_incremental_updates(struct kunit *test)
+{
+	for (int i = 0; i < 1000; i++) {
+		size_t total_len, offset;
+		struct HASH_CTX ctx;
+		u8 hash1[HASH_SIZE];
+		u8 hash2[HASH_SIZE];
+		size_t num_parts = 0;
+		size_t remaining_len, cur_offset;
+
+		total_len = rand_length(TEST_BUF_LEN);
+		offset = rand_offset(TEST_BUF_LEN - total_len);
+		rand_bytes(&test_buf[offset], total_len);
+
+		/* Compute the hash value in one shot. */
+		HASH(&test_buf[offset], total_len, hash1);
+
+		/*
+		 * Compute the hash value incrementally, using a randomly
+		 * selected sequence of update lengths that sum to total_len.
+		 */
+		HASH_INIT(&ctx);
+		remaining_len = total_len;
+		cur_offset = offset;
+		while (rand_bool()) {
+			size_t part_len = rand_length(remaining_len);
+
+			HASH_UPDATE(&ctx, &test_buf[cur_offset], part_len);
+			num_parts++;
+			cur_offset += part_len;
+			remaining_len -= part_len;
+		}
+		if (remaining_len != 0 || rand_bool()) {
+			HASH_UPDATE(&ctx, &test_buf[cur_offset], remaining_len);
+			num_parts++;
+		}
+		HASH_FINAL(&ctx, hash2);
+
+		/* Verify that the two hash values are the same. */
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash1, hash2, HASH_SIZE,
+			"Incremental test failed with total_len=%zu num_parts=%zu offset=%zu",
+			total_len, num_parts, offset);
+	}
+}
+
+/*
+ * Test that the hash function does not overrun any buffers.  Uses a guard page
+ * to catch buffer overruns even if they occur in assembly code.
+ */
+static void test_hash_buffer_overruns(struct kunit *test)
+{
+	const size_t max_tested_len = TEST_BUF_LEN - sizeof(struct HASH_CTX);
+	void *const buf_end = &test_buf[TEST_BUF_LEN];
+	struct HASH_CTX *guarded_ctx = buf_end - sizeof(*guarded_ctx);
+
+	rand_bytes(test_buf, TEST_BUF_LEN);
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(max_tested_len);
+		struct HASH_CTX ctx;
+		u8 hash[HASH_SIZE];
+
+		/* Check for overruns of the data buffer. */
+		HASH(buf_end - len, len, hash);
+		HASH_INIT(&ctx);
+		HASH_UPDATE(&ctx, buf_end - len, len);
+		HASH_FINAL(&ctx, hash);
+
+		/* Check for overruns of the hash value buffer. */
+		HASH(test_buf, len, buf_end - HASH_SIZE);
+		HASH_INIT(&ctx);
+		HASH_UPDATE(&ctx, test_buf, len);
+		HASH_FINAL(&ctx, buf_end - HASH_SIZE);
+
+		/* Check for overuns of the hash context. */
+		HASH_INIT(guarded_ctx);
+		HASH_UPDATE(guarded_ctx, test_buf, len);
+		HASH_FINAL(guarded_ctx, hash);
+	}
+}
+
+/*
+ * Test that the caller is permitted to alias the output digest and source data
+ * buffer, and also modify the source data buffer after it has been used.
+ */
+static void test_hash_overlaps(struct kunit *test)
+{
+	const size_t max_tested_len = TEST_BUF_LEN - HASH_SIZE;
+	struct HASH_CTX ctx;
+	u8 hash[HASH_SIZE];
+
+	rand_bytes(test_buf, TEST_BUF_LEN);
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(max_tested_len);
+		size_t offset = HASH_SIZE + rand_offset(max_tested_len - len);
+		bool left_end = rand_bool();
+		u8 *ovl_hash = left_end ? &test_buf[offset] :
+					  &test_buf[offset + len - HASH_SIZE];
+
+		HASH(&test_buf[offset], len, hash);
+		HASH(&test_buf[offset], len, ovl_hash);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash, ovl_hash, HASH_SIZE,
+			"Overlap test 1 failed with len=%zu offset=%zu left_end=%d",
+			len, offset, left_end);
+
+		/* Repeat the above test, but this time use init+update+final */
+		HASH(&test_buf[offset], len, hash);
+		HASH_INIT(&ctx);
+		HASH_UPDATE(&ctx, &test_buf[offset], len);
+		HASH_FINAL(&ctx, ovl_hash);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash, ovl_hash, HASH_SIZE,
+			"Overlap test 2 failed with len=%zu offset=%zu left_end=%d",
+			len, offset, left_end);
+
+		/* Test modifying the source data after it was used. */
+		HASH(&test_buf[offset], len, hash);
+		HASH_INIT(&ctx);
+		HASH_UPDATE(&ctx, &test_buf[offset], len);
+		rand_bytes(&test_buf[offset], len);
+		HASH_FINAL(&ctx, ovl_hash);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash, ovl_hash, HASH_SIZE,
+			"Overlap test 3 failed with len=%zu offset=%zu left_end=%d",
+			len, offset, left_end);
+	}
+}
+
+/*
+ * Test that if the same data is hashed at different alignments in memory, the
+ * results are the same.
+ */
+static void test_hash_alignment_consistency(struct kunit *test)
+{
+	u8 hash1[128 + HASH_SIZE];
+	u8 hash2[128 + HASH_SIZE];
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(TEST_BUF_LEN);
+		size_t data_offs1 = rand_offset(TEST_BUF_LEN - len);
+		size_t data_offs2 = rand_offset(TEST_BUF_LEN - len);
+		size_t hash_offs1 = rand_offset(128);
+		size_t hash_offs2 = rand_offset(128);
+
+		rand_bytes(&test_buf[data_offs1], len);
+		HASH(&test_buf[data_offs1], len, &hash1[hash_offs1]);
+		memmove(&test_buf[data_offs2], &test_buf[data_offs1], len);
+		HASH(&test_buf[data_offs2], len, &hash2[hash_offs2]);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, &hash1[hash_offs1], &hash2[hash_offs2], HASH_SIZE,
+			"Alignment consistency test failed with len=%zu data_offs=(%zu,%zu) hash_offs=(%zu,%zu)",
+			len, data_offs1, data_offs2, hash_offs1, hash_offs2);
+	}
+}
+
+/* Test that HASH_FINAL zeroizes the context. */
+static void test_hash_ctx_zeroization(struct kunit *test)
+{
+	static const u8 zeroes[sizeof(struct HASH_CTX)];
+	struct HASH_CTX ctx;
+
+	rand_bytes(test_buf, 128);
+	HASH_INIT(&ctx);
+	HASH_UPDATE(&ctx, test_buf, 128);
+	HASH_FINAL(&ctx, test_buf);
+	KUNIT_ASSERT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx),
+			       "Hash context was not zeroized by finalization");
+}
+
+#define IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5)
+
+struct hash_irq_test_state {
+	bool (*func)(void *test_specific_state);
+	void *test_specific_state;
+	bool task_func_reported_failure;
+	bool hardirq_func_reported_failure;
+	bool softirq_func_reported_failure;
+	unsigned long hardirq_func_calls;
+	unsigned long softirq_func_calls;
+	struct hrtimer timer;
+	struct work_struct bh_work;
+};
+
+static enum hrtimer_restart hash_irq_test_timer_func(struct hrtimer *timer)
+{
+	struct hash_irq_test_state *state =
+		container_of(timer, typeof(*state), timer);
+
+	WARN_ON_ONCE(!in_hardirq());
+	state->hardirq_func_calls++;
+
+	if (!state->func(state->test_specific_state))
+		state->hardirq_func_reported_failure = true;
+
+	hrtimer_forward_now(&state->timer, IRQ_TEST_HRTIMER_INTERVAL);
+	queue_work(system_bh_wq, &state->bh_work);
+	return HRTIMER_RESTART;
+}
+
+static void hash_irq_test_bh_work_func(struct work_struct *work)
+{
+	struct hash_irq_test_state *state =
+		container_of(work, typeof(*state), bh_work);
+
+	WARN_ON_ONCE(!in_serving_softirq());
+	state->softirq_func_calls++;
+
+	if (!state->func(state->test_specific_state))
+		state->softirq_func_reported_failure = true;
+}
+
+/*
+ * Helper function which repeatedly runs the given @func in task, softirq, and
+ * hardirq context concurrently, and reports a failure to KUnit if any
+ * invocation of @func in any context returns false.  @func is passed
+ * @test_specific_state as its argument.  At most 3 invocations of @func will
+ * run concurrently: one in each of task, softirq, and hardirq context.
+ *
+ * The main purpose of this interrupt context testing is to validate fallback
+ * code paths that run in contexts where the normal code path cannot be used,
+ * typically due to the FPU or vector registers already being in-use in kernel
+ * mode.  These code paths aren't covered when the test code is executed only by
+ * the KUnit test runner thread in task context.  The reason for the concurrency
+ * is because merely using hardirq context is not sufficient to reach a fallback
+ * code path on some architectures; the hardirq actually has to occur while the
+ * FPU or vector unit was already in-use in kernel mode.
+ *
+ * Another purpose of this testing is to detect issues with the architecture's
+ * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions,
+ * especially in softirq context when the softirq may have interrupted a task
+ * already using kernel-mode FPU or vector (if the arch didn't prevent that).
+ * Crypto functions are often executed in softirqs, so this is important.
+ */
+static void run_irq_test(struct kunit *test, bool (*func)(void *),
+			 int max_iterations, void *test_specific_state)
+{
+	struct hash_irq_test_state state = {
+		.func = func,
+		.test_specific_state = test_specific_state,
+	};
+	unsigned long end_jiffies;
+
+	/*
+	 * Set up a hrtimer (the way we access hardirq context) and a work
+	 * struct for the BH workqueue (the way we access softirq context).
+	 */
+	hrtimer_setup_on_stack(&state.timer, hash_irq_test_timer_func,
+			       CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+	INIT_WORK_ONSTACK(&state.bh_work, hash_irq_test_bh_work_func);
+
+	/* Run for up to max_iterations or 1 second, whichever comes first. */
+	end_jiffies = jiffies + HZ;
+	hrtimer_start(&state.timer, IRQ_TEST_HRTIMER_INTERVAL,
+		      HRTIMER_MODE_REL_HARD);
+	for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies);
+	     i++) {
+		if (!func(test_specific_state))
+			state.task_func_reported_failure = true;
+	}
+
+	/* Cancel the timer and work. */
+	hrtimer_cancel(&state.timer);
+	flush_work(&state.bh_work);
+
+	/* Sanity check: the timer and BH functions should have been run. */
+	KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0,
+			    "Timer function was not called");
+	KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0,
+			    "BH work function was not called");
+
+	/* Check for incorrect hash values reported from any context. */
+	KUNIT_EXPECT_FALSE_MSG(
+		test, state.task_func_reported_failure,
+		"Incorrect hash values reported from task context");
+	KUNIT_EXPECT_FALSE_MSG(
+		test, state.hardirq_func_reported_failure,
+		"Incorrect hash values reported from hardirq context");
+	KUNIT_EXPECT_FALSE_MSG(
+		test, state.softirq_func_reported_failure,
+		"Incorrect hash values reported from softirq context");
+}
+
+#define IRQ_TEST_DATA_LEN 256
+#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */
+
+struct hash_irq_test1_state {
+	u8 expected_hashes[IRQ_TEST_NUM_BUFFERS][HASH_SIZE];
+	atomic_t seqno;
+};
+
+/*
+ * Compute the hash of one of the test messages and verify that it matches the
+ * expected hash from @state->expected_hashes.  To increase the chance of
+ * detecting problems, cycle through multiple messages.
+ */
+static bool hash_irq_test1_func(void *state_)
+{
+	struct hash_irq_test1_state *state = state_;
+	u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS;
+	u8 actual_hash[HASH_SIZE];
+
+	HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, actual_hash);
+	return memcmp(actual_hash, state->expected_hashes[i], HASH_SIZE) == 0;
+}
+
+/*
+ * Test that if hashes are computed in task, softirq, and hardirq context
+ * concurrently, then all results are as expected.
+ */
+static void test_hash_interrupt_context_1(struct kunit *test)
+{
+	struct hash_irq_test1_state state = {};
+
+	/* Prepare some test messages and compute the expected hash of each. */
+	rand_bytes(test_buf, IRQ_TEST_NUM_BUFFERS * IRQ_TEST_DATA_LEN);
+	for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++)
+		HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN,
+		     state.expected_hashes[i]);
+
+	run_irq_test(test, hash_irq_test1_func, 100000, &state);
+}
+
+struct hash_irq_test2_hash_ctx {
+	struct HASH_CTX hash_ctx;
+	atomic_t in_use;
+	int offset;
+	int step;
+};
+
+struct hash_irq_test2_state {
+	struct hash_irq_test2_hash_ctx ctxs[IRQ_TEST_NUM_BUFFERS];
+	u8 expected_hash[HASH_SIZE];
+	u16 update_lens[32];
+	int num_steps;
+};
+
+static bool hash_irq_test2_func(void *state_)
+{
+	struct hash_irq_test2_state *state = state_;
+	struct hash_irq_test2_hash_ctx *ctx;
+	bool ret = true;
+
+	for (ctx = &state->ctxs[0]; ctx < &state->ctxs[ARRAY_SIZE(state->ctxs)];
+	     ctx++) {
+		if (atomic_cmpxchg(&ctx->in_use, 0, 1) == 0)
+			break;
+	}
+	if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) {
+		/*
+		 * This should never happen, as the number of contexts is equal
+		 * to the maximum concurrency level of run_irq_test().
+		 */
+		return false;
+	}
+
+	if (ctx->step == 0) {
+		/* Init step */
+		HASH_INIT(&ctx->hash_ctx);
+		ctx->offset = 0;
+		ctx->step++;
+	} else if (ctx->step < state->num_steps - 1) {
+		/* Update step */
+		HASH_UPDATE(&ctx->hash_ctx, &test_buf[ctx->offset],
+			    state->update_lens[ctx->step - 1]);
+		ctx->offset += state->update_lens[ctx->step - 1];
+		ctx->step++;
+	} else {
+		/* Final step */
+		u8 actual_hash[HASH_SIZE];
+
+		if (WARN_ON_ONCE(ctx->offset != TEST_BUF_LEN))
+			ret = false;
+		HASH_FINAL(&ctx->hash_ctx, actual_hash);
+		if (memcmp(actual_hash, state->expected_hash, HASH_SIZE) != 0)
+			ret = false;
+		ctx->step = 0;
+	}
+	atomic_set_release(&ctx->in_use, 0);
+	return ret;
+}
+
+/*
+ * Test that if hashes are computed in task, softirq, and hardirq context
+ * concurrently, *including doing different parts of the same incremental
+ * computation in different contexts*, then all results are as expected.
+ * Besides detecting bugs similar to those that test_hash_interrupt_context_1
+ * can detect, this test case can also detect bugs where hash function
+ * implementations don't correctly handle these mixed incremental computations.
+ */
+static void test_hash_interrupt_context_2(struct kunit *test)
+{
+	struct hash_irq_test2_state *state;
+	int remaining = TEST_BUF_LEN;
+
+	state = kunit_kzalloc(test, sizeof(*state), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, state);
+
+	rand_bytes(test_buf, TEST_BUF_LEN);
+	HASH(test_buf, TEST_BUF_LEN, state->expected_hash);
+
+	/*
+	 * Generate a list of update lengths to use.  Ensure that it contains
+	 * multiple entries but is limited to a maximum length.
+	 */
+	static_assert(TEST_BUF_LEN / 4096 > 1);
+	for (state->num_steps = 0;
+	     state->num_steps < ARRAY_SIZE(state->update_lens) - 1 && remaining;
+	     state->num_steps++) {
+		state->update_lens[state->num_steps] =
+			rand_length(min(remaining, 4096));
+		remaining -= state->update_lens[state->num_steps];
+	}
+	if (remaining)
+		state->update_lens[state->num_steps++] = remaining;
+	state->num_steps += 2; /* for init and final */
+
+	run_irq_test(test, hash_irq_test2_func, 250000, state);
+}
+
+#define UNKEYED_HASH_KUNIT_CASES                     \
+	KUNIT_CASE(test_hash_test_vectors),          \
+	KUNIT_CASE(test_hash_all_lens_up_to_4096),   \
+	KUNIT_CASE(test_hash_incremental_updates),   \
+	KUNIT_CASE(test_hash_buffer_overruns),       \
+	KUNIT_CASE(test_hash_overlaps),              \
+	KUNIT_CASE(test_hash_alignment_consistency), \
+	KUNIT_CASE(test_hash_ctx_zeroization),       \
+	KUNIT_CASE(test_hash_interrupt_context_1),   \
+	KUNIT_CASE(test_hash_interrupt_context_2)
+/* benchmark_hash is omitted so that the suites can put it last. */
+
+#ifdef HMAC
+/*
+ * Test the corresponding HMAC variant.
+ *
+ * This test case is fairly short, since HMAC is just a simple C wrapper around
+ * the underlying unkeyed hash function, which is already well-tested by the
+ * other test cases.  It's not useful to test things like data alignment or
+ * interrupt context again for HMAC, nor to have a long list of test vectors.
+ *
+ * Thus, just do a single consolidated test, which covers all data lengths up to
+ * 4096 bytes and all key lengths up to 292 bytes.  For each data length, select
+ * a key length, generate the inputs from a seed, and compute the HMAC value.
+ * Concatenate all these HMAC values together, and compute the HMAC of that.
+ * Verify that value.  If this fails, then the HMAC implementation is wrong.
+ * This won't show which specific input failed, but that should be fine.  Any
+ * failure would likely be non-input-specific or also show in the unkeyed tests.
+ */
+static void test_hmac(struct kunit *test)
+{
+	static const u8 zeroes[sizeof(struct HMAC_CTX)];
+	u8 *raw_key;
+	struct HMAC_KEY key;
+	struct HMAC_CTX ctx;
+	u8 mac[HASH_SIZE];
+	u8 mac2[HASH_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096 + 293);
+	rand_bytes_seeded_from_len(test_buf, 4096);
+	raw_key = &test_buf[4096];
+
+	rand_bytes_seeded_from_len(raw_key, 32);
+	HMAC_PREPAREKEY(&key, raw_key, 32);
+	HMAC_INIT(&ctx, &key);
+	for (size_t data_len = 0; data_len <= 4096; data_len++) {
+		/*
+		 * Cycle through key lengths as well.  Somewhat arbitrarily go
+		 * up to 293, which is somewhat larger than the largest hash
+		 * block size (which is the size at which the key starts being
+		 * hashed down to one block); going higher would not be useful.
+		 * To reduce correlation with data_len, use a prime number here.
+		 */
+		size_t key_len = data_len % 293;
+
+		HMAC_UPDATE(&ctx, test_buf, data_len);
+
+		rand_bytes_seeded_from_len(raw_key, key_len);
+		HMAC_USINGRAWKEY(raw_key, key_len, test_buf, data_len, mac);
+		HMAC_UPDATE(&ctx, mac, HASH_SIZE);
+
+		/* Verify that HMAC() is consistent with HMAC_USINGRAWKEY(). */
+		HMAC_PREPAREKEY(&key, raw_key, key_len);
+		HMAC(&key, test_buf, data_len, mac2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, mac, mac2, HASH_SIZE,
+			"HMAC gave different results with raw and prepared keys");
+	}
+	HMAC_FINAL(&ctx, mac);
+	KUNIT_EXPECT_MEMEQ_MSG(test, mac, hmac_testvec_consolidated, HASH_SIZE,
+			       "HMAC gave wrong result");
+	KUNIT_EXPECT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx),
+			       "HMAC context was not zeroized by finalization");
+}
+#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES, KUNIT_CASE(test_hmac)
+#else
+#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES
+#endif
+
+/* Benchmark the hash function on various data lengths. */
+static void benchmark_hash(struct kunit *test)
+{
+	static const size_t lens_to_test[] = {
+		1,   16,  64,	127,  128,  200,   256,
+		511, 512, 1024, 3173, 4096, 16384,
+	};
+	u8 hash[HASH_SIZE];
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
+
+	/* Warm-up */
+	for (size_t i = 0; i < 10000000; i += TEST_BUF_LEN)
+		HASH(test_buf, TEST_BUF_LEN, hash);
+
+	for (size_t i = 0; i < ARRAY_SIZE(lens_to_test); i++) {
+		size_t len = lens_to_test[i];
+		/* The '+ 128' tries to account for per-message overhead. */
+		size_t num_iters = 10000000 / (len + 128);
+		u64 t;
+
+		KUNIT_ASSERT_LE(test, len, TEST_BUF_LEN);
+		preempt_disable();
+		t = ktime_get_ns();
+		for (size_t j = 0; j < num_iters; j++)
+			HASH(test_buf, len, hash);
+		t = ktime_get_ns() - t;
+		preempt_enable();
+		kunit_info(test, "len=%zu: %llu MB/s", len,
+			   div64_u64((u64)len * num_iters * 1000, t ?: 1));
+	}
+}
diff --git a/lib/crypto/tests/poly1305-testvecs.h b/lib/crypto/tests/poly1305-testvecs.h
new file mode 100644
index 000000000000..ecf8662225c8
--- /dev/null
+++ b/lib/crypto/tests/poly1305-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py poly1305 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[POLY1305_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xe8, 0x2d, 0x67, 0x2c, 0x01, 0x48, 0xf9, 0xb7,
+			0x87, 0x85, 0x3f, 0xcf, 0x18, 0x66, 0x8c, 0xd3,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0xb8, 0xad, 0xca, 0x6b, 0x32, 0xba, 0x34, 0x42,
+			0x54, 0x10, 0x28, 0xf5, 0x0f, 0x7e, 0x8e, 0xe3,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xb8, 0xf7, 0xf4, 0xc2, 0x85, 0x33, 0x80, 0x63,
+			0xd1, 0x45, 0xda, 0xf8, 0x7c, 0x79, 0x42, 0xd1,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xf3, 0x73, 0x7b, 0x60, 0x24, 0xcc, 0x5d, 0x3e,
+			0xd1, 0x95, 0x86, 0xce, 0x89, 0x0a, 0x33, 0xba,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x0a, 0x1a, 0x2d, 0x39, 0xea, 0x49, 0x8f, 0xb7,
+			0x90, 0xb6, 0x74, 0x3b, 0x41, 0x3b, 0x96, 0x11,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x99, 0x05, 0xe3, 0xa7, 0x9e, 0x2a, 0xd2, 0x42,
+			0xb9, 0x45, 0x0c, 0x08, 0xe7, 0x10, 0xe4, 0xe1,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xe1, 0xb2, 0x15, 0xee, 0xa2, 0xf3, 0x04, 0xac,
+			0xdd, 0x27, 0x57, 0x95, 0x2f, 0x45, 0xa8, 0xd3,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x1c, 0xf3, 0xab, 0x39, 0xc0, 0x69, 0x49, 0x69,
+			0x89, 0x6f, 0x1f, 0x03, 0x16, 0xe7, 0xc0, 0xf0,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x30, 0xb0, 0x32, 0x87, 0x51, 0x55, 0x9c, 0x39,
+			0x38, 0x42, 0x06, 0xe9, 0x2a, 0x3e, 0x2c, 0x92,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x2c, 0x04, 0x16, 0x36, 0x55, 0x25, 0x2d, 0xc6,
+			0x3d, 0x70, 0x5b, 0x88, 0x46, 0xb6, 0x71, 0x77,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x03, 0x87, 0xdd, 0xbe, 0xe8, 0x30, 0xf2, 0x15,
+			0x40, 0x44, 0x29, 0x7b, 0xb1, 0xe9, 0x9d, 0xe7,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x29, 0x83, 0x4f, 0xcb, 0x5a, 0x93, 0x25, 0xad,
+			0x05, 0xa4, 0xb3, 0x24, 0x77, 0x62, 0x2d, 0x3d,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x20, 0x0e, 0x2c, 0x05, 0xe2, 0x0b, 0x85, 0xa0,
+			0x24, 0x73, 0x7f, 0x65, 0x70, 0x6c, 0x3e, 0xb0,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0xef, 0x2f, 0x98, 0x42, 0xc2, 0x90, 0x55, 0xea,
+			0xba, 0x28, 0x76, 0xfd, 0x9e, 0x3e, 0x4d, 0x53,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x9e, 0x75, 0x4b, 0xc7, 0x69, 0x68, 0x51, 0x90,
+			0xdc, 0x29, 0xc8, 0xfa, 0x86, 0xf1, 0xc9, 0xb3,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x9d, 0x13, 0xf5, 0x54, 0xe6, 0xe3, 0x45, 0x38,
+			0x8b, 0x6d, 0x5c, 0xc4, 0x50, 0xeb, 0x90, 0xcb,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xaa, 0xb2, 0x3e, 0x3c, 0x2a, 0xfc, 0x62, 0x0e,
+			0xd4, 0xe6, 0xe5, 0x5c, 0x6b, 0x9f, 0x3d, 0xc7,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0xd6, 0x8c, 0xea, 0x8a, 0x0f, 0x68, 0xa9, 0xa8,
+			0x67, 0x86, 0xf9, 0xc1, 0x4c, 0x26, 0x10, 0x6d,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xdc, 0xc1, 0x54, 0xe7, 0x38, 0xc6, 0xdb, 0x24,
+			0xa7, 0x0b, 0xff, 0xd3, 0x1b, 0x93, 0x01, 0xa6,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x8f, 0x88, 0x3e, 0x9c, 0x7b, 0x2e, 0x82, 0x5a,
+			0x1d, 0x31, 0x82, 0xcc, 0x69, 0xb4, 0x16, 0x26,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x23, 0x45, 0x94, 0xa8, 0x11, 0x54, 0x9d, 0xf2,
+			0xa1, 0x9a, 0xca, 0xf9, 0x3e, 0x65, 0x52, 0xfd,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x7b, 0xfc, 0xa9, 0x1e, 0x03, 0xad, 0xef, 0x03,
+			0xe2, 0x20, 0x92, 0xc7, 0x54, 0x83, 0xfa, 0x37,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x46, 0xab, 0x8c, 0x75, 0xb3, 0x10, 0xa6, 0x3f,
+			0x74, 0x55, 0x42, 0x6d, 0x69, 0x35, 0xd2, 0xf5,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xd0, 0xfe, 0x26, 0xc2, 0xca, 0x94, 0x2d, 0x52,
+			0x2d, 0xe1, 0x11, 0xdd, 0x42, 0x28, 0x83, 0xa6,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[POLY1305_DIGEST_SIZE] = {
+	0x9d, 0x07, 0x5d, 0xc9, 0x6c, 0xeb, 0x62, 0x5d,
+	0x02, 0x5f, 0xe1, 0xe3, 0xd1, 0x71, 0x69, 0x34,
+};
+
+static const u8 poly1305_allones_macofmacs[POLY1305_DIGEST_SIZE] = {
+	0x0c, 0x26, 0x6b, 0x45, 0x87, 0x06, 0xcf, 0xc4,
+	0x3f, 0x70, 0x7d, 0xb3, 0x50, 0xdd, 0x81, 0x25,
+};
diff --git a/lib/crypto/tests/poly1305_kunit.c b/lib/crypto/tests/poly1305_kunit.c
new file mode 100644
index 000000000000..7ac191bd96b6
--- /dev/null
+++ b/lib/crypto/tests/poly1305_kunit.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/poly1305.h>
+#include "poly1305-testvecs.h"
+
+/*
+ * A fixed key used when presenting Poly1305 as an unkeyed hash function in
+ * order to reuse hash-test-template.h.  At the beginning of the test suite,
+ * this is initialized to bytes generated from a fixed seed.
+ */
+static u8 test_key[POLY1305_KEY_SIZE];
+
+/* This probably should be in the actual API, but just define it here for now */
+static void poly1305(const u8 key[POLY1305_KEY_SIZE], const u8 *data,
+		     size_t len, u8 out[POLY1305_DIGEST_SIZE])
+{
+	struct poly1305_desc_ctx ctx;
+
+	poly1305_init(&ctx, key);
+	poly1305_update(&ctx, data, len);
+	poly1305_final(&ctx, out);
+}
+
+static void poly1305_init_withtestkey(struct poly1305_desc_ctx *ctx)
+{
+	poly1305_init(ctx, test_key);
+}
+
+static void poly1305_withtestkey(const u8 *data, size_t len,
+				 u8 out[POLY1305_DIGEST_SIZE])
+{
+	poly1305(test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH poly1305_withtestkey
+#define HASH_CTX poly1305_desc_ctx
+#define HASH_SIZE POLY1305_DIGEST_SIZE
+#define HASH_INIT poly1305_init_withtestkey
+#define HASH_UPDATE poly1305_update
+#define HASH_FINAL poly1305_final
+#include "hash-test-template.h"
+
+static int poly1305_suite_init(struct kunit_suite *suite)
+{
+	rand_bytes_seeded_from_len(test_key, POLY1305_KEY_SIZE);
+	return hash_suite_init(suite);
+}
+
+static void poly1305_suite_exit(struct kunit_suite *suite)
+{
+	hash_suite_exit(suite);
+}
+
+/*
+ * Poly1305 test case which uses a key and message consisting only of one bits:
+ *
+ * - Using an all-one-bits r_key tests the key clamping.
+ * - Using an all-one-bits s_key tests carries in implementations of the
+ *   addition mod 2**128 during finalization.
+ * - Using all-one-bits message, and to a lesser extent r_key, tends to maximize
+ *   any intermediate accumulator values.  This increases the chance of
+ *   detecting bugs that occur only in rare cases where the accumulator values
+ *   get very large, for example the bug fixed by commit 678cce4019d746da
+ *   ("crypto: x86/poly1305 - fix overflow during partial reduction").
+ *
+ * Accumulator overflow bugs may be specific to particular update lengths (in
+ * blocks) and/or particular values of the previous acculumator.  Note that the
+ * accumulator starts at 0 which gives the lowest chance of an overflow.  Thus,
+ * a single all-one-bits test vector may be insufficient.
+ *
+ * Considering that, do the following test: continuously update a single
+ * Poly1305 context with all-one-bits data of varying lengths (0, 16, 32, ...,
+ * 4096 bytes).  After each update, generate the MAC from the current context,
+ * and feed that MAC into a separate Poly1305 context.  Repeat that entire
+ * sequence of updates 32 times without re-initializing either context,
+ * resulting in a total of 8224 MAC computations from a long-running, cumulative
+ * context.  Finally, generate and verify the MAC of all the MACs.
+ */
+static void test_poly1305_allones_keys_and_message(struct kunit *test)
+{
+	struct poly1305_desc_ctx mac_ctx, macofmacs_ctx;
+	u8 mac[POLY1305_DIGEST_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096);
+	memset(test_buf, 0xff, 4096);
+
+	poly1305_init(&mac_ctx, test_buf);
+	poly1305_init(&macofmacs_ctx, test_buf);
+	for (int i = 0; i < 32; i++) {
+		for (size_t len = 0; len <= 4096; len += 16) {
+			struct poly1305_desc_ctx tmp_ctx;
+
+			poly1305_update(&mac_ctx, test_buf, len);
+			tmp_ctx = mac_ctx;
+			poly1305_final(&tmp_ctx, mac);
+			poly1305_update(&macofmacs_ctx, mac,
+					POLY1305_DIGEST_SIZE);
+		}
+	}
+	poly1305_final(&macofmacs_ctx, mac);
+	KUNIT_ASSERT_MEMEQ(test, mac, poly1305_allones_macofmacs,
+			   POLY1305_DIGEST_SIZE);
+}
+
+/*
+ * Poly1305 test case which uses r_key=1, s_key=0, and a 48-byte message
+ * consisting of three blocks with integer values [2**128 - i, 0, 0].  In this
+ * case, the result of the polynomial evaluation is 2**130 - i.  For small
+ * values of i, this is very close to the modulus 2**130 - 5, which helps catch
+ * edge case bugs in the modular reduction logic.
+ */
+static void test_poly1305_reduction_edge_cases(struct kunit *test)
+{
+	static const u8 key[POLY1305_KEY_SIZE] = { 1 }; /* r_key=1, s_key=0 */
+	u8 data[3 * POLY1305_BLOCK_SIZE] = {};
+	u8 expected_mac[POLY1305_DIGEST_SIZE];
+	u8 actual_mac[POLY1305_DIGEST_SIZE];
+
+	for (int i = 1; i <= 10; i++) {
+		/* Set the first data block to 2**128 - i. */
+		data[0] = -i;
+		memset(&data[1], 0xff, POLY1305_BLOCK_SIZE - 1);
+
+		/*
+		 * Assuming s_key=0, the expected MAC as an integer is
+		 * (2**130 - i mod 2**130 - 5) + 0 mod 2**128.  If 1 <= i <= 5,
+		 * that's 5 - i.  If 6 <= i <= 10, that's 2**128 - i.
+		 */
+		if (i <= 5) {
+			expected_mac[0] = 5 - i;
+			memset(&expected_mac[1], 0, POLY1305_DIGEST_SIZE - 1);
+		} else {
+			expected_mac[0] = -i;
+			memset(&expected_mac[1], 0xff,
+			       POLY1305_DIGEST_SIZE - 1);
+		}
+
+		/* Compute and verify the MAC. */
+		poly1305(key, data, sizeof(data), actual_mac);
+		KUNIT_ASSERT_MEMEQ(test, actual_mac, expected_mac,
+				   POLY1305_DIGEST_SIZE);
+	}
+}
+
+static struct kunit_case poly1305_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_poly1305_allones_keys_and_message),
+	KUNIT_CASE(test_poly1305_reduction_edge_cases),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite poly1305_test_suite = {
+	.name = "poly1305",
+	.test_cases = poly1305_test_cases,
+	.suite_init = poly1305_suite_init,
+	.suite_exit = poly1305_suite_exit,
+};
+kunit_test_suite(poly1305_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for Poly1305");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha1-testvecs.h b/lib/crypto/tests/sha1-testvecs.h
new file mode 100644
index 000000000000..f5d050122e84
--- /dev/null
+++ b/lib/crypto/tests/sha1-testvecs.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha1 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA1_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d,
+			0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90,
+			0xaf, 0xd8, 0x07, 0x09,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x0a, 0xd0, 0x52, 0xdd, 0x9f, 0x32, 0x40, 0x55,
+			0x21, 0xe4, 0x3c, 0x6e, 0xbd, 0xc5, 0x2f, 0x5a,
+			0x02, 0x54, 0x93, 0xb2,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x13, 0x83, 0x82, 0x03, 0x23, 0xff, 0x46, 0xd6,
+			0x12, 0x7f, 0xad, 0x05, 0x2b, 0xc3, 0x4a, 0x42,
+			0x49, 0x6a, 0xf8, 0x84,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xe4, 0xdf, 0x7b, 0xdc, 0xe8, 0x6e, 0x81, 0x97,
+			0x1e, 0x0f, 0xe8, 0x8b, 0x76, 0xa8, 0x59, 0x04,
+			0xae, 0x92, 0x1a, 0x7c,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x8c, 0x1c, 0x30, 0xd8, 0xbc, 0xc4, 0xc3, 0xf5,
+			0xf8, 0x83, 0x0d, 0x1e, 0x04, 0x5d, 0x29, 0xb5,
+			0x68, 0x89, 0xc1, 0xe9,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x6c, 0x1d, 0x72, 0x31, 0xa5, 0x03, 0x4f, 0xdc,
+			0xff, 0x2d, 0x06, 0x3e, 0x24, 0x26, 0x34, 0x8d,
+			0x60, 0xa4, 0x67, 0x16,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x37, 0x53, 0x33, 0xfa, 0xd0, 0x21, 0xad, 0xe7,
+			0xa5, 0x43, 0xf1, 0x94, 0x64, 0x11, 0x47, 0x9c,
+			0x72, 0xb5, 0x78, 0xb4,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x51, 0x5c, 0xd8, 0x5a, 0xa9, 0xde, 0x7b, 0x2a,
+			0xa2, 0xff, 0x70, 0x09, 0x56, 0x88, 0x40, 0x2b,
+			0x50, 0x93, 0x82, 0x47,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0xbc, 0x9c, 0xab, 0x93, 0x06, 0xd5, 0xdb, 0xac,
+			0x2c, 0x33, 0x15, 0x83, 0x56, 0xf6, 0x91, 0x20,
+			0x09, 0xc7, 0xb2, 0x6b,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x26, 0x90, 0x3b, 0x47, 0xe1, 0x92, 0x42, 0xd0,
+			0x85, 0x63, 0x2e, 0x6b, 0x68, 0xa4, 0xc4, 0x4c,
+			0xe6, 0xf4, 0xb0, 0x52,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x55, 0x6f, 0x87, 0xdc, 0x34, 0x3d, 0xe2, 0x4f,
+			0xc3, 0x81, 0xa4, 0x82, 0x79, 0x84, 0x64, 0x01,
+			0x55, 0xa0, 0x1e, 0x36,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0xb7, 0xd5, 0x5f, 0xa4, 0xef, 0xbf, 0x4f, 0x96,
+			0x01, 0xc1, 0x06, 0xe3, 0x75, 0xa8, 0x90, 0x92,
+			0x4c, 0x5f, 0xf1, 0x21,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x70, 0x0c, 0xea, 0xa4, 0x93, 0xd0, 0x56, 0xf0,
+			0x6f, 0xbb, 0x53, 0x42, 0x5b, 0xe3, 0xf2, 0xb0,
+			0x30, 0x66, 0x8e, 0x75,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x15, 0x01, 0xbc, 0xb0, 0xee, 0xd8, 0xeb, 0xa8,
+			0x7d, 0xd9, 0x4d, 0x50, 0x2e, 0x41, 0x30, 0xba,
+			0x41, 0xaa, 0x7b, 0x02,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x98, 0x05, 0x52, 0xf5, 0x0f, 0xf0, 0xd3, 0x97,
+			0x15, 0x8c, 0xa3, 0x9a, 0x2b, 0x4d, 0x67, 0x57,
+			0x29, 0xa0, 0xac, 0x61,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x1f, 0x47, 0xf0, 0xcc, 0xd7, 0xda, 0xa5, 0x3b,
+			0x39, 0xb4, 0x5b, 0xa8, 0x33, 0xd4, 0xca, 0x2f,
+			0xdd, 0xf2, 0x39, 0x89,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xb9, 0x75, 0xe6, 0x57, 0x42, 0x7f, 0x8b, 0x0a,
+			0xcc, 0x53, 0x10, 0x69, 0x45, 0xac, 0xfd, 0x11,
+			0xf7, 0x1f, 0x4e, 0x6f,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x63, 0x66, 0xcb, 0x44, 0xc1, 0x2c, 0xa2, 0x06,
+			0x5d, 0xb9, 0x8e, 0x31, 0xcb, 0x4f, 0x4e, 0x49,
+			0xe0, 0xfb, 0x3c, 0x4e,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x35, 0xbc, 0x74, 0xfb, 0x31, 0x9c, 0xd4, 0xdd,
+			0xe8, 0x87, 0xa7, 0x56, 0x3b, 0x08, 0xe5, 0x49,
+			0xe1, 0xe9, 0xc9, 0xa8,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x43, 0x00, 0xea, 0xcd, 0x4e, 0x7c, 0xe9, 0xe4,
+			0x32, 0xce, 0x25, 0xa8, 0xcd, 0x20, 0xa8, 0xaa,
+			0x7b, 0x63, 0x2c, 0x3c,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xd0, 0x67, 0x26, 0x0e, 0x22, 0x72, 0xaa, 0x63,
+			0xfc, 0x34, 0x55, 0x07, 0xab, 0xc8, 0x64, 0xb6,
+			0xc4, 0xea, 0xd5, 0x7c,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x6b, 0xc9, 0x5e, 0xb9, 0x41, 0x19, 0x50, 0x35,
+			0xf1, 0x39, 0xfe, 0xd9, 0x72, 0x6d, 0xd0, 0x55,
+			0xb8, 0x1f, 0x1a, 0x95,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x70, 0x5d, 0x10, 0x2e, 0x4e, 0x44, 0xc9, 0x80,
+			0x8f, 0xba, 0x13, 0xbc, 0xd0, 0x77, 0x78, 0xc7,
+			0x84, 0xe3, 0x24, 0x43,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xa8, 0x82, 0xca, 0x08, 0xb4, 0x84, 0x09, 0x13,
+			0xc0, 0x9c, 0x26, 0x18, 0xcf, 0x0f, 0xf3, 0x08,
+			0xff, 0xa1, 0xe4, 0x5d,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA1_DIGEST_SIZE] = {
+	0xe1, 0x72, 0xa5, 0x3c, 0xda, 0xf2, 0xe5, 0x56,
+	0xb8, 0xb5, 0x35, 0x6e, 0xce, 0xc8, 0x37, 0x57,
+	0x31, 0xb4, 0x05, 0xdd,
+};
+
+static const u8 hmac_testvec_consolidated[SHA1_DIGEST_SIZE] = {
+	0x9d, 0xe5, 0xb1, 0x43, 0x97, 0x95, 0x16, 0x52,
+	0xa0, 0x7a, 0xc0, 0xe2, 0xc1, 0x60, 0x64, 0x7c,
+	0x24, 0xf9, 0x34, 0xd7,
+};
diff --git a/lib/crypto/tests/sha1_kunit.c b/lib/crypto/tests/sha1_kunit.c
new file mode 100644
index 000000000000..24ba8d5669c8
--- /dev/null
+++ b/lib/crypto/tests/sha1_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha1.h>
+#include "sha1-testvecs.h"
+
+#define HASH sha1
+#define HASH_CTX sha1_ctx
+#define HASH_SIZE SHA1_DIGEST_SIZE
+#define HASH_INIT sha1_init
+#define HASH_UPDATE sha1_update
+#define HASH_FINAL sha1_final
+#define HMAC_KEY hmac_sha1_key
+#define HMAC_CTX hmac_sha1_ctx
+#define HMAC_PREPAREKEY hmac_sha1_preparekey
+#define HMAC_INIT hmac_sha1_init
+#define HMAC_UPDATE hmac_sha1_update
+#define HMAC_FINAL hmac_sha1_final
+#define HMAC hmac_sha1
+#define HMAC_USINGRAWKEY hmac_sha1_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha1",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-1 and HMAC-SHA1");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha224-testvecs.h b/lib/crypto/tests/sha224-testvecs.h
new file mode 100644
index 000000000000..523adc178e1a
--- /dev/null
+++ b/lib/crypto/tests/sha224-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha224 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA224_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9,
+			0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4,
+			0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a,
+			0xc5, 0xb3, 0xe4, 0x2f,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0xe3, 0x4d, 0x79, 0x17, 0x75, 0x35, 0xdc, 0xd2,
+			0x27, 0xc9, 0x9d, 0x0b, 0x90, 0x0f, 0x21, 0x5d,
+			0x95, 0xfb, 0x9c, 0x6d, 0xa8, 0xec, 0x19, 0x15,
+			0x12, 0xef, 0xf5, 0x0f,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x81, 0xc7, 0x60, 0x0d, 0x6d, 0x13, 0x75, 0x70,
+			0x4b, 0xc0, 0xab, 0xea, 0x04, 0xe3, 0x78, 0x7e,
+			0x73, 0xb9, 0x0f, 0xb6, 0xae, 0x90, 0xf3, 0x94,
+			0xb2, 0x56, 0xda, 0xc8,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x24, 0xf0, 0x8c, 0x6e, 0x9d, 0xd6, 0x06, 0x80,
+			0x0a, 0x03, 0xee, 0x9b, 0x33, 0xec, 0x83, 0x42,
+			0x2c, 0x8b, 0xe7, 0xc7, 0xc6, 0x04, 0xfb, 0xc6,
+			0xa3, 0x3a, 0x4d, 0xc9,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x1c, 0x08, 0xa8, 0x55, 0x8f, 0xc6, 0x0a, 0xea,
+			0x2f, 0x1b, 0x54, 0xff, 0x8d, 0xd2, 0xa3, 0xc7,
+			0x42, 0xc2, 0x93, 0x3d, 0x73, 0x18, 0x84, 0xba,
+			0x75, 0x49, 0x34, 0xfd,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x45, 0xdd, 0xb5, 0xf0, 0x3c, 0xda, 0xe6, 0xd4,
+			0x6c, 0x86, 0x91, 0x29, 0x11, 0x2f, 0x88, 0x7d,
+			0xd8, 0x3c, 0xa3, 0xd6, 0xdd, 0x1e, 0xac, 0x98,
+			0xff, 0xf0, 0x14, 0x69,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x0b, 0xfb, 0x71, 0x4c, 0x06, 0x7a, 0xd5, 0x89,
+			0x76, 0x0a, 0x43, 0x8b, 0x2b, 0x47, 0x12, 0x56,
+			0xa7, 0x64, 0x33, 0x1d, 0xd3, 0x44, 0x17, 0x95,
+			0x23, 0xe7, 0x53, 0x01,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0xc4, 0xae, 0x9c, 0x33, 0xd5, 0x1d, 0xf4, 0xa7,
+			0xfd, 0xb7, 0xd4, 0x6b, 0xc3, 0xeb, 0xa8, 0xbf,
+			0xfb, 0x07, 0x89, 0x4b, 0x07, 0x15, 0x22, 0xec,
+			0xe1, 0x45, 0x84, 0xba,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0xad, 0x01, 0x34, 0x2a, 0xe2, 0x3b, 0x58, 0x06,
+			0x9f, 0x20, 0xc8, 0xfb, 0xf3, 0x20, 0x82, 0xa6,
+			0x9f, 0xee, 0x7a, 0xbe, 0xdf, 0xf3, 0x5d, 0x57,
+			0x9b, 0xce, 0x79, 0x96,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0xa7, 0xa6, 0x47, 0xf7, 0xed, 0x2a, 0xe5, 0xe3,
+			0xc0, 0x1e, 0x7b, 0x40, 0xe4, 0xf7, 0x40, 0x65,
+			0x42, 0xc1, 0x6f, 0x7d, 0x8d, 0x0d, 0x17, 0x4f,
+			0xd3, 0xbc, 0x0d, 0x85,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0xc4, 0x9c, 0xb5, 0x6a, 0x01, 0x2d, 0x10, 0xa9,
+			0x5f, 0xa4, 0x5a, 0xe1, 0xba, 0x40, 0x12, 0x09,
+			0x7b, 0xea, 0xdb, 0xa6, 0x7b, 0xcb, 0x56, 0xf0,
+			0xfd, 0x5b, 0xe2, 0xe7,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x14, 0xda, 0x0e, 0x01, 0xca, 0x78, 0x7d, 0x2d,
+			0x85, 0xa3, 0xca, 0x0e, 0x80, 0xf9, 0x95, 0x10,
+			0xa1, 0x7b, 0xa5, 0xaa, 0xfc, 0x95, 0x05, 0x08,
+			0x53, 0xda, 0x52, 0xee,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0xa5, 0x24, 0xc4, 0x54, 0xe1, 0x50, 0xab, 0xee,
+			0x22, 0xc1, 0xa7, 0x27, 0x15, 0x2c, 0x6f, 0xf7,
+			0x4c, 0x31, 0xe5, 0x15, 0x25, 0x4e, 0x71, 0xc6,
+			0x7e, 0xa0, 0x11, 0x5d,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x73, 0xd0, 0x8c, 0xce, 0xed, 0xed, 0x9f, 0xaa,
+			0x21, 0xaf, 0xa2, 0x08, 0x80, 0x16, 0x15, 0x59,
+			0x3f, 0x1d, 0x7f, 0x0a, 0x79, 0x3d, 0x7b, 0x58,
+			0xf8, 0xc8, 0x5c, 0x27,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x31, 0xa7, 0xa1, 0xca, 0x49, 0x72, 0x75, 0xcc,
+			0x6e, 0x02, 0x9e, 0xad, 0xea, 0x86, 0x5c, 0x91,
+			0x02, 0xe4, 0xc9, 0xf9, 0xd3, 0x9e, 0x74, 0x50,
+			0xd8, 0x43, 0x6b, 0x85,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x40, 0x60, 0x8b, 0xb0, 0x03, 0xa9, 0x75, 0xab,
+			0x2d, 0x5b, 0x20, 0x9a, 0x05, 0x72, 0xb7, 0xa8,
+			0xce, 0xf2, 0x4f, 0x66, 0x62, 0xe3, 0x7e, 0x24,
+			0xd6, 0xe2, 0xea, 0xfa,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x4f, 0x5f, 0x9f, 0x1e, 0xb3, 0x66, 0x81, 0xdb,
+			0x41, 0x5d, 0x65, 0x97, 0x00, 0x8d, 0xdc, 0x62,
+			0x03, 0xb0, 0x4d, 0x6b, 0x5c, 0x7f, 0x1e, 0xa0,
+			0xfe, 0xfc, 0x0e, 0xb8,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x08, 0xa8, 0xa1, 0xc0, 0xd8, 0xf9, 0xb4, 0xaa,
+			0x53, 0x22, 0xa1, 0x73, 0x0b, 0x45, 0xa0, 0x20,
+			0x72, 0xf3, 0xa9, 0xbc, 0x51, 0xd0, 0x20, 0x79,
+			0x69, 0x97, 0xf7, 0xe3,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xe8, 0x60, 0x5f, 0xb9, 0x12, 0xe1, 0x6b, 0x24,
+			0xc5, 0xe8, 0x43, 0xa9, 0x5c, 0x3f, 0x65, 0xed,
+			0xbe, 0xfd, 0x77, 0xf5, 0x47, 0xf2, 0x75, 0x21,
+			0xc2, 0x8f, 0x54, 0x8f,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xc7, 0xdf, 0x50, 0x16, 0x10, 0x01, 0xb7, 0xdf,
+			0x34, 0x1d, 0x18, 0xa2, 0xd5, 0xad, 0x1f, 0x50,
+			0xf7, 0xa8, 0x9a, 0x72, 0xfb, 0xfd, 0xd9, 0x1c,
+			0x57, 0xac, 0x08, 0x97,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xdf, 0x16, 0x76, 0x7f, 0xc0, 0x16, 0x84, 0x63,
+			0xac, 0xcf, 0xd0, 0x78, 0x1e, 0x96, 0x67, 0xc5,
+			0x3c, 0x06, 0xe9, 0xdb, 0x6e, 0x7d, 0xd0, 0x07,
+			0xaa, 0xb1, 0x56, 0xc9,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x49, 0xec, 0x5c, 0x18, 0xd7, 0x5b, 0xda, 0xed,
+			0x5b, 0x59, 0xde, 0x09, 0x34, 0xb2, 0x49, 0x43,
+			0x62, 0x6a, 0x0a, 0x63, 0x6a, 0x51, 0x08, 0x37,
+			0x8c, 0xb6, 0x29, 0x84,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x3d, 0xc2, 0xc8, 0x43, 0xcf, 0xb7, 0x33, 0x14,
+			0x04, 0x93, 0xed, 0xe2, 0xcd, 0x8a, 0x69, 0x5c,
+			0x5a, 0xd5, 0x9b, 0x52, 0xdf, 0x48, 0xa7, 0xaa,
+			0x28, 0x2b, 0x5d, 0x27,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xa7, 0xaf, 0xda, 0x92, 0xe2, 0xe7, 0x61, 0xdc,
+			0xa1, 0x32, 0x53, 0x2a, 0x3f, 0x41, 0x5c, 0x7e,
+			0xc9, 0x89, 0xda, 0x1c, 0xf7, 0x8d, 0x00, 0xbd,
+			0x21, 0x73, 0xb1, 0x69,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA224_DIGEST_SIZE] = {
+	0x9e, 0xb8, 0x82, 0xab, 0x83, 0x37, 0xe4, 0x63,
+	0x84, 0xee, 0x21, 0x15, 0xc2, 0xbb, 0xa3, 0x17,
+	0x8f, 0xc4, 0x99, 0x33, 0xa0, 0x2c, 0x9f, 0xec,
+	0xca, 0xd0, 0xf3, 0x73,
+};
+
+static const u8 hmac_testvec_consolidated[SHA224_DIGEST_SIZE] = {
+	0x66, 0x34, 0x79, 0x92, 0x47, 0x0e, 0xcd, 0x70,
+	0xb0, 0x8b, 0x91, 0xcb, 0x94, 0x2f, 0x67, 0x65,
+	0x2f, 0xc9, 0xd2, 0x91, 0x32, 0xaf, 0xf7, 0x5f,
+	0xb6, 0x01, 0x5b, 0xf2,
+};
diff --git a/lib/crypto/tests/sha224_kunit.c b/lib/crypto/tests/sha224_kunit.c
new file mode 100644
index 000000000000..962ad46b9c99
--- /dev/null
+++ b/lib/crypto/tests/sha224_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha224-testvecs.h"
+
+#define HASH sha224
+#define HASH_CTX sha224_ctx
+#define HASH_SIZE SHA224_DIGEST_SIZE
+#define HASH_INIT sha224_init
+#define HASH_UPDATE sha224_update
+#define HASH_FINAL sha224_final
+#define HMAC_KEY hmac_sha224_key
+#define HMAC_CTX hmac_sha224_ctx
+#define HMAC_PREPAREKEY hmac_sha224_preparekey
+#define HMAC_INIT hmac_sha224_init
+#define HMAC_UPDATE hmac_sha224_update
+#define HMAC_FINAL hmac_sha224_final
+#define HMAC hmac_sha224
+#define HMAC_USINGRAWKEY hmac_sha224_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha224",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-224 and HMAC-SHA224");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha256-testvecs.h b/lib/crypto/tests/sha256-testvecs.h
new file mode 100644
index 000000000000..2db7b2042034
--- /dev/null
+++ b/lib/crypto/tests/sha256-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha256 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA256_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14,
+			0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24,
+			0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c,
+			0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x45, 0xf8, 0x3d, 0x17, 0xe1, 0x0b, 0x34, 0xfc,
+			0xa0, 0x1e, 0xb8, 0xf4, 0x45, 0x4d, 0xac, 0x34,
+			0xa7, 0x77, 0xd9, 0x40, 0x4a, 0x46, 0x4e, 0x73,
+			0x2c, 0xf4, 0xab, 0xf2, 0xc0, 0xda, 0x94, 0xc4,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xf9, 0xd3, 0x52, 0x2f, 0xd5, 0xe0, 0x99, 0x15,
+			0x1c, 0xd6, 0xa9, 0x24, 0x4f, 0x40, 0xba, 0x25,
+			0x33, 0x43, 0x3e, 0xe1, 0x78, 0x6a, 0xfe, 0x7d,
+			0x07, 0xe2, 0x29, 0x7b, 0x6d, 0xc5, 0x73, 0xf5,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x71, 0xf7, 0xa1, 0xef, 0x69, 0x86, 0x0e, 0xe4,
+			0x87, 0x25, 0x58, 0x4c, 0x07, 0x2c, 0xfc, 0x60,
+			0xc5, 0xf6, 0xe2, 0x44, 0xaa, 0xfb, 0x41, 0xc7,
+			0x2b, 0xc5, 0x01, 0x8c, 0x39, 0x98, 0x30, 0x37,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x09, 0x95, 0x9a, 0xfa, 0x25, 0x18, 0x86, 0x06,
+			0xfe, 0x65, 0xc9, 0x2f, 0x91, 0x15, 0x74, 0x06,
+			0x6c, 0xbf, 0xef, 0x7b, 0x0b, 0xc7, 0x2c, 0x05,
+			0xdd, 0x17, 0x5d, 0x6f, 0x8a, 0xa5, 0xde, 0x3c,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xe5, 0x52, 0x3c, 0x85, 0xea, 0x1b, 0xe1, 0x6c,
+			0xe0, 0xdb, 0xc3, 0xef, 0xf0, 0xca, 0xc2, 0xe1,
+			0xb9, 0x36, 0xa1, 0x28, 0xb6, 0x9e, 0xf5, 0x6e,
+			0x70, 0xf7, 0xf9, 0xa7, 0x1c, 0xd3, 0x22, 0xd0,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x5f, 0x84, 0xd4, 0xd7, 0x2e, 0x80, 0x09, 0xef,
+			0x1c, 0x77, 0x7c, 0x25, 0x59, 0x63, 0x88, 0x64,
+			0xfd, 0x56, 0xea, 0x23, 0xf4, 0x4f, 0x2e, 0x49,
+			0xcd, 0xb4, 0xaa, 0xc7, 0x5c, 0x8b, 0x75, 0x84,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x22, 0x6e, 0xca, 0xda, 0x00, 0x2d, 0x90, 0x96,
+			0x24, 0xf8, 0x55, 0x17, 0x11, 0xda, 0x42, 0x1c,
+			0x78, 0x4e, 0xbf, 0xd9, 0xc5, 0xcf, 0xf3, 0xe3,
+			0xaf, 0xd3, 0x60, 0xcd, 0xaa, 0xe2, 0xc7, 0x22,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x97, 0xe2, 0x74, 0xdc, 0x6b, 0xa4, 0xaf, 0x32,
+			0x3b, 0x50, 0x6d, 0x80, 0xb5, 0xd3, 0x0c, 0x36,
+			0xea, 0x3f, 0x5d, 0x36, 0xa7, 0x49, 0x51, 0xf3,
+			0xbd, 0x69, 0x68, 0x60, 0x9b, 0xde, 0x73, 0xf5,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x13, 0x74, 0xb1, 0x72, 0xd6, 0x53, 0x48, 0x28,
+			0x42, 0xd8, 0xba, 0x64, 0x20, 0x60, 0xb6, 0x4c,
+			0xc3, 0xac, 0x5d, 0x93, 0x8c, 0xb9, 0xd4, 0xcc,
+			0xb4, 0x9f, 0x31, 0x1f, 0xeb, 0x68, 0x35, 0x58,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0xda, 0xbe, 0xd7, 0xbc, 0x6e, 0xe6, 0x5a, 0x57,
+			0xeb, 0x9a, 0x93, 0xaa, 0x66, 0xd0, 0xe0, 0xc4,
+			0x29, 0x7f, 0xe9, 0x3b, 0x8e, 0xdf, 0x81, 0x82,
+			0x8d, 0x15, 0x11, 0x59, 0x4e, 0x13, 0xa5, 0x58,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x8c, 0x1a, 0xba, 0x40, 0x66, 0x94, 0x19, 0xf4,
+			0x2e, 0xa2, 0xae, 0x94, 0x53, 0x18, 0xb6, 0xfd,
+			0xa0, 0x12, 0xc5, 0xef, 0xd5, 0xd6, 0x1b, 0xa1,
+			0x37, 0xea, 0x19, 0x44, 0x35, 0x54, 0x85, 0x74,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0xfd, 0x07, 0xd8, 0x77, 0x7d, 0x8b, 0x4f, 0xee,
+			0x60, 0x60, 0x26, 0xef, 0x2a, 0x86, 0xfb, 0x67,
+			0xeb, 0x31, 0x27, 0x03, 0x99, 0x3c, 0xde, 0xe5,
+			0x84, 0x72, 0x71, 0x4c, 0x33, 0x7b, 0x87, 0x13,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x97, 0xc5, 0x58, 0x38, 0x20, 0xc7, 0xde, 0xfa,
+			0xdd, 0x9b, 0x10, 0xc6, 0xc2, 0x2f, 0x94, 0xb5,
+			0xc0, 0x33, 0xc0, 0x20, 0x1c, 0x2f, 0xb4, 0x28,
+			0x5e, 0x36, 0xfa, 0x8c, 0x24, 0x1c, 0x18, 0x27,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x62, 0x17, 0x84, 0x26, 0x98, 0x30, 0x57, 0xca,
+			0x4f, 0x32, 0xd9, 0x09, 0x09, 0x34, 0xe2, 0xcb,
+			0x92, 0x45, 0xd5, 0xeb, 0x8b, 0x9b, 0x3c, 0xd8,
+			0xaa, 0xc7, 0xd2, 0x2b, 0x04, 0xab, 0xb3, 0x35,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x7f, 0xe1, 0x09, 0x78, 0x5d, 0x61, 0xfa, 0x5e,
+			0x9b, 0x8c, 0xb1, 0xa9, 0x09, 0x69, 0xb4, 0x24,
+			0x54, 0xf2, 0x1c, 0xc9, 0x5f, 0xfb, 0x59, 0x9d,
+			0x36, 0x1b, 0x37, 0x44, 0xfc, 0x64, 0x79, 0xb6,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xd2, 0x3b, 0x3a, 0xe7, 0x13, 0x4f, 0xbd, 0x29,
+			0x6b, 0xd2, 0x79, 0x26, 0x6c, 0xd2, 0x22, 0x43,
+			0x25, 0x34, 0x9b, 0x9b, 0x22, 0xb0, 0x9f, 0x61,
+			0x1d, 0xf4, 0xe2, 0x65, 0x68, 0x95, 0x02, 0x6c,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x0c, 0x34, 0x53, 0x3f, 0x0f, 0x8a, 0x39, 0x8d,
+			0x63, 0xe4, 0x83, 0x6e, 0x11, 0x7d, 0x14, 0x8e,
+			0x5b, 0xf0, 0x4d, 0xca, 0x23, 0x24, 0xb5, 0xd2,
+			0x13, 0x3f, 0xd9, 0xde, 0x84, 0x74, 0x26, 0x59,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xa8, 0xb8, 0x83, 0x01, 0x1b, 0x38, 0x7a, 0xca,
+			0x59, 0xe9, 0x5b, 0x37, 0x6a, 0xab, 0xb4, 0x85,
+			0x94, 0x73, 0x26, 0x04, 0xef, 0xed, 0xf4, 0x0d,
+			0xd6, 0x09, 0x21, 0x09, 0x96, 0x78, 0xe3, 0xcf,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x0b, 0x12, 0x66, 0x96, 0x78, 0x4f, 0x2c, 0x35,
+			0xa4, 0xed, 0xbc, 0xb8, 0x30, 0xa6, 0x37, 0x9b,
+			0x94, 0x13, 0xae, 0x86, 0xf0, 0x20, 0xfb, 0x49,
+			0x8f, 0x5d, 0x20, 0x70, 0x60, 0x2b, 0x02, 0x70,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xe4, 0xbd, 0xe4, 0x3b, 0x85, 0xf4, 0x6f, 0x11,
+			0xad, 0xc4, 0x79, 0xcc, 0x8e, 0x6d, 0x8b, 0x15,
+			0xbb, 0xf9, 0xd3, 0x65, 0xe1, 0xf8, 0x8d, 0x22,
+			0x65, 0x66, 0x66, 0xb3, 0xf5, 0xd0, 0x9c, 0xaf,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x90, 0x5f, 0xe0, 0xfc, 0xb1, 0xdc, 0x38, 0x1b,
+			0xe5, 0x37, 0x3f, 0xd2, 0xcc, 0x48, 0xc4, 0xbc,
+			0xb4, 0xfd, 0xf7, 0x71, 0x5f, 0x6b, 0xf4, 0xc4,
+			0xa6, 0x08, 0x7e, 0xfc, 0x4e, 0x96, 0xf7, 0xc2,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x1f, 0x34, 0x0a, 0x3b, 0xdb, 0xf7, 0x7a, 0xdb,
+			0x3d, 0x89, 0x85, 0x0c, 0xd2, 0xf0, 0x0c, 0xbd,
+			0x25, 0x39, 0x14, 0x06, 0x28, 0x0f, 0x6b, 0x5f,
+			0xe3, 0x1f, 0x2a, 0xb6, 0xca, 0x56, 0x41, 0xa1,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x7b, 0x01, 0x2d, 0x84, 0x70, 0xee, 0xe0, 0x77,
+			0x3c, 0x17, 0x63, 0xfe, 0x40, 0xd7, 0xfd, 0xa1,
+			0x75, 0x90, 0xb8, 0x3e, 0x50, 0xcd, 0x06, 0xb7,
+			0xb9, 0xb9, 0x2b, 0x91, 0x4f, 0xba, 0xe4, 0x4c,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA256_DIGEST_SIZE] = {
+	0x78, 0x1c, 0xb1, 0x9f, 0xe5, 0xe1, 0xcb, 0x41,
+	0x8b, 0x34, 0x00, 0x33, 0x57, 0xc3, 0x1c, 0x8f,
+	0x5c, 0x84, 0xc7, 0x8b, 0x87, 0x6a, 0x13, 0x29,
+	0x2f, 0xc4, 0x1a, 0x70, 0x5e, 0x40, 0xf2, 0xfe,
+};
+
+static const u8 hmac_testvec_consolidated[SHA256_DIGEST_SIZE] = {
+	0x56, 0x96, 0x2e, 0x23, 0x3f, 0x94, 0x89, 0x0d,
+	0x0f, 0x24, 0x36, 0x2e, 0x19, 0x3d, 0xb5, 0xac,
+	0xb8, 0xcd, 0xf1, 0xc9, 0xca, 0xac, 0xee, 0x9d,
+	0x62, 0xe6, 0x81, 0xe5, 0x96, 0xf9, 0x38, 0xf5,
+};
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
new file mode 100644
index 000000000000..1cd4caee6010
--- /dev/null
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha256-testvecs.h"
+
+#define HASH sha256
+#define HASH_CTX sha256_ctx
+#define HASH_SIZE SHA256_DIGEST_SIZE
+#define HASH_INIT sha256_init
+#define HASH_UPDATE sha256_update
+#define HASH_FINAL sha256_final
+#define HMAC_KEY hmac_sha256_key
+#define HMAC_CTX hmac_sha256_ctx
+#define HMAC_PREPAREKEY hmac_sha256_preparekey
+#define HMAC_INIT hmac_sha256_init
+#define HMAC_UPDATE hmac_sha256_update
+#define HMAC_FINAL hmac_sha256_final
+#define HMAC hmac_sha256
+#define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha256",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-256 and HMAC-SHA256");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha384-testvecs.h b/lib/crypto/tests/sha384-testvecs.h
new file mode 100644
index 000000000000..6e2f572dd792
--- /dev/null
+++ b/lib/crypto/tests/sha384-testvecs.h
@@ -0,0 +1,290 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha384 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA384_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38,
+			0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a,
+			0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43,
+			0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda,
+			0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb,
+			0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x07, 0x34, 0x9d, 0x74, 0x48, 0x76, 0xa5, 0x72,
+			0x78, 0x02, 0xb8, 0x6e, 0x21, 0x59, 0xb0, 0x75,
+			0x09, 0x68, 0x11, 0x39, 0x53, 0x61, 0xee, 0x8d,
+			0xf2, 0x01, 0xf3, 0x90, 0x53, 0x7c, 0xd3, 0xde,
+			0x13, 0x9f, 0xd2, 0x74, 0x28, 0xfe, 0xe1, 0xc8,
+			0x2e, 0x95, 0xc6, 0x7d, 0x69, 0x4d, 0x04, 0xc6,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xc4, 0xef, 0x6e, 0x8c, 0x19, 0x1c, 0xaa, 0x0e,
+			0x86, 0xf2, 0x68, 0xa1, 0xa0, 0x2d, 0x2e, 0xb2,
+			0x84, 0xbc, 0x5d, 0x53, 0x31, 0xf8, 0x03, 0x75,
+			0x56, 0xf4, 0x8b, 0x23, 0x1a, 0x68, 0x15, 0x9a,
+			0x60, 0xb2, 0xec, 0x05, 0xe1, 0xd4, 0x5e, 0x9e,
+			0xe8, 0x7c, 0x9d, 0xe4, 0x0f, 0x9c, 0x3a, 0xdd,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x29, 0xd2, 0x02, 0xa2, 0x77, 0x24, 0xc7, 0xa7,
+			0x23, 0x0c, 0x3e, 0x30, 0x56, 0x47, 0xdb, 0x75,
+			0xd4, 0x41, 0xf8, 0xb3, 0x8e, 0x26, 0xf6, 0x92,
+			0xbc, 0x20, 0x2e, 0x96, 0xcc, 0x81, 0x5f, 0x32,
+			0x82, 0x60, 0xe2, 0xcf, 0x23, 0xd7, 0x3c, 0x90,
+			0xb2, 0x56, 0x8f, 0xb6, 0x0f, 0xf0, 0x6b, 0x80,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x21, 0x4c, 0xac, 0xfe, 0xbd, 0x40, 0x74, 0x1f,
+			0xa2, 0x2d, 0x2f, 0x35, 0x91, 0xfd, 0xc9, 0x97,
+			0x88, 0x12, 0x6c, 0x0c, 0x6e, 0xd8, 0x50, 0x0b,
+			0x4b, 0x2c, 0x89, 0xa6, 0xa6, 0x4a, 0xad, 0xd7,
+			0x72, 0x62, 0x2c, 0x62, 0x81, 0xcd, 0x24, 0x74,
+			0xf5, 0x44, 0x05, 0xa0, 0x97, 0xea, 0xf1, 0x78,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x06, 0x8b, 0x92, 0x9f, 0x8b, 0x64, 0xb2, 0x80,
+			0xde, 0xcc, 0xde, 0xc3, 0x2f, 0x22, 0x27, 0xe8,
+			0x3b, 0x6e, 0x16, 0x21, 0x14, 0x81, 0xbe, 0x5b,
+			0xa7, 0xa7, 0x14, 0x8a, 0x00, 0x8f, 0x0d, 0x38,
+			0x11, 0x63, 0xe8, 0x3e, 0xb9, 0xf1, 0xcf, 0x87,
+			0xb1, 0x28, 0xe5, 0xa1, 0x89, 0xa8, 0x7a, 0xde,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x9e, 0x37, 0x76, 0x62, 0x98, 0x39, 0xbe, 0xfd,
+			0x2b, 0x91, 0x20, 0x54, 0x8f, 0x21, 0xe7, 0x30,
+			0x0a, 0x01, 0x7a, 0x65, 0x0b, 0xc9, 0xb3, 0x89,
+			0x3c, 0xb6, 0xd3, 0xa8, 0xff, 0xc9, 0x1b, 0x5c,
+			0xd4, 0xac, 0xb4, 0x7e, 0xba, 0x94, 0xc3, 0x8a,
+			0x26, 0x41, 0xf6, 0xd5, 0xed, 0x6f, 0x27, 0xa7,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x03, 0x1f, 0xef, 0x5a, 0x16, 0x28, 0x78, 0x10,
+			0x29, 0xe8, 0xe2, 0xe4, 0x84, 0x36, 0x19, 0x10,
+			0xaa, 0xea, 0xde, 0x06, 0x39, 0x5f, 0xb2, 0x36,
+			0xca, 0x24, 0x4f, 0x7b, 0x66, 0xf7, 0xe7, 0x31,
+			0xf3, 0x9b, 0x74, 0x1e, 0x17, 0x20, 0x88, 0x62,
+			0x50, 0xeb, 0x5f, 0x9a, 0xa7, 0x2c, 0xf4, 0xc9,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x10, 0xce, 0xed, 0x26, 0xb8, 0xac, 0xc1, 0x1b,
+			0xe6, 0xb9, 0xeb, 0x7c, 0xae, 0xcd, 0x55, 0x5a,
+			0x20, 0x2a, 0x7b, 0x43, 0xe6, 0x3e, 0xf0, 0x3f,
+			0xd9, 0x2f, 0x8c, 0x52, 0xe2, 0xf0, 0xb6, 0x24,
+			0x2e, 0xa4, 0xac, 0x24, 0x3a, 0x54, 0x99, 0x71,
+			0x65, 0xab, 0x97, 0x2d, 0xb6, 0xe6, 0x94, 0x20,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x24, 0x6d, 0x9f, 0x59, 0x42, 0x36, 0xca, 0x34,
+			0x36, 0x41, 0xa2, 0xcd, 0x69, 0xdf, 0x3d, 0xcb,
+			0x64, 0x94, 0x54, 0xb2, 0xed, 0xc1, 0x1c, 0x31,
+			0xe3, 0x26, 0xcb, 0x71, 0xe6, 0x98, 0xb2, 0x56,
+			0x74, 0x30, 0xa9, 0x15, 0x98, 0x9d, 0xb3, 0x07,
+			0xcc, 0xa8, 0xcc, 0x6f, 0x42, 0xb0, 0x9d, 0x2b,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x85, 0x1f, 0xbc, 0x5e, 0x2a, 0x00, 0x7d, 0xc2,
+			0x21, 0x4c, 0x28, 0x14, 0xc5, 0xd8, 0x0c, 0xe8,
+			0x55, 0xa5, 0xa0, 0x77, 0xda, 0x8f, 0xce, 0xd4,
+			0xf0, 0xcb, 0x30, 0xb8, 0x9c, 0x47, 0xe1, 0x33,
+			0x92, 0x18, 0xc5, 0x1f, 0xf2, 0xef, 0xb5, 0xe5,
+			0xbc, 0x63, 0xa6, 0xe5, 0x9a, 0xc9, 0xcc, 0xf1,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x26, 0xd2, 0x4c, 0xb6, 0xce, 0xd8, 0x22, 0x2b,
+			0x44, 0x10, 0x6f, 0x59, 0xf7, 0x0d, 0xb9, 0x3f,
+			0x7d, 0x29, 0x75, 0xf1, 0x71, 0xb2, 0x71, 0x23,
+			0xef, 0x68, 0xb7, 0x25, 0xae, 0xb8, 0x45, 0xf8,
+			0xa3, 0xb2, 0x2d, 0x7a, 0x83, 0x0a, 0x05, 0x61,
+			0xbc, 0x73, 0xf1, 0xf9, 0xba, 0xfb, 0x3d, 0xc2,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x7c, 0xe5, 0x7f, 0x5e, 0xea, 0xd9, 0x7e, 0x54,
+			0x14, 0x30, 0x6f, 0x37, 0x02, 0x71, 0x0f, 0xf1,
+			0x14, 0x16, 0xfa, 0xeb, 0x6e, 0x1e, 0xf0, 0xbe,
+			0x10, 0xed, 0x01, 0xbf, 0xa0, 0x9d, 0xcb, 0x07,
+			0x5f, 0x8b, 0x7f, 0x44, 0xe1, 0xd9, 0x13, 0xf0,
+			0x29, 0xa2, 0x54, 0x32, 0xd9, 0xb0, 0x69, 0x69,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0xc5, 0x54, 0x1f, 0xcb, 0x9d, 0x8f, 0xdf, 0xbf,
+			0xab, 0x55, 0x92, 0x1d, 0x3b, 0x93, 0x79, 0x26,
+			0xdf, 0xba, 0x9a, 0x28, 0xff, 0xa0, 0x6c, 0xae,
+			0x7b, 0x53, 0x8d, 0xfa, 0xef, 0x35, 0x88, 0x19,
+			0x16, 0xb8, 0x72, 0x86, 0x76, 0x2a, 0xf5, 0xe6,
+			0xec, 0xb2, 0xd7, 0xd4, 0xbe, 0x1a, 0xe4, 0x9f,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x74, 0x9d, 0x77, 0xfb, 0xe8, 0x0f, 0x0c, 0x2d,
+			0x86, 0x0d, 0x49, 0xea, 0x2b, 0xd0, 0x13, 0xd1,
+			0xe8, 0xb8, 0xe1, 0xa3, 0x7b, 0x48, 0xab, 0x6a,
+			0x21, 0x2b, 0x4c, 0x48, 0x32, 0xb5, 0xdc, 0x31,
+			0x7f, 0xd0, 0x32, 0x67, 0x9a, 0xc0, 0x85, 0x53,
+			0xef, 0xe9, 0xfb, 0xe1, 0x8b, 0xd8, 0xcc, 0xc2,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x7b, 0xa9, 0xde, 0xa3, 0x07, 0x5c, 0x4c, 0xaa,
+			0x31, 0xc6, 0x9e, 0x55, 0xd4, 0x3f, 0x52, 0xdd,
+			0xde, 0x36, 0x70, 0x96, 0x59, 0x6e, 0x90, 0x78,
+			0x4c, 0x6a, 0x27, 0xde, 0x83, 0x84, 0xc3, 0x35,
+			0x53, 0x76, 0x1d, 0xbf, 0x83, 0x64, 0xcf, 0xf2,
+			0xb0, 0x3e, 0x07, 0x27, 0xe4, 0x25, 0x6c, 0x56,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x53, 0x50, 0xf7, 0x3b, 0x86, 0x1d, 0x7a, 0xe2,
+			0x5d, 0x9b, 0x71, 0xfa, 0x25, 0x23, 0x5a, 0xfe,
+			0x8c, 0xb9, 0xac, 0x8a, 0x9d, 0x6c, 0x99, 0xbc,
+			0x01, 0x9e, 0xa0, 0xd6, 0x3c, 0x03, 0x46, 0x21,
+			0xb6, 0xd0, 0xb0, 0xb3, 0x23, 0x23, 0x58, 0xf1,
+			0xea, 0x4e, 0xf2, 0x1a, 0x2f, 0x14, 0x2b, 0x5a,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x06, 0x03, 0xb3, 0xba, 0x14, 0xe0, 0x28, 0x07,
+			0xd5, 0x15, 0x97, 0x1f, 0x87, 0xef, 0x80, 0xba,
+			0x48, 0x03, 0xb6, 0xc5, 0x47, 0xca, 0x8c, 0x95,
+			0xed, 0x95, 0xfd, 0x27, 0xb6, 0x83, 0xda, 0x6d,
+			0xa7, 0xb2, 0x1a, 0xd2, 0xb5, 0x89, 0xbb, 0xb4,
+			0x00, 0xbc, 0x86, 0x54, 0x7d, 0x5a, 0x91, 0x63,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xd3, 0xe0, 0x6e, 0x7d, 0x80, 0x08, 0x53, 0x07,
+			0x8c, 0x0f, 0xc2, 0xce, 0x9f, 0x09, 0x86, 0x31,
+			0x28, 0x24, 0x3c, 0x3e, 0x2d, 0x36, 0xb4, 0x28,
+			0xc7, 0x1b, 0x70, 0xf9, 0x35, 0x9b, 0x10, 0xfa,
+			0xc8, 0x5e, 0x2b, 0x32, 0x7f, 0x65, 0xd2, 0x68,
+			0xb2, 0x84, 0x90, 0xf6, 0xc8, 0x6e, 0xb8, 0xdb,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x39, 0xeb, 0xc4, 0xb3, 0x08, 0xe2, 0xdd, 0xf3,
+			0x9f, 0x5e, 0x44, 0x93, 0x63, 0x8b, 0x39, 0x57,
+			0xd7, 0xe8, 0x7e, 0x3d, 0x74, 0xf8, 0xf6, 0xab,
+			0xfe, 0x74, 0x51, 0xe4, 0x1b, 0x4a, 0x23, 0xbc,
+			0x69, 0xfc, 0xbb, 0xa7, 0x71, 0xa7, 0x86, 0x24,
+			0xcc, 0x85, 0x70, 0xf2, 0x31, 0x0d, 0x47, 0xc0,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x23, 0xc3, 0x97, 0x06, 0x79, 0xbe, 0x8a, 0xe9,
+			0x1f, 0x1a, 0x43, 0xad, 0xe6, 0x76, 0x23, 0x13,
+			0x64, 0xae, 0xda, 0xe7, 0x8b, 0x88, 0x96, 0xb6,
+			0xa9, 0x1a, 0xb7, 0x80, 0x8e, 0x1c, 0x94, 0x98,
+			0x09, 0x08, 0xdb, 0x8e, 0x4d, 0x0a, 0x09, 0x65,
+			0xe5, 0x21, 0x1c, 0xd9, 0xab, 0x64, 0xbb, 0xea,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x4f, 0x4a, 0x88, 0x9f, 0x40, 0x89, 0xfe, 0xb6,
+			0xda, 0x9d, 0xcd, 0xa5, 0x27, 0xd2, 0x29, 0x71,
+			0x58, 0x60, 0xd4, 0x55, 0xfe, 0x92, 0xcd, 0x51,
+			0x8b, 0xec, 0x3b, 0xd3, 0xd1, 0x3e, 0x8d, 0x36,
+			0x7b, 0xb1, 0x41, 0xef, 0xec, 0x9d, 0xdf, 0xcd,
+			0x4e, 0xde, 0x5a, 0xe5, 0xe5, 0x16, 0x14, 0x54,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0xb5, 0xa5, 0x3e, 0x86, 0x39, 0x20, 0x49, 0x4c,
+			0xcd, 0xb6, 0xdd, 0x03, 0xfe, 0x36, 0x6e, 0xa6,
+			0xfc, 0xff, 0x19, 0x33, 0x0c, 0x52, 0xea, 0x37,
+			0x94, 0xda, 0x5b, 0x27, 0xd1, 0x99, 0x5a, 0x89,
+			0x40, 0x78, 0xfa, 0x96, 0xb9, 0x2f, 0xa0, 0x48,
+			0xc9, 0xf8, 0x5c, 0xf0, 0x95, 0xf4, 0xea, 0x61,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x6f, 0x48, 0x6f, 0x21, 0xb9, 0xc1, 0xcc, 0x92,
+			0x4e, 0xed, 0x6b, 0xef, 0x51, 0x88, 0xdf, 0xfd,
+			0xcb, 0x3d, 0x44, 0x9c, 0x37, 0x85, 0xb4, 0xc5,
+			0xeb, 0x60, 0x55, 0x58, 0x01, 0x47, 0xbf, 0x75,
+			0x9b, 0xa8, 0x82, 0x8c, 0xec, 0xe8, 0x0e, 0x58,
+			0xc1, 0x26, 0xa2, 0x45, 0x87, 0x3e, 0xfb, 0x8d,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA384_DIGEST_SIZE] = {
+	0xfc, 0xcb, 0xe6, 0x42, 0xf0, 0x9e, 0x2b, 0x77,
+	0x7b, 0x62, 0xe8, 0x70, 0x86, 0xbf, 0xaf, 0x3c,
+	0xbb, 0x02, 0xd9, 0x86, 0xdc, 0xba, 0xd3, 0xa4,
+	0x0d, 0x8d, 0xb3, 0x2d, 0x0b, 0xa3, 0x84, 0x04,
+	0x7c, 0x16, 0x37, 0xaf, 0xba, 0x1e, 0xd4, 0x2f,
+	0x4c, 0x57, 0x55, 0x86, 0x52, 0x47, 0x9a, 0xec,
+};
+
+static const u8 hmac_testvec_consolidated[SHA384_DIGEST_SIZE] = {
+	0x82, 0xcf, 0x7d, 0x80, 0x71, 0xdb, 0x91, 0x09,
+	0x67, 0xe8, 0x44, 0x4a, 0x0d, 0x03, 0xb1, 0xf9,
+	0x62, 0xde, 0x4e, 0xbb, 0x1f, 0x41, 0xcd, 0x62,
+	0x39, 0x6b, 0x2d, 0x44, 0x0e, 0xde, 0x98, 0x73,
+	0xdd, 0xeb, 0x9d, 0x53, 0xfb, 0xee, 0xd1, 0xc3,
+	0x96, 0xdb, 0xfc, 0x2a, 0x38, 0x90, 0x02, 0x53,
+};
diff --git a/lib/crypto/tests/sha384_kunit.c b/lib/crypto/tests/sha384_kunit.c
new file mode 100644
index 000000000000..e1ef5c995bb6
--- /dev/null
+++ b/lib/crypto/tests/sha384_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha384-testvecs.h"
+
+#define HASH sha384
+#define HASH_CTX sha384_ctx
+#define HASH_SIZE SHA384_DIGEST_SIZE
+#define HASH_INIT sha384_init
+#define HASH_UPDATE sha384_update
+#define HASH_FINAL sha384_final
+#define HMAC_KEY hmac_sha384_key
+#define HMAC_CTX hmac_sha384_ctx
+#define HMAC_PREPAREKEY hmac_sha384_preparekey
+#define HMAC_INIT hmac_sha384_init
+#define HMAC_UPDATE hmac_sha384_update
+#define HMAC_FINAL hmac_sha384_final
+#define HMAC hmac_sha384
+#define HMAC_USINGRAWKEY hmac_sha384_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha384",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-384 and HMAC-SHA384");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha512-testvecs.h b/lib/crypto/tests/sha512-testvecs.h
new file mode 100644
index 000000000000..c506aaf72ba7
--- /dev/null
+++ b/lib/crypto/tests/sha512-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha512 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA512_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd,
+			0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07,
+			0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc,
+			0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce,
+			0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0,
+			0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f,
+			0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81,
+			0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x12, 0xf2, 0xb6, 0xec, 0x84, 0xa0, 0x8e, 0xcf,
+			0x0d, 0xec, 0x17, 0xfd, 0x1f, 0x91, 0x15, 0x69,
+			0xd2, 0xb9, 0x89, 0xff, 0x89, 0x9d, 0xd9, 0x0b,
+			0x7a, 0x0f, 0x82, 0x94, 0x57, 0x5b, 0xf3, 0x08,
+			0x42, 0x45, 0x23, 0x08, 0x44, 0x54, 0x35, 0x36,
+			0xed, 0x4b, 0xb3, 0xa5, 0xbf, 0x17, 0xc1, 0x3c,
+			0xdd, 0x25, 0x4a, 0x30, 0x64, 0xed, 0x66, 0x06,
+			0x72, 0x05, 0xc2, 0x71, 0x5a, 0x6c, 0xd0, 0x75,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x01, 0x37, 0x97, 0xcc, 0x0a, 0xcb, 0x61, 0xa4,
+			0x93, 0x26, 0x36, 0x4b, 0xd2, 0x27, 0xea, 0xaf,
+			0xda, 0xfa, 0x8f, 0x86, 0x12, 0x99, 0x7b, 0xc8,
+			0x94, 0xa9, 0x1c, 0x70, 0x3f, 0x43, 0xf3, 0x9a,
+			0x02, 0xc5, 0x0d, 0x8e, 0x01, 0xf8, 0x3a, 0xa6,
+			0xec, 0xaa, 0xc5, 0xc7, 0x9d, 0x3d, 0x7f, 0x9d,
+			0x47, 0x0e, 0x58, 0x2d, 0x9a, 0x2d, 0x51, 0x1d,
+			0xc3, 0x77, 0xb2, 0x7f, 0x69, 0x9a, 0xc3, 0x50,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xd4, 0xa3, 0xc2, 0x50, 0xa0, 0x33, 0xc6, 0xe4,
+			0x50, 0x08, 0xea, 0xc9, 0xb8, 0x35, 0x55, 0x34,
+			0x61, 0xb8, 0x2e, 0xa2, 0xe5, 0xdc, 0x04, 0x70,
+			0x99, 0x86, 0x5f, 0xee, 0x2e, 0x1e, 0xd4, 0x40,
+			0xf8, 0x88, 0x01, 0x84, 0x97, 0x16, 0x6a, 0xd3,
+			0x0a, 0xb4, 0xe2, 0x34, 0xca, 0x1f, 0xfc, 0x6b,
+			0x61, 0xf3, 0x7f, 0x72, 0xfa, 0x8d, 0x22, 0xcf,
+			0x7f, 0x2d, 0x87, 0x9d, 0xbb, 0x59, 0xac, 0x53,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x3b, 0xae, 0x66, 0x48, 0x0f, 0x35, 0x3c, 0xcd,
+			0x92, 0xa9, 0xb6, 0xb9, 0xfe, 0x19, 0x90, 0x92,
+			0x52, 0xa5, 0x02, 0xd1, 0x89, 0xcf, 0x98, 0x86,
+			0x29, 0x28, 0xab, 0xc4, 0x9e, 0xcc, 0x75, 0x38,
+			0x95, 0xa7, 0x59, 0x43, 0xef, 0x8c, 0x3a, 0xeb,
+			0x40, 0xa3, 0xbe, 0x2b, 0x75, 0x0d, 0xfd, 0xc3,
+			0xaf, 0x69, 0x08, 0xad, 0x9f, 0xc9, 0xf4, 0x96,
+			0xa9, 0xc2, 0x2b, 0x1b, 0x66, 0x6f, 0x1d, 0x28,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x9c, 0xfb, 0x3c, 0x40, 0xd5, 0x3b, 0xc4, 0xff,
+			0x07, 0xa7, 0xf0, 0x24, 0xb7, 0xd6, 0x5e, 0x12,
+			0x5b, 0x85, 0xb5, 0xa5, 0xe0, 0x82, 0xa6, 0xda,
+			0x30, 0x13, 0x2f, 0x1a, 0xe3, 0xd0, 0x55, 0xcb,
+			0x14, 0x19, 0xe2, 0x09, 0x91, 0x96, 0x26, 0xf9,
+			0x38, 0xd7, 0xfa, 0x4a, 0xfb, 0x2f, 0x6f, 0xc0,
+			0xf4, 0x95, 0xc3, 0x40, 0xf6, 0xdb, 0xe7, 0xc2,
+			0x79, 0x23, 0xa4, 0x20, 0x96, 0x3a, 0x00, 0xbb,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x92, 0x1a, 0x21, 0x06, 0x6e, 0x08, 0x84, 0x09,
+			0x23, 0x8d, 0x63, 0xec, 0xd6, 0x72, 0xd3, 0x21,
+			0x51, 0xe8, 0x65, 0x94, 0xf8, 0x1f, 0x5f, 0xa7,
+			0xab, 0x6b, 0xae, 0x1c, 0x2c, 0xaf, 0xf9, 0x0c,
+			0x7c, 0x5a, 0x74, 0x1d, 0x90, 0x26, 0x4a, 0xc3,
+			0xa1, 0x60, 0xf4, 0x1d, 0xd5, 0x3c, 0x86, 0xe8,
+			0x00, 0xb3, 0x99, 0x27, 0xb8, 0x9d, 0x3e, 0x17,
+			0x32, 0x5a, 0x34, 0x3e, 0xc2, 0xb2, 0x6e, 0xbd,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x5a, 0x1f, 0x40, 0x5f, 0xee, 0xf2, 0xdd, 0x67,
+			0x01, 0xcb, 0x26, 0x58, 0xf5, 0x1b, 0xe8, 0x7e,
+			0xeb, 0x7d, 0x9d, 0xef, 0xd3, 0x55, 0xd6, 0x89,
+			0x2e, 0xfc, 0x14, 0xe2, 0x98, 0x4c, 0x31, 0xaa,
+			0x69, 0x00, 0xf9, 0x4e, 0xb0, 0x75, 0x1b, 0x71,
+			0x93, 0x60, 0xdf, 0xa1, 0xaf, 0xba, 0xc2, 0xd3,
+			0x6a, 0x22, 0xa0, 0xff, 0xb5, 0x66, 0x15, 0x66,
+			0xd2, 0x24, 0x9a, 0x7e, 0xe4, 0xe5, 0x84, 0xdb,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x32, 0xbd, 0xcf, 0x72, 0xa9, 0x74, 0x87, 0xe6,
+			0x2a, 0x53, 0x7e, 0x6d, 0xac, 0xc2, 0xdd, 0x2c,
+			0x87, 0xb3, 0xf7, 0x90, 0x29, 0xc9, 0x16, 0x59,
+			0xd2, 0x7e, 0x6e, 0x84, 0x1d, 0xa6, 0xaf, 0x3c,
+			0xca, 0xd6, 0x1a, 0x24, 0xa4, 0xcd, 0x59, 0x44,
+			0x20, 0xd7, 0xd2, 0x5b, 0x97, 0xda, 0xd5, 0xa9,
+			0x23, 0xb1, 0xa4, 0x60, 0xb8, 0x05, 0x98, 0xdc,
+			0xef, 0x89, 0x81, 0xe3, 0x3a, 0xf9, 0x24, 0x37,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x96, 0x3a, 0x1a, 0xdd, 0x1b, 0xeb, 0x1a, 0x55,
+			0x24, 0x52, 0x3d, 0xec, 0x9d, 0x52, 0x2e, 0xa6,
+			0xfe, 0x81, 0xd6, 0x98, 0xac, 0xcc, 0x60, 0x56,
+			0x04, 0x9d, 0xa3, 0xf3, 0x56, 0x05, 0xe4, 0x8a,
+			0x61, 0xaf, 0x6f, 0x6e, 0x8e, 0x75, 0x67, 0x3a,
+			0xd2, 0xb0, 0x85, 0x2d, 0x17, 0xd2, 0x86, 0x8c,
+			0x50, 0x4b, 0xdd, 0xef, 0x35, 0x00, 0xde, 0x29,
+			0x3d, 0x4b, 0x04, 0x12, 0x8a, 0x81, 0xe2, 0xcc,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x9c, 0x6e, 0xf0, 0x6f, 0x71, 0x77, 0xd5, 0xd0,
+			0xbb, 0x70, 0x1f, 0xcb, 0xbd, 0xd3, 0xfe, 0x23,
+			0x71, 0x78, 0xad, 0x3a, 0xd2, 0x1e, 0x34, 0xf4,
+			0x6d, 0xb4, 0xa2, 0x0a, 0x24, 0xcb, 0xe1, 0x99,
+			0x07, 0xd0, 0x79, 0x8f, 0x7e, 0x69, 0x31, 0x68,
+			0x29, 0xb5, 0x85, 0x82, 0x67, 0xdc, 0x4a, 0x8d,
+			0x44, 0x04, 0x02, 0xc0, 0xfb, 0xd2, 0x19, 0x66,
+			0x1e, 0x25, 0x8b, 0xd2, 0x5a, 0x59, 0x68, 0xc0,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0xb8, 0x8f, 0xa8, 0x29, 0x4d, 0xcf, 0x5f, 0x73,
+			0x3c, 0x55, 0x43, 0xd9, 0x1c, 0xbc, 0x0c, 0x17,
+			0x75, 0x0b, 0xc7, 0xb1, 0x1d, 0x9f, 0x7b, 0x2f,
+			0x4c, 0x3d, 0x2a, 0x71, 0xfb, 0x1b, 0x0e, 0xca,
+			0x4e, 0x96, 0xa0, 0x95, 0xee, 0xf4, 0x93, 0x76,
+			0x36, 0xfb, 0x5d, 0xd3, 0x46, 0xc4, 0x1d, 0x41,
+			0x32, 0x92, 0x9d, 0xed, 0xdb, 0x7f, 0xfa, 0xb3,
+			0x91, 0x61, 0x3e, 0xd6, 0xb2, 0xca, 0x8d, 0x81,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x54, 0xac, 0x1a, 0xa1, 0xa6, 0xa3, 0x47, 0x2a,
+			0x5a, 0xac, 0x1a, 0x3a, 0x4b, 0xa1, 0x11, 0x08,
+			0xa7, 0x90, 0xec, 0x52, 0xcb, 0xaf, 0x68, 0x41,
+			0x38, 0x44, 0x53, 0x94, 0x93, 0x30, 0xaf, 0x3a,
+			0xec, 0x99, 0x3a, 0x7d, 0x2a, 0xd5, 0xb6, 0x05,
+			0xf5, 0xa6, 0xbb, 0x9b, 0x82, 0xc2, 0xbd, 0x98,
+			0x28, 0x62, 0x98, 0x3e, 0xe4, 0x27, 0x9b, 0xaa,
+			0xce, 0x0a, 0x6f, 0xab, 0x1b, 0x16, 0xf3, 0xdd,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x04, 0x37, 0x60, 0xbc, 0xb3, 0xb1, 0xc6, 0x2d,
+			0x42, 0xc5, 0xd7, 0x7e, 0xd9, 0x86, 0x82, 0xe0,
+			0xf4, 0x62, 0xad, 0x75, 0x68, 0x0b, 0xc7, 0xa8,
+			0xd6, 0x9a, 0x76, 0xe5, 0x29, 0xb8, 0x37, 0x30,
+			0x0f, 0xc0, 0xbc, 0x81, 0x94, 0x7c, 0x13, 0xf4,
+			0x9c, 0x27, 0xbc, 0x59, 0xa1, 0x70, 0x6a, 0x87,
+			0x20, 0x12, 0x0a, 0x2a, 0x62, 0x5e, 0x6f, 0xca,
+			0x91, 0x6b, 0x34, 0x7e, 0x4c, 0x0d, 0xf0, 0x6c,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x4b, 0x7c, 0x1f, 0x53, 0x52, 0xcc, 0x30, 0xed,
+			0x91, 0x44, 0x6f, 0x0d, 0xb5, 0x41, 0x79, 0x99,
+			0xaf, 0x82, 0x65, 0x52, 0x03, 0xf8, 0x55, 0x74,
+			0x7c, 0xd9, 0x41, 0xd6, 0xe8, 0x91, 0xa4, 0x85,
+			0xcb, 0x0a, 0x60, 0x08, 0x76, 0x07, 0x60, 0x99,
+			0x89, 0x76, 0xba, 0x84, 0xbd, 0x0b, 0xf2, 0xb3,
+			0xdc, 0xf3, 0x33, 0xd1, 0x9b, 0x0b, 0x2e, 0x5d,
+			0xf6, 0x9d, 0x0f, 0x67, 0xf4, 0x86, 0xb3, 0xd5,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x7d, 0x83, 0x78, 0x6a, 0x5d, 0x52, 0x42, 0x2a,
+			0xb1, 0x97, 0xc6, 0x62, 0xa2, 0x2a, 0x7c, 0x8b,
+			0xcd, 0x4f, 0xa4, 0x86, 0x19, 0xa4, 0x5b, 0x1d,
+			0xc7, 0x6f, 0x2f, 0x9c, 0x03, 0xc3, 0x45, 0x2e,
+			0xa7, 0x8e, 0x38, 0xf2, 0x57, 0x55, 0x89, 0x47,
+			0xed, 0xeb, 0x81, 0xe2, 0xe0, 0x55, 0x9f, 0xe6,
+			0xca, 0x03, 0x59, 0xd3, 0xd4, 0xba, 0xc9, 0x2d,
+			0xaf, 0xbb, 0x62, 0x2e, 0xe6, 0x89, 0xe4, 0x11,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xe9, 0x14, 0xe7, 0x01, 0xd0, 0x81, 0x09, 0x51,
+			0x78, 0x1c, 0x8e, 0x6c, 0x00, 0xd3, 0x28, 0xa0,
+			0x2a, 0x7b, 0xd6, 0x25, 0xca, 0xd0, 0xf9, 0xb8,
+			0xd8, 0xcf, 0xd0, 0xb7, 0x48, 0x25, 0xb7, 0x6a,
+			0x53, 0x8e, 0xf8, 0x52, 0x9c, 0x1f, 0x7d, 0xae,
+			0x4c, 0x22, 0xd5, 0x9d, 0xf0, 0xaf, 0x98, 0x91,
+			0x19, 0x1f, 0x99, 0xbd, 0xa6, 0xc2, 0x0f, 0x05,
+			0xa5, 0x9f, 0x3e, 0x87, 0xed, 0xc3, 0xab, 0x92,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x2e, 0xf4, 0x72, 0xd2, 0xd9, 0x4a, 0xd5, 0xf9,
+			0x20, 0x03, 0x4a, 0xad, 0xed, 0xa9, 0x1b, 0x64,
+			0x73, 0x38, 0xc6, 0x30, 0xa8, 0x7f, 0xf9, 0x3b,
+			0x8c, 0xbc, 0xa1, 0x2d, 0x22, 0x7b, 0x84, 0x37,
+			0xf5, 0xba, 0xee, 0xf0, 0x80, 0x9d, 0xe3, 0x82,
+			0xbd, 0x07, 0x68, 0x15, 0x01, 0x22, 0xf6, 0x88,
+			0x07, 0x0b, 0xfd, 0xb7, 0xb1, 0xc0, 0x68, 0x4b,
+			0x8d, 0x05, 0xec, 0xfb, 0xcd, 0xde, 0xa4, 0x2a,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x73, 0xe3, 0xe5, 0x87, 0x01, 0x0a, 0x29, 0x4d,
+			0xad, 0x92, 0x67, 0x64, 0xc7, 0x71, 0x0b, 0x22,
+			0x80, 0x8a, 0x6e, 0x8b, 0x20, 0x73, 0xb2, 0xd7,
+			0x98, 0x20, 0x35, 0x42, 0x42, 0x5d, 0x85, 0x12,
+			0xb0, 0x06, 0x69, 0x63, 0x5f, 0x5b, 0xe7, 0x63,
+			0x6f, 0xe6, 0x18, 0xa6, 0xc1, 0xa6, 0xae, 0x27,
+			0xa7, 0x6a, 0x73, 0x6b, 0x27, 0xd5, 0x47, 0xe1,
+			0xa2, 0x7d, 0xe4, 0x0d, 0xbd, 0x23, 0x7b, 0x7a,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x11, 0x5b, 0x77, 0x36, 0x6b, 0x3b, 0xe4, 0x42,
+			0xe4, 0x92, 0x23, 0xcb, 0x0c, 0x06, 0xff, 0xb7,
+			0x0c, 0x71, 0x64, 0xd9, 0x8a, 0x57, 0x75, 0x7b,
+			0xa2, 0xd2, 0x17, 0x19, 0xbb, 0xb5, 0x3c, 0xb3,
+			0x5f, 0xae, 0x35, 0x75, 0x8e, 0xa8, 0x97, 0x43,
+			0xce, 0xfe, 0x41, 0x84, 0xfe, 0xcb, 0x18, 0x70,
+			0x68, 0x2e, 0x16, 0x19, 0xd5, 0x10, 0x0d, 0x2f,
+			0x61, 0x87, 0x79, 0xee, 0x5f, 0x24, 0xdd, 0x76,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x9e, 0x96, 0xe1, 0x0a, 0xb2, 0xd5, 0xba, 0xcf,
+			0x27, 0xba, 0x6f, 0x85, 0xe9, 0xbf, 0x96, 0xb9,
+			0x5a, 0x00, 0x00, 0x06, 0xdc, 0xb7, 0xaf, 0x0a,
+			0x8f, 0x1d, 0x31, 0xf6, 0xce, 0xc3, 0x50, 0x2e,
+			0x61, 0x3a, 0x8b, 0x28, 0xaf, 0xb2, 0x50, 0x0d,
+			0x00, 0x98, 0x02, 0x11, 0x6b, 0xfa, 0x51, 0xc1,
+			0xde, 0xe1, 0x34, 0x9f, 0xda, 0x11, 0x63, 0xfa,
+			0x0a, 0xa0, 0xa2, 0x67, 0x39, 0xeb, 0x9b, 0xf1,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x46, 0x4e, 0x81, 0xd1, 0x08, 0x2a, 0x46, 0x12,
+			0x4e, 0xae, 0x1f, 0x5d, 0x57, 0xe5, 0x19, 0xbc,
+			0x76, 0x38, 0xb6, 0xa7, 0xe3, 0x72, 0x6d, 0xaf,
+			0x80, 0x3b, 0xd0, 0xbc, 0x06, 0xe8, 0xab, 0xab,
+			0x86, 0x4b, 0x0b, 0x7a, 0x61, 0xa6, 0x13, 0xff,
+			0x64, 0x47, 0x89, 0xb7, 0x63, 0x8a, 0xa5, 0x4c,
+			0x9f, 0x52, 0x70, 0xeb, 0x21, 0xe5, 0x2d, 0xe9,
+			0xe7, 0xab, 0x1c, 0x0e, 0x74, 0xf5, 0x72, 0xec,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0xfa, 0x6e, 0xff, 0x3c, 0xc1, 0x98, 0x49, 0x42,
+			0x34, 0x67, 0xd4, 0xd3, 0xfa, 0xae, 0x27, 0xe4,
+			0x77, 0x11, 0x84, 0xd2, 0x57, 0x99, 0xf8, 0xfd,
+			0x41, 0x50, 0x84, 0x80, 0x7f, 0xf7, 0xb2, 0xd3,
+			0x88, 0x21, 0x9c, 0xe8, 0xb9, 0x05, 0xd3, 0x48,
+			0x64, 0xc5, 0xb7, 0x29, 0xd9, 0x21, 0x17, 0xad,
+			0x89, 0x9c, 0x79, 0x55, 0x51, 0x0b, 0x96, 0x3e,
+			0x10, 0x40, 0xe1, 0xdd, 0x7b, 0x39, 0x40, 0x86,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x41, 0xb3, 0xd2, 0x93, 0xcd, 0x79, 0x84, 0xc2,
+			0xf5, 0xea, 0xf3, 0xb3, 0x94, 0x23, 0xaa, 0x76,
+			0x87, 0x5f, 0xe3, 0xd2, 0x03, 0xd8, 0x00, 0xbb,
+			0xa1, 0x55, 0xe4, 0xcb, 0x16, 0x04, 0x5b, 0xdf,
+			0xf8, 0xd2, 0x63, 0x51, 0x02, 0x22, 0xc6, 0x0f,
+			0x98, 0x2b, 0x12, 0x52, 0x25, 0x64, 0x93, 0xd9,
+			0xab, 0xe9, 0x4d, 0x16, 0x4b, 0xf6, 0x09, 0x83,
+			0x5c, 0x63, 0x1c, 0x41, 0x19, 0xf6, 0x76, 0xe3,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA512_DIGEST_SIZE] = {
+	0x5b, 0x9d, 0xf9, 0xab, 0x8c, 0x8e, 0x52, 0xdb,
+	0x02, 0xa0, 0x4c, 0x24, 0x2d, 0xc4, 0xa8, 0x4e,
+	0x9c, 0x93, 0x2f, 0x72, 0xa8, 0x75, 0xfb, 0xb5,
+	0xdb, 0xef, 0x52, 0xc6, 0xa3, 0xfe, 0xeb, 0x6b,
+	0x92, 0x79, 0x18, 0x05, 0xf6, 0xd7, 0xaf, 0x7b,
+	0x36, 0xfc, 0x83, 0x2c, 0x7e, 0x7b, 0x59, 0x8b,
+	0xf9, 0x81, 0xaa, 0x98, 0x38, 0x11, 0x97, 0x56,
+	0x34, 0xe5, 0x2a, 0x4b, 0xf2, 0x9e, 0xf3, 0xdb,
+};
+
+static const u8 hmac_testvec_consolidated[SHA512_DIGEST_SIZE] = {
+	0x40, 0xe7, 0xbc, 0x03, 0xdf, 0x22, 0xd4, 0x76,
+	0x66, 0x45, 0xf8, 0x1d, 0x25, 0xdf, 0xbe, 0xa2,
+	0x93, 0x06, 0x8c, 0x1d, 0x14, 0x23, 0x9b, 0x5c,
+	0xfa, 0xac, 0xdf, 0xbd, 0xa2, 0x24, 0xe5, 0xf7,
+	0xdc, 0xf7, 0xae, 0x96, 0xc1, 0x34, 0xe5, 0x24,
+	0x16, 0x24, 0xdc, 0xee, 0x4f, 0x62, 0x1c, 0x67,
+	0x4e, 0x02, 0x31, 0x4b, 0x9b, 0x65, 0x25, 0xeb,
+	0x32, 0x2e, 0x24, 0xfb, 0xcd, 0x2b, 0x59, 0xd8,
+};
diff --git a/lib/crypto/tests/sha512_kunit.c b/lib/crypto/tests/sha512_kunit.c
new file mode 100644
index 000000000000..8923e2d7d3d4
--- /dev/null
+++ b/lib/crypto/tests/sha512_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha512-testvecs.h"
+
+#define HASH sha512
+#define HASH_CTX sha512_ctx
+#define HASH_SIZE SHA512_DIGEST_SIZE
+#define HASH_INIT sha512_init
+#define HASH_UPDATE sha512_update
+#define HASH_FINAL sha512_final
+#define HMAC_KEY hmac_sha512_key
+#define HMAC_CTX hmac_sha512_ctx
+#define HMAC_PREPAREKEY hmac_sha512_preparekey
+#define HMAC_INIT hmac_sha512_init
+#define HMAC_UPDATE hmac_sha512_update
+#define HMAC_FINAL hmac_sha512_final
+#define HMAC hmac_sha512
+#define HMAC_USINGRAWKEY hmac_sha512_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha512",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-512 and HMAC-SHA512");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c
index 87da2a6dd161..dec381d5e906 100644
--- a/lib/crypto/utils.c
+++ b/lib/crypto/utils.c
@@ -5,9 +5,10 @@
  * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  */
 
-#include <linux/unaligned.h>
 #include <crypto/utils.h>
+#include <linux/export.h>
 #include <linux/module.h>
+#include <linux/unaligned.h>
 
 /*
  * XOR @len bytes from @src1 and @src2 together, writing the result to @dst
diff --git a/lib/crypto/x86/.gitignore b/lib/crypto/x86/.gitignore
new file mode 100644
index 000000000000..580c839bb177
--- /dev/null
+++ b/lib/crypto/x86/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-x86_64-cryptogams.S
diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig
new file mode 100644
index 000000000000..546fe2afe0b5
--- /dev/null
+++ b/lib/crypto/x86/Kconfig
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_BLAKE2S_X86
+	bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
+	depends on 64BIT
+	select CRYPTO_LIB_BLAKE2S_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+	help
+	  BLAKE2s cryptographic hash function (RFC 7693)
+
+	  Architecture: x86_64 using:
+	  - SSSE3 (Supplemental SSE3)
+	  - AVX-512 (Advanced Vector Extensions-512)
+
+config CRYPTO_CHACHA20_X86_64
+	tristate
+	depends on 64BIT
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_X86_64
+	tristate
+	depends on 64BIT
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile
new file mode 100644
index 000000000000..c2ff8c5f1046
--- /dev/null
+++ b/lib/crypto/x86/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
+libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
+chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
+poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
+targets += poly1305-x86_64-cryptogams.S
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $< > $@
+
+$(obj)/%.S: $(src)/%.pl FORCE
+	$(call if_changed,perlasm)
diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
new file mode 100644
index 000000000000..ac1c845445a4
--- /dev/null
+++ b/lib/crypto/x86/blake2s-core.S
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
+.align 32
+IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
+	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
+.section .rodata.cst16.ROR328, "aM", @progbits, 16
+.align 16
+ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
+.align 64
+SIGMA:
+.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
+.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
+.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
+.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
+.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
+.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
+.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
+.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
+.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
+.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
+.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
+.align 64
+SIGMA2:
+.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
+.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
+.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
+.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
+.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
+.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
+.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
+.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
+.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
+.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
+
+.text
+SYM_FUNC_START(blake2s_compress_ssse3)
+	testq		%rdx,%rdx
+	je		.Lendofloop
+	movdqu		(%rdi),%xmm0
+	movdqu		0x10(%rdi),%xmm1
+	movdqa		ROT16(%rip),%xmm12
+	movdqa		ROR328(%rip),%xmm13
+	movdqu		0x20(%rdi),%xmm14
+	movq		%rcx,%xmm15
+	leaq		SIGMA+0xa0(%rip),%r8
+	jmp		.Lbeginofloop
+	.align		32
+.Lbeginofloop:
+	movdqa		%xmm0,%xmm10
+	movdqa		%xmm1,%xmm11
+	paddq		%xmm15,%xmm14
+	movdqa		IV(%rip),%xmm2
+	movdqa		%xmm14,%xmm3
+	pxor		IV+0x10(%rip),%xmm3
+	leaq		SIGMA(%rip),%rcx
+.Lroundloop:
+	movzbl		(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm4
+	movzbl		0x1(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm5
+	movzbl		0x2(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm6
+	movzbl		0x3(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm7
+	punpckldq	%xmm5,%xmm4
+	punpckldq	%xmm7,%xmm6
+	punpcklqdq	%xmm6,%xmm4
+	paddd		%xmm4,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm12,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$0xc,%xmm1
+	pslld		$0x14,%xmm8
+	por		%xmm8,%xmm1
+	movzbl		0x4(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm5
+	movzbl		0x5(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm6
+	movzbl		0x6(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm7
+	movzbl		0x7(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm4
+	punpckldq	%xmm6,%xmm5
+	punpckldq	%xmm4,%xmm7
+	punpcklqdq	%xmm7,%xmm5
+	paddd		%xmm5,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm13,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$0x7,%xmm1
+	pslld		$0x19,%xmm8
+	por		%xmm8,%xmm1
+	pshufd		$0x93,%xmm0,%xmm0
+	pshufd		$0x4e,%xmm3,%xmm3
+	pshufd		$0x39,%xmm2,%xmm2
+	movzbl		0x8(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm6
+	movzbl		0x9(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm7
+	movzbl		0xa(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm4
+	movzbl		0xb(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm5
+	punpckldq	%xmm7,%xmm6
+	punpckldq	%xmm5,%xmm4
+	punpcklqdq	%xmm4,%xmm6
+	paddd		%xmm6,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm12,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$0xc,%xmm1
+	pslld		$0x14,%xmm8
+	por		%xmm8,%xmm1
+	movzbl		0xc(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm7
+	movzbl		0xd(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm4
+	movzbl		0xe(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm5
+	movzbl		0xf(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm6
+	punpckldq	%xmm4,%xmm7
+	punpckldq	%xmm6,%xmm5
+	punpcklqdq	%xmm5,%xmm7
+	paddd		%xmm7,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm13,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$0x7,%xmm1
+	pslld		$0x19,%xmm8
+	por		%xmm8,%xmm1
+	pshufd		$0x39,%xmm0,%xmm0
+	pshufd		$0x4e,%xmm3,%xmm3
+	pshufd		$0x93,%xmm2,%xmm2
+	addq		$0x10,%rcx
+	cmpq		%r8,%rcx
+	jnz		.Lroundloop
+	pxor		%xmm2,%xmm0
+	pxor		%xmm3,%xmm1
+	pxor		%xmm10,%xmm0
+	pxor		%xmm11,%xmm1
+	addq		$0x40,%rsi
+	decq		%rdx
+	jnz		.Lbeginofloop
+	movdqu		%xmm0,(%rdi)
+	movdqu		%xmm1,0x10(%rdi)
+	movdqu		%xmm14,0x20(%rdi)
+.Lendofloop:
+	RET
+SYM_FUNC_END(blake2s_compress_ssse3)
+
+SYM_FUNC_START(blake2s_compress_avx512)
+	vmovdqu		(%rdi),%xmm0
+	vmovdqu		0x10(%rdi),%xmm1
+	vmovdqu		0x20(%rdi),%xmm4
+	vmovq		%rcx,%xmm5
+	vmovdqa		IV(%rip),%xmm14
+	vmovdqa		IV+16(%rip),%xmm15
+	jmp		.Lblake2s_compress_avx512_mainloop
+.align 32
+.Lblake2s_compress_avx512_mainloop:
+	vmovdqa		%xmm0,%xmm10
+	vmovdqa		%xmm1,%xmm11
+	vpaddq		%xmm5,%xmm4,%xmm4
+	vmovdqa		%xmm14,%xmm2
+	vpxor		%xmm15,%xmm4,%xmm3
+	vmovdqu		(%rsi),%ymm6
+	vmovdqu		0x20(%rsi),%ymm7
+	addq		$0x40,%rsi
+	leaq		SIGMA2(%rip),%rax
+	movb		$0xa,%cl
+.Lblake2s_compress_avx512_roundloop:
+	addq		$0x40,%rax
+	vmovdqa		-0x40(%rax),%ymm8
+	vmovdqa		-0x20(%rax),%ymm9
+	vpermi2d	%ymm7,%ymm6,%ymm8
+	vpermi2d	%ymm7,%ymm6,%ymm9
+	vmovdqa		%ymm8,%ymm6
+	vmovdqa		%ymm9,%ymm7
+	vpaddd		%xmm8,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$0x10,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$0xc,%xmm1,%xmm1
+	vextracti128	$0x1,%ymm8,%xmm8
+	vpaddd		%xmm8,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$0x8,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$0x7,%xmm1,%xmm1
+	vpshufd		$0x93,%xmm0,%xmm0
+	vpshufd		$0x4e,%xmm3,%xmm3
+	vpshufd		$0x39,%xmm2,%xmm2
+	vpaddd		%xmm9,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$0x10,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$0xc,%xmm1,%xmm1
+	vextracti128	$0x1,%ymm9,%xmm9
+	vpaddd		%xmm9,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$0x8,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$0x7,%xmm1,%xmm1
+	vpshufd		$0x39,%xmm0,%xmm0
+	vpshufd		$0x4e,%xmm3,%xmm3
+	vpshufd		$0x93,%xmm2,%xmm2
+	decb		%cl
+	jne		.Lblake2s_compress_avx512_roundloop
+	vpxor		%xmm10,%xmm0,%xmm0
+	vpxor		%xmm11,%xmm1,%xmm1
+	vpxor		%xmm2,%xmm0,%xmm0
+	vpxor		%xmm3,%xmm1,%xmm1
+	decq		%rdx
+	jne		.Lblake2s_compress_avx512_mainloop
+	vmovdqu		%xmm0,(%rdi)
+	vmovdqu		%xmm1,0x10(%rdi)
+	vmovdqu		%xmm4,0x20(%rdi)
+	vzeroupper
+	RET
+SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/lib/crypto/x86/blake2s-glue.c b/lib/crypto/x86/blake2s-glue.c
new file mode 100644
index 000000000000..adc296cd17c9
--- /dev/null
+++ b/lib/crypto/x86/blake2s-glue.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+#include <asm/processor.h>
+#include <asm/simd.h>
+#include <crypto/internal/blake2s.h>
+#include <linux/init.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
+				       const u8 *block, const size_t nblocks,
+				       const u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
+					const u8 *block, const size_t nblocks,
+					const u32 inc);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
+
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, const u32 inc)
+{
+	/* SIMD disables preemption, so relax after processing each page. */
+	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+
+	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
+		blake2s_compress_generic(state, block, nblocks, inc);
+		return;
+	}
+
+	do {
+		const size_t blocks = min_t(size_t, nblocks,
+					    SZ_4K / BLAKE2S_BLOCK_SIZE);
+
+		kernel_fpu_begin();
+		if (static_branch_likely(&blake2s_use_avx512))
+			blake2s_compress_avx512(state, block, blocks, inc);
+		else
+			blake2s_compress_ssse3(state, block, blocks, inc);
+		kernel_fpu_end();
+
+		nblocks -= blocks;
+		block += blocks * BLAKE2S_BLOCK_SIZE;
+	} while (nblocks);
+}
+EXPORT_SYMBOL(blake2s_compress);
+
+static int __init blake2s_mod_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		static_branch_enable(&blake2s_use_ssse3);
+
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    boot_cpu_has(X86_FEATURE_AVX512VL) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+			      XFEATURE_MASK_AVX512, NULL))
+		static_branch_enable(&blake2s_use_avx512);
+
+	return 0;
+}
+
+subsys_initcall(blake2s_mod_init);
diff --git a/lib/crypto/x86/chacha-avx2-x86_64.S b/lib/crypto/x86/chacha-avx2-x86_64.S
new file mode 100644
index 000000000000..f3d8fc018249
--- /dev/null
+++ b/lib/crypto/x86/chacha-avx2-x86_64.S
@@ -0,0 +1,1021 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+	.octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section	.rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+	.octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
+.text
+
+SYM_FUNC_START(chacha_2block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts two ChaCha blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+	vmovdqa		ROT8(%rip),%ymm4
+	vmovdqa		ROT16(%rip),%ymm5
+
+	mov		%rcx,%rax
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rax
+	jl		.Lxorpart2
+	vpxor		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rax
+	jl		.Lxorpart2
+	vpxor		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rax
+	jl		.Lxorpart2
+	vpxor		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rax
+	jl		.Lxorpart2
+	vpxor		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rax
+	jl		.Lxorpart2
+	vpxor		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rax
+	jl		.Lxorpart2
+	vpxor		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rax
+	jl		.Lxorpart2
+	vpxor		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rax
+	jl		.Lxorpart2
+	vpxor		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	RET
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone2
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm7,%xmm7
+	vmovdqa		%xmm7,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone2
+
+SYM_FUNC_END(chacha_2block_xor_avx2)
+
+SYM_FUNC_START(chacha_4block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four ChaCha blocks by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+	vmovdqa		ROT8(%rip),%ymm8
+	vmovdqa		ROT16(%rip),%ymm9
+
+	mov		%rcx,%rax
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
+	vpxor		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	vpxor		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	vpxor		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	vpxor		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
+	vpxor		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	vpxor		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	vpxor		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	vpxor		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	vpxor		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	vpxor		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	vpxor		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	vpxor		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
+	vpxor		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
+	vpxor		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
+	vpxor		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
+	vpxor		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	RET
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm10,%xmm10
+	vmovdqa		%xmm10,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_avx2)
+
+SYM_FUNC_START(chacha_8block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts eight consecutive ChaCha blocks by loading
+	# the state matrix in AVX registers eight times. As we need some
+	# scratch registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
+	# words, which allows us to do XOR in AVX registers. 8/16-bit word
+	# rotation is done with the slightly better performing byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	vzeroupper
+	# 4 * 32 byte stack, 32-byte aligned
+	lea		8(%rsp),%r10
+	and		$~31, %rsp
+	sub		$0x80, %rsp
+	mov		%rcx,%rax
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+	# x0..3 on stack
+	vmovdqa		%ymm0,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm3,0x60(%rsp)
+
+	vmovdqa		CTRINC(%rip),%ymm1
+	vmovdqa		ROT8(%rip),%ymm2
+	vmovdqa		ROT16(%rip),%ymm3
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpaddd		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpbroadcastd	0x04(%rdi),%ymm0
+	vpaddd		0x20(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpbroadcastd	0x08(%rdi),%ymm0
+	vpaddd		0x40(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpbroadcastd	0x0c(%rdi),%ymm0
+	vpaddd		0x60(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpbroadcastd	0x10(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm4,%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm5,%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm6,%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm7,%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm8,%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm9,%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm10,%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm11,%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm12,%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm13,%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm14,%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm15,%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+	# interleave 32-bit words in state n, n+1
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x20(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm1,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpckldq	%ymm5,%ymm0,%ymm4
+	vpunpckhdq	%ymm5,%ymm0,%ymm5
+	vmovdqa		%ymm6,%ymm0
+	vpunpckldq	%ymm7,%ymm0,%ymm6
+	vpunpckhdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpckldq	%ymm9,%ymm0,%ymm8
+	vpunpckhdq	%ymm9,%ymm0,%ymm9
+	vmovdqa		%ymm10,%ymm0
+	vpunpckldq	%ymm11,%ymm0,%ymm10
+	vpunpckhdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpckldq	%ymm13,%ymm0,%ymm12
+	vpunpckhdq	%ymm13,%ymm0,%ymm13
+	vmovdqa		%ymm14,%ymm0
+	vpunpckldq	%ymm15,%ymm0,%ymm14
+	vpunpckhdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 64-bit words in state n, n+2
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x40(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x00(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		0x20(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpcklqdq	%ymm6,%ymm0,%ymm4
+	vpunpckhqdq	%ymm6,%ymm0,%ymm6
+	vmovdqa		%ymm5,%ymm0
+	vpunpcklqdq	%ymm7,%ymm0,%ymm5
+	vpunpckhqdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpcklqdq	%ymm10,%ymm0,%ymm8
+	vpunpckhqdq	%ymm10,%ymm0,%ymm10
+	vmovdqa		%ymm9,%ymm0
+	vpunpcklqdq	%ymm11,%ymm0,%ymm9
+	vpunpckhqdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpcklqdq	%ymm14,%ymm0,%ymm12
+	vpunpckhqdq	%ymm14,%ymm0,%ymm14
+	vmovdqa		%ymm13,%ymm0
+	vpunpcklqdq	%ymm15,%ymm0,%ymm13
+	vpunpckhqdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa		0x00(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
+	cmp		$0x0020,%rax
+	jl		.Lxorpart8
+	vpxor		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0000(%rsi)
+	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rax
+	jl		.Lxorpart8
+	vpxor		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vmovdqa		0x40(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
+	cmp		$0x0060,%rax
+	jl		.Lxorpart8
+	vpxor		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rax
+	jl		.Lxorpart8
+	vpxor		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vmovdqa		0x20(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vmovdqa		0x60(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
+	cmp		$0x00e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rax
+	jl		.Lxorpart8
+	vpxor		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa		%ymm4,%ymm0
+	cmp		$0x0120,%rax
+	jl		.Lxorpart8
+	vpxor		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0100(%rsi)
+
+	vmovdqa		%ymm12,%ymm0
+	cmp		$0x0140,%rax
+	jl		.Lxorpart8
+	vpxor		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0120(%rsi)
+
+	vmovdqa		%ymm6,%ymm0
+	cmp		$0x0160,%rax
+	jl		.Lxorpart8
+	vpxor		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0140(%rsi)
+
+	vmovdqa		%ymm14,%ymm0
+	cmp		$0x0180,%rax
+	jl		.Lxorpart8
+	vpxor		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0160(%rsi)
+
+	vmovdqa		%ymm5,%ymm0
+	cmp		$0x01a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0180(%rsi)
+
+	vmovdqa		%ymm13,%ymm0
+	cmp		$0x01c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01a0(%rsi)
+
+	vmovdqa		%ymm7,%ymm0
+	cmp		$0x01e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01c0(%rsi)
+
+	vmovdqa		%ymm15,%ymm0
+	cmp		$0x0200,%rax
+	jl		.Lxorpart8
+	vpxor		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	lea		-8(%r10),%rsp
+	RET
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x1f,%r9
+	jz		.Ldone8
+	and		$~0x1f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone8
+
+SYM_FUNC_END(chacha_8block_xor_avx2)
diff --git a/lib/crypto/x86/chacha-avx512vl-x86_64.S b/lib/crypto/x86/chacha-avx512vl-x86_64.S
new file mode 100644
index 000000000000..259383e1ad44
--- /dev/null
+++ b/lib/crypto/x86/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
+.section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.text
+
+SYM_FUNC_START(chacha_2block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts two ChaCha blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rcx
+	jl		.Lxorpart2
+	vpxord		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rcx
+	jl		.Lxorpart2
+	vpxord		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rcx
+	jl		.Lxorpart2
+	vpxord		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rcx
+	jl		.Lxorpart2
+	vpxord		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rcx
+	jl		.Lxorpart2
+	vpxord		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rcx
+	jl		.Lxorpart2
+	vpxord		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rcx
+	jl		.Lxorpart2
+	vpxord		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rcx
+	jl		.Lxorpart2
+	vpxord		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	RET
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone2
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm7,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone2
+
+SYM_FUNC_END(chacha_2block_xor_avx512vl)
+
+SYM_FUNC_START(chacha_4block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four ChaCha blocks by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rcx
+	jl		.Lxorpart4
+	vpxord		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rcx
+	jl		.Lxorpart4
+	vpxord		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rcx
+	jl		.Lxorpart4
+	vpxord		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rcx
+	jl		.Lxorpart4
+	vpxord		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rcx
+	jl		.Lxorpart4
+	vpxord		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rcx
+	jl		.Lxorpart4
+	vpxord		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rcx
+	jl		.Lxorpart4
+	vpxord		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rcx
+	jl		.Lxorpart4
+	vpxord		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rcx
+	jl		.Lxorpart4
+	vpxord		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rcx
+	jl		.Lxorpart4
+	vpxord		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rcx
+	jl		.Lxorpart4
+	vpxord		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	RET
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone4
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm10,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_avx512vl)
+
+SYM_FUNC_START(chacha_8block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts eight consecutive ChaCha blocks by loading
+	# the state matrix in AVX registers eight times. Compared to AVX2, this
+	# mostly benefits from the new rotate instructions in VL and the
+	# additional registers.
+
+	vzeroupper
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		CTR8BL(%rip),%ymm12,%ymm12
+
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+	vmovdqa64	%ymm4,%ymm20
+	vmovdqa64	%ymm5,%ymm21
+	vmovdqa64	%ymm6,%ymm22
+	vmovdqa64	%ymm7,%ymm23
+	vmovdqa64	%ymm8,%ymm24
+	vmovdqa64	%ymm9,%ymm25
+	vmovdqa64	%ymm10,%ymm26
+	vmovdqa64	%ymm11,%ymm27
+	vmovdqa64	%ymm12,%ymm28
+	vmovdqa64	%ymm13,%ymm29
+	vmovdqa64	%ymm14,%ymm30
+	vmovdqa64	%ymm15,%ymm31
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpaddd		%ymm16,%ymm0,%ymm0
+	vpaddd		%ymm17,%ymm1,%ymm1
+	vpaddd		%ymm18,%ymm2,%ymm2
+	vpaddd		%ymm19,%ymm3,%ymm3
+	vpaddd		%ymm20,%ymm4,%ymm4
+	vpaddd		%ymm21,%ymm5,%ymm5
+	vpaddd		%ymm22,%ymm6,%ymm6
+	vpaddd		%ymm23,%ymm7,%ymm7
+	vpaddd		%ymm24,%ymm8,%ymm8
+	vpaddd		%ymm25,%ymm9,%ymm9
+	vpaddd		%ymm26,%ymm10,%ymm10
+	vpaddd		%ymm27,%ymm11,%ymm11
+	vpaddd		%ymm28,%ymm12,%ymm12
+	vpaddd		%ymm29,%ymm13,%ymm13
+	vpaddd		%ymm30,%ymm14,%ymm14
+	vpaddd		%ymm31,%ymm15,%ymm15
+
+	# interleave 32-bit words in state n, n+1
+	vpunpckldq	%ymm1,%ymm0,%ymm16
+	vpunpckhdq	%ymm1,%ymm0,%ymm17
+	vpunpckldq	%ymm3,%ymm2,%ymm18
+	vpunpckhdq	%ymm3,%ymm2,%ymm19
+	vpunpckldq	%ymm5,%ymm4,%ymm20
+	vpunpckhdq	%ymm5,%ymm4,%ymm21
+	vpunpckldq	%ymm7,%ymm6,%ymm22
+	vpunpckhdq	%ymm7,%ymm6,%ymm23
+	vpunpckldq	%ymm9,%ymm8,%ymm24
+	vpunpckhdq	%ymm9,%ymm8,%ymm25
+	vpunpckldq	%ymm11,%ymm10,%ymm26
+	vpunpckhdq	%ymm11,%ymm10,%ymm27
+	vpunpckldq	%ymm13,%ymm12,%ymm28
+	vpunpckhdq	%ymm13,%ymm12,%ymm29
+	vpunpckldq	%ymm15,%ymm14,%ymm30
+	vpunpckhdq	%ymm15,%ymm14,%ymm31
+
+	# interleave 64-bit words in state n, n+2
+	vpunpcklqdq	%ymm18,%ymm16,%ymm0
+	vpunpcklqdq	%ymm19,%ymm17,%ymm1
+	vpunpckhqdq	%ymm18,%ymm16,%ymm2
+	vpunpckhqdq	%ymm19,%ymm17,%ymm3
+	vpunpcklqdq	%ymm22,%ymm20,%ymm4
+	vpunpcklqdq	%ymm23,%ymm21,%ymm5
+	vpunpckhqdq	%ymm22,%ymm20,%ymm6
+	vpunpckhqdq	%ymm23,%ymm21,%ymm7
+	vpunpcklqdq	%ymm26,%ymm24,%ymm8
+	vpunpcklqdq	%ymm27,%ymm25,%ymm9
+	vpunpckhqdq	%ymm26,%ymm24,%ymm10
+	vpunpckhqdq	%ymm27,%ymm25,%ymm11
+	vpunpcklqdq	%ymm30,%ymm28,%ymm12
+	vpunpcklqdq	%ymm31,%ymm29,%ymm13
+	vpunpckhqdq	%ymm30,%ymm28,%ymm14
+	vpunpckhqdq	%ymm31,%ymm29,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa64	%ymm0,%ymm16
+	vperm2i128	$0x20,%ymm4,%ymm0,%ymm0
+	cmp		$0x0020,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0000(%rsi)
+	vmovdqa64	%ymm16,%ymm0
+	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vperm2i128	$0x20,%ymm6,%ymm2,%ymm0
+	cmp		$0x0060,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm2,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vperm2i128	$0x20,%ymm7,%ymm3,%ymm0
+	cmp		$0x00e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm3,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa64	%ymm4,%ymm0
+	cmp		$0x0120,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0100(%rsi)
+
+	vmovdqa64	%ymm12,%ymm0
+	cmp		$0x0140,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0120(%rsi)
+
+	vmovdqa64	%ymm6,%ymm0
+	cmp		$0x0160,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0140(%rsi)
+
+	vmovdqa64	%ymm14,%ymm0
+	cmp		$0x0180,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0160(%rsi)
+
+	vmovdqa64	%ymm5,%ymm0
+	cmp		$0x01a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0180(%rsi)
+
+	vmovdqa64	%ymm13,%ymm0
+	cmp		$0x01c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01a0(%rsi)
+
+	vmovdqa64	%ymm7,%ymm0
+	cmp		$0x01e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01c0(%rsi)
+
+	vmovdqa64	%ymm15,%ymm0
+	cmp		$0x0200,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	RET
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0x1f,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0x1f,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%ymm1{%k1}{z}
+	vpxord		%ymm0,%ymm1,%ymm1
+	vmovdqu8	%ymm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone8
+
+SYM_FUNC_END(chacha_8block_xor_avx512vl)
diff --git a/lib/crypto/x86/chacha-ssse3-x86_64.S b/lib/crypto/x86/chacha-ssse3-x86_64.S
new file mode 100644
index 000000000000..7111949cd5b9
--- /dev/null
+++ b/lib/crypto/x86/chacha-ssse3-x86_64.S
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section	.rodata.cst16.ROT8, "aM", @progbits, 16
+.align 16
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+.section	.rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
+CTRINC:	.octa 0x00000003000000020000000100000000
+
+.text
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round.  8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+	movdqa		ROT8(%rip),%xmm4
+	movdqa		ROT16(%rip),%xmm5
+
+.Ldoubleround:
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm5,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm6
+	pslld		$12,%xmm6
+	psrld		$20,%xmm1
+	por		%xmm6,%xmm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm4,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm7
+	pslld		$7,%xmm7
+	psrld		$25,%xmm1
+	por		%xmm7,%xmm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	pshufd		$0x39,%xmm1,%xmm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	pshufd		$0x4e,%xmm2,%xmm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	pshufd		$0x93,%xmm3,%xmm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm5,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm6
+	pslld		$12,%xmm6
+	psrld		$20,%xmm1
+	por		%xmm6,%xmm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm4,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm7
+	pslld		$7,%xmm7
+	psrld		$25,%xmm1
+	por		%xmm7,%xmm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	pshufd		$0x93,%xmm1,%xmm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	pshufd		$0x4e,%xmm2,%xmm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	pshufd		$0x39,%xmm3,%xmm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	RET
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 1 data block output, o
+	# %rdx: up to 1 data block input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+	FRAME_BEGIN
+
+	# x0..3 = s0..3
+	movdqu		0x00(%rdi),%xmm0
+	movdqu		0x10(%rdi),%xmm1
+	movdqu		0x20(%rdi),%xmm2
+	movdqu		0x30(%rdi),%xmm3
+	movdqa		%xmm0,%xmm8
+	movdqa		%xmm1,%xmm9
+	movdqa		%xmm2,%xmm10
+	movdqa		%xmm3,%xmm11
+
+	mov		%rcx,%rax
+	call		chacha_permute
+
+	# o0 = i0 ^ (x0 + s0)
+	paddd		%xmm8,%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart
+	movdqu		0x00(%rdx),%xmm4
+	pxor		%xmm4,%xmm0
+	movdqu		%xmm0,0x00(%rsi)
+	# o1 = i1 ^ (x1 + s1)
+	paddd		%xmm9,%xmm1
+	movdqa		%xmm1,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart
+	movdqu		0x10(%rdx),%xmm0
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x10(%rsi)
+	# o2 = i2 ^ (x2 + s2)
+	paddd		%xmm10,%xmm2
+	movdqa		%xmm2,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart
+	movdqu		0x20(%rdx),%xmm0
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
+	# o3 = i3 ^ (x3 + s3)
+	paddd		%xmm11,%xmm3
+	movdqa		%xmm3,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart
+	movdqu		0x30(%rdx),%xmm0
+	pxor		%xmm3,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
+.Ldone:
+	FRAME_END
+	RET
+
+.Lxorpart:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone
+
+SYM_FUNC_END(chacha_block_xor_ssse3)
+
+SYM_FUNC_START(hchacha_block_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: output (8 32-bit words)
+	# %edx: nrounds
+	FRAME_BEGIN
+
+	movdqu		0x00(%rdi),%xmm0
+	movdqu		0x10(%rdi),%xmm1
+	movdqu		0x20(%rdi),%xmm2
+	movdqu		0x30(%rdi),%xmm3
+
+	mov		%edx,%r8d
+	call		chacha_permute
+
+	movdqu		%xmm0,0x00(%rsi)
+	movdqu		%xmm3,0x10(%rsi)
+
+	FRAME_END
+	RET
+SYM_FUNC_END(hchacha_block_ssse3)
+
+SYM_FUNC_START(chacha_4block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four consecutive ChaCha blocks by loading the
+	# the state matrix in SSE registers four times. As we need some scratch
+	# registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32- and then 64-bit words,
+	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+	# done with the slightly better performing SSSE3 byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	lea		8(%rsp),%r10
+	sub		$0x80,%rsp
+	and		$~63,%rsp
+	mov		%rcx,%rax
+
+	# x0..15[0-3] = s0..3[0..3]
+	movq		0x00(%rdi),%xmm1
+	pshufd		$0x00,%xmm1,%xmm0
+	pshufd		$0x55,%xmm1,%xmm1
+	movq		0x08(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	movq		0x10(%rdi),%xmm5
+	pshufd		$0x00,%xmm5,%xmm4
+	pshufd		$0x55,%xmm5,%xmm5
+	movq		0x18(%rdi),%xmm7
+	pshufd		$0x00,%xmm7,%xmm6
+	pshufd		$0x55,%xmm7,%xmm7
+	movq		0x20(%rdi),%xmm9
+	pshufd		$0x00,%xmm9,%xmm8
+	pshufd		$0x55,%xmm9,%xmm9
+	movq		0x28(%rdi),%xmm11
+	pshufd		$0x00,%xmm11,%xmm10
+	pshufd		$0x55,%xmm11,%xmm11
+	movq		0x30(%rdi),%xmm13
+	pshufd		$0x00,%xmm13,%xmm12
+	pshufd		$0x55,%xmm13,%xmm13
+	movq		0x38(%rdi),%xmm15
+	pshufd		$0x00,%xmm15,%xmm14
+	pshufd		$0x55,%xmm15,%xmm15
+	# x0..3 on stack
+	movdqa		%xmm0,0x00(%rsp)
+	movdqa		%xmm1,0x10(%rsp)
+	movdqa		%xmm2,0x20(%rsp)
+	movdqa		%xmm3,0x30(%rsp)
+
+	movdqa		CTRINC(%rip),%xmm1
+	movdqa		ROT8(%rip),%xmm2
+	movdqa		ROT16(%rip),%xmm3
+
+	# x12 += counter values 0-3
+	paddd		%xmm1,%xmm12
+
+.Ldoubleround4:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm3,%xmm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm3,%xmm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm3,%xmm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm3,%xmm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	paddd		%xmm12,%xmm8
+	pxor		%xmm8,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm4
+	por		%xmm0,%xmm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	paddd		%xmm13,%xmm9
+	pxor		%xmm9,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm5
+	por		%xmm0,%xmm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	paddd		%xmm14,%xmm10
+	pxor		%xmm10,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm6
+	por		%xmm0,%xmm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	paddd		%xmm15,%xmm11
+	pxor		%xmm11,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm7
+	por		%xmm0,%xmm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm2,%xmm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm2,%xmm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm2,%xmm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm2,%xmm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	paddd		%xmm12,%xmm8
+	pxor		%xmm8,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm4
+	por		%xmm0,%xmm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	paddd		%xmm13,%xmm9
+	pxor		%xmm9,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm5
+	por		%xmm0,%xmm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	paddd		%xmm14,%xmm10
+	pxor		%xmm10,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm6
+	por		%xmm0,%xmm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	paddd		%xmm15,%xmm11
+	pxor		%xmm11,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm7
+	por		%xmm0,%xmm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm3,%xmm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm3,%xmm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm3,%xmm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm3,%xmm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	paddd		%xmm15,%xmm10
+	pxor		%xmm10,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm5
+	por		%xmm0,%xmm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	paddd		%xmm12,%xmm11
+	pxor		%xmm11,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm6
+	por		%xmm0,%xmm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	paddd		%xmm13,%xmm8
+	pxor		%xmm8,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm7
+	por		%xmm0,%xmm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	paddd		%xmm14,%xmm9
+	pxor		%xmm9,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm4
+	por		%xmm0,%xmm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm2,%xmm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm2,%xmm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm2,%xmm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm2,%xmm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	paddd		%xmm15,%xmm10
+	pxor		%xmm10,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm5
+	por		%xmm0,%xmm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	paddd		%xmm12,%xmm11
+	pxor		%xmm11,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm6
+	por		%xmm0,%xmm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	paddd		%xmm13,%xmm8
+	pxor		%xmm8,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm7
+	por		%xmm0,%xmm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	paddd		%xmm14,%xmm9
+	pxor		%xmm9,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm4
+	por		%xmm0,%xmm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# x0[0-3] += s0[0]
+	# x1[0-3] += s0[1]
+	movq		0x00(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		0x00(%rsp),%xmm2
+	movdqa		%xmm2,0x00(%rsp)
+	paddd		0x10(%rsp),%xmm3
+	movdqa		%xmm3,0x10(%rsp)
+	# x2[0-3] += s0[2]
+	# x3[0-3] += s0[3]
+	movq		0x08(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		0x20(%rsp),%xmm2
+	movdqa		%xmm2,0x20(%rsp)
+	paddd		0x30(%rsp),%xmm3
+	movdqa		%xmm3,0x30(%rsp)
+
+	# x4[0-3] += s1[0]
+	# x5[0-3] += s1[1]
+	movq		0x10(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm4
+	paddd		%xmm3,%xmm5
+	# x6[0-3] += s1[2]
+	# x7[0-3] += s1[3]
+	movq		0x18(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm6
+	paddd		%xmm3,%xmm7
+
+	# x8[0-3] += s2[0]
+	# x9[0-3] += s2[1]
+	movq		0x20(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm8
+	paddd		%xmm3,%xmm9
+	# x10[0-3] += s2[2]
+	# x11[0-3] += s2[3]
+	movq		0x28(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm10
+	paddd		%xmm3,%xmm11
+
+	# x12[0-3] += s3[0]
+	# x13[0-3] += s3[1]
+	movq		0x30(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm12
+	paddd		%xmm3,%xmm13
+	# x14[0-3] += s3[2]
+	# x15[0-3] += s3[3]
+	movq		0x38(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm14
+	paddd		%xmm3,%xmm15
+
+	# x12 += counter values 0-3
+	paddd		%xmm1,%xmm12
+
+	# interleave 32-bit words in state n, n+1
+	movdqa		0x00(%rsp),%xmm0
+	movdqa		0x10(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpckldq	%xmm1,%xmm2
+	punpckhdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x00(%rsp)
+	movdqa		%xmm0,0x10(%rsp)
+	movdqa		0x20(%rsp),%xmm0
+	movdqa		0x30(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpckldq	%xmm1,%xmm2
+	punpckhdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x20(%rsp)
+	movdqa		%xmm0,0x30(%rsp)
+	movdqa		%xmm4,%xmm0
+	punpckldq	%xmm5,%xmm4
+	punpckhdq	%xmm5,%xmm0
+	movdqa		%xmm0,%xmm5
+	movdqa		%xmm6,%xmm0
+	punpckldq	%xmm7,%xmm6
+	punpckhdq	%xmm7,%xmm0
+	movdqa		%xmm0,%xmm7
+	movdqa		%xmm8,%xmm0
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm9,%xmm0
+	movdqa		%xmm0,%xmm9
+	movdqa		%xmm10,%xmm0
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm11,%xmm0
+	movdqa		%xmm0,%xmm11
+	movdqa		%xmm12,%xmm0
+	punpckldq	%xmm13,%xmm12
+	punpckhdq	%xmm13,%xmm0
+	movdqa		%xmm0,%xmm13
+	movdqa		%xmm14,%xmm0
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm15,%xmm0
+	movdqa		%xmm0,%xmm15
+
+	# interleave 64-bit words in state n, n+2
+	movdqa		0x00(%rsp),%xmm0
+	movdqa		0x20(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpcklqdq	%xmm1,%xmm2
+	punpckhqdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x00(%rsp)
+	movdqa		%xmm0,0x20(%rsp)
+	movdqa		0x10(%rsp),%xmm0
+	movdqa		0x30(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpcklqdq	%xmm1,%xmm2
+	punpckhqdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x10(%rsp)
+	movdqa		%xmm0,0x30(%rsp)
+	movdqa		%xmm4,%xmm0
+	punpcklqdq	%xmm6,%xmm4
+	punpckhqdq	%xmm6,%xmm0
+	movdqa		%xmm0,%xmm6
+	movdqa		%xmm5,%xmm0
+	punpcklqdq	%xmm7,%xmm5
+	punpckhqdq	%xmm7,%xmm0
+	movdqa		%xmm0,%xmm7
+	movdqa		%xmm8,%xmm0
+	punpcklqdq	%xmm10,%xmm8
+	punpckhqdq	%xmm10,%xmm0
+	movdqa		%xmm0,%xmm10
+	movdqa		%xmm9,%xmm0
+	punpcklqdq	%xmm11,%xmm9
+	punpckhqdq	%xmm11,%xmm0
+	movdqa		%xmm0,%xmm11
+	movdqa		%xmm12,%xmm0
+	punpcklqdq	%xmm14,%xmm12
+	punpckhqdq	%xmm14,%xmm0
+	movdqa		%xmm0,%xmm14
+	movdqa		%xmm13,%xmm0
+	punpcklqdq	%xmm15,%xmm13
+	punpckhqdq	%xmm15,%xmm0
+	movdqa		%xmm0,%xmm15
+
+	# xor with corresponding input, write to output
+	movdqa		0x00(%rsp),%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
+	movdqu		0x00(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x00(%rsi)
+
+	movdqu		%xmm4,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	movdqu		0x10(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x10(%rsi)
+
+	movdqu		%xmm8,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	movdqu		0x20(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
+
+	movdqu		%xmm12,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	movdqu		0x30(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
+	movdqa		0x20(%rsp),%xmm0
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
+	movdqu		0x40(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x40(%rsi)
+
+	movdqu		%xmm6,%xmm0
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	movdqu		0x50(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x50(%rsi)
+
+	movdqu		%xmm10,%xmm0
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	movdqu		0x60(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x60(%rsi)
+
+	movdqu		%xmm14,%xmm0
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	movdqu		0x70(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x70(%rsi)
+
+	movdqa		0x10(%rsp),%xmm0
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	movdqu		0x80(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x80(%rsi)
+
+	movdqu		%xmm5,%xmm0
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	movdqu		0x90(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x90(%rsi)
+
+	movdqu		%xmm9,%xmm0
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	movdqu		0xa0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xa0(%rsi)
+
+	movdqu		%xmm13,%xmm0
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	movdqu		0xb0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xb0(%rsi)
+
+	movdqa		0x30(%rsp),%xmm0
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
+	movdqu		0xc0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xc0(%rsi)
+
+	movdqu		%xmm7,%xmm0
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
+	movdqu		0xd0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xd0(%rsi)
+
+	movdqu		%xmm11,%xmm0
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
+	movdqu		0xe0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xe0(%rsi)
+
+	movdqu		%xmm15,%xmm0
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
+	movdqu		0xf0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xf0(%rsi)
+
+.Ldone4:
+	lea		-8(%r10),%rsp
+	RET
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_ssse3)
diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha_glue.c
new file mode 100644
index 000000000000..10b2c945f541
--- /dev/null
+++ b/lib/crypto/x86/chacha_glue.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * ChaCha and HChaCha functions (x86_64 optimized)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <asm/simd.h>
+#include <crypto/chacha.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+
+asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
+					u8 *dst, const u8 *src,
+					unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
+				    u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+
+asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	if (static_branch_likely(&chacha_use_avx512vl)) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state->x[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes) {
+			chacha_2block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+
+	if (static_branch_likely(&chacha_use_avx2)) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state->x[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE) {
+			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state->x[12] += 4;
+	}
+	if (bytes > CHACHA_BLOCK_SIZE) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state->x[12] += chacha_advance(bytes, 4);
+		return;
+	}
+	if (bytes) {
+		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state->x[12]++;
+	}
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!static_branch_likely(&chacha_use_simd)) {
+		hchacha_block_generic(state, out, nrounds);
+	} else {
+		kernel_fpu_begin();
+		hchacha_block_ssse3(state, out, nrounds);
+		kernel_fpu_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&chacha_use_simd) ||
+	    bytes <= CHACHA_BLOCK_SIZE)
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_fpu_begin();
+		chacha_dosimd(state, dst, src, todo, nrounds);
+		kernel_fpu_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return static_key_enabled(&chacha_use_simd);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_SSSE3))
+		return 0;
+
+	static_branch_enable(&chacha_use_simd);
+
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		static_branch_enable(&chacha_use_avx2);
+
+		if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+			static_branch_enable(&chacha_use_avx512vl);
+	}
+	return 0;
+}
+subsys_initcall(chacha_simd_mod_init);
+
+static void __exit chacha_simd_mod_exit(void)
+{
+}
+module_exit(chacha_simd_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
new file mode 100644
index 000000000000..501827254fed
--- /dev/null
+++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
@@ -0,0 +1,4253 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86_64.
+#
+# March 2015
+#
+# Initial release.
+#
+# December 2016
+#
+# Add AVX512F+VL+BW code path.
+#
+# November 2017
+#
+# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
+# executed even on Knights Landing. Trigger for modification was
+# observation that AVX512 code paths can negatively affect overall
+# Skylake-X system performance. Since we are likely to suppress
+# AVX512F capability flag [at least on Skylake-X], conversion serves
+# as kind of "investment protection". Note that next *lake processor,
+# Cannonlake, has AVX512IFMA code path to execute...
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+#		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
+# P4		4.46/+120%	-
+# Core 2	2.41/+90%	-
+# Westmere	1.88/+120%	-
+# Sandy Bridge	1.39/+140%	1.10
+# Haswell	1.14/+175%	1.11		0.65
+# Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
+# Silvermont	2.83/+95%	-
+# Knights L	3.60/?		1.65		1.10	0.41(***)
+# Goldmont	1.70/+180%	-
+# VIA Nano	1.82/+150%	-
+# Sledgehammer	1.38/+160%	-
+# Bulldozer	2.30/+130%	0.97
+# Ryzen		1.15/+200%	1.08		1.18
+#
+# (*)	improvement coefficients relative to clang are more modest and
+#	are ~50% on most processors, in both cases we are comparing to
+#	__int128 code;
+# (**)	SSE2 implementation was attempted, but among non-AVX processors
+#	it was faster than integer-only code only on older Intel P4 and
+#	Core processors, 50-30%, less newer processor is, but slower on
+#	contemporary ones, for example almost 2x slower on Atom, and as
+#	former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***)	strangely enough performance seems to vary from core to core,
+#	listed result is best case;
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$kernel=0; $kernel=1 if (!$flavour && !$output);
+
+if (!$kernel) {
+	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+	die "can't locate x86_64-xlate.pl";
+
+	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+	*STDOUT=*OUT;
+
+	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+	}
+
+	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
+		$avx += 1 if ($1==2.11 && $2>=8);
+	}
+
+	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+		$avx = ($1>=10) + ($1>=11);
+	}
+
+	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+		$avx = ($2>=3.0) + ($2>3.0);
+	}
+} else {
+	$avx = 4; # The kernel uses ifdefs for this.
+}
+
+sub declare_function() {
+	my ($name, $align, $nargs) = @_;
+	if($kernel) {
+		$code .= "SYM_FUNC_START($name)\n";
+		$code .= ".L$name:\n";
+	} else {
+		$code .= ".globl	$name\n";
+		$code .= ".type	$name,\@function,$nargs\n";
+		$code .= ".align	$align\n";
+		$code .= "$name:\n";
+	}
+}
+
+sub declare_typed_function() {
+	my ($name, $align, $nargs) = @_;
+	if($kernel) {
+		$code .= "SYM_TYPED_FUNC_START($name)\n";
+		$code .= ".L$name:\n";
+	} else {
+		$code .= ".globl	$name\n";
+		$code .= ".type	$name,\@function,$nargs\n";
+		$code .= ".align	$align\n";
+		$code .= "$name:\n";
+	}
+}
+
+sub end_function() {
+	my ($name) = @_;
+	if($kernel) {
+		$code .= "SYM_FUNC_END($name)\n";
+	} else {
+		$code .= ".size   $name,.-$name\n";
+	}
+}
+
+$code.=<<___ if $kernel;
+#include <linux/cfi_types.h>
+___
+
+if ($avx) {
+$code.=<<___ if $kernel;
+.section .rodata
+___
+$code.=<<___;
+.align	64
+.Lconst:
+.Lmask24:
+.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+.Lmask26:
+.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long	2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long	0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad	0,12,24,64
+.L2_44_mask:
+.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad	44,44,42,64
+.L2_44_shift_lft:
+.quad	8,8,10,64
+
+.align	64
+.Lx_mask44:
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+___
+}
+$code.=<<___ if (!$kernel);
+.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+my ($mac,$nonce)=($inp,$len);	# *_emit arguments
+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
+
+sub poly1305_iteration {
+# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
+# output:	$h0-$h2 *= $r0-$r1
+$code.=<<___;
+	mulq	$h0			# h0*r1
+	mov	%rax,$d2
+	 mov	$r0,%rax
+	mov	%rdx,$d3
+
+	mulq	$h0			# h0*r0
+	mov	%rax,$h0		# future $h0
+	 mov	$r0,%rax
+	mov	%rdx,$d1
+
+	mulq	$h1			# h1*r0
+	add	%rax,$d2
+	 mov	$s1,%rax
+	adc	%rdx,$d3
+
+	mulq	$h1			# h1*s1
+	 mov	$h2,$h1			# borrow $h1
+	add	%rax,$h0
+	adc	%rdx,$d1
+
+	imulq	$s1,$h1			# h2*s1
+	add	$h1,$d2
+	 mov	$d1,$h1
+	adc	\$0,$d3
+
+	imulq	$r0,$h2			# h2*r0
+	add	$d2,$h1
+	mov	\$-4,%rax		# mask value
+	adc	$h2,$d3
+
+	and	$d3,%rax		# last reduction step
+	mov	$d3,$h2
+	shr	\$2,$d3
+	and	\$3,$h2
+	add	$d3,%rax
+	add	%rax,$h0
+	adc	\$0,$h1
+	adc	\$0,$h2
+___
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+#	unsigned __int64 h[3];		# current hash value base 2^64
+#	unsigned __int64 r[2];		# key value base 2^64
+
+$code.=<<___;
+.text
+___
+$code.=<<___ if (!$kernel);
+.extern	OPENSSL_ia32cap_P
+
+.globl	poly1305_block_init_arch
+.hidden	poly1305_block_init_arch
+.globl	poly1305_blocks_x86_64
+.hidden	poly1305_blocks_x86_64
+.globl	poly1305_emit_x86_64
+.hidden	poly1305_emit_x86_64
+___
+&declare_typed_function("poly1305_block_init_arch", 32, 3);
+$code.=<<___;
+	xor	%eax,%eax
+	mov	%rax,0($ctx)		# initialize hash value
+	mov	%rax,8($ctx)
+	mov	%rax,16($ctx)
+
+	test	$inp,$inp
+	je	.Lno_key
+___
+$code.=<<___ if (!$kernel);
+	lea	poly1305_blocks_x86_64(%rip),%r10
+	lea	poly1305_emit_x86_64(%rip),%r11
+___
+$code.=<<___	if (!$kernel && $avx);
+	mov	OPENSSL_ia32cap_P+4(%rip),%r9
+	lea	poly1305_blocks_avx(%rip),%rax
+	lea	poly1305_emit_avx(%rip),%rcx
+	bt	\$`60-32`,%r9		# AVX?
+	cmovc	%rax,%r10
+	cmovc	%rcx,%r11
+___
+$code.=<<___	if (!$kernel && $avx>1);
+	lea	poly1305_blocks_avx2(%rip),%rax
+	bt	\$`5+32`,%r9		# AVX2?
+	cmovc	%rax,%r10
+___
+$code.=<<___	if (!$kernel && $avx>3);
+	mov	\$`(1<<31|1<<21|1<<16)`,%rax
+	shr	\$32,%r9
+	and	%rax,%r9
+	cmp	%rax,%r9
+	je	.Linit_base2_44
+___
+$code.=<<___;
+	mov	\$0x0ffffffc0fffffff,%rax
+	mov	\$0x0ffffffc0ffffffc,%rcx
+	and	0($inp),%rax
+	and	8($inp),%rcx
+	mov	%rax,24($ctx)
+	mov	%rcx,32($ctx)
+___
+$code.=<<___	if (!$kernel && $flavour !~ /elf32/);
+	mov	%r10,0(%rdx)
+	mov	%r11,8(%rdx)
+___
+$code.=<<___	if (!$kernel && $flavour =~ /elf32/);
+	mov	%r10d,0(%rdx)
+	mov	%r11d,4(%rdx)
+___
+$code.=<<___;
+	mov	\$1,%eax
+.Lno_key:
+	RET
+___
+&end_function("poly1305_block_init_arch");
+
+&declare_function("poly1305_blocks_x86_64", 32, 4);
+$code.=<<___;
+.cfi_startproc
+.Lblocks:
+	shr	\$4,$len
+	jz	.Lno_data		# too short
+
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$ctx
+.cfi_push	$ctx
+.Lblocks_body:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	mov	0($ctx),$h0		# load hash value
+	mov	8($ctx),$h1
+	mov	16($ctx),$h2
+
+	mov	$s1,$r1
+	shr	\$2,$s1
+	mov	$r1,%rax
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+	jmp	.Loop
+
+.align	32
+.Loop:
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+___
+
+	&poly1305_iteration();
+
+$code.=<<___;
+	mov	$r1,%rax
+	dec	%r15			# len-=16
+	jnz	.Loop
+
+	mov	0(%rsp),$ctx
+.cfi_restore	$ctx
+
+	mov	$h0,0($ctx)		# store hash value
+	mov	$h1,8($ctx)
+	mov	$h2,16($ctx)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lno_data:
+.Lblocks_epilogue:
+	RET
+.cfi_endproc
+___
+&end_function("poly1305_blocks_x86_64");
+
+&declare_function("poly1305_emit_x86_64", 32, 3);
+$code.=<<___;
+.Lemit:
+	mov	0($ctx),%r8	# load hash value
+	mov	8($ctx),%r9
+	mov	16($ctx),%r10
+
+	mov	%r8,%rax
+	add	\$5,%r8		# compare to modulus
+	mov	%r9,%rcx
+	adc	\$0,%r9
+	adc	\$0,%r10
+	shr	\$2,%r10	# did 130-bit value overflow?
+	cmovnz	%r8,%rax
+	cmovnz	%r9,%rcx
+
+	add	0($nonce),%rax	# accumulate nonce
+	adc	8($nonce),%rcx
+	mov	%rax,0($mac)	# write result
+	mov	%rcx,8($mac)
+
+	RET
+___
+&end_function("poly1305_emit_x86_64");
+if ($avx) {
+
+########################################################################
+# Layout of opaque area is following.
+#
+#	unsigned __int32 h[5];		# current hash value base 2^26
+#	unsigned __int32 is_base2_26;
+#	unsigned __int64 r[2];		# key value base 2^64
+#	unsigned __int64 pad;
+#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
+    map("%xmm$_",(0..15));
+
+$code.=<<___;
+.type	__poly1305_block,\@abi-omnipotent
+.align	32
+__poly1305_block:
+	push $ctx
+___
+	&poly1305_iteration();
+$code.=<<___;
+	pop $ctx
+	RET
+.size	__poly1305_block,.-__poly1305_block
+
+.type	__poly1305_init_avx,\@abi-omnipotent
+.align	32
+__poly1305_init_avx:
+	push %rbp
+	mov %rsp,%rbp
+	mov	$r0,$h0
+	mov	$r1,$h1
+	xor	$h2,$h2
+
+	lea	48+64($ctx),$ctx	# size optimization
+
+	mov	$r1,%rax
+	call	__poly1305_block	# r^2
+
+	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
+	mov	\$0x3ffffff,%edx
+	mov	$h0,$d1
+	and	$h0#d,%eax
+	mov	$r0,$d2
+	and	$r0#d,%edx
+	mov	%eax,`16*0+0-64`($ctx)
+	shr	\$26,$d1
+	mov	%edx,`16*0+4-64`($ctx)
+	shr	\$26,$d2
+
+	mov	\$0x3ffffff,%eax
+	mov	\$0x3ffffff,%edx
+	and	$d1#d,%eax
+	and	$d2#d,%edx
+	mov	%eax,`16*1+0-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	%edx,`16*1+4-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	mov	%eax,`16*2+0-64`($ctx)
+	shr	\$26,$d1
+	mov	%edx,`16*2+4-64`($ctx)
+	shr	\$26,$d2
+
+	mov	$h1,%rax
+	mov	$r1,%rdx
+	shl	\$12,%rax
+	shl	\$12,%rdx
+	or	$d1,%rax
+	or	$d2,%rdx
+	and	\$0x3ffffff,%eax
+	and	\$0x3ffffff,%edx
+	mov	%eax,`16*3+0-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	%edx,`16*3+4-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	mov	%eax,`16*4+0-64`($ctx)
+	mov	$h1,$d1
+	mov	%edx,`16*4+4-64`($ctx)
+	mov	$r1,$d2
+
+	mov	\$0x3ffffff,%eax
+	mov	\$0x3ffffff,%edx
+	shr	\$14,$d1
+	shr	\$14,$d2
+	and	$d1#d,%eax
+	and	$d2#d,%edx
+	mov	%eax,`16*5+0-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	%edx,`16*5+4-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	mov	%eax,`16*6+0-64`($ctx)
+	shr	\$26,$d1
+	mov	%edx,`16*6+4-64`($ctx)
+	shr	\$26,$d2
+
+	mov	$h2,%rax
+	shl	\$24,%rax
+	or	%rax,$d1
+	mov	$d1#d,`16*7+0-64`($ctx)
+	lea	($d1,$d1,4),$d1		# *5
+	mov	$d2#d,`16*7+4-64`($ctx)
+	lea	($d2,$d2,4),$d2		# *5
+	mov	$d1#d,`16*8+0-64`($ctx)
+	mov	$d2#d,`16*8+4-64`($ctx)
+
+	mov	$r1,%rax
+	call	__poly1305_block	# r^3
+
+	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
+	mov	$h0,$d1
+	and	$h0#d,%eax
+	shr	\$26,$d1
+	mov	%eax,`16*0+12-64`($ctx)
+
+	mov	\$0x3ffffff,%edx
+	and	$d1#d,%edx
+	mov	%edx,`16*1+12-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	shr	\$26,$d1
+	mov	%edx,`16*2+12-64`($ctx)
+
+	mov	$h1,%rax
+	shl	\$12,%rax
+	or	$d1,%rax
+	and	\$0x3ffffff,%eax
+	mov	%eax,`16*3+12-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	$h1,$d1
+	mov	%eax,`16*4+12-64`($ctx)
+
+	mov	\$0x3ffffff,%edx
+	shr	\$14,$d1
+	and	$d1#d,%edx
+	mov	%edx,`16*5+12-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	shr	\$26,$d1
+	mov	%edx,`16*6+12-64`($ctx)
+
+	mov	$h2,%rax
+	shl	\$24,%rax
+	or	%rax,$d1
+	mov	$d1#d,`16*7+12-64`($ctx)
+	lea	($d1,$d1,4),$d1		# *5
+	mov	$d1#d,`16*8+12-64`($ctx)
+
+	mov	$r1,%rax
+	call	__poly1305_block	# r^4
+
+	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
+	mov	$h0,$d1
+	and	$h0#d,%eax
+	shr	\$26,$d1
+	mov	%eax,`16*0+8-64`($ctx)
+
+	mov	\$0x3ffffff,%edx
+	and	$d1#d,%edx
+	mov	%edx,`16*1+8-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	shr	\$26,$d1
+	mov	%edx,`16*2+8-64`($ctx)
+
+	mov	$h1,%rax
+	shl	\$12,%rax
+	or	$d1,%rax
+	and	\$0x3ffffff,%eax
+	mov	%eax,`16*3+8-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	$h1,$d1
+	mov	%eax,`16*4+8-64`($ctx)
+
+	mov	\$0x3ffffff,%edx
+	shr	\$14,$d1
+	and	$d1#d,%edx
+	mov	%edx,`16*5+8-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	shr	\$26,$d1
+	mov	%edx,`16*6+8-64`($ctx)
+
+	mov	$h2,%rax
+	shl	\$24,%rax
+	or	%rax,$d1
+	mov	$d1#d,`16*7+8-64`($ctx)
+	lea	($d1,$d1,4),$d1		# *5
+	mov	$d1#d,`16*8+8-64`($ctx)
+
+	lea	-48-64($ctx),$ctx	# size [de-]optimization
+	pop %rbp
+	RET
+.size	__poly1305_init_avx,.-__poly1305_init_avx
+___
+
+&declare_function("poly1305_blocks_avx", 32, 4);
+$code.=<<___;
+.cfi_startproc
+	mov	20($ctx),%r8d		# is_base2_26
+	cmp	\$128,$len
+	jae	.Lblocks_avx
+	test	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx:
+	and	\$-16,$len
+	jz	.Lno_data_avx
+
+	vzeroupper
+
+	test	%r8d,%r8d
+	jz	.Lbase2_64_avx
+
+	test	\$31,$len
+	jz	.Leven_avx
+
+	push	%rbp
+.cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lblocks_avx_body:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	0($ctx),$d1		# load hash value
+	mov	8($ctx),$d2
+	mov	16($ctx),$h2#d
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	################################# base 2^26 -> base 2^64
+	mov	$d1#d,$h0#d
+	and	\$`-1*(1<<31)`,$d1
+	mov	$d2,$r1			# borrow $r1
+	mov	$d2#d,$h1#d
+	and	\$`-1*(1<<31)`,$d2
+
+	shr	\$6,$d1
+	shl	\$52,$r1
+	add	$d1,$h0
+	shr	\$12,$h1
+	shr	\$18,$d2
+	add	$r1,$h0
+	adc	$d2,$h1
+
+	mov	$h2,$d1
+	shl	\$40,$d1
+	shr	\$24,$h2
+	add	$d1,$h1
+	adc	\$0,$h2			# can be partially reduced...
+
+	mov	\$-4,$d2		# ... so reduce
+	mov	$h2,$d1
+	and	$h2,$d2
+	shr	\$2,$d1
+	and	\$3,$h2
+	add	$d2,$d1			# =*5
+	add	$d1,$h0
+	adc	\$0,$h1
+	adc	\$0,$h2
+
+	mov	$s1,$r1
+	mov	$s1,%rax
+	shr	\$2,$s1
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+
+	call	__poly1305_block
+
+	test	$padbit,$padbit		# if $padbit is zero,
+	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
+
+	################################# base 2^64 -> base 2^26
+	mov	$h0,%rax
+	mov	$h0,%rdx
+	shr	\$52,$h0
+	mov	$h1,$r0
+	mov	$h1,$r1
+	shr	\$26,%rdx
+	and	\$0x3ffffff,%rax	# h[0]
+	shl	\$12,$r0
+	and	\$0x3ffffff,%rdx	# h[1]
+	shr	\$14,$h1
+	or	$r0,$h0
+	shl	\$24,$h2
+	and	\$0x3ffffff,$h0		# h[2]
+	shr	\$40,$r1
+	and	\$0x3ffffff,$h1		# h[3]
+	or	$r1,$h2			# h[4]
+
+	sub	\$16,%r15
+	jz	.Lstore_base2_26_avx
+
+	vmovd	%rax#d,$H0
+	vmovd	%rdx#d,$H1
+	vmovd	$h0#d,$H2
+	vmovd	$h1#d,$H3
+	vmovd	$h2#d,$H4
+	jmp	.Lproceed_avx
+
+.align	32
+.Lstore_base2_64_avx:
+	mov	$h0,0($ctx)
+	mov	$h1,8($ctx)
+	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
+	jmp	.Ldone_avx
+
+.align	16
+.Lstore_base2_26_avx:
+	mov	%rax#d,0($ctx)		# store hash value base 2^26
+	mov	%rdx#d,4($ctx)
+	mov	$h0#d,8($ctx)
+	mov	$h1#d,12($ctx)
+	mov	$h2#d,16($ctx)
+.align	16
+.Ldone_avx:
+	pop 		%r15
+.cfi_restore	%r15
+	pop 		%r14
+.cfi_restore	%r14
+	pop 		%r13
+.cfi_restore	%r13
+	pop 		%r12
+.cfi_restore	%r12
+	pop 		%rbx
+.cfi_restore	%rbx
+	pop 		%rbp
+.cfi_restore	%rbp
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+	RET
+.cfi_endproc
+
+.align	32
+.Lbase2_64_avx:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lbase2_64_avx_body:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	mov	0($ctx),$h0		# load hash value
+	mov	8($ctx),$h1
+	mov	16($ctx),$h2#d
+
+	mov	$s1,$r1
+	mov	$s1,%rax
+	shr	\$2,$s1
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+
+	test	\$31,$len
+	jz	.Linit_avx
+
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+	sub	\$16,%r15
+
+	call	__poly1305_block
+
+.Linit_avx:
+	################################# base 2^64 -> base 2^26
+	mov	$h0,%rax
+	mov	$h0,%rdx
+	shr	\$52,$h0
+	mov	$h1,$d1
+	mov	$h1,$d2
+	shr	\$26,%rdx
+	and	\$0x3ffffff,%rax	# h[0]
+	shl	\$12,$d1
+	and	\$0x3ffffff,%rdx	# h[1]
+	shr	\$14,$h1
+	or	$d1,$h0
+	shl	\$24,$h2
+	and	\$0x3ffffff,$h0		# h[2]
+	shr	\$40,$d2
+	and	\$0x3ffffff,$h1		# h[3]
+	or	$d2,$h2			# h[4]
+
+	vmovd	%rax#d,$H0
+	vmovd	%rdx#d,$H1
+	vmovd	$h0#d,$H2
+	vmovd	$h1#d,$H3
+	vmovd	$h2#d,$H4
+	movl	\$1,20($ctx)		# set is_base2_26
+
+	call	__poly1305_init_avx
+
+.Lproceed_avx:
+	mov	%r15,$len
+	pop 		%r15
+.cfi_restore	%r15
+	pop 		%r14
+.cfi_restore	%r14
+	pop 		%r13
+.cfi_restore	%r13
+	pop 		%r12
+.cfi_restore	%r12
+	pop 		%rbx
+.cfi_restore	%rbx
+	pop 		%rbp
+.cfi_restore	%rbp
+.Lbase2_64_avx_epilogue:
+	jmp	.Ldo_avx
+.cfi_endproc
+
+.align	32
+.Leven_avx:
+.cfi_startproc
+	vmovd		4*0($ctx),$H0		# load hash value
+	vmovd		4*1($ctx),$H1
+	vmovd		4*2($ctx),$H2
+	vmovd		4*3($ctx),$H3
+	vmovd		4*4($ctx),$H4
+
+.Ldo_avx:
+___
+$code.=<<___	if (!$win64);
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
+	and		\$-32,%rsp
+	sub		\$-8,%rsp
+	lea		-0x58(%rsp),%r11
+	sub		\$0x178,%rsp
+___
+$code.=<<___	if ($win64);
+	lea		-0xf8(%rsp),%r11
+	sub		\$0x218,%rsp
+	vmovdqa		%xmm6,0x50(%r11)
+	vmovdqa		%xmm7,0x60(%r11)
+	vmovdqa		%xmm8,0x70(%r11)
+	vmovdqa		%xmm9,0x80(%r11)
+	vmovdqa		%xmm10,0x90(%r11)
+	vmovdqa		%xmm11,0xa0(%r11)
+	vmovdqa		%xmm12,0xb0(%r11)
+	vmovdqa		%xmm13,0xc0(%r11)
+	vmovdqa		%xmm14,0xd0(%r11)
+	vmovdqa		%xmm15,0xe0(%r11)
+.Ldo_avx_body:
+___
+$code.=<<___;
+	sub		\$64,$len
+	lea		-32($inp),%rax
+	cmovc		%rax,$inp
+
+	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
+	lea		`16*3+64`($ctx),$ctx	# size optimization
+	lea		.Lconst(%rip),%rcx
+
+	################################################################
+	# load input
+	vmovdqu		16*2($inp),$T0
+	vmovdqu		16*3($inp),$T1
+	vmovdqa		64(%rcx),$MASK		# .Lmask26
+
+	vpsrldq		\$6,$T0,$T2		# splat input
+	vpsrldq		\$6,$T1,$T3
+	vpunpckhqdq	$T1,$T0,$T4		# 4
+	vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpunpcklqdq	$T3,$T2,$T3		# 2:3
+
+	vpsrlq		\$40,$T4,$T4		# 4
+	vpsrlq		\$26,$T0,$T1
+	vpand		$MASK,$T0,$T0		# 0
+	vpsrlq		\$4,$T3,$T2
+	vpand		$MASK,$T1,$T1		# 1
+	vpsrlq		\$30,$T3,$T3
+	vpand		$MASK,$T2,$T2		# 2
+	vpand		$MASK,$T3,$T3		# 3
+	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+
+	jbe		.Lskip_loop_avx
+
+	# expand and copy pre-calculated table to stack
+	vmovdqu		`16*1-64`($ctx),$D1
+	vmovdqu		`16*2-64`($ctx),$D2
+	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
+	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
+	vmovdqa		$D3,-0x90(%r11)
+	vmovdqa		$D0,0x00(%rsp)
+	vpshufd		\$0xEE,$D1,$D4
+	vmovdqu		`16*3-64`($ctx),$D0
+	vpshufd		\$0x44,$D1,$D1
+	vmovdqa		$D4,-0x80(%r11)
+	vmovdqa		$D1,0x10(%rsp)
+	vpshufd		\$0xEE,$D2,$D3
+	vmovdqu		`16*4-64`($ctx),$D1
+	vpshufd		\$0x44,$D2,$D2
+	vmovdqa		$D3,-0x70(%r11)
+	vmovdqa		$D2,0x20(%rsp)
+	vpshufd		\$0xEE,$D0,$D4
+	vmovdqu		`16*5-64`($ctx),$D2
+	vpshufd		\$0x44,$D0,$D0
+	vmovdqa		$D4,-0x60(%r11)
+	vmovdqa		$D0,0x30(%rsp)
+	vpshufd		\$0xEE,$D1,$D3
+	vmovdqu		`16*6-64`($ctx),$D0
+	vpshufd		\$0x44,$D1,$D1
+	vmovdqa		$D3,-0x50(%r11)
+	vmovdqa		$D1,0x40(%rsp)
+	vpshufd		\$0xEE,$D2,$D4
+	vmovdqu		`16*7-64`($ctx),$D1
+	vpshufd		\$0x44,$D2,$D2
+	vmovdqa		$D4,-0x40(%r11)
+	vmovdqa		$D2,0x50(%rsp)
+	vpshufd		\$0xEE,$D0,$D3
+	vmovdqu		`16*8-64`($ctx),$D2
+	vpshufd		\$0x44,$D0,$D0
+	vmovdqa		$D3,-0x30(%r11)
+	vmovdqa		$D0,0x60(%rsp)
+	vpshufd		\$0xEE,$D1,$D4
+	vpshufd		\$0x44,$D1,$D1
+	vmovdqa		$D4,-0x20(%r11)
+	vmovdqa		$D1,0x70(%rsp)
+	vpshufd		\$0xEE,$D2,$D3
+	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
+	vpshufd		\$0x44,$D2,$D2
+	vmovdqa		$D3,-0x10(%r11)
+	vmovdqa		$D2,0x80(%rsp)
+
+	jmp		.Loop_avx
+
+.align	32
+.Loop_avx:
+	################################################################
+	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	#   \___________________/
+	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	#   \___________________/ \____________________/
+	#
+	# Note that we start with inp[2:3]*r^2. This is because it
+	# doesn't depend on reduction in previous iteration.
+	################################################################
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	#
+	# though note that $Tx and $Hx are "reversed" in this section,
+	# and $D4 is preloaded with r0^2...
+
+	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
+	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
+	  vmovdqa	$H2,0x20(%r11)				# offload hash
+	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
+	 vmovdqa	0x10(%rsp),$H2		# r1^2
+	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
+	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
+
+	  vmovdqa	$H0,0x00(%r11)				#
+	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
+	  vmovdqa	$H1,0x10(%r11)				#
+	vpmuludq	$T3,$H2,$H1		# h3*r1
+	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
+	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
+	  vmovdqa	$H3,0x30(%r11)				#
+	vpmuludq	$T2,$H2,$H0		# h2*r1
+	vpmuludq	$T1,$H2,$H1		# h1*r1
+	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
+	 vmovdqa	0x30(%rsp),$H3		# r2^2
+	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
+	  vmovdqa	$H4,0x40(%r11)				#
+	vpmuludq	$T0,$H2,$H2		# h0*r1
+	 vpmuludq	$T2,$H3,$H0		# h2*r2
+	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
+
+	 vmovdqa	0x40(%rsp),$H4		# s2^2
+	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
+	vpmuludq	$T1,$H3,$H1		# h1*r2
+	vpmuludq	$T0,$H3,$H3		# h0*r2
+	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
+	 vmovdqa	0x50(%rsp),$H2		# r3^2
+	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
+	vpmuludq	$T4,$H4,$H0		# h4*s2
+	vpmuludq	$T3,$H4,$H4		# h3*s2
+	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
+	 vmovdqa	0x60(%rsp),$H3		# s3^2
+	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
+
+	 vmovdqa	0x80(%rsp),$H4		# s4^2
+	vpmuludq	$T1,$H2,$H1		# h1*r3
+	vpmuludq	$T0,$H2,$H2		# h0*r3
+	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
+	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
+	vpmuludq	$T4,$H3,$H0		# h4*s3
+	vpmuludq	$T3,$H3,$H1		# h3*s3
+	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
+	 vmovdqu	16*0($inp),$H0				# load input
+	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
+	vpmuludq	$T2,$H3,$H3		# h2*s3
+	 vpmuludq	$T2,$H4,$T2		# h2*s4
+	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
+
+	 vmovdqu	16*1($inp),$H1				#
+	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
+	vpmuludq	$T3,$H4,$T3		# h3*s4
+	vpmuludq	$T4,$H4,$T4		# h4*s4
+	 vpsrldq	\$6,$H0,$H2				# splat input
+	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
+	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
+	 vpsrldq	\$6,$H1,$H3				#
+	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
+	vpmuludq	$T1,$H4,$T0		# h1*s4
+	 vpunpckhqdq	$H1,$H0,$H4		# 4
+	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
+	 vmovdqa	-0x90(%r11),$T4		# r0^4
+	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
+
+	vpunpcklqdq	$H1,$H0,$H0		# 0:1
+	vpunpcklqdq	$H3,$H2,$H3		# 2:3
+
+	#vpsrlq		\$40,$H4,$H4		# 4
+	vpsrldq		\$`40/8`,$H4,$H4	# 4
+	vpsrlq		\$26,$H0,$H1
+	vpand		$MASK,$H0,$H0		# 0
+	vpsrlq		\$4,$H3,$H2
+	vpand		$MASK,$H1,$H1		# 1
+	vpand		0(%rcx),$H4,$H4		# .Lmask24
+	vpsrlq		\$30,$H3,$H3
+	vpand		$MASK,$H2,$H2		# 2
+	vpand		$MASK,$H3,$H3		# 3
+	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
+
+	vpaddq		0x00(%r11),$H0,$H0	# add hash value
+	vpaddq		0x10(%r11),$H1,$H1
+	vpaddq		0x20(%r11),$H2,$H2
+	vpaddq		0x30(%r11),$H3,$H3
+	vpaddq		0x40(%r11),$H4,$H4
+
+	lea		16*2($inp),%rax
+	lea		16*4($inp),$inp
+	sub		\$64,$len
+	cmovc		%rax,$inp
+
+	################################################################
+	# Now we accumulate (inp[0:1]+hash)*r^4
+	################################################################
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+	vpmuludq	$H0,$T4,$T0		# h0*r0
+	vpmuludq	$H1,$T4,$T1		# h1*r0
+	vpaddq		$T0,$D0,$D0
+	vpaddq		$T1,$D1,$D1
+	 vmovdqa	-0x80(%r11),$T2		# r1^4
+	vpmuludq	$H2,$T4,$T0		# h2*r0
+	vpmuludq	$H3,$T4,$T1		# h3*r0
+	vpaddq		$T0,$D2,$D2
+	vpaddq		$T1,$D3,$D3
+	vpmuludq	$H4,$T4,$T4		# h4*r0
+	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
+	vpaddq		$T4,$D4,$D4
+
+	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
+	vpmuludq	$H2,$T2,$T1		# h2*r1
+	vpmuludq	$H3,$T2,$T0		# h3*r1
+	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
+	 vmovdqa	-0x60(%r11),$T3		# r2^4
+	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
+	vpmuludq	$H1,$T2,$T1		# h1*r1
+	vpmuludq	$H0,$T2,$T2		# h0*r1
+	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
+	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
+
+	 vmovdqa	-0x50(%r11),$T4		# s2^4
+	vpmuludq	$H2,$T3,$T0		# h2*r2
+	vpmuludq	$H1,$T3,$T1		# h1*r2
+	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
+	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
+	 vmovdqa	-0x40(%r11),$T2		# r3^4
+	vpmuludq	$H0,$T3,$T3		# h0*r2
+	vpmuludq	$H4,$T4,$T0		# h4*s2
+	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
+	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
+	 vmovdqa	-0x30(%r11),$T3		# s3^4
+	vpmuludq	$H3,$T4,$T4		# h3*s2
+	 vpmuludq	$H1,$T2,$T1		# h1*r3
+	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
+
+	 vmovdqa	-0x10(%r11),$T4		# s4^4
+	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
+	vpmuludq	$H0,$T2,$T2		# h0*r3
+	vpmuludq	$H4,$T3,$T0		# h4*s3
+	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
+	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
+	 vmovdqu	16*2($inp),$T0				# load input
+	vpmuludq	$H3,$T3,$T2		# h3*s3
+	vpmuludq	$H2,$T3,$T3		# h2*s3
+	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
+	 vmovdqu	16*3($inp),$T1				#
+	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
+
+	vpmuludq	$H2,$T4,$H2		# h2*s4
+	vpmuludq	$H3,$T4,$H3		# h3*s4
+	 vpsrldq	\$6,$T0,$T2				# splat input
+	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
+	vpmuludq	$H4,$T4,$H4		# h4*s4
+	 vpsrldq	\$6,$T1,$T3				#
+	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
+	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
+	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
+	vpmuludq	$H1,$T4,$H0
+	 vpunpckhqdq	$T1,$T0,$T4		# 4
+	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
+	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
+
+	vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpunpcklqdq	$T3,$T2,$T3		# 2:3
+
+	#vpsrlq		\$40,$T4,$T4		# 4
+	vpsrldq		\$`40/8`,$T4,$T4	# 4
+	vpsrlq		\$26,$T0,$T1
+	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
+	vpand		$MASK,$T0,$T0		# 0
+	vpsrlq		\$4,$T3,$T2
+	vpand		$MASK,$T1,$T1		# 1
+	vpand		0(%rcx),$T4,$T4		# .Lmask24
+	vpsrlq		\$30,$T3,$T3
+	vpand		$MASK,$T2,$T2		# 2
+	vpand		$MASK,$T3,$T3		# 3
+	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+
+	################################################################
+	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	# and P. Schwabe
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$D1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H4,$D0
+	vpand		$MASK,$H4,$H4
+
+	vpsrlq		\$26,$H1,$D1
+	vpand		$MASK,$H1,$H1
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D0,$H0,$H0
+	vpsllq		\$2,$D0,$D0
+	vpaddq		$D0,$H0,$H0		# h4 -> h0
+
+	vpsrlq		\$26,$H2,$D2
+	vpand		$MASK,$H2,$H2
+	vpaddq		$D2,$H3,$H3		# h2 -> h3
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	ja		.Loop_avx
+
+.Lskip_loop_avx:
+	################################################################
+	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
+	add		\$32,$len
+	jnz		.Long_tail_avx
+
+	vpaddq		$H2,$T2,$T2
+	vpaddq		$H0,$T0,$T0
+	vpaddq		$H1,$T1,$T1
+	vpaddq		$H3,$T3,$T3
+	vpaddq		$H4,$T4,$T4
+
+.Long_tail_avx:
+	vmovdqa		$H2,0x20(%r11)
+	vmovdqa		$H0,0x00(%r11)
+	vmovdqa		$H1,0x10(%r11)
+	vmovdqa		$H3,0x30(%r11)
+	vmovdqa		$H4,0x40(%r11)
+
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
+	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
+	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
+	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
+	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
+	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
+
+	vpmuludq	$T3,$H2,$H0		# h3*r1
+	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
+	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
+	vpmuludq	$T2,$H2,$H1		# h2*r1
+	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
+	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
+	vpmuludq	$T1,$H2,$H0		# h1*r1
+	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
+	vpmuludq	$T0,$H2,$H2		# h0*r1
+	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
+	vpmuludq	$T4,$H3,$H3		# h4*s1
+	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
+
+	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
+	vpmuludq	$T2,$H4,$H1		# h2*r2
+	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
+	vpmuludq	$T1,$H4,$H0		# h1*r2
+	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
+	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
+	vpmuludq	$T0,$H4,$H4		# h0*r2
+	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
+	vpmuludq	$T4,$H2,$H1		# h4*s2
+	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
+	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
+	vpmuludq	$T3,$H2,$H2		# h3*s2
+	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
+
+	vpmuludq	$T1,$H3,$H0		# h1*r3
+	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
+	vpmuludq	$T0,$H3,$H3		# h0*r3
+	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
+	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
+	vpmuludq	$T4,$H4,$H1		# h4*s3
+	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
+	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
+	vpmuludq	$T3,$H4,$H0		# h3*s3
+	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
+	vpmuludq	$T2,$H4,$H4		# h2*s3
+	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
+
+	vpmuludq	$T0,$H2,$H2		# h0*r4
+	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
+	vpmuludq	$T4,$H3,$H1		# h4*s4
+	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
+	vpmuludq	$T3,$H3,$H0		# h3*s4
+	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
+	vpmuludq	$T2,$H3,$H1		# h2*s4
+	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
+	vpmuludq	$T1,$H3,$H3		# h1*s4
+	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
+
+	jz		.Lshort_tail_avx
+
+	vmovdqu		16*0($inp),$H0		# load input
+	vmovdqu		16*1($inp),$H1
+
+	vpsrldq		\$6,$H0,$H2		# splat input
+	vpsrldq		\$6,$H1,$H3
+	vpunpckhqdq	$H1,$H0,$H4		# 4
+	vpunpcklqdq	$H1,$H0,$H0		# 0:1
+	vpunpcklqdq	$H3,$H2,$H3		# 2:3
+
+	vpsrlq		\$40,$H4,$H4		# 4
+	vpsrlq		\$26,$H0,$H1
+	vpand		$MASK,$H0,$H0		# 0
+	vpsrlq		\$4,$H3,$H2
+	vpand		$MASK,$H1,$H1		# 1
+	vpsrlq		\$30,$H3,$H3
+	vpand		$MASK,$H2,$H2		# 2
+	vpand		$MASK,$H3,$H3		# 3
+	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
+
+	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
+	vpaddq		0x00(%r11),$H0,$H0
+	vpaddq		0x10(%r11),$H1,$H1
+	vpaddq		0x20(%r11),$H2,$H2
+	vpaddq		0x30(%r11),$H3,$H3
+	vpaddq		0x40(%r11),$H4,$H4
+
+	################################################################
+	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
+
+	vpmuludq	$H0,$T4,$T0		# h0*r0
+	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
+	vpmuludq	$H1,$T4,$T1		# h1*r0
+	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
+	vpmuludq	$H2,$T4,$T0		# h2*r0
+	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
+	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
+	vpmuludq	$H3,$T4,$T1		# h3*r0
+	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
+	vpmuludq	$H4,$T4,$T4		# h4*r0
+	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
+
+	vpmuludq	$H3,$T2,$T0		# h3*r1
+	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
+	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
+	vpmuludq	$H2,$T2,$T1		# h2*r1
+	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
+	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
+	vpmuludq	$H1,$T2,$T0		# h1*r1
+	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
+	vpmuludq	$H0,$T2,$T2		# h0*r1
+	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
+	vpmuludq	$H4,$T3,$T3		# h4*s1
+	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
+
+	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
+	vpmuludq	$H2,$T4,$T1		# h2*r2
+	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
+	vpmuludq	$H1,$T4,$T0		# h1*r2
+	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
+	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
+	vpmuludq	$H0,$T4,$T4		# h0*r2
+	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
+	vpmuludq	$H4,$T2,$T1		# h4*s2
+	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
+	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
+	vpmuludq	$H3,$T2,$T2		# h3*s2
+	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
+
+	vpmuludq	$H1,$T3,$T0		# h1*r3
+	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
+	vpmuludq	$H0,$T3,$T3		# h0*r3
+	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
+	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
+	vpmuludq	$H4,$T4,$T1		# h4*s3
+	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
+	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
+	vpmuludq	$H3,$T4,$T0		# h3*s3
+	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
+	vpmuludq	$H2,$T4,$T4		# h2*s3
+	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
+
+	vpmuludq	$H0,$T2,$T2		# h0*r4
+	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
+	vpmuludq	$H4,$T3,$T1		# h4*s4
+	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
+	vpmuludq	$H3,$T3,$T0		# h3*s4
+	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
+	vpmuludq	$H2,$T3,$T1		# h2*s4
+	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
+	vpmuludq	$H1,$T3,$T3		# h1*s4
+	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
+
+.Lshort_tail_avx:
+	################################################################
+	# horizontal addition
+
+	vpsrldq		\$8,$D4,$T4
+	vpsrldq		\$8,$D3,$T3
+	vpsrldq		\$8,$D1,$T1
+	vpsrldq		\$8,$D0,$T0
+	vpsrldq		\$8,$D2,$T2
+	vpaddq		$T3,$D3,$D3
+	vpaddq		$T4,$D4,$D4
+	vpaddq		$T0,$D0,$D0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$D2,$D2
+
+	################################################################
+	# lazy reduction
+
+	vpsrlq		\$26,$D3,$H3
+	vpand		$MASK,$D3,$D3
+	vpaddq		$H3,$D4,$D4		# h3 -> h4
+
+	vpsrlq		\$26,$D0,$H0
+	vpand		$MASK,$D0,$D0
+	vpaddq		$H0,$D1,$D1		# h0 -> h1
+
+	vpsrlq		\$26,$D4,$H4
+	vpand		$MASK,$D4,$D4
+
+	vpsrlq		\$26,$D1,$H1
+	vpand		$MASK,$D1,$D1
+	vpaddq		$H1,$D2,$D2		# h1 -> h2
+
+	vpaddq		$H4,$D0,$D0
+	vpsllq		\$2,$H4,$H4
+	vpaddq		$H4,$D0,$D0		# h4 -> h0
+
+	vpsrlq		\$26,$D2,$H2
+	vpand		$MASK,$D2,$D2
+	vpaddq		$H2,$D3,$D3		# h2 -> h3
+
+	vpsrlq		\$26,$D0,$H0
+	vpand		$MASK,$D0,$D0
+	vpaddq		$H0,$D1,$D1		# h0 -> h1
+
+	vpsrlq		\$26,$D3,$H3
+	vpand		$MASK,$D3,$D3
+	vpaddq		$H3,$D4,$D4		# h3 -> h4
+
+	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
+	vmovd		$D1,`4*1-48-64`($ctx)
+	vmovd		$D2,`4*2-48-64`($ctx)
+	vmovd		$D3,`4*3-48-64`($ctx)
+	vmovd		$D4,`4*4-48-64`($ctx)
+___
+$code.=<<___	if ($win64);
+	vmovdqa		0x50(%r11),%xmm6
+	vmovdqa		0x60(%r11),%xmm7
+	vmovdqa		0x70(%r11),%xmm8
+	vmovdqa		0x80(%r11),%xmm9
+	vmovdqa		0x90(%r11),%xmm10
+	vmovdqa		0xa0(%r11),%xmm11
+	vmovdqa		0xb0(%r11),%xmm12
+	vmovdqa		0xc0(%r11),%xmm13
+	vmovdqa		0xd0(%r11),%xmm14
+	vmovdqa		0xe0(%r11),%xmm15
+	lea		0xf8(%r11),%rsp
+.Ldo_avx_epilogue:
+___
+$code.=<<___	if (!$win64);
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+___
+$code.=<<___;
+	vzeroupper
+	RET
+.cfi_endproc
+___
+&end_function("poly1305_blocks_avx");
+
+&declare_function("poly1305_emit_avx", 32, 3);
+$code.=<<___;
+	cmpl	\$0,20($ctx)	# is_base2_26?
+	je	.Lemit
+
+	mov	0($ctx),%eax	# load hash value base 2^26
+	mov	4($ctx),%ecx
+	mov	8($ctx),%r8d
+	mov	12($ctx),%r11d
+	mov	16($ctx),%r10d
+
+	shl	\$26,%rcx	# base 2^26 -> base 2^64
+	mov	%r8,%r9
+	shl	\$52,%r8
+	add	%rcx,%rax
+	shr	\$12,%r9
+	add	%rax,%r8	# h0
+	adc	\$0,%r9
+
+	shl	\$14,%r11
+	mov	%r10,%rax
+	shr	\$24,%r10
+	add	%r11,%r9
+	shl	\$40,%rax
+	add	%rax,%r9	# h1
+	adc	\$0,%r10	# h2
+
+	mov	%r10,%rax	# could be partially reduced, so reduce
+	mov	%r10,%rcx
+	and	\$3,%r10
+	shr	\$2,%rax
+	and	\$-4,%rcx
+	add	%rcx,%rax
+	add	%rax,%r8
+	adc	\$0,%r9
+	adc	\$0,%r10
+
+	mov	%r8,%rax
+	add	\$5,%r8		# compare to modulus
+	mov	%r9,%rcx
+	adc	\$0,%r9
+	adc	\$0,%r10
+	shr	\$2,%r10	# did 130-bit value overflow?
+	cmovnz	%r8,%rax
+	cmovnz	%r9,%rcx
+
+	add	0($nonce),%rax	# accumulate nonce
+	adc	8($nonce),%rcx
+	mov	%rax,0($mac)	# write result
+	mov	%rcx,8($mac)
+
+	RET
+___
+&end_function("poly1305_emit_avx");
+
+if ($avx>1) {
+
+my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+    map("%ymm$_",(0..15));
+my $S4=$MASK;
+
+sub poly1305_blocks_avxN {
+	my ($avx512) = @_;
+	my $suffix = $avx512 ? "_avx512" : "";
+$code.=<<___;
+.cfi_startproc
+	mov	20($ctx),%r8d		# is_base2_26
+	cmp	\$128,$len
+	jae	.Lblocks_avx2$suffix
+	test	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx2$suffix:
+	and	\$-16,$len
+	jz	.Lno_data_avx2$suffix
+
+	vzeroupper
+
+	test	%r8d,%r8d
+	jz	.Lbase2_64_avx2$suffix
+
+	test	\$63,$len
+	jz	.Leven_avx2$suffix
+
+	push	%rbp
+.cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lblocks_avx2_body$suffix:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	0($ctx),$d1		# load hash value
+	mov	8($ctx),$d2
+	mov	16($ctx),$h2#d
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	################################# base 2^26 -> base 2^64
+	mov	$d1#d,$h0#d
+	and	\$`-1*(1<<31)`,$d1
+	mov	$d2,$r1			# borrow $r1
+	mov	$d2#d,$h1#d
+	and	\$`-1*(1<<31)`,$d2
+
+	shr	\$6,$d1
+	shl	\$52,$r1
+	add	$d1,$h0
+	shr	\$12,$h1
+	shr	\$18,$d2
+	add	$r1,$h0
+	adc	$d2,$h1
+
+	mov	$h2,$d1
+	shl	\$40,$d1
+	shr	\$24,$h2
+	add	$d1,$h1
+	adc	\$0,$h2			# can be partially reduced...
+
+	mov	\$-4,$d2		# ... so reduce
+	mov	$h2,$d1
+	and	$h2,$d2
+	shr	\$2,$d1
+	and	\$3,$h2
+	add	$d2,$d1			# =*5
+	add	$d1,$h0
+	adc	\$0,$h1
+	adc	\$0,$h2
+
+	mov	$s1,$r1
+	mov	$s1,%rax
+	shr	\$2,$s1
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+
+.Lbase2_26_pre_avx2$suffix:
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+	sub	\$16,%r15
+
+	call	__poly1305_block
+	mov	$r1,%rax
+
+	test	\$63,%r15
+	jnz	.Lbase2_26_pre_avx2$suffix
+
+	test	$padbit,$padbit		# if $padbit is zero,
+	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
+
+	################################# base 2^64 -> base 2^26
+	mov	$h0,%rax
+	mov	$h0,%rdx
+	shr	\$52,$h0
+	mov	$h1,$r0
+	mov	$h1,$r1
+	shr	\$26,%rdx
+	and	\$0x3ffffff,%rax	# h[0]
+	shl	\$12,$r0
+	and	\$0x3ffffff,%rdx	# h[1]
+	shr	\$14,$h1
+	or	$r0,$h0
+	shl	\$24,$h2
+	and	\$0x3ffffff,$h0		# h[2]
+	shr	\$40,$r1
+	and	\$0x3ffffff,$h1		# h[3]
+	or	$r1,$h2			# h[4]
+
+	test	%r15,%r15
+	jz	.Lstore_base2_26_avx2$suffix
+
+	vmovd	%rax#d,%x#$H0
+	vmovd	%rdx#d,%x#$H1
+	vmovd	$h0#d,%x#$H2
+	vmovd	$h1#d,%x#$H3
+	vmovd	$h2#d,%x#$H4
+	jmp	.Lproceed_avx2$suffix
+
+.align	32
+.Lstore_base2_64_avx2$suffix:
+	mov	$h0,0($ctx)
+	mov	$h1,8($ctx)
+	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
+	jmp	.Ldone_avx2$suffix
+
+.align	16
+.Lstore_base2_26_avx2$suffix:
+	mov	%rax#d,0($ctx)		# store hash value base 2^26
+	mov	%rdx#d,4($ctx)
+	mov	$h0#d,8($ctx)
+	mov	$h1#d,12($ctx)
+	mov	$h2#d,16($ctx)
+.align	16
+.Ldone_avx2$suffix:
+	pop 		%r15
+.cfi_restore	%r15
+	pop 		%r14
+.cfi_restore	%r14
+	pop 		%r13
+.cfi_restore	%r13
+	pop 		%r12
+.cfi_restore	%r12
+	pop 		%rbx
+.cfi_restore	%rbx
+	pop 		%rbp
+.cfi_restore 	%rbp
+.Lno_data_avx2$suffix:
+.Lblocks_avx2_epilogue$suffix:
+	RET
+.cfi_endproc
+
+.align	32
+.Lbase2_64_avx2$suffix:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lbase2_64_avx2_body$suffix:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	mov	0($ctx),$h0		# load hash value
+	mov	8($ctx),$h1
+	mov	16($ctx),$h2#d
+
+	mov	$s1,$r1
+	mov	$s1,%rax
+	shr	\$2,$s1
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+
+	test	\$63,$len
+	jz	.Linit_avx2$suffix
+
+.Lbase2_64_pre_avx2$suffix:
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+	sub	\$16,%r15
+
+	call	__poly1305_block
+	mov	$r1,%rax
+
+	test	\$63,%r15
+	jnz	.Lbase2_64_pre_avx2$suffix
+
+.Linit_avx2$suffix:
+	################################# base 2^64 -> base 2^26
+	mov	$h0,%rax
+	mov	$h0,%rdx
+	shr	\$52,$h0
+	mov	$h1,$d1
+	mov	$h1,$d2
+	shr	\$26,%rdx
+	and	\$0x3ffffff,%rax	# h[0]
+	shl	\$12,$d1
+	and	\$0x3ffffff,%rdx	# h[1]
+	shr	\$14,$h1
+	or	$d1,$h0
+	shl	\$24,$h2
+	and	\$0x3ffffff,$h0		# h[2]
+	shr	\$40,$d2
+	and	\$0x3ffffff,$h1		# h[3]
+	or	$d2,$h2			# h[4]
+
+	vmovd	%rax#d,%x#$H0
+	vmovd	%rdx#d,%x#$H1
+	vmovd	$h0#d,%x#$H2
+	vmovd	$h1#d,%x#$H3
+	vmovd	$h2#d,%x#$H4
+	movl	\$1,20($ctx)		# set is_base2_26
+
+	call	__poly1305_init_avx
+
+.Lproceed_avx2$suffix:
+	mov	%r15,$len			# restore $len
+___
+$code.=<<___ if (!$kernel);
+	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
+	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
+___
+$code.=<<___;
+	pop 		%r15
+.cfi_restore	%r15
+	pop 		%r14
+.cfi_restore	%r14
+	pop 		%r13
+.cfi_restore	%r13
+	pop 		%r12
+.cfi_restore	%r12
+	pop 		%rbx
+.cfi_restore	%rbx
+	pop 		%rbp
+.cfi_restore 	%rbp
+.Lbase2_64_avx2_epilogue$suffix:
+	jmp	.Ldo_avx2$suffix
+.cfi_endproc
+
+.align	32
+.Leven_avx2$suffix:
+.cfi_startproc
+___
+$code.=<<___ if (!$kernel);
+	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
+___
+$code.=<<___;
+	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
+	vmovd		4*1($ctx),%x#$H1
+	vmovd		4*2($ctx),%x#$H2
+	vmovd		4*3($ctx),%x#$H3
+	vmovd		4*4($ctx),%x#$H4
+
+.Ldo_avx2$suffix:
+___
+$code.=<<___		if (!$kernel && $avx>2);
+	cmp		\$512,$len
+	jb		.Lskip_avx512
+	and		%r11d,%r9d
+	test		\$`1<<16`,%r9d		# check for AVX512F
+	jnz		.Lblocks_avx512
+.Lskip_avx512$suffix:
+___
+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
+	cmp		\$512,$len
+	jae		.Lblocks_avx512
+___
+$code.=<<___	if (!$win64);
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
+	sub		\$0x128,%rsp
+___
+$code.=<<___	if ($win64);
+	lea		8(%rsp),%r10
+	sub		\$0x1c8,%rsp
+	vmovdqa		%xmm6,-0xb0(%r10)
+	vmovdqa		%xmm7,-0xa0(%r10)
+	vmovdqa		%xmm8,-0x90(%r10)
+	vmovdqa		%xmm9,-0x80(%r10)
+	vmovdqa		%xmm10,-0x70(%r10)
+	vmovdqa		%xmm11,-0x60(%r10)
+	vmovdqa		%xmm12,-0x50(%r10)
+	vmovdqa		%xmm13,-0x40(%r10)
+	vmovdqa		%xmm14,-0x30(%r10)
+	vmovdqa		%xmm15,-0x20(%r10)
+.Ldo_avx2_body$suffix:
+___
+$code.=<<___;
+	lea		.Lconst(%rip),%rcx
+	lea		48+64($ctx),$ctx	# size optimization
+	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
+
+	# expand and copy pre-calculated table to stack
+	vmovdqu		`16*0-64`($ctx),%x#$T2
+	and		\$-512,%rsp
+	vmovdqu		`16*1-64`($ctx),%x#$T3
+	vmovdqu		`16*2-64`($ctx),%x#$T4
+	vmovdqu		`16*3-64`($ctx),%x#$D0
+	vmovdqu		`16*4-64`($ctx),%x#$D1
+	vmovdqu		`16*5-64`($ctx),%x#$D2
+	lea		0x90(%rsp),%rax		# size optimization
+	vmovdqu		`16*6-64`($ctx),%x#$D3
+	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
+	vmovdqu		`16*7-64`($ctx),%x#$D4
+	vpermd		$T3,$T0,$T3
+	vmovdqu		`16*8-64`($ctx),%x#$MASK
+	vpermd		$T4,$T0,$T4
+	vmovdqa		$T2,0x00(%rsp)
+	vpermd		$D0,$T0,$D0
+	vmovdqa		$T3,0x20-0x90(%rax)
+	vpermd		$D1,$T0,$D1
+	vmovdqa		$T4,0x40-0x90(%rax)
+	vpermd		$D2,$T0,$D2
+	vmovdqa		$D0,0x60-0x90(%rax)
+	vpermd		$D3,$T0,$D3
+	vmovdqa		$D1,0x80-0x90(%rax)
+	vpermd		$D4,$T0,$D4
+	vmovdqa		$D2,0xa0-0x90(%rax)
+	vpermd		$MASK,$T0,$MASK
+	vmovdqa		$D3,0xc0-0x90(%rax)
+	vmovdqa		$D4,0xe0-0x90(%rax)
+	vmovdqa		$MASK,0x100-0x90(%rax)
+	vmovdqa		64(%rcx),$MASK		# .Lmask26
+
+	################################################################
+	# load input
+	vmovdqu		16*0($inp),%x#$T0
+	vmovdqu		16*1($inp),%x#$T1
+	vinserti128	\$1,16*2($inp),$T0,$T0
+	vinserti128	\$1,16*3($inp),$T1,$T1
+	lea		16*4($inp),$inp
+
+	vpsrldq		\$6,$T0,$T2		# splat input
+	vpsrldq		\$6,$T1,$T3
+	vpunpckhqdq	$T1,$T0,$T4		# 4
+	vpunpcklqdq	$T3,$T2,$T2		# 2:3
+	vpunpcklqdq	$T1,$T0,$T0		# 0:1
+
+	vpsrlq		\$30,$T2,$T3
+	vpsrlq		\$4,$T2,$T2
+	vpsrlq		\$26,$T0,$T1
+	vpsrlq		\$40,$T4,$T4		# 4
+	vpand		$MASK,$T2,$T2		# 2
+	vpand		$MASK,$T0,$T0		# 0
+	vpand		$MASK,$T1,$T1		# 1
+	vpand		$MASK,$T3,$T3		# 3
+	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+
+	vpaddq		$H2,$T2,$H2		# accumulate input
+	sub		\$64,$len
+	jz		.Ltail_avx2$suffix
+	jmp		.Loop_avx2$suffix
+
+.align	32
+.Loop_avx2$suffix:
+	################################################################
+	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
+	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
+	# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
+	# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
+	#   \________/\__________/
+	################################################################
+	#vpaddq		$H2,$T2,$H2		# accumulate input
+	vpaddq		$H0,$T0,$H0
+	vmovdqa		`32*0`(%rsp),$T0	# r0^4
+	vpaddq		$H1,$T1,$H1
+	vmovdqa		`32*1`(%rsp),$T1	# r1^4
+	vpaddq		$H3,$T3,$H3
+	vmovdqa		`32*3`(%rsp),$T2	# r2^4
+	vpaddq		$H4,$T4,$H4
+	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
+	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
+
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	#
+	# however, as h2 is "chronologically" first one available pull
+	# corresponding operations up, so it's
+	#
+	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
+	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
+	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
+
+	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
+	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
+	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
+	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
+	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
+
+	vpmuludq	$H0,$T1,$T4		# h0*r1
+	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
+	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
+	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
+	vpmuludq	$H3,$T1,$T4		# h3*r1
+	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
+	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
+	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
+	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
+
+	vpmuludq	$H0,$T0,$T4		# h0*r0
+	vpmuludq	$H1,$T0,$H2		# h1*r0
+	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
+	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
+	vpmuludq	$H3,$T0,$T4		# h3*r0
+	vpmuludq	$H4,$T0,$H2		# h4*r0
+	 vmovdqu	16*0($inp),%x#$T0	# load input
+	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
+	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
+	 vinserti128	\$1,16*2($inp),$T0,$T0
+
+	vpmuludq	$H3,$T1,$T4		# h3*s2
+	vpmuludq	$H4,$T1,$H2		# h4*s2
+	 vmovdqu	16*1($inp),%x#$T1
+	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
+	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
+	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
+	vpmuludq	$H1,$T2,$T4		# h1*r2
+	vpmuludq	$H0,$T2,$T2		# h0*r2
+	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
+	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
+	 vinserti128	\$1,16*3($inp),$T1,$T1
+	 lea		16*4($inp),$inp
+
+	vpmuludq	$H1,$H2,$T4		# h1*r3
+	vpmuludq	$H0,$H2,$H2		# h0*r3
+	 vpsrldq	\$6,$T0,$T2		# splat input
+	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
+	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
+	vpmuludq	$H3,$T3,$T4		# h3*s3
+	vpmuludq	$H4,$T3,$H2		# h4*s3
+	 vpsrldq	\$6,$T1,$T3
+	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
+	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
+	 vpunpckhqdq	$T1,$T0,$T4		# 4
+
+	vpmuludq	$H3,$S4,$H3		# h3*s4
+	vpmuludq	$H4,$S4,$H4		# h4*s4
+	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
+	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
+	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
+	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
+	vpmuludq	$H1,$S4,$H0		# h1*s4
+	vmovdqa		64(%rcx),$MASK		# .Lmask26
+	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
+	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
+
+	################################################################
+	# lazy reduction (interleaved with tail of input splat)
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$D1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H4,$D4
+	vpand		$MASK,$H4,$H4
+
+	 vpsrlq		\$4,$T3,$T2
+
+	vpsrlq		\$26,$H1,$D1
+	vpand		$MASK,$H1,$H1
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D4,$H0,$H0
+	vpsllq		\$2,$D4,$D4
+	vpaddq		$D4,$H0,$H0		# h4 -> h0
+
+	 vpand		$MASK,$T2,$T2		# 2
+	 vpsrlq		\$26,$T0,$T1
+
+	vpsrlq		\$26,$H2,$D2
+	vpand		$MASK,$H2,$H2
+	vpaddq		$D2,$H3,$H3		# h2 -> h3
+
+	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
+	 vpsrlq		\$30,$T3,$T3
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	 vpsrlq		\$40,$T4,$T4		# 4
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	 vpand		$MASK,$T0,$T0		# 0
+	 vpand		$MASK,$T1,$T1		# 1
+	 vpand		$MASK,$T3,$T3		# 3
+	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+
+	sub		\$64,$len
+	jnz		.Loop_avx2$suffix
+
+	.byte		0x66,0x90
+.Ltail_avx2$suffix:
+	################################################################
+	# while above multiplications were by r^4 in all lanes, in last
+	# iteration we multiply least significant lane by r^4 and most
+	# significant one by r, so copy of above except that references
+	# to the precomputed table are displaced by 4...
+
+	#vpaddq		$H2,$T2,$H2		# accumulate input
+	vpaddq		$H0,$T0,$H0
+	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
+	vpaddq		$H1,$T1,$H1
+	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
+	vpaddq		$H3,$T3,$H3
+	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
+	vpaddq		$H4,$T4,$H4
+	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
+	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
+
+	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
+	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
+	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
+	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
+	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
+
+	vpmuludq	$H0,$T1,$T4		# h0*r1
+	vpmuludq	$H1,$T1,$H2		# h1*r1
+	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
+	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
+	vpmuludq	$H3,$T1,$T4		# h3*r1
+	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
+	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
+	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
+
+	vpmuludq	$H0,$T0,$T4		# h0*r0
+	vpmuludq	$H1,$T0,$H2		# h1*r0
+	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
+	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
+	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
+	vpmuludq	$H3,$T0,$T4		# h3*r0
+	vpmuludq	$H4,$T0,$H2		# h4*r0
+	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
+	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
+
+	vpmuludq	$H3,$T1,$T4		# h3*s2
+	vpmuludq	$H4,$T1,$H2		# h4*s2
+	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
+	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
+	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
+	vpmuludq	$H1,$T2,$T4		# h1*r2
+	vpmuludq	$H0,$T2,$T2		# h0*r2
+	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
+	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
+
+	vpmuludq	$H1,$H2,$T4		# h1*r3
+	vpmuludq	$H0,$H2,$H2		# h0*r3
+	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
+	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
+	vpmuludq	$H3,$T3,$T4		# h3*s3
+	vpmuludq	$H4,$T3,$H2		# h4*s3
+	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
+	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
+
+	vpmuludq	$H3,$S4,$H3		# h3*s4
+	vpmuludq	$H4,$S4,$H4		# h4*s4
+	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
+	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
+	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
+	vpmuludq	$H1,$S4,$H0		# h1*s4
+	vmovdqa		64(%rcx),$MASK		# .Lmask26
+	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
+	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
+
+	################################################################
+	# horizontal addition
+
+	vpsrldq		\$8,$D1,$T1
+	vpsrldq		\$8,$H2,$T2
+	vpsrldq		\$8,$H3,$T3
+	vpsrldq		\$8,$H4,$T4
+	vpsrldq		\$8,$H0,$T0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$H2,$H2
+	vpaddq		$T3,$H3,$H3
+	vpaddq		$T4,$H4,$H4
+	vpaddq		$T0,$H0,$H0
+
+	vpermq		\$0x2,$H3,$T3
+	vpermq		\$0x2,$H4,$T4
+	vpermq		\$0x2,$H0,$T0
+	vpermq		\$0x2,$D1,$T1
+	vpermq		\$0x2,$H2,$T2
+	vpaddq		$T3,$H3,$H3
+	vpaddq		$T4,$H4,$H4
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$H2,$H2
+
+	################################################################
+	# lazy reduction
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$D1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H4,$D4
+	vpand		$MASK,$H4,$H4
+
+	vpsrlq		\$26,$H1,$D1
+	vpand		$MASK,$H1,$H1
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D4,$H0,$H0
+	vpsllq		\$2,$D4,$D4
+	vpaddq		$D4,$H0,$H0		# h4 -> h0
+
+	vpsrlq		\$26,$H2,$D2
+	vpand		$MASK,$H2,$H2
+	vpaddq		$D2,$H3,$H3		# h2 -> h3
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
+	vmovd		%x#$H1,`4*1-48-64`($ctx)
+	vmovd		%x#$H2,`4*2-48-64`($ctx)
+	vmovd		%x#$H3,`4*3-48-64`($ctx)
+	vmovd		%x#$H4,`4*4-48-64`($ctx)
+___
+$code.=<<___	if ($win64);
+	vmovdqa		-0xb0(%r10),%xmm6
+	vmovdqa		-0xa0(%r10),%xmm7
+	vmovdqa		-0x90(%r10),%xmm8
+	vmovdqa		-0x80(%r10),%xmm9
+	vmovdqa		-0x70(%r10),%xmm10
+	vmovdqa		-0x60(%r10),%xmm11
+	vmovdqa		-0x50(%r10),%xmm12
+	vmovdqa		-0x40(%r10),%xmm13
+	vmovdqa		-0x30(%r10),%xmm14
+	vmovdqa		-0x20(%r10),%xmm15
+	lea		-8(%r10),%rsp
+.Ldo_avx2_epilogue$suffix:
+___
+$code.=<<___	if (!$win64);
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+___
+$code.=<<___;
+	vzeroupper
+	RET
+.cfi_endproc
+___
+if($avx > 2 && $avx512) {
+my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
+my $PADBIT="%zmm30";
+
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+map(s/%y/%z/,($MASK));
+
+$code.=<<___;
+.cfi_startproc
+.Lblocks_avx512:
+	mov		\$15,%eax
+	kmovw		%eax,%k2
+___
+$code.=<<___	if (!$win64);
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
+	sub		\$0x128,%rsp
+___
+$code.=<<___	if ($win64);
+	lea		8(%rsp),%r10
+	sub		\$0x1c8,%rsp
+	vmovdqa		%xmm6,-0xb0(%r10)
+	vmovdqa		%xmm7,-0xa0(%r10)
+	vmovdqa		%xmm8,-0x90(%r10)
+	vmovdqa		%xmm9,-0x80(%r10)
+	vmovdqa		%xmm10,-0x70(%r10)
+	vmovdqa		%xmm11,-0x60(%r10)
+	vmovdqa		%xmm12,-0x50(%r10)
+	vmovdqa		%xmm13,-0x40(%r10)
+	vmovdqa		%xmm14,-0x30(%r10)
+	vmovdqa		%xmm15,-0x20(%r10)
+.Ldo_avx512_body:
+___
+$code.=<<___;
+	lea		.Lconst(%rip),%rcx
+	lea		48+64($ctx),$ctx	# size optimization
+	vmovdqa		96(%rcx),%y#$T2		# .Lpermd_avx2
+
+	# expand pre-calculated table
+	vmovdqu		`16*0-64`($ctx),%x#$D0	# will become expanded ${R0}
+	and		\$-512,%rsp
+	vmovdqu		`16*1-64`($ctx),%x#$D1	# will become ... ${R1}
+	mov		\$0x20,%rax
+	vmovdqu		`16*2-64`($ctx),%x#$T0	# ... ${S1}
+	vmovdqu		`16*3-64`($ctx),%x#$D2	# ... ${R2}
+	vmovdqu		`16*4-64`($ctx),%x#$T1	# ... ${S2}
+	vmovdqu		`16*5-64`($ctx),%x#$D3	# ... ${R3}
+	vmovdqu		`16*6-64`($ctx),%x#$T3	# ... ${S3}
+	vmovdqu		`16*7-64`($ctx),%x#$D4	# ... ${R4}
+	vmovdqu		`16*8-64`($ctx),%x#$T4	# ... ${S4}
+	vpermd		$D0,$T2,$R0		# 00003412 -> 14243444
+	vpbroadcastq	64(%rcx),$MASK		# .Lmask26
+	vpermd		$D1,$T2,$R1
+	vpermd		$T0,$T2,$S1
+	vpermd		$D2,$T2,$R2
+	vmovdqa64	$R0,0x00(%rsp){%k2}	# save in case $len%128 != 0
+	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
+	vpermd		$T1,$T2,$S2
+	vmovdqu64	$R1,0x00(%rsp,%rax){%k2}
+	 vpsrlq		\$32,$R1,$T1
+	vpermd		$D3,$T2,$R3
+	vmovdqa64	$S1,0x40(%rsp){%k2}
+	vpermd		$T3,$T2,$S3
+	vpermd		$D4,$T2,$R4
+	vmovdqu64	$R2,0x40(%rsp,%rax){%k2}
+	vpermd		$T4,$T2,$S4
+	vmovdqa64	$S2,0x80(%rsp){%k2}
+	vmovdqu64	$R3,0x80(%rsp,%rax){%k2}
+	vmovdqa64	$S3,0xc0(%rsp){%k2}
+	vmovdqu64	$R4,0xc0(%rsp,%rax){%k2}
+	vmovdqa64	$S4,0x100(%rsp){%k2}
+
+	################################################################
+	# calculate 5th through 8th powers of the key
+	#
+	# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
+	# d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
+	# d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
+	# d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
+	# d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
+
+	vpmuludq	$T0,$R0,$D0		# d0 = r0'*r0
+	vpmuludq	$T0,$R1,$D1		# d1 = r0'*r1
+	vpmuludq	$T0,$R2,$D2		# d2 = r0'*r2
+	vpmuludq	$T0,$R3,$D3		# d3 = r0'*r3
+	vpmuludq	$T0,$R4,$D4		# d4 = r0'*r4
+	 vpsrlq		\$32,$R2,$T2
+
+	vpmuludq	$T1,$S4,$M0
+	vpmuludq	$T1,$R0,$M1
+	vpmuludq	$T1,$R1,$M2
+	vpmuludq	$T1,$R2,$M3
+	vpmuludq	$T1,$R3,$M4
+	 vpsrlq		\$32,$R3,$T3
+	vpaddq		$M0,$D0,$D0		# d0 += r1'*5*r4
+	vpaddq		$M1,$D1,$D1		# d1 += r1'*r0
+	vpaddq		$M2,$D2,$D2		# d2 += r1'*r1
+	vpaddq		$M3,$D3,$D3		# d3 += r1'*r2
+	vpaddq		$M4,$D4,$D4		# d4 += r1'*r3
+
+	vpmuludq	$T2,$S3,$M0
+	vpmuludq	$T2,$S4,$M1
+	vpmuludq	$T2,$R1,$M3
+	vpmuludq	$T2,$R2,$M4
+	vpmuludq	$T2,$R0,$M2
+	 vpsrlq		\$32,$R4,$T4
+	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r3
+	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r4
+	vpaddq		$M3,$D3,$D3		# d3 += r2'*r1
+	vpaddq		$M4,$D4,$D4		# d4 += r2'*r2
+	vpaddq		$M2,$D2,$D2		# d2 += r2'*r0
+
+	vpmuludq	$T3,$S2,$M0
+	vpmuludq	$T3,$R0,$M3
+	vpmuludq	$T3,$R1,$M4
+	vpmuludq	$T3,$S3,$M1
+	vpmuludq	$T3,$S4,$M2
+	vpaddq		$M0,$D0,$D0		# d0 += r3'*5*r2
+	vpaddq		$M3,$D3,$D3		# d3 += r3'*r0
+	vpaddq		$M4,$D4,$D4		# d4 += r3'*r1
+	vpaddq		$M1,$D1,$D1		# d1 += r3'*5*r3
+	vpaddq		$M2,$D2,$D2		# d2 += r3'*5*r4
+
+	vpmuludq	$T4,$S4,$M3
+	vpmuludq	$T4,$R0,$M4
+	vpmuludq	$T4,$S1,$M0
+	vpmuludq	$T4,$S2,$M1
+	vpmuludq	$T4,$S3,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += r2'*5*r4
+	vpaddq		$M4,$D4,$D4		# d4 += r2'*r0
+	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r1
+	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r2
+	vpaddq		$M2,$D2,$D2		# d2 += r2'*5*r3
+
+	################################################################
+	# load input
+	vmovdqu64	16*0($inp),%z#$T3
+	vmovdqu64	16*4($inp),%z#$T4
+	lea		16*8($inp),$inp
+
+	################################################################
+	# lazy reduction
+
+	vpsrlq		\$26,$D3,$M3
+	vpandq		$MASK,$D3,$D3
+	vpaddq		$M3,$D4,$D4		# d3 -> d4
+
+	vpsrlq		\$26,$D0,$M0
+	vpandq		$MASK,$D0,$D0
+	vpaddq		$M0,$D1,$D1		# d0 -> d1
+
+	vpsrlq		\$26,$D4,$M4
+	vpandq		$MASK,$D4,$D4
+
+	vpsrlq		\$26,$D1,$M1
+	vpandq		$MASK,$D1,$D1
+	vpaddq		$M1,$D2,$D2		# d1 -> d2
+
+	vpaddq		$M4,$D0,$D0
+	vpsllq		\$2,$M4,$M4
+	vpaddq		$M4,$D0,$D0		# d4 -> d0
+
+	vpsrlq		\$26,$D2,$M2
+	vpandq		$MASK,$D2,$D2
+	vpaddq		$M2,$D3,$D3		# d2 -> d3
+
+	vpsrlq		\$26,$D0,$M0
+	vpandq		$MASK,$D0,$D0
+	vpaddq		$M0,$D1,$D1		# d0 -> d1
+
+	vpsrlq		\$26,$D3,$M3
+	vpandq		$MASK,$D3,$D3
+	vpaddq		$M3,$D4,$D4		# d3 -> d4
+
+	################################################################
+	# at this point we have 14243444 in $R0-$S4 and 05060708 in
+	# $D0-$D4, ...
+
+	vpunpcklqdq	$T4,$T3,$T0	# transpose input
+	vpunpckhqdq	$T4,$T3,$T4
+
+	# ... since input 64-bit lanes are ordered as 73625140, we could
+	# "vperm" it to 76543210 (here and in each loop iteration), *or*
+	# we could just flow along, hence the goal for $R0-$S4 is
+	# 1858286838784888 ...
+
+	vmovdqa32	128(%rcx),$M0		# .Lpermd_avx512:
+	mov		\$0x7777,%eax
+	kmovw		%eax,%k1
+
+	vpermd		$R0,$M0,$R0		# 14243444 -> 1---2---3---4---
+	vpermd		$R1,$M0,$R1
+	vpermd		$R2,$M0,$R2
+	vpermd		$R3,$M0,$R3
+	vpermd		$R4,$M0,$R4
+
+	vpermd		$D0,$M0,${R0}{%k1}	# 05060708 -> 1858286838784888
+	vpermd		$D1,$M0,${R1}{%k1}
+	vpermd		$D2,$M0,${R2}{%k1}
+	vpermd		$D3,$M0,${R3}{%k1}
+	vpermd		$D4,$M0,${R4}{%k1}
+
+	vpslld		\$2,$R1,$S1		# *5
+	vpslld		\$2,$R2,$S2
+	vpslld		\$2,$R3,$S3
+	vpslld		\$2,$R4,$S4
+	vpaddd		$R1,$S1,$S1
+	vpaddd		$R2,$S2,$S2
+	vpaddd		$R3,$S3,$S3
+	vpaddd		$R4,$S4,$S4
+
+	vpbroadcastq	32(%rcx),$PADBIT	# .L129
+
+	vpsrlq		\$52,$T0,$T2		# splat input
+	vpsllq		\$12,$T4,$T3
+	vporq		$T3,$T2,$T2
+	vpsrlq		\$26,$T0,$T1
+	vpsrlq		\$14,$T4,$T3
+	vpsrlq		\$40,$T4,$T4		# 4
+	vpandq		$MASK,$T2,$T2		# 2
+	vpandq		$MASK,$T0,$T0		# 0
+	#vpandq		$MASK,$T1,$T1		# 1
+	#vpandq		$MASK,$T3,$T3		# 3
+	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
+
+	vpaddq		$H2,$T2,$H2		# accumulate input
+	sub		\$192,$len
+	jbe		.Ltail_avx512
+	jmp		.Loop_avx512
+
+.align	32
+.Loop_avx512:
+	################################################################
+	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
+	# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
+	# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
+	# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
+	# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
+	# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
+	# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
+	# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
+	#   \________/\___________/
+	################################################################
+	#vpaddq		$H2,$T2,$H2		# accumulate input
+
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	#
+	# however, as h2 is "chronologically" first one available pull
+	# corresponding operations up, so it's
+	#
+	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
+	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
+	# d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
+	# d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
+	# d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
+
+	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
+	 vpaddq		$H0,$T0,$H0
+	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
+	 vpandq		$MASK,$T1,$T1		# 1
+	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
+	 vpandq		$MASK,$T3,$T3		# 3
+	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
+	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
+	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
+	 vpaddq		$H1,$T1,$H1		# accumulate input
+	 vpaddq		$H3,$T3,$H3
+	 vpaddq		$H4,$T4,$H4
+
+	  vmovdqu64	16*0($inp),$T3		# load input
+	  vmovdqu64	16*4($inp),$T4
+	  lea		16*8($inp),$inp
+	vpmuludq	$H0,$R3,$M3
+	vpmuludq	$H0,$R4,$M4
+	vpmuludq	$H0,$R0,$M0
+	vpmuludq	$H0,$R1,$M1
+	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
+	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
+	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
+	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
+
+	vpmuludq	$H1,$R2,$M3
+	vpmuludq	$H1,$R3,$M4
+	vpmuludq	$H1,$S4,$M0
+	vpmuludq	$H0,$R2,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
+	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
+	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
+	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
+
+	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
+	  vpunpckhqdq	$T4,$T3,$T4
+
+	vpmuludq	$H3,$R0,$M3
+	vpmuludq	$H3,$R1,$M4
+	vpmuludq	$H1,$R0,$M1
+	vpmuludq	$H1,$R1,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
+	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
+	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
+	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
+
+	vpmuludq	$H4,$S4,$M3
+	vpmuludq	$H4,$R0,$M4
+	vpmuludq	$H3,$S2,$M0
+	vpmuludq	$H3,$S3,$M1
+	vpaddq		$M3,$D3,$D3		# d3 += h4*s4
+	vpmuludq	$H3,$S4,$M2
+	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
+	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
+	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
+	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
+
+	vpmuludq	$H4,$S1,$M0
+	vpmuludq	$H4,$S2,$M1
+	vpmuludq	$H4,$S3,$M2
+	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
+	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
+	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
+
+	################################################################
+	# lazy reduction (interleaved with input splat)
+
+	 vpsrlq		\$52,$T0,$T2		# splat input
+	 vpsllq		\$12,$T4,$T3
+
+	vpsrlq		\$26,$D3,$H3
+	vpandq		$MASK,$D3,$D3
+	vpaddq		$H3,$D4,$H4		# h3 -> h4
+
+	 vporq		$T3,$T2,$T2
+
+	vpsrlq		\$26,$H0,$D0
+	vpandq		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	 vpandq		$MASK,$T2,$T2		# 2
+
+	vpsrlq		\$26,$H4,$D4
+	vpandq		$MASK,$H4,$H4
+
+	vpsrlq		\$26,$H1,$D1
+	vpandq		$MASK,$H1,$H1
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D4,$H0,$H0
+	vpsllq		\$2,$D4,$D4
+	vpaddq		$D4,$H0,$H0		# h4 -> h0
+
+	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
+	 vpsrlq		\$26,$T0,$T1
+
+	vpsrlq		\$26,$H2,$D2
+	vpandq		$MASK,$H2,$H2
+	vpaddq		$D2,$D3,$H3		# h2 -> h3
+
+	 vpsrlq		\$14,$T4,$T3
+
+	vpsrlq		\$26,$H0,$D0
+	vpandq		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	 vpsrlq		\$40,$T4,$T4		# 4
+
+	vpsrlq		\$26,$H3,$D3
+	vpandq		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	 vpandq		$MASK,$T0,$T0		# 0
+	 #vpandq	$MASK,$T1,$T1		# 1
+	 #vpandq	$MASK,$T3,$T3		# 3
+	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
+
+	sub		\$128,$len
+	ja		.Loop_avx512
+
+.Ltail_avx512:
+	################################################################
+	# while above multiplications were by r^8 in all lanes, in last
+	# iteration we multiply least significant lane by r^8 and most
+	# significant one by r, that's why table gets shifted...
+
+	vpsrlq		\$32,$R0,$R0		# 0105020603070408
+	vpsrlq		\$32,$R1,$R1
+	vpsrlq		\$32,$R2,$R2
+	vpsrlq		\$32,$S3,$S3
+	vpsrlq		\$32,$S4,$S4
+	vpsrlq		\$32,$R3,$R3
+	vpsrlq		\$32,$R4,$R4
+	vpsrlq		\$32,$S1,$S1
+	vpsrlq		\$32,$S2,$S2
+
+	################################################################
+	# load either next or last 64 byte of input
+	lea		($inp,$len),$inp
+
+	#vpaddq		$H2,$T2,$H2		# accumulate input
+	vpaddq		$H0,$T0,$H0
+
+	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
+	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
+	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
+	 vpandq		$MASK,$T1,$T1		# 1
+	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
+	 vpandq		$MASK,$T3,$T3		# 3
+	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
+	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
+	 vpaddq		$H1,$T1,$H1		# accumulate input
+	 vpaddq		$H3,$T3,$H3
+	 vpaddq		$H4,$T4,$H4
+
+	  vmovdqu	16*0($inp),%x#$T0
+	vpmuludq	$H0,$R3,$M3
+	vpmuludq	$H0,$R4,$M4
+	vpmuludq	$H0,$R0,$M0
+	vpmuludq	$H0,$R1,$M1
+	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
+	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
+	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
+	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
+
+	  vmovdqu	16*1($inp),%x#$T1
+	vpmuludq	$H1,$R2,$M3
+	vpmuludq	$H1,$R3,$M4
+	vpmuludq	$H1,$S4,$M0
+	vpmuludq	$H0,$R2,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
+	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
+	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
+	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
+
+	  vinserti128	\$1,16*2($inp),%y#$T0,%y#$T0
+	vpmuludq	$H3,$R0,$M3
+	vpmuludq	$H3,$R1,$M4
+	vpmuludq	$H1,$R0,$M1
+	vpmuludq	$H1,$R1,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
+	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
+	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
+	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
+
+	  vinserti128	\$1,16*3($inp),%y#$T1,%y#$T1
+	vpmuludq	$H4,$S4,$M3
+	vpmuludq	$H4,$R0,$M4
+	vpmuludq	$H3,$S2,$M0
+	vpmuludq	$H3,$S3,$M1
+	vpmuludq	$H3,$S4,$M2
+	vpaddq		$M3,$D3,$H3		# h3 = d3 + h4*s4
+	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
+	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
+	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
+	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
+
+	vpmuludq	$H4,$S1,$M0
+	vpmuludq	$H4,$S2,$M1
+	vpmuludq	$H4,$S3,$M2
+	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
+	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
+	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
+
+	################################################################
+	# horizontal addition
+
+	mov		\$1,%eax
+	vpermq		\$0xb1,$H3,$D3
+	vpermq		\$0xb1,$D4,$H4
+	vpermq		\$0xb1,$H0,$D0
+	vpermq		\$0xb1,$H1,$D1
+	vpermq		\$0xb1,$H2,$D2
+	vpaddq		$D3,$H3,$H3
+	vpaddq		$D4,$H4,$H4
+	vpaddq		$D0,$H0,$H0
+	vpaddq		$D1,$H1,$H1
+	vpaddq		$D2,$H2,$H2
+
+	kmovw		%eax,%k3
+	vpermq		\$0x2,$H3,$D3
+	vpermq		\$0x2,$H4,$D4
+	vpermq		\$0x2,$H0,$D0
+	vpermq		\$0x2,$H1,$D1
+	vpermq		\$0x2,$H2,$D2
+	vpaddq		$D3,$H3,$H3
+	vpaddq		$D4,$H4,$H4
+	vpaddq		$D0,$H0,$H0
+	vpaddq		$D1,$H1,$H1
+	vpaddq		$D2,$H2,$H2
+
+	vextracti64x4	\$0x1,$H3,%y#$D3
+	vextracti64x4	\$0x1,$H4,%y#$D4
+	vextracti64x4	\$0x1,$H0,%y#$D0
+	vextracti64x4	\$0x1,$H1,%y#$D1
+	vextracti64x4	\$0x1,$H2,%y#$D2
+	vpaddq		$D3,$H3,${H3}{%k3}{z}	# keep single qword in case
+	vpaddq		$D4,$H4,${H4}{%k3}{z}	# it's passed to .Ltail_avx2
+	vpaddq		$D0,$H0,${H0}{%k3}{z}
+	vpaddq		$D1,$H1,${H1}{%k3}{z}
+	vpaddq		$D2,$H2,${H2}{%k3}{z}
+___
+map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
+map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
+$code.=<<___;
+	################################################################
+	# lazy reduction (interleaved with input splat)
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	 vpsrldq	\$6,$T0,$T2		# splat input
+	 vpsrldq	\$6,$T1,$T3
+	 vpunpckhqdq	$T1,$T0,$T4		# 4
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	 vpunpcklqdq	$T3,$T2,$T2		# 2:3
+	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H4,$D4
+	vpand		$MASK,$H4,$H4
+
+	vpsrlq		\$26,$H1,$D1
+	vpand		$MASK,$H1,$H1
+	 vpsrlq		\$30,$T2,$T3
+	 vpsrlq		\$4,$T2,$T2
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D4,$H0,$H0
+	vpsllq		\$2,$D4,$D4
+	 vpsrlq		\$26,$T0,$T1
+	 vpsrlq		\$40,$T4,$T4		# 4
+	vpaddq		$D4,$H0,$H0		# h4 -> h0
+
+	vpsrlq		\$26,$H2,$D2
+	vpand		$MASK,$H2,$H2
+	 vpand		$MASK,$T2,$T2		# 2
+	 vpand		$MASK,$T0,$T0		# 0
+	vpaddq		$D2,$H3,$H3		# h2 -> h3
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	 vpaddq		$H2,$T2,$H2		# accumulate input for .Ltail_avx2
+	 vpand		$MASK,$T1,$T1		# 1
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	 vpand		$MASK,$T3,$T3		# 3
+	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
+	add		\$64,$len
+	jnz		.Ltail_avx2$suffix
+
+	vpsubq		$T2,$H2,$H2		# undo input accumulation
+	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
+	vmovd		%x#$H1,`4*1-48-64`($ctx)
+	vmovd		%x#$H2,`4*2-48-64`($ctx)
+	vmovd		%x#$H3,`4*3-48-64`($ctx)
+	vmovd		%x#$H4,`4*4-48-64`($ctx)
+	vzeroall
+___
+$code.=<<___	if ($win64);
+	movdqa		-0xb0(%r10),%xmm6
+	movdqa		-0xa0(%r10),%xmm7
+	movdqa		-0x90(%r10),%xmm8
+	movdqa		-0x80(%r10),%xmm9
+	movdqa		-0x70(%r10),%xmm10
+	movdqa		-0x60(%r10),%xmm11
+	movdqa		-0x50(%r10),%xmm12
+	movdqa		-0x40(%r10),%xmm13
+	movdqa		-0x30(%r10),%xmm14
+	movdqa		-0x20(%r10),%xmm15
+	lea		-8(%r10),%rsp
+.Ldo_avx512_epilogue:
+___
+$code.=<<___	if (!$win64);
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+___
+$code.=<<___;
+	RET
+.cfi_endproc
+___
+
+}
+
+}
+
+&declare_function("poly1305_blocks_avx2", 32, 4);
+poly1305_blocks_avxN(0);
+&end_function("poly1305_blocks_avx2");
+
+#######################################################################
+if ($avx>2) {
+# On entry we have input length divisible by 64. But since inner loop
+# processes 128 bytes per iteration, cases when length is not divisible
+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+# for this tail, we wouldn't have to even allocate stack frame...
+
+&declare_function("poly1305_blocks_avx512", 32, 4);
+poly1305_blocks_avxN(1);
+&end_function("poly1305_blocks_avx512");
+
+if (!$kernel && $avx>3) {
+########################################################################
+# VPMADD52 version using 2^44 radix.
+#
+# One can argue that base 2^52 would be more natural. Well, even though
+# some operations would be more natural, one has to recognize couple of
+# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
+# at amount of multiply-n-accumulate operations. Secondly, it makes it
+# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
+# reference implementations], which means that more such operations
+# would have to be performed in inner loop, which in turn makes critical
+# path longer. In other words, even though base 2^44 reduction might
+# look less elegant, overall critical path is actually shorter...
+
+########################################################################
+# Layout of opaque area is following.
+#
+#	unsigned __int64 h[3];		# current hash value base 2^44
+#	unsigned __int64 s[2];		# key value*20 base 2^44
+#	unsigned __int64 r[3];		# key value base 2^44
+#	struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
+#					# r^n positions reflect
+#					# placement in register, not
+#					# memory, R[3] is R[1]*20
+
+$code.=<<___;
+.type	poly1305_init_base2_44,\@function,3
+.align	32
+poly1305_init_base2_44:
+	xor	%eax,%eax
+	mov	%rax,0($ctx)		# initialize hash value
+	mov	%rax,8($ctx)
+	mov	%rax,16($ctx)
+
+.Linit_base2_44:
+	lea	poly1305_blocks_vpmadd52(%rip),%r10
+	lea	poly1305_emit_base2_44(%rip),%r11
+
+	mov	\$0x0ffffffc0fffffff,%rax
+	mov	\$0x0ffffffc0ffffffc,%rcx
+	and	0($inp),%rax
+	mov	\$0x00000fffffffffff,%r8
+	and	8($inp),%rcx
+	mov	\$0x00000fffffffffff,%r9
+	and	%rax,%r8
+	shrd	\$44,%rcx,%rax
+	mov	%r8,40($ctx)		# r0
+	and	%r9,%rax
+	shr	\$24,%rcx
+	mov	%rax,48($ctx)		# r1
+	lea	(%rax,%rax,4),%rax	# *5
+	mov	%rcx,56($ctx)		# r2
+	shl	\$2,%rax		# magic <<2
+	lea	(%rcx,%rcx,4),%rcx	# *5
+	shl	\$2,%rcx		# magic <<2
+	mov	%rax,24($ctx)		# s1
+	mov	%rcx,32($ctx)		# s2
+	movq	\$-1,64($ctx)		# write impossible value
+___
+$code.=<<___	if ($flavour !~ /elf32/);
+	mov	%r10,0(%rdx)
+	mov	%r11,8(%rdx)
+___
+$code.=<<___	if ($flavour =~ /elf32/);
+	mov	%r10d,0(%rdx)
+	mov	%r11d,4(%rdx)
+___
+$code.=<<___;
+	mov	\$1,%eax
+	RET
+.size	poly1305_init_base2_44,.-poly1305_init_base2_44
+___
+{
+my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
+my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
+my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
+
+$code.=<<___;
+.type	poly1305_blocks_vpmadd52,\@function,4
+.align	32
+poly1305_blocks_vpmadd52:
+	shr	\$4,$len
+	jz	.Lno_data_vpmadd52		# too short
+
+	shl	\$40,$padbit
+	mov	64($ctx),%r8			# peek on power of the key
+
+	# if powers of the key are not calculated yet, process up to 3
+	# blocks with this single-block subroutine, otherwise ensure that
+	# length is divisible by 2 blocks and pass the rest down to next
+	# subroutine...
+
+	mov	\$3,%rax
+	mov	\$1,%r10
+	cmp	\$4,$len			# is input long
+	cmovae	%r10,%rax
+	test	%r8,%r8				# is power value impossible?
+	cmovns	%r10,%rax
+
+	and	$len,%rax			# is input of favourable length?
+	jz	.Lblocks_vpmadd52_4x
+
+	sub		%rax,$len
+	mov		\$7,%r10d
+	mov		\$1,%r11d
+	kmovw		%r10d,%k7
+	lea		.L2_44_inp_permd(%rip),%r10
+	kmovw		%r11d,%k1
+
+	vmovq		$padbit,%x#$PAD
+	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
+	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
+	vpermq		\$0xcf,$PAD,$PAD
+	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
+
+	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
+	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
+	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
+	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
+
+	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
+	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
+
+	jmp		.Loop_vpmadd52
+
+.align	32
+.Loop_vpmadd52:
+	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
+	lea		16($inp),$inp
+
+	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
+	vpsrlvq		$inp_shift,$T0,$T0
+	vpandq		$reduc_mask,$T0,$T0
+	vporq		$PAD,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
+
+	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
+	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
+	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
+
+	vpxord		$Dlo,$Dlo,$Dlo
+	vpxord		$Dhi,$Dhi,$Dhi
+
+	vpmadd52luq	$r2r1r0,$H0,$Dlo
+	vpmadd52huq	$r2r1r0,$H0,$Dhi
+
+	vpmadd52luq	$r1r0s2,$H1,$Dlo
+	vpmadd52huq	$r1r0s2,$H1,$Dhi
+
+	vpmadd52luq	$r0s2s1,$H2,$Dlo
+	vpmadd52huq	$r0s2s1,$H2,$Dhi
+
+	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
+	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
+	vpandq		$reduc_mask,$Dlo,$Dlo
+
+	vpaddq		$T0,$Dhi,$Dhi
+
+	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
+
+	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
+
+	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
+	vpandq		$reduc_mask,$Dlo,$Dlo
+
+	vpermq		\$0b10010011,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo
+
+	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
+
+	vpaddq		$T0,$Dlo,$Dlo
+	vpsllq		\$2,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo
+
+	dec		%rax			# len-=16
+	jnz		.Loop_vpmadd52
+
+	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
+
+	test		$len,$len
+	jnz		.Lblocks_vpmadd52_4x
+
+.Lno_data_vpmadd52:
+	RET
+.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+___
+}
+{
+########################################################################
+# As implied by its name 4x subroutine processes 4 blocks in parallel
+# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
+# and is handled in 256-bit %ymm registers.
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+
+$code.=<<___;
+.type	poly1305_blocks_vpmadd52_4x,\@function,4
+.align	32
+poly1305_blocks_vpmadd52_4x:
+	shr	\$4,$len
+	jz	.Lno_data_vpmadd52_4x		# too short
+
+	shl	\$40,$padbit
+	mov	64($ctx),%r8			# peek on power of the key
+
+.Lblocks_vpmadd52_4x:
+	vpbroadcastq	$padbit,$PAD
+
+	vmovdqa64	.Lx_mask44(%rip),$mask44
+	mov		\$5,%eax
+	vmovdqa64	.Lx_mask42(%rip),$mask42
+	kmovw		%eax,%k1		# used in 2x path
+
+	test		%r8,%r8			# is power value impossible?
+	js		.Linit_vpmadd52		# if it is, then init R[4]
+
+	vmovq		0($ctx),%x#$H0		# load current hash value
+	vmovq		8($ctx),%x#$H1
+	vmovq		16($ctx),%x#$H2
+
+	test		\$3,$len		# is length 4*n+2?
+	jnz		.Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+	vpbroadcastq	64($ctx),$R0		# load 4th power of the key
+	vpbroadcastq	96($ctx),$R1
+	vpbroadcastq	128($ctx),$R2
+	vpbroadcastq	160($ctx),$S1
+
+.Lblocks_vpmadd52_4x_key_loaded:
+	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
+	vpaddq		$R2,$S2,$S2
+	vpsllq		\$2,$S2,$S2
+
+	test		\$7,$len		# is len 8*n?
+	jz		.Lblocks_vpmadd52_8x
+
+	vmovdqu64	16*0($inp),$T2		# load data
+	vmovdqu64	16*2($inp),$T3
+	lea		16*4($inp),$inp
+
+	vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	vpunpckhqdq	$T3,$T2,$T3
+
+	# at this point 64-bit lanes are ordered as 3-1-2-0
+
+	vpsrlq		\$24,$T3,$T2		# splat the data
+	vporq		$PAD,$T2,$T2
+	 vpaddq		$T2,$H2,$H2		# accumulate input
+	vpandq		$mask44,$T1,$T0
+	vpsrlq		\$44,$T1,$T1
+	vpsllq		\$20,$T3,$T3
+	vporq		$T3,$T1,$T1
+	vpandq		$mask44,$T1,$T1
+
+	sub		\$4,$len
+	jz		.Ltail_vpmadd52_4x
+	jmp		.Loop_vpmadd52_4x
+	ud2
+
+.align	32
+.Linit_vpmadd52:
+	vmovq		24($ctx),%x#$S1		# load key
+	vmovq		56($ctx),%x#$H2
+	vmovq		32($ctx),%x#$S2
+	vmovq		40($ctx),%x#$R0
+	vmovq		48($ctx),%x#$R1
+
+	vmovdqa		$R0,$H0
+	vmovdqa		$R1,$H1
+	vmovdqa		$H2,$R2
+
+	mov		\$2,%eax
+
+.Lmul_init_vpmadd52:
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$R0,$D2hi
+
+	vpmadd52luq	$H0,$R0,$D0lo
+	vpmadd52huq	$H0,$R0,$D0hi
+	vpmadd52luq	$H0,$R1,$D1lo
+	vpmadd52huq	$H0,$R1,$D1hi
+	vpmadd52luq	$H0,$R2,$D2lo
+	vpmadd52huq	$H0,$R2,$D2hi
+
+	vpmadd52luq	$H1,$S2,$D0lo
+	vpmadd52huq	$H1,$S2,$D0hi
+	vpmadd52luq	$H1,$R0,$D1lo
+	vpmadd52huq	$H1,$R0,$D1hi
+	vpmadd52luq	$H1,$R1,$D2lo
+	vpmadd52huq	$H1,$R1,$D2hi
+
+	################################################################
+	# partial reduction
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+
+	dec		%eax
+	jz		.Ldone_init_vpmadd52
+
+	vpunpcklqdq	$R1,$H1,$R1		# 1,2
+	vpbroadcastq	%x#$H1,%x#$H1		# 2,2
+	vpunpcklqdq	$R2,$H2,$R2
+	vpbroadcastq	%x#$H2,%x#$H2
+	vpunpcklqdq	$R0,$H0,$R0
+	vpbroadcastq	%x#$H0,%x#$H0
+
+	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
+	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
+	vpaddq		$R1,$S1,$S1
+	vpaddq		$R2,$S2,$S2
+	vpsllq		\$2,$S1,$S1
+	vpsllq		\$2,$S2,$S2
+
+	jmp		.Lmul_init_vpmadd52
+	ud2
+
+.align	32
+.Ldone_init_vpmadd52:
+	vinserti128	\$1,%x#$R1,$H1,$R1	# 1,2,3,4
+	vinserti128	\$1,%x#$R2,$H2,$R2
+	vinserti128	\$1,%x#$R0,$H0,$R0
+
+	vpermq		\$0b11011000,$R1,$R1	# 1,3,2,4
+	vpermq		\$0b11011000,$R2,$R2
+	vpermq		\$0b11011000,$R0,$R0
+
+	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
+	vpaddq		$R1,$S1,$S1
+	vpsllq		\$2,$S1,$S1
+
+	vmovq		0($ctx),%x#$H0		# load current hash value
+	vmovq		8($ctx),%x#$H1
+	vmovq		16($ctx),%x#$H2
+
+	test		\$3,$len		# is length 4*n+2?
+	jnz		.Ldone_init_vpmadd52_2x
+
+	vmovdqu64	$R0,64($ctx)		# save key powers
+	vpbroadcastq	%x#$R0,$R0		# broadcast 4th power
+	vmovdqu64	$R1,96($ctx)
+	vpbroadcastq	%x#$R1,$R1
+	vmovdqu64	$R2,128($ctx)
+	vpbroadcastq	%x#$R2,$R2
+	vmovdqu64	$S1,160($ctx)
+	vpbroadcastq	%x#$S1,$S1
+
+	jmp		.Lblocks_vpmadd52_4x_key_loaded
+	ud2
+
+.align	32
+.Ldone_init_vpmadd52_2x:
+	vmovdqu64	$R0,64($ctx)		# save key powers
+	vpsrldq		\$8,$R0,$R0		# 0-1-0-2
+	vmovdqu64	$R1,96($ctx)
+	vpsrldq		\$8,$R1,$R1
+	vmovdqu64	$R2,128($ctx)
+	vpsrldq		\$8,$R2,$R2
+	vmovdqu64	$S1,160($ctx)
+	vpsrldq		\$8,$S1,$S1
+	jmp		.Lblocks_vpmadd52_2x_key_loaded
+	ud2
+
+.align	32
+.Lblocks_vpmadd52_2x_do:
+	vmovdqu64	128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
+	vmovdqu64	160+8($ctx),${S1}{%k1}{z}
+	vmovdqu64	64+8($ctx),${R0}{%k1}{z}
+	vmovdqu64	96+8($ctx),${R1}{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+	vmovdqu64	16*0($inp),$T2		# load data
+	vpxorq		$T3,$T3,$T3
+	lea		16*2($inp),$inp
+
+	vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	vpunpckhqdq	$T3,$T2,$T3
+
+	# at this point 64-bit lanes are ordered as x-1-x-0
+
+	vpsrlq		\$24,$T3,$T2		# splat the data
+	vporq		$PAD,$T2,$T2
+	 vpaddq		$T2,$H2,$H2		# accumulate input
+	vpandq		$mask44,$T1,$T0
+	vpsrlq		\$44,$T1,$T1
+	vpsllq		\$20,$T3,$T3
+	vporq		$T3,$T1,$T1
+	vpandq		$mask44,$T1,$T1
+
+	jmp		.Ltail_vpmadd52_2x
+	ud2
+
+.align	32
+.Loop_vpmadd52_4x:
+	#vpaddq		$T2,$H2,$H2		# accumulate input
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$H1,$H1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$R0,$D2hi
+
+	 vmovdqu64	16*0($inp),$T2		# load data
+	 vmovdqu64	16*2($inp),$T3
+	 lea		16*4($inp),$inp
+	vpmadd52luq	$H0,$R0,$D0lo
+	vpmadd52huq	$H0,$R0,$D0hi
+	vpmadd52luq	$H0,$R1,$D1lo
+	vpmadd52huq	$H0,$R1,$D1hi
+	vpmadd52luq	$H0,$R2,$D2lo
+	vpmadd52huq	$H0,$R2,$D2hi
+
+	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	 vpunpckhqdq	$T3,$T2,$T3
+	vpmadd52luq	$H1,$S2,$D0lo
+	vpmadd52huq	$H1,$S2,$D0hi
+	vpmadd52luq	$H1,$R0,$D1lo
+	vpmadd52huq	$H1,$R0,$D1hi
+	vpmadd52luq	$H1,$R1,$D2lo
+	vpmadd52huq	$H1,$R1,$D2hi
+
+	################################################################
+	# partial reduction (interleaved with data splat)
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	 vpsrlq		\$24,$T3,$T2
+	 vporq		$PAD,$T2,$T2
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	 vpandq		$mask44,$T1,$T0
+	 vpsrlq		\$44,$T1,$T1
+	 vpsllq		\$20,$T3,$T3
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	  vpaddq	$T2,$H2,$H2		# accumulate input
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	 vporq		$T3,$T1,$T1
+	 vpandq		$mask44,$T1,$T1
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+
+	sub		\$4,$len		# len-=64
+	jnz		.Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+	vmovdqu64	128($ctx),$R2		# load all key powers
+	vmovdqu64	160($ctx),$S1
+	vmovdqu64	64($ctx),$R0
+	vmovdqu64	96($ctx),$R1
+
+.Ltail_vpmadd52_2x:
+	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
+	vpaddq		$R2,$S2,$S2
+	vpsllq		\$2,$S2,$S2
+
+	#vpaddq		$T2,$H2,$H2		# accumulate input
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$H1,$H1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$R0,$D2hi
+
+	vpmadd52luq	$H0,$R0,$D0lo
+	vpmadd52huq	$H0,$R0,$D0hi
+	vpmadd52luq	$H0,$R1,$D1lo
+	vpmadd52huq	$H0,$R1,$D1hi
+	vpmadd52luq	$H0,$R2,$D2lo
+	vpmadd52huq	$H0,$R2,$D2hi
+
+	vpmadd52luq	$H1,$S2,$D0lo
+	vpmadd52huq	$H1,$S2,$D0hi
+	vpmadd52luq	$H1,$R0,$D1lo
+	vpmadd52huq	$H1,$R0,$D1hi
+	vpmadd52luq	$H1,$R1,$D2lo
+	vpmadd52huq	$H1,$R1,$D2hi
+
+	################################################################
+	# horizontal addition
+
+	mov		\$1,%eax
+	kmovw		%eax,%k1
+	vpsrldq		\$8,$D0lo,$T0
+	vpsrldq		\$8,$D0hi,$H0
+	vpsrldq		\$8,$D1lo,$T1
+	vpsrldq		\$8,$D1hi,$H1
+	vpaddq		$T0,$D0lo,$D0lo
+	vpaddq		$H0,$D0hi,$D0hi
+	vpsrldq		\$8,$D2lo,$T2
+	vpsrldq		\$8,$D2hi,$H2
+	vpaddq		$T1,$D1lo,$D1lo
+	vpaddq		$H1,$D1hi,$D1hi
+	 vpermq		\$0x2,$D0lo,$T0
+	 vpermq		\$0x2,$D0hi,$H0
+	vpaddq		$T2,$D2lo,$D2lo
+	vpaddq		$H2,$D2hi,$D2hi
+
+	vpermq		\$0x2,$D1lo,$T1
+	vpermq		\$0x2,$D1hi,$H1
+	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
+	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
+	vpermq		\$0x2,$D2lo,$T2
+	vpermq		\$0x2,$D2hi,$H2
+	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
+	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
+	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
+	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
+
+	################################################################
+	# partial reduction
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+						# at this point $len is
+						# either 4*n+2 or 0...
+	sub		\$2,$len		# len-=32
+	ja		.Lblocks_vpmadd52_4x_do
+
+	vmovq		%x#$H0,0($ctx)
+	vmovq		%x#$H1,8($ctx)
+	vmovq		%x#$H2,16($ctx)
+	vzeroall
+
+.Lno_data_vpmadd52_4x:
+	RET
+.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+___
+}
+{
+########################################################################
+# As implied by its name 8x subroutine processes 8 blocks in parallel...
+# This is intermediate version, as it's used only in cases when input
+# length is either 8*n, 8*n+1 or 8*n+2...
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
+
+$code.=<<___;
+.type	poly1305_blocks_vpmadd52_8x,\@function,4
+.align	32
+poly1305_blocks_vpmadd52_8x:
+	shr	\$4,$len
+	jz	.Lno_data_vpmadd52_8x		# too short
+
+	shl	\$40,$padbit
+	mov	64($ctx),%r8			# peek on power of the key
+
+	vmovdqa64	.Lx_mask44(%rip),$mask44
+	vmovdqa64	.Lx_mask42(%rip),$mask42
+
+	test	%r8,%r8				# is power value impossible?
+	js	.Linit_vpmadd52			# if it is, then init R[4]
+
+	vmovq	0($ctx),%x#$H0			# load current hash value
+	vmovq	8($ctx),%x#$H1
+	vmovq	16($ctx),%x#$H2
+
+.Lblocks_vpmadd52_8x:
+	################################################################
+	# fist we calculate more key powers
+
+	vmovdqu64	128($ctx),$R2		# load 1-3-2-4 powers
+	vmovdqu64	160($ctx),$S1
+	vmovdqu64	64($ctx),$R0
+	vmovdqu64	96($ctx),$R1
+
+	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
+	vpaddq		$R2,$S2,$S2
+	vpsllq		\$2,$S2,$S2
+
+	vpbroadcastq	%x#$R2,$RR2		# broadcast 4th power
+	vpbroadcastq	%x#$R0,$RR0
+	vpbroadcastq	%x#$R1,$RR1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$RR2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$RR2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$RR2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$RR2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$RR2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$RR2,$R0,$D2hi
+
+	vpmadd52luq	$RR0,$R0,$D0lo
+	vpmadd52huq	$RR0,$R0,$D0hi
+	vpmadd52luq	$RR0,$R1,$D1lo
+	vpmadd52huq	$RR0,$R1,$D1hi
+	vpmadd52luq	$RR0,$R2,$D2lo
+	vpmadd52huq	$RR0,$R2,$D2hi
+
+	vpmadd52luq	$RR1,$S2,$D0lo
+	vpmadd52huq	$RR1,$S2,$D0hi
+	vpmadd52luq	$RR1,$R0,$D1lo
+	vpmadd52huq	$RR1,$R0,$D1hi
+	vpmadd52luq	$RR1,$R1,$D2lo
+	vpmadd52huq	$RR1,$R1,$D2hi
+
+	################################################################
+	# partial reduction
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$RR0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$RR1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$RR2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$RR0,$RR0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$RR0,$RR0
+
+	vpsrlq		\$44,$RR0,$tmp		# additional step
+	vpandq		$mask44,$RR0,$RR0
+
+	vpaddq		$tmp,$RR1,$RR1
+
+	################################################################
+	# At this point Rx holds 1324 powers, RRx - 5768, and the goal
+	# is 15263748, which reflects how data is loaded...
+
+	vpunpcklqdq	$R2,$RR2,$T2		# 3748
+	vpunpckhqdq	$R2,$RR2,$R2		# 1526
+	vpunpcklqdq	$R0,$RR0,$T0
+	vpunpckhqdq	$R0,$RR0,$R0
+	vpunpcklqdq	$R1,$RR1,$T1
+	vpunpckhqdq	$R1,$RR1,$R1
+___
+######## switch to %zmm
+map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
+
+$code.=<<___;
+	vshufi64x2	\$0x44,$R2,$T2,$RR2	# 15263748
+	vshufi64x2	\$0x44,$R0,$T0,$RR0
+	vshufi64x2	\$0x44,$R1,$T1,$RR1
+
+	vmovdqu64	16*0($inp),$T2		# load data
+	vmovdqu64	16*4($inp),$T3
+	lea		16*8($inp),$inp
+
+	vpsllq		\$2,$RR2,$SS2		# S2 = R2*5*4
+	vpsllq		\$2,$RR1,$SS1		# S1 = R1*5*4
+	vpaddq		$RR2,$SS2,$SS2
+	vpaddq		$RR1,$SS1,$SS1
+	vpsllq		\$2,$SS2,$SS2
+	vpsllq		\$2,$SS1,$SS1
+
+	vpbroadcastq	$padbit,$PAD
+	vpbroadcastq	%x#$mask44,$mask44
+	vpbroadcastq	%x#$mask42,$mask42
+
+	vpbroadcastq	%x#$SS1,$S1		# broadcast 8th power
+	vpbroadcastq	%x#$SS2,$S2
+	vpbroadcastq	%x#$RR0,$R0
+	vpbroadcastq	%x#$RR1,$R1
+	vpbroadcastq	%x#$RR2,$R2
+
+	vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	vpunpckhqdq	$T3,$T2,$T3
+
+	# at this point 64-bit lanes are ordered as 73625140
+
+	vpsrlq		\$24,$T3,$T2		# splat the data
+	vporq		$PAD,$T2,$T2
+	 vpaddq		$T2,$H2,$H2		# accumulate input
+	vpandq		$mask44,$T1,$T0
+	vpsrlq		\$44,$T1,$T1
+	vpsllq		\$20,$T3,$T3
+	vporq		$T3,$T1,$T1
+	vpandq		$mask44,$T1,$T1
+
+	sub		\$8,$len
+	jz		.Ltail_vpmadd52_8x
+	jmp		.Loop_vpmadd52_8x
+
+.align	32
+.Loop_vpmadd52_8x:
+	#vpaddq		$T2,$H2,$H2		# accumulate input
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$H1,$H1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$R0,$D2hi
+
+	 vmovdqu64	16*0($inp),$T2		# load data
+	 vmovdqu64	16*4($inp),$T3
+	 lea		16*8($inp),$inp
+	vpmadd52luq	$H0,$R0,$D0lo
+	vpmadd52huq	$H0,$R0,$D0hi
+	vpmadd52luq	$H0,$R1,$D1lo
+	vpmadd52huq	$H0,$R1,$D1hi
+	vpmadd52luq	$H0,$R2,$D2lo
+	vpmadd52huq	$H0,$R2,$D2hi
+
+	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	 vpunpckhqdq	$T3,$T2,$T3
+	vpmadd52luq	$H1,$S2,$D0lo
+	vpmadd52huq	$H1,$S2,$D0hi
+	vpmadd52luq	$H1,$R0,$D1lo
+	vpmadd52huq	$H1,$R0,$D1hi
+	vpmadd52luq	$H1,$R1,$D2lo
+	vpmadd52huq	$H1,$R1,$D2hi
+
+	################################################################
+	# partial reduction (interleaved with data splat)
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	 vpsrlq		\$24,$T3,$T2
+	 vporq		$PAD,$T2,$T2
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	 vpandq		$mask44,$T1,$T0
+	 vpsrlq		\$44,$T1,$T1
+	 vpsllq		\$20,$T3,$T3
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	  vpaddq	$T2,$H2,$H2		# accumulate input
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	 vporq		$T3,$T1,$T1
+	 vpandq		$mask44,$T1,$T1
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+
+	sub		\$8,$len		# len-=128
+	jnz		.Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+	#vpaddq		$T2,$H2,$H2		# accumulate input
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$H1,$H1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$SS1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$SS1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$SS2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$SS2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$RR0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$RR0,$D2hi
+
+	vpmadd52luq	$H0,$RR0,$D0lo
+	vpmadd52huq	$H0,$RR0,$D0hi
+	vpmadd52luq	$H0,$RR1,$D1lo
+	vpmadd52huq	$H0,$RR1,$D1hi
+	vpmadd52luq	$H0,$RR2,$D2lo
+	vpmadd52huq	$H0,$RR2,$D2hi
+
+	vpmadd52luq	$H1,$SS2,$D0lo
+	vpmadd52huq	$H1,$SS2,$D0hi
+	vpmadd52luq	$H1,$RR0,$D1lo
+	vpmadd52huq	$H1,$RR0,$D1hi
+	vpmadd52luq	$H1,$RR1,$D2lo
+	vpmadd52huq	$H1,$RR1,$D2hi
+
+	################################################################
+	# horizontal addition
+
+	mov		\$1,%eax
+	kmovw		%eax,%k1
+	vpsrldq		\$8,$D0lo,$T0
+	vpsrldq		\$8,$D0hi,$H0
+	vpsrldq		\$8,$D1lo,$T1
+	vpsrldq		\$8,$D1hi,$H1
+	vpaddq		$T0,$D0lo,$D0lo
+	vpaddq		$H0,$D0hi,$D0hi
+	vpsrldq		\$8,$D2lo,$T2
+	vpsrldq		\$8,$D2hi,$H2
+	vpaddq		$T1,$D1lo,$D1lo
+	vpaddq		$H1,$D1hi,$D1hi
+	 vpermq		\$0x2,$D0lo,$T0
+	 vpermq		\$0x2,$D0hi,$H0
+	vpaddq		$T2,$D2lo,$D2lo
+	vpaddq		$H2,$D2hi,$D2hi
+
+	vpermq		\$0x2,$D1lo,$T1
+	vpermq		\$0x2,$D1hi,$H1
+	vpaddq		$T0,$D0lo,$D0lo
+	vpaddq		$H0,$D0hi,$D0hi
+	vpermq		\$0x2,$D2lo,$T2
+	vpermq		\$0x2,$D2hi,$H2
+	vpaddq		$T1,$D1lo,$D1lo
+	vpaddq		$H1,$D1hi,$D1hi
+	 vextracti64x4	\$1,$D0lo,%y#$T0
+	 vextracti64x4	\$1,$D0hi,%y#$H0
+	vpaddq		$T2,$D2lo,$D2lo
+	vpaddq		$H2,$D2hi,$D2hi
+
+	vextracti64x4	\$1,$D1lo,%y#$T1
+	vextracti64x4	\$1,$D1hi,%y#$H1
+	vextracti64x4	\$1,$D2lo,%y#$T2
+	vextracti64x4	\$1,$D2hi,%y#$H2
+___
+######## switch back to %ymm
+map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+
+$code.=<<___;
+	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
+	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
+	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
+	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
+	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
+	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
+
+	################################################################
+	# partial reduction
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+
+	################################################################
+
+	vmovq		%x#$H0,0($ctx)
+	vmovq		%x#$H1,8($ctx)
+	vmovq		%x#$H2,16($ctx)
+	vzeroall
+
+.Lno_data_vpmadd52_8x:
+	RET
+.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+___
+}
+$code.=<<___;
+.type	poly1305_emit_base2_44,\@function,3
+.align	32
+poly1305_emit_base2_44:
+	mov	0($ctx),%r8	# load hash value
+	mov	8($ctx),%r9
+	mov	16($ctx),%r10
+
+	mov	%r9,%rax
+	shr	\$20,%r9
+	shl	\$44,%rax
+	mov	%r10,%rcx
+	shr	\$40,%r10
+	shl	\$24,%rcx
+
+	add	%rax,%r8
+	adc	%rcx,%r9
+	adc	\$0,%r10
+
+	mov	%r8,%rax
+	add	\$5,%r8		# compare to modulus
+	mov	%r9,%rcx
+	adc	\$0,%r9
+	adc	\$0,%r10
+	shr	\$2,%r10	# did 130-bit value overflow?
+	cmovnz	%r8,%rax
+	cmovnz	%r9,%rcx
+
+	add	0($nonce),%rax	# accumulate nonce
+	adc	8($nonce),%rcx
+	mov	%rax,0($mac)	# write result
+	mov	%rcx,8($mac)
+
+	RET
+.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
+___
+}	}	}
+}
+
+if (!$kernel)
+{	# chacha20-poly1305 helpers
+my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+$code.=<<___;
+.globl	xor128_encrypt_n_pad
+.type	xor128_encrypt_n_pad,\@abi-omnipotent
+.align	16
+xor128_encrypt_n_pad:
+	sub	$otp,$inp
+	sub	$otp,$out
+	mov	$len,%r10		# put len aside
+	shr	\$4,$len		# len / 16
+	jz	.Ltail_enc
+	nop
+.Loop_enc_xmm:
+	movdqu	($inp,$otp),%xmm0
+	pxor	($otp),%xmm0
+	movdqu	%xmm0,($out,$otp)
+	movdqa	%xmm0,($otp)
+	lea	16($otp),$otp
+	dec	$len
+	jnz	.Loop_enc_xmm
+
+	and	\$15,%r10		# len % 16
+	jz	.Ldone_enc
+
+.Ltail_enc:
+	mov	\$16,$len
+	sub	%r10,$len
+	xor	%eax,%eax
+.Loop_enc_byte:
+	mov	($inp,$otp),%al
+	xor	($otp),%al
+	mov	%al,($out,$otp)
+	mov	%al,($otp)
+	lea	1($otp),$otp
+	dec	%r10
+	jnz	.Loop_enc_byte
+
+	xor	%eax,%eax
+.Loop_enc_pad:
+	mov	%al,($otp)
+	lea	1($otp),$otp
+	dec	$len
+	jnz	.Loop_enc_pad
+
+.Ldone_enc:
+	mov	$otp,%rax
+	RET
+.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl	xor128_decrypt_n_pad
+.type	xor128_decrypt_n_pad,\@abi-omnipotent
+.align	16
+xor128_decrypt_n_pad:
+	sub	$otp,$inp
+	sub	$otp,$out
+	mov	$len,%r10		# put len aside
+	shr	\$4,$len		# len / 16
+	jz	.Ltail_dec
+	nop
+.Loop_dec_xmm:
+	movdqu	($inp,$otp),%xmm0
+	movdqa	($otp),%xmm1
+	pxor	%xmm0,%xmm1
+	movdqu	%xmm1,($out,$otp)
+	movdqa	%xmm0,($otp)
+	lea	16($otp),$otp
+	dec	$len
+	jnz	.Loop_dec_xmm
+
+	pxor	%xmm1,%xmm1
+	and	\$15,%r10		# len % 16
+	jz	.Ldone_dec
+
+.Ltail_dec:
+	mov	\$16,$len
+	sub	%r10,$len
+	xor	%eax,%eax
+	xor	%r11d,%r11d
+.Loop_dec_byte:
+	mov	($inp,$otp),%r11b
+	mov	($otp),%al
+	xor	%r11b,%al
+	mov	%al,($out,$otp)
+	mov	%r11b,($otp)
+	lea	1($otp),$otp
+	dec	%r10
+	jnz	.Loop_dec_byte
+
+	xor	%eax,%eax
+.Loop_dec_pad:
+	mov	%al,($otp)
+	lea	1($otp),$otp
+	dec	$len
+	jnz	.Loop_dec_pad
+
+.Ldone_dec:
+	mov	$otp,%rax
+	RET
+.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lcommon_seh_tail
+
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R14
+
+	jmp	.Lcommon_seh_tail
+.size	se_handler,.-se_handler
+
+.type	avx_handler,\@abi-omnipotent
+.align	16
+avx_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	208($context),%rax	# pull context->R11
+
+	lea	0x50(%rax),%rsi
+	lea	0xf8(%rax),%rax
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%ecx,%ecx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	RET
+.size	avx_handler,.-avx_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_poly1305_block_init_arch
+	.rva	.LSEH_end_poly1305_block_init_arch
+	.rva	.LSEH_info_poly1305_block_init_arch
+
+	.rva	.LSEH_begin_poly1305_blocks_x86_64
+	.rva	.LSEH_end_poly1305_blocks_x86_64
+	.rva	.LSEH_info_poly1305_blocks_x86_64
+
+	.rva	.LSEH_begin_poly1305_emit_x86_64
+	.rva	.LSEH_end_poly1305_emit_x86_64
+	.rva	.LSEH_info_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_poly1305_blocks_avx
+	.rva	.Lbase2_64_avx
+	.rva	.LSEH_info_poly1305_blocks_avx_1
+
+	.rva	.Lbase2_64_avx
+	.rva	.Leven_avx
+	.rva	.LSEH_info_poly1305_blocks_avx_2
+
+	.rva	.Leven_avx
+	.rva	.LSEH_end_poly1305_blocks_avx
+	.rva	.LSEH_info_poly1305_blocks_avx_3
+
+	.rva	.LSEH_begin_poly1305_emit_avx
+	.rva	.LSEH_end_poly1305_emit_avx
+	.rva	.LSEH_info_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+	.rva	.LSEH_begin_poly1305_blocks_avx2
+	.rva	.Lbase2_64_avx2
+	.rva	.LSEH_info_poly1305_blocks_avx2_1
+
+	.rva	.Lbase2_64_avx2
+	.rva	.Leven_avx2
+	.rva	.LSEH_info_poly1305_blocks_avx2_2
+
+	.rva	.Leven_avx2
+	.rva	.LSEH_end_poly1305_blocks_avx2
+	.rva	.LSEH_info_poly1305_blocks_avx2_3
+___
+$code.=<<___ if ($avx>2);
+	.rva	.LSEH_begin_poly1305_blocks_avx512
+	.rva	.LSEH_end_poly1305_blocks_avx512
+	.rva	.LSEH_info_poly1305_blocks_avx512
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_poly1305_block_init_arch:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch
+
+.LSEH_info_poly1305_blocks_x86_64:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lblocks_body,.Lblocks_epilogue
+
+.LSEH_info_poly1305_emit_x86_64:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+.LSEH_info_poly1305_blocks_avx_1:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_2:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_3:
+	.byte	9,0,0,0
+	.rva	avx_handler
+	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
+
+.LSEH_info_poly1305_emit_avx:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_poly1305_blocks_avx2_1:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_2:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_3:
+	.byte	9,0,0,0
+	.rva	avx_handler
+	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
+___
+$code.=<<___ if ($avx>2);
+.LSEH_info_poly1305_blocks_avx512:
+	.byte	9,0,0,0
+	.rva	avx_handler
+	.rva	.Ldo_avx512_body,.Ldo_avx512_epilogue		# HandlerData[]
+___
+}
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/\/\// and !/^$/);
+	print;
+}
+close SELF;
+
+foreach (split('\n',$code)) {
+	s/\`([^\`]*)\`/eval($1)/ge;
+	s/%r([a-z]+)#d/%e$1/g;
+	s/%r([0-9]+)#d/%r$1d/g;
+	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
+
+	if ($kernel) {
+		s/(^\.type.*),[0-9]+$/\1/;
+		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
+		next if /^\.cfi.*/;
+	}
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305_glue.c
new file mode 100644
index 000000000000..856d48fd422b
--- /dev/null
+++ b/lib/crypto/x86/poly1305_glue.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/unaligned.h>
+
+struct poly1305_arch_internal {
+	union {
+		struct {
+			u32 h[5];
+			u32 is_base2_26;
+		};
+		u64 hs[3];
+	};
+	u64 r[2];
+	u64 pad;
+	struct { u32 r2, r1, r4, r3; } rn[9];
+};
+
+/*
+ * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
+ */
+static void convert_to_base2_64(void *ctx)
+{
+	struct poly1305_arch_internal *state = ctx;
+	u32 cy;
+
+	if (!state->is_base2_26)
+		return;
+
+	cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+	cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+	cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+	cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+	state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+	state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+	state->hs[2] = state->h[4] >> 24;
+	/* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+	cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+	state->hs[2] &= 3;
+	state->hs[0] += cy;
+	state->hs[1] += (cy = ULT(state->hs[0], cy));
+	state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+	state->is_base2_26 = 0;
+}
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
+				       const u8 *inp,
+				       const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
+				     u8 mac[POLY1305_DIGEST_SIZE],
+				     const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
+				  u8 mac[POLY1305_DIGEST_SIZE],
+				  const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
+				    const u8 *inp, const size_t len,
+				    const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
+				     const u8 *inp, const size_t len,
+				     const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
+				       const u8 *inp,
+				       const size_t len, const u32 padbit);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
+			  unsigned int len, u32 padbit)
+{
+	struct poly1305_arch_internal *ctx =
+		container_of(&state->h.h, struct poly1305_arch_internal, h);
+
+	/* SIMD disables preemption, so relax after processing each page. */
+	BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+		     SZ_4K % POLY1305_BLOCK_SIZE);
+
+	/*
+	 * The AVX implementations have significant setup overhead (e.g. key
+	 * power computation, kernel FPU enabling) which makes them slower for
+	 * short messages.  Fall back to the scalar implementation for messages
+	 * shorter than 288 bytes, unless the AVX-specific key setup has already
+	 * been performed (indicated by ctx->is_base2_26).
+	 */
+	if (!static_branch_likely(&poly1305_use_avx) ||
+	    (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) ||
+	    unlikely(!irq_fpu_usable())) {
+		convert_to_base2_64(ctx);
+		poly1305_blocks_x86_64(ctx, inp, len, padbit);
+		return;
+	}
+
+	do {
+		const unsigned int bytes = min(len, SZ_4K);
+
+		kernel_fpu_begin();
+		if (static_branch_likely(&poly1305_use_avx512))
+			poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+		else if (static_branch_likely(&poly1305_use_avx2))
+			poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+		else
+			poly1305_blocks_avx(ctx, inp, bytes, padbit);
+		kernel_fpu_end();
+
+		len -= bytes;
+		inp += bytes;
+	} while (len);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+void poly1305_emit_arch(const struct poly1305_state *ctx,
+			u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+{
+	if (!static_branch_likely(&poly1305_use_avx))
+		poly1305_emit_x86_64(ctx, mac, nonce);
+	else
+		poly1305_emit_avx(ctx, mac, nonce);
+}
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	return static_key_enabled(&poly1305_use_avx);
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init poly1305_simd_mod_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+		static_branch_enable(&poly1305_use_avx);
+	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+		static_branch_enable(&poly1305_use_avx2);
+	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+	    /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
+	    boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
+		static_branch_enable(&poly1305_use_avx512);
+	return 0;
+}
+subsys_initcall(poly1305_simd_mod_init);
+
+static void __exit poly1305_simd_mod_exit(void)
+{
+}
+module_exit(poly1305_simd_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+MODULE_DESCRIPTION("Poly1305 authenticator");
diff --git a/lib/crypto/x86/sha1-avx2-asm.S b/lib/crypto/x86/sha1-avx2-asm.S
new file mode 100644
index 000000000000..91b2dc6db179
--- /dev/null
+++ b/lib/crypto/x86/sha1-avx2-asm.S
@@ -0,0 +1,697 @@
+/*
+ *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ *Visit http://software.intel.com/en-us/articles/
+ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * void sha1_transform_avx2(struct sha1_block_state *state,
+ *			    const u8 *data, size_t nblocks);
+ */
+
+#include <linux/linkage.h>
+
+#define	CTX	%rdi	/* arg1 */
+#define BUF	%rsi	/* arg2 */
+#define CNT	%rdx	/* arg3 */
+
+#define	REG_A	%ecx
+#define	REG_B	%esi
+#define	REG_C	%edi
+#define	REG_D	%eax
+#define	REG_E	%edx
+#define	REG_TB	%ebx
+#define	REG_TA	%r12d
+#define	REG_RA	%rcx
+#define	REG_RB	%rsi
+#define	REG_RC	%rdi
+#define	REG_RD	%rax
+#define	REG_RE	%rdx
+#define	REG_RTA	%r12
+#define	REG_RTB	%rbx
+#define	REG_T1	%r11d
+#define	xmm_mov	vmovups
+#define	avx2_zeroupper	vzeroupper
+#define	RND_F1	1
+#define	RND_F2	2
+#define	RND_F3	3
+
+.macro REGALLOC
+	.set A, REG_A
+	.set B, REG_B
+	.set C, REG_C
+	.set D, REG_D
+	.set E, REG_E
+	.set TB, REG_TB
+	.set TA, REG_TA
+
+	.set RA, REG_RA
+	.set RB, REG_RB
+	.set RC, REG_RC
+	.set RD, REG_RD
+	.set RE, REG_RE
+
+	.set RTA, REG_RTA
+	.set RTB, REG_RTB
+
+	.set T1, REG_T1
+.endm
+
+#define HASH_PTR	%r9
+#define BLOCKS_CTR	%r8
+#define BUFFER_PTR	%r10
+#define BUFFER_PTR2	%r13
+
+#define PRECALC_BUF	%r14
+#define WK_BUF		%r15
+
+#define W_TMP		%xmm0
+#define WY_TMP		%ymm0
+#define WY_TMP2		%ymm9
+
+# AVX2 variables
+#define WY0		%ymm3
+#define WY4		%ymm5
+#define WY08		%ymm7
+#define WY12		%ymm8
+#define WY16		%ymm12
+#define WY20		%ymm13
+#define WY24		%ymm14
+#define WY28		%ymm15
+
+#define YMM_SHUFB_BSWAP	%ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ *    - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE		(80*2*2 +16)
+
+#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+	.set WY_00, WY0
+	.set WY_04, WY4
+	.set WY_08, WY08
+	.set WY_12, WY12
+	.set WY_16, WY16
+	.set WY_20, WY20
+	.set WY_24, WY24
+	.set WY_28, WY28
+	.set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+	/* Rotate macros */
+	.set WY_32, WY_28
+	.set WY_28, WY_24
+	.set WY_24, WY_20
+	.set WY_20, WY_16
+	.set WY_16, WY_12
+	.set WY_12, WY_08
+	.set WY_08, WY_04
+	.set WY_04, WY_00
+	.set WY_00, WY_32
+
+	/* Define register aliases */
+	.set WY, WY_00
+	.set WY_minus_04, WY_04
+	.set WY_minus_08, WY_08
+	.set WY_minus_12, WY_12
+	.set WY_minus_16, WY_16
+	.set WY_minus_20, WY_20
+	.set WY_minus_24, WY_24
+	.set WY_minus_28, WY_28
+	.set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+	.if (i == 0) # Initialize and rotate registers
+		PRECALC_RESET_WY
+		PRECALC_ROTATE_WY
+	.endif
+
+	/* message scheduling pre-compute for rounds 0-15 */
+	.if   ((i & 7) == 0)
+		/*
+		 * blended AVX2 and ALU instruction scheduling
+		 * 1 vector iteration per 8 rounds
+		 */
+		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
+	.elseif ((i & 7) == 1)
+		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
+			 WY_TMP, WY_TMP
+	.elseif ((i & 7) == 2)
+		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+	.elseif ((i & 7) == 4)
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+	.elseif ((i & 7) == 7)
+		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC_16_31
+	/*
+	 * message scheduling pre-compute for rounds 16-31
+	 * calculating last 32 w[i] values in 8 XMM registers
+	 * pre-calculate K+w[i] values and store to mem
+	 * for later load by ALU add instruction
+	 *
+	 * "brute force" vectorization for rounds 16-31 only
+	 * due to w[i]->w[i-3] dependency
+	 */
+	.if   ((i & 7) == 0)
+		/*
+		 * blended AVX2 and ALU instruction scheduling
+		 * 1 vector iteration per 8 rounds
+		 */
+		/* w[i-14] */
+		vpalignr	$8, WY_minus_16, WY_minus_12, WY
+		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
+	.elseif ((i & 7) == 1)
+		vpxor	WY_minus_08, WY, WY
+		vpxor	WY_minus_16, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 2)
+		vpxor	WY_TMP, WY, WY
+		vpslldq	$12, WY, WY_TMP2
+	.elseif ((i & 7) == 3)
+		vpslld	$1, WY, WY_TMP
+		vpsrld	$31, WY, WY
+	.elseif ((i & 7) == 4)
+		vpor	WY, WY_TMP, WY_TMP
+		vpslld	$2, WY_TMP2, WY
+	.elseif ((i & 7) == 5)
+		vpsrld	$30, WY_TMP2, WY_TMP2
+		vpxor	WY, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 7)
+		vpxor	WY_TMP2, WY_TMP, WY
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC_32_79
+	/*
+	 * in SHA-1 specification:
+	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+	 * instead we do equal:
+	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+	 * allows more efficient vectorization
+	 * since w[i]=>w[i-3] dependency is broken
+	 */
+
+	.if   ((i & 7) == 0)
+	/*
+	 * blended AVX2 and ALU instruction scheduling
+	 * 1 vector iteration per 8 rounds
+	 */
+		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
+	.elseif ((i & 7) == 1)
+		/* W is W_minus_32 before xor */
+		vpxor	WY_minus_28, WY, WY
+	.elseif ((i & 7) == 2)
+		vpxor	WY_minus_16, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 3)
+		vpxor	WY_TMP, WY, WY
+	.elseif ((i & 7) == 4)
+		vpslld	$2, WY, WY_TMP
+	.elseif ((i & 7) == 5)
+		vpsrld	$30, WY, WY
+		vpor	WY, WY_TMP, WY
+	.elseif ((i & 7) == 7)
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC r, s
+	.set i, \r
+
+	.if (i < 40)
+		.set K_XMM, 32*0
+	.elseif (i < 80)
+		.set K_XMM, 32*1
+	.elseif (i < 120)
+		.set K_XMM, 32*2
+	.else
+		.set K_XMM, 32*3
+	.endif
+
+	.if (i<32)
+		PRECALC_00_15	\s
+	.elseif (i<64)
+		PRECALC_16_31	\s
+	.elseif (i < 160)
+		PRECALC_32_79	\s
+	.endif
+.endm
+
+.macro ROTATE_STATE
+	.set T_REG, E
+	.set E, D
+	.set D, C
+	.set C, B
+	.set B, TB
+	.set TB, A
+	.set A, T_REG
+
+	.set T_REG, RE
+	.set RE, RD
+	.set RD, RC
+	.set RC, RB
+	.set RB, RTB
+	.set RTB, RA
+	.set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+	.if (\f == RND_F1)
+		ROUND_F1	\r
+	.elseif (\f == RND_F2)
+		ROUND_F2	\r
+	.elseif (\f == RND_F3)
+		ROUND_F3	\r
+	.endif
+.endm
+
+.macro RR r
+	.set round_id, (\r % 80)
+
+	.if (round_id == 0)        /* Precalculate F for first round */
+		.set ROUND_FUNC, RND_F1
+		mov	B, TB
+
+		rorx	$(32-30), B, B    /* b>>>2 */
+		andn	D, TB, T1
+		and	C, TB
+		xor	T1, TB
+	.endif
+
+	RND_FUN ROUND_FUNC, \r
+	ROTATE_STATE
+
+	.if   (round_id == 18)
+		.set ROUND_FUNC, RND_F2
+	.elseif (round_id == 38)
+		.set ROUND_FUNC, RND_F3
+	.elseif (round_id == 58)
+		.set ROUND_FUNC, RND_F2
+	.endif
+
+	.set round_id, ( (\r+1) % 80)
+
+	RND_FUN ROUND_FUNC, (\r+1)
+	ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+	add	WK(\r), E
+
+	andn	C, A, T1			/* ~b&d */
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-30),A, TB		/* b>>>2 for next round */
+
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	/*
+	 * Calculate F for the next round
+	 * (b & c) ^ andn[b, d]
+	 */
+	and	B, A			/* b&c */
+	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
+
+	lea	(RE,RTA), E		/* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+	add	WK(\r), E
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	/* Calculate F for the next round */
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	.if ((round_id) < 79)
+		rorx	$(32-30), A, TB	/* b>>>2 for next round */
+	.endif
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	.if ((round_id) < 79)
+		xor	B, A
+	.endif
+
+	add	TA, E			/* E += A >>> 5 */
+
+	.if ((round_id) < 79)
+		xor	C, A
+	.endif
+.endm
+
+.macro ROUND_F3 r
+	add	WK(\r), E
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	mov	B, T1
+	or	A, T1
+
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-30), A, TB		/* b>>>2 for next round */
+
+	/* Calculate F for the next round
+	 * (b and c) or (d and (b or c))
+	 */
+	and	C, T1
+	and	B, A
+	or	T1, A
+
+	add	TA, E			/* E += A >>> 5 */
+
+.endm
+
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+	mov     \a, RTA
+	add     $\d, RTA
+	cmp     $\c, \b
+	cmovge  RTA, \a
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+	REGALLOC
+
+	mov	(HASH_PTR), A
+	mov	4(HASH_PTR), B
+	mov	8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+	mov	%rsp, PRECALC_BUF
+	lea	(2*4*80+32)(%rsp), WK_BUF
+
+	# Precalc WK for first 2 blocks
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
+	.set i, 0
+	.rept    160
+		PRECALC i
+		.set i, i + 1
+	.endr
+
+	/* Go to next block if needed */
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+	xchg	WK_BUF, PRECALC_BUF
+
+	.align 32
+.L_loop:
+	/*
+	 * code loops through more than one block
+	 * we use K_BASE value as a signal of a last block,
+	 * it is set below by: cmovae BUFFER_PTR, K_BASE
+	 */
+	test BLOCKS_CTR, BLOCKS_CTR
+	jnz .L_begin
+	.align 32
+	jmp	.L_end
+	.align 32
+.L_begin:
+
+	/*
+	 * Do first block
+	 * rounds: 0,2,4,6,8
+	 */
+	.set j, 0
+	.rept 5
+		RR	j
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds:
+	 * 10,12,14,16,18
+	 * 20,22,24,26,28
+	 * 30,32,34,36,38
+	 * 40,42,44,46,48
+	 * 50,52,54,56,58
+	 */
+	.rept 25
+		RR	j
+		.set j, j+2
+	.endr
+
+	/* Update Counter */
+	sub $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
+	/*
+	 * rounds
+	 * 60,62,64,66,68
+	 * 70,72,74,76,78
+	 */
+	.rept 10
+		RR	j
+		.set j, j+2
+	.endr
+
+	UPDATE_HASH	(HASH_PTR), A
+	UPDATE_HASH	4(HASH_PTR), TB
+	UPDATE_HASH	8(HASH_PTR), C
+	UPDATE_HASH	12(HASH_PTR), D
+	UPDATE_HASH	16(HASH_PTR), E
+
+	test	BLOCKS_CTR, BLOCKS_CTR
+	jz	.L_loop
+
+	mov	TB, B
+
+	/* Process second block */
+	/*
+	 * rounds
+	 *  0+80, 2+80, 4+80, 6+80, 8+80
+	 * 10+80,12+80,14+80,16+80,18+80
+	 */
+
+	.set j, 0
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds
+	 * 20+80,22+80,24+80,26+80,28+80
+	 * 30+80,32+80,34+80,36+80,38+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds
+	 * 40+80,42+80,44+80,46+80,48+80
+	 * 50+80,52+80,54+80,56+80,58+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/* update counter */
+	sub     $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+
+	/*
+	 * rounds
+	 * 60+80,62+80,64+80,66+80,68+80
+	 * 70+80,72+80,74+80,76+80,78+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	UPDATE_HASH	(HASH_PTR), A
+	UPDATE_HASH	4(HASH_PTR), TB
+	UPDATE_HASH	8(HASH_PTR), C
+	UPDATE_HASH	12(HASH_PTR), D
+	UPDATE_HASH	16(HASH_PTR), E
+
+	/* Reset state for AVX2 reg permutation */
+	mov	A, TA
+	mov	TB, A
+	mov	C, TB
+	mov	E, C
+	mov	D, B
+	mov	TA, D
+
+	REGALLOC
+
+	xchg	WK_BUF, PRECALC_BUF
+
+	jmp	.L_loop
+
+	.align 32
+.L_end:
+
+.endm
+/*
+ * macro implements SHA-1 function's body for several 64-byte blocks
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM  name
+	SYM_FUNC_START(\name)
+
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	RESERVE_STACK  = (W_SIZE*4 + 8+24)
+
+	/* Align stack */
+	push	%rbp
+	mov	%rsp, %rbp
+	and	$~(0x20-1), %rsp
+	sub	$RESERVE_STACK, %rsp
+
+	avx2_zeroupper
+
+	/* Setup initial values */
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+
+	mov	BUF, BUFFER_PTR2
+	mov	CNT, BLOCKS_CTR
+
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	avx2_zeroupper
+
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	RET
+
+	SYM_FUNC_END(\name)
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+.text
+
+SHA1_VECTOR_ASM     sha1_transform_avx2
diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S
new file mode 100644
index 000000000000..428f9b960594
--- /dev/null
+++ b/lib/crypto/x86/sha1-ni-asm.S
@@ -0,0 +1,152 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define STATE_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define SHUF_MASK	%xmm7
+#define ABCD_SAVED	%xmm8
+#define E0_SAVED	%xmm9
+
+.macro do_4rounds	i, m0, m1, m2, m3, e0, e1
+.if \i < 16
+	movdqu		\i*4(DATA_PTR), \m0
+	pshufb		SHUF_MASK, \m0
+.endif
+.if \i == 0
+	paddd		\m0, \e0
+.else
+	sha1nexte	\m0, \e0
+.endif
+	movdqa		ABCD, \e1
+.if \i >= 12 && \i < 76
+	sha1msg2	\m0, \m1
+.endif
+	sha1rnds4	$\i / 20, \e0, ABCD
+.if \i >= 4 && \i < 68
+	sha1msg1	\m0, \m3
+.endif
+.if \i >= 8 && \i < 72
+	pxor		\m0, \m2
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 block function
+ *
+ * This function takes a pointer to the current SHA-1 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process.  The number of
+ * blocks to process is assumed to be nonzero.  Once all blocks have been
+ * processed, the state is updated with the new state.  This function only
+ * processes complete blocks.  State initialization, buffering of partial
+ * blocks, and digest finalization are expected to be handled elsewhere.
+ *
+ * void sha1_ni_transform(struct sha1_block_state *state,
+ *			  const u8 *data, size_t nblocks)
+ */
+.text
+SYM_FUNC_START(sha1_ni_transform)
+
+	/* Load the initial state from STATE_PTR. */
+	pxor		E0, E0
+	pinsrd		$3, 16(STATE_PTR), E0
+	movdqu		(STATE_PTR), ABCD
+	pshufd		$0x1B, ABCD, ABCD
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lnext_block:
+	/* Save the state for addition after the rounds. */
+	movdqa		E0, E0_SAVED
+	movdqa		ABCD, ABCD_SAVED
+
+.irp i, 0, 16, 32, 48, 64
+	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3, E0, E1
+	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0, E1, E0
+	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1, E0, E1
+	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2, E1, E0
+.endr
+
+	/* Add the previous state (before the rounds) to the current state. */
+	sha1nexte	E0_SAVED, E0
+	paddd		ABCD_SAVED, ABCD
+
+	/* Advance to the next block, or break if there are no more blocks. */
+	add		$64, DATA_PTR
+	dec		NUM_BLKS
+	jnz		.Lnext_block
+
+	/* Store the new state to STATE_PTR. */
+	pextrd		$3, E0, 16(STATE_PTR)
+	pshufd		$0x1B, ABCD, ABCD
+	movdqu		ABCD, (STATE_PTR)
+
+	RET
+SYM_FUNC_END(sha1_ni_transform)
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/lib/crypto/x86/sha1-ssse3-and-avx.S b/lib/crypto/x86/sha1-ssse3-and-avx.S
new file mode 100644
index 000000000000..78b48cb09c5b
--- /dev/null
+++ b/lib/crypto/x86/sha1-ssse3-and-avx.S
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ *    http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ *            Ronen Zohar <ronen.zohar@intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ *   Author: Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <linux/linkage.h>
+
+#define CTX	%rdi	// arg1
+#define BUF	%rsi	// arg2
+#define CNT	%rdx	// arg3
+
+#define REG_A	%ecx
+#define REG_B	%esi
+#define REG_C	%edi
+#define REG_D	%r12d
+#define REG_E	%edx
+
+#define REG_T1	%eax
+#define REG_T2	%ebx
+
+#define K_BASE		%r8
+#define HASH_PTR	%r9
+#define BUFFER_PTR	%r10
+#define BUFFER_END	%r11
+
+#define W_TMP1	%xmm0
+#define W_TMP2	%xmm9
+
+#define W0	%xmm1
+#define W4	%xmm2
+#define W8	%xmm3
+#define W12	%xmm4
+#define W16	%xmm5
+#define W20	%xmm6
+#define W24	%xmm7
+#define W28	%xmm8
+
+#define XMM_SHUFB_BSWAP	%xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t)	(((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD	16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM  name
+	SYM_FUNC_START(\name)
+
+	push	%rbx
+	push	%r12
+	push	%rbp
+	mov	%rsp, %rbp
+
+	sub	$64, %rsp		# allocate workspace
+	and	$~15, %rsp		# align stack
+
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+
+	shl	$6, CNT			# multiply by 64
+	add	BUF, CNT
+	mov	CNT, BUFFER_END
+
+	lea	K_XMM_AR(%rip), K_BASE
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	# cleanup workspace
+	mov	$8, %ecx
+	mov	%rsp, %rdi
+	xor	%eax, %eax
+	rep stosq
+
+	mov	%rbp, %rsp		# deallocate workspace
+	pop	%rbp
+	pop	%r12
+	pop	%rbx
+	RET
+
+	SYM_FUNC_END(\name)
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+	INIT_REGALLOC
+
+	mov	  (HASH_PTR), A
+	mov	 4(HASH_PTR), B
+	mov	 8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+  .set i, 0
+  .rept W_PRECALC_AHEAD
+	W_PRECALC i
+    .set i, (i+1)
+  .endr
+
+.align 4
+1:
+	RR F1,A,B,C,D,E,0
+	RR F1,D,E,A,B,C,2
+	RR F1,B,C,D,E,A,4
+	RR F1,E,A,B,C,D,6
+	RR F1,C,D,E,A,B,8
+
+	RR F1,A,B,C,D,E,10
+	RR F1,D,E,A,B,C,12
+	RR F1,B,C,D,E,A,14
+	RR F1,E,A,B,C,D,16
+	RR F1,C,D,E,A,B,18
+
+	RR F2,A,B,C,D,E,20
+	RR F2,D,E,A,B,C,22
+	RR F2,B,C,D,E,A,24
+	RR F2,E,A,B,C,D,26
+	RR F2,C,D,E,A,B,28
+
+	RR F2,A,B,C,D,E,30
+	RR F2,D,E,A,B,C,32
+	RR F2,B,C,D,E,A,34
+	RR F2,E,A,B,C,D,36
+	RR F2,C,D,E,A,B,38
+
+	RR F3,A,B,C,D,E,40
+	RR F3,D,E,A,B,C,42
+	RR F3,B,C,D,E,A,44
+	RR F3,E,A,B,C,D,46
+	RR F3,C,D,E,A,B,48
+
+	RR F3,A,B,C,D,E,50
+	RR F3,D,E,A,B,C,52
+	RR F3,B,C,D,E,A,54
+	RR F3,E,A,B,C,D,56
+	RR F3,C,D,E,A,B,58
+
+	add	$64, BUFFER_PTR		# move to the next 64-byte block
+	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
+	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
+
+	RR F4,A,B,C,D,E,60
+	RR F4,D,E,A,B,C,62
+	RR F4,B,C,D,E,A,64
+	RR F4,E,A,B,C,D,66
+	RR F4,C,D,E,A,B,68
+
+	RR F4,A,B,C,D,E,70
+	RR F4,D,E,A,B,C,72
+	RR F4,B,C,D,E,A,74
+	RR F4,E,A,B,C,D,76
+	RR F4,C,D,E,A,B,78
+
+	UPDATE_HASH   (HASH_PTR), A
+	UPDATE_HASH  4(HASH_PTR), B
+	UPDATE_HASH  8(HASH_PTR), C
+	UPDATE_HASH 12(HASH_PTR), D
+	UPDATE_HASH 16(HASH_PTR), E
+
+	RESTORE_RENAMED_REGS
+	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
+	jne	1b
+.endm
+
+.macro INIT_REGALLOC
+  .set A, REG_A
+  .set B, REG_B
+  .set C, REG_C
+  .set D, REG_D
+  .set E, REG_E
+  .set T1, REG_T1
+  .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+	# order is important (REG_C is where it should be)
+	mov	B, REG_B
+	mov	D, REG_D
+	mov	A, REG_A
+	mov	E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES  a, b
+  .set _T, \a
+  .set \a, \b
+  .set \b, _T
+.endm
+
+.macro F1  b, c, d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	xor	\d, T1
+	and	\b, T1
+	xor	\d, T1
+.endm
+
+.macro F2  b, c, d
+	mov	\d, T1
+	SWAP_REG_NAMES \d, T1
+	xor	\c, T1
+	xor	\b, T1
+.endm
+
+.macro F3  b, c ,d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	mov	\b, T2
+	or	\b, T1
+	and	\c, T2
+	and	\d, T1
+	or	T2, T1
+.endm
+
+.macro F4  b, c, d
+	F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ *   t1 = F(b, c, d);   e += w(i)
+ *   e += t1;           b <<= 30;   d  += w(i+1);
+ *   t1 = F(a, b, c);
+ *   d += t1;           a <<= 5;
+ *   e += a;
+ *   t1 = e;            a >>= 7;
+ *   t1 <<= 5;
+ *   d += t1;
+ */
+.macro RR  F, a, b, c, d, e, round
+	add	WK(\round), \e
+	\F   \b, \c, \d		# t1 = F(b, c, d);
+	W_PRECALC (\round + W_PRECALC_AHEAD)
+	rol	$30, \b
+	add	T1, \e
+	add	WK(\round + 1), \d
+
+	\F   \a, \b, \c
+	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+	rol	$5, \a
+	add	\a, \e
+	add	T1, \d
+	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
+
+	mov	\e, T1
+	SWAP_REG_NAMES \e, T1
+
+	rol	$5, T1
+	add	T1, \d
+
+	# write:  \a, \b
+	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC  r
+  .set i, \r
+
+  .if (i < 20)
+    .set K_XMM, 0
+  .elseif (i < 40)
+    .set K_XMM, 16
+  .elseif (i < 60)
+    .set K_XMM, 32
+  .elseif (i < 80)
+    .set K_XMM, 48
+  .endif
+
+  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+    .set i, ((\r) % 80)	    # pre-compute for the next iteration
+    .if (i == 0)
+	W_PRECALC_RESET
+    .endif
+	W_PRECALC_00_15
+  .elseif (i<32)
+	W_PRECALC_16_31
+  .elseif (i < 80)   // rounds 32-79
+	W_PRECALC_32_79
+  .endif
+.endm
+
+.macro W_PRECALC_RESET
+  .set W,          W0
+  .set W_minus_04, W4
+  .set W_minus_08, W8
+  .set W_minus_12, W12
+  .set W_minus_16, W16
+  .set W_minus_20, W20
+  .set W_minus_24, W24
+  .set W_minus_28, W28
+  .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+  .set W_minus_32, W_minus_28
+  .set W_minus_28, W_minus_24
+  .set W_minus_24, W_minus_20
+  .set W_minus_20, W_minus_16
+  .set W_minus_16, W_minus_12
+  .set W_minus_12, W_minus_08
+  .set W_minus_08, W_minus_04
+  .set W_minus_04, W
+  .set W,          W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+	W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+	W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+	W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+  .if ((i & 3) == 0)
+	movdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	pshufb	XMM_SHUFB_BSWAP, W_TMP1
+	movdqa	W_TMP1, W
+  .elseif ((i & 3) == 2)
+	paddd	(K_BASE), W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa  W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ *   instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+  # blended scheduling of vector and scalar instruction streams, one 4-wide
+  # vector iteration / 4 scalar rounds
+  .if ((i & 3) == 0)
+	movdqa	W_minus_12, W
+	palignr	$8, W_minus_16, W	# w[i-14]
+	movdqa	W_minus_04, W_TMP1
+	psrldq	$4, W_TMP1		# w[i-3]
+	pxor	W_minus_08, W
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W_TMP1
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP2
+	movdqa	W, W_TMP1
+	pslldq	$12, W_TMP2
+  .elseif ((i & 3) == 2)
+	psrld	$31, W
+	pslld	$1, W_TMP1
+	por	W, W_TMP1
+	movdqa	W_TMP2, W
+	psrld	$30, W_TMP2
+	pslld	$2, W
+  .elseif ((i & 3) == 3)
+	pxor	W, W_TMP1
+	pxor	W_TMP2, W_TMP1
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+  .if ((i & 3) == 0)
+	movdqa	W_minus_04, W_TMP1
+	pxor	W_minus_28, W		# W is W_minus_32 before xor
+	palignr	$8, W_minus_08, W_TMP1
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP1
+  .elseif ((i & 3) == 2)
+	psrld	$30, W
+	pslld	$2, W_TMP1
+	por	W, W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm		// W_PRECALC_SSSE3
+
+
+#define K1	0x5a827999
+#define K2	0x6ed9eba1
+#define K3	0x8f1bbcdc
+#define K4	0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+	movdqu	\a,\b
+.endm
+
+/*
+ * SSSE3 optimized implementation:
+ *
+ * void sha1_transform_ssse3(struct sha1_block_state *state,
+ *			     const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM     sha1_transform_ssse3
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro  W_PRECALC_00_15
+    W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro  W_PRECALC_16_31
+    W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro  W_PRECALC_32_79
+    W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+  .if ((i & 3) == 0)
+	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
+  .elseif ((i & 3) == 2)
+	vpaddd	(K_BASE), W, W_TMP1
+  .elseif ((i & 3) == 3)
+	vmovdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
+	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
+	vpxor	W_minus_08, W, W
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+  .elseif ((i & 3) == 1)
+	vpxor	W_TMP1, W, W
+	vpslldq	$12, W, W_TMP2
+	vpslld	$1, W, W_TMP1
+  .elseif ((i & 3) == 2)
+	vpsrld	$31, W, W
+	vpor	W, W_TMP1, W_TMP1
+	vpslld	$2, W_TMP2, W
+	vpsrld	$30, W_TMP2, W_TMP2
+  .elseif ((i & 3) == 3)
+	vpxor	W, W_TMP1, W_TMP1
+	vpxor	W_TMP2, W_TMP1, W
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
+  .elseif ((i & 3) == 1)
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+	vpxor	W_TMP1, W, W
+  .elseif ((i & 3) == 2)
+	vpslld	$2, W, W_TMP1
+	vpsrld	$30, W, W
+	vpor	W, W_TMP1, W
+  .elseif ((i & 3) == 3)
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm    // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+	vmovdqu	\a,\b
+.endm
+
+
+/* AVX optimized implementation:
+ * void sha1_transform_avx(struct sha1_block_state *state,
+ *			   const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM     sha1_transform_avx
diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h
new file mode 100644
index 000000000000..e308379d89bc
--- /dev/null
+++ b/lib/crypto/x86/sha1.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic);
+
+#define DEFINE_X86_SHA1_FN(c_fn, asm_fn)                           \
+	asmlinkage void asm_fn(struct sha1_block_state *state,     \
+			       const u8 *data, size_t nblocks);    \
+	static void c_fn(struct sha1_block_state *state,           \
+			 const u8 *data, size_t nblocks)           \
+	{                                                          \
+		if (likely(irq_fpu_usable())) {                    \
+			kernel_fpu_begin();                        \
+			asm_fn(state, data, nblocks);              \
+			kernel_fpu_end();                          \
+		} else {                                           \
+			sha1_blocks_generic(state, data, nblocks); \
+		}                                                  \
+	}
+
+DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3);
+DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx);
+DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform);
+
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(struct sha1_block_state *state,
+				    const u8 *data, size_t nblocks);
+static void sha1_blocks_avx2(struct sha1_block_state *state,
+			     const u8 *data, size_t nblocks)
+{
+	if (likely(irq_fpu_usable())) {
+		kernel_fpu_begin();
+		/* Select the optimal transform based on the number of blocks */
+		if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE)
+			sha1_transform_avx2(state, data, nblocks);
+		else
+			sha1_transform_avx(state, data, nblocks);
+		kernel_fpu_end();
+	} else {
+		sha1_blocks_generic(state, data, nblocks);
+	}
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	static_call(sha1_blocks_x86)(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static inline void sha1_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		static_call_update(sha1_blocks_x86, sha1_blocks_ni);
+	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				     NULL) &&
+		   boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI1) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha1_blocks_x86, sha1_blocks_avx2);
+		else
+			static_call_update(sha1_blocks_x86, sha1_blocks_avx);
+	} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+		static_call_update(sha1_blocks_x86, sha1_blocks_ssse3);
+	}
+}
diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S
new file mode 100644
index 000000000000..c1aceb3ba3a3
--- /dev/null
+++ b/lib/crypto/x86/sha256-avx-asm.S
@@ -0,0 +1,493 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define    VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add     \p1, \p2
+	mov     \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+	shld    $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+	VMOVDQ \p2, \p1
+	vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx   # 3rd arg
+INP = %rsi        # 2nd arg
+CTX = %rdi        # 1st arg
+
+SRND = %rsi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+	## compute s0 four at a time and s1 two at a time
+	## compute W[-16] + W[-7] 4 at a time
+
+	mov     e, y0			# y0 = e
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	## compute s0
+	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpsrld  $7, XTMP1, XTMP2
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpslld  $(32-7), XTMP1, XTMP3
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	vpsrld  $18, XTMP1, XTMP2       #
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                   # y2 = f^g
+	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	vpslld  $(32-18), XTMP1, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	vpxor   XTMP1, XTMP3, XTMP3     #
+	add     y0, y2                  # y2 = S1 + CH
+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	## compute low s1
+	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	mov     f, y2                   # y2 = f
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
+	xor     g, y2                   # y2 = f^g
+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	vpxor   XTMP3, XTMP2, XTMP2     #
+	add     y0, y2                  # y2 = S1 + CH
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	## compute high s1
+	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	vpxor   XTMP3, XTMP2, XTMP2
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+	mov	e, y0			# y0 = e
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        mov     f, y2                   # y2 = f
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     g, y2                   # y2 = f^g
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        and     e, y2                   # y2 = (f^g)&e
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        add     y0, y2                  # y2 = S1 + CH
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        offset = \round * 4 + _XFER     #
+        add     offset(%rsp), y2	# y2 = k + w + S1 + CH
+        mov     a, y0			# y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(struct sha256_block_state *state,
+##			     const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_avx)
+	pushq   %rbx
+	pushq   %r12
+	pushq   %r13
+	pushq   %r14
+	pushq   %r15
+	pushq	%rbp
+	movq	%rsp, %rbp
+
+	subq    $STACK_SIZE, %rsp	# allocate stack space
+	and	$~15, %rsp		# align stack pointer
+
+	shl     $6, NUM_BLKS		# convert to bytes
+	add     INP, NUM_BLKS		# pointer to end of data
+	mov     NUM_BLKS, _INP_END(%rsp)
+
+	## load initial digest
+	mov     4*0(CTX), a
+	mov     4*1(CTX), b
+	mov     4*2(CTX), c
+	mov     4*3(CTX), d
+	mov     4*4(CTX), e
+	mov     4*5(CTX), f
+	mov     4*6(CTX), g
+	mov     4*7(CTX), h
+
+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+.Lloop0:
+	lea     K256(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+
+	mov     INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     $3, SRND
+.align 16
+.Lloop1:
+	vpaddd  (TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  1*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  2*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  3*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	add	$4*16, TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     $1, SRND
+	jne     .Lloop1
+
+	mov     $2, SRND
+.Lloop2:
+	vpaddd  (TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	vpaddd  1*16(TBL), X1, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	add     $2*16, TBL
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	vmovdqa X2, X0
+	vmovdqa X3, X1
+
+	sub     $1, SRND
+	jne     .Lloop2
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	mov     _INP(%rsp), INP
+	add     $64, INP
+	cmp     _INP_END(%rsp), INP
+	jne     .Lloop0
+
+	mov	%rbp, %rsp
+	popq	%rbp
+	popq    %r15
+	popq    %r14
+	popq    %r13
+	popq	%r12
+	popq    %rbx
+	RET
+SYM_FUNC_END(sha256_transform_avx)
+
+.section	.rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S
new file mode 100644
index 000000000000..eb8836fb9695
--- /dev/null
+++ b/lib/crypto/x86/sha256-avx2-asm.S
@@ -0,0 +1,770 @@
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define	VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add	\p1, \p2
+	mov	\p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER  = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx	# 3rd arg
+INP	= %rsi  # 2nd arg
+CTX	= %rdi	# 1st arg
+c	= %ecx
+d	= %r8d
+e       = %edx	# clobbers NUM_BLKS
+y3	= %esi	# clobbers INP
+
+SRND	= CTX	# SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE	= 0
+_INP_END_SIZE	= 8
+_INP_SIZE	= 8
+_CTX_SIZE	= 8
+
+_XFER		= 0
+_XMM_SAVE	= _XFER     + _XFER_SIZE
+_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
+_INP		= _INP_END  + _INP_END_SIZE
+_CTX		= _INP      + _INP_SIZE
+STACK_SIZE	= _CTX      + _CTX_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+	X_ = X0
+	X0 = X1
+	X1 = X2
+	X2 = X3
+	X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+	old_h = h
+	TMP_ = h
+	h = g
+	g = f
+	f = e
+	e = d
+	d = c
+	c = b
+	b = a
+	a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+
+	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	vpsrld	$7, XTMP1, XTMP2
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+
+	add	y0, y2		# y2 = S1 + CH                          # --
+	vpslld	$(32-7), XTMP1, XTMP3
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
+
+	vpsrld	$18, XTMP1, XTMP2
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+
+	ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	offset = \disp + 1*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+
+	vpslld	$(32-18), XTMP1, XTMP1
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+
+	vpxor	XTMP1, XTMP3, XTMP3
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
+	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+	ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	offset = \disp + 2*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+
+	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	xor	g, y2		# y2 = f^g                              # CH
+
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	vpxor	XTMP3, XTMP2, XTMP2
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
+	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1,h		# h = k + w + h + S0                    # --
+	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3,h		# h = t1 + S0 + MAJ                     # --
+
+
+	ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	offset = \disp + 3*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpxor	XTMP3, XTMP2, XTMP2
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
+
+	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*1 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*2 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*3 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(struct sha256_block_state *state,
+##			      const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_rorx)
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	push	%rbp
+	mov	%rsp, %rbp
+
+	subq	$STACK_SIZE, %rsp
+	and	$-32, %rsp	# align rsp to 32 byte boundary
+
+	shl	$6, NUM_BLKS	# convert to bytes
+	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+	mov	NUM_BLKS, _INP_END(%rsp)
+
+	cmp	NUM_BLKS, INP
+	je	.Lonly_one_block
+
+	## load initial digest
+	mov	(CTX), a
+	mov	4*1(CTX), b
+	mov	4*2(CTX), c
+	mov	4*3(CTX), d
+	mov	4*4(CTX), e
+	mov	4*5(CTX), f
+	mov	4*6(CTX), g
+	mov	4*7(CTX), h
+
+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+
+	mov	CTX, _CTX(%rsp)
+
+.Lloop0:
+	## Load first 16 dwords from two blocks
+	VMOVDQ	0*32(INP),XTMP0
+	VMOVDQ	1*32(INP),XTMP1
+	VMOVDQ	2*32(INP),XTMP2
+	VMOVDQ	3*32(INP),XTMP3
+
+	## byte swap data
+	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
+	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
+	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
+	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
+
+	## transpose data into high/low halves
+	vperm2i128	$0x20, XTMP2, XTMP0, X0
+	vperm2i128	$0x31, XTMP2, XTMP0, X1
+	vperm2i128	$0x20, XTMP3, XTMP1, X2
+	vperm2i128	$0x31, XTMP3, XTMP1, X3
+
+.Llast_block_enter:
+	add	$64, INP
+	mov	INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 12 each
+	xor	SRND, SRND
+
+.align 16
+.Lloop1:
+	leaq	K256+0*32(%rip), INP		## reuse INP as scratch reg
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	(_XFER + 0*32)
+
+	leaq	K256+1*32(%rip), INP
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	(_XFER + 1*32)
+
+	leaq	K256+2*32(%rip), INP
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	(_XFER + 2*32)
+
+	leaq	K256+3*32(%rip), INP
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	(_XFER + 3*32)
+
+	add	$4*32, SRND
+	cmp	$3*4*32, SRND
+	jb	.Lloop1
+
+.Lloop2:
+	## Do last 16 rounds with no scheduling
+	leaq	K256+0*32(%rip), INP
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+	DO_4ROUNDS	(_XFER + 0*32)
+
+	leaq	K256+1*32(%rip), INP
+	vpaddd	(INP, SRND), X1, XFER
+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+	DO_4ROUNDS	(_XFER + 1*32)
+	add	$2*32, SRND
+
+	vmovdqa	X2, X0
+	vmovdqa	X3, X1
+
+	cmp	$4*4*32, SRND
+	jb	.Lloop2
+
+	mov	_CTX(%rsp), CTX
+	mov	_INP(%rsp), INP
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	cmp	_INP_END(%rsp), INP
+	ja	.Ldone_hash
+
+	#### Do second block using previously scheduled results
+	xor	SRND, SRND
+.align 16
+.Lloop3:
+	DO_4ROUNDS	(_XFER + 0*32 + 16)
+	DO_4ROUNDS	(_XFER + 1*32 + 16)
+	add	$2*32, SRND
+	cmp	$4*4*32, SRND
+	jb	.Lloop3
+
+	mov	_CTX(%rsp), CTX
+	mov	_INP(%rsp), INP
+	add	$64, INP
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	cmp	_INP_END(%rsp), INP
+	jb	.Lloop0
+	ja	.Ldone_hash
+
+.Ldo_last_block:
+	VMOVDQ	0*16(INP),XWORD0
+	VMOVDQ	1*16(INP),XWORD1
+	VMOVDQ	2*16(INP),XWORD2
+	VMOVDQ	3*16(INP),XWORD3
+
+	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
+	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
+	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
+	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+	jmp	.Llast_block_enter
+
+.Lonly_one_block:
+
+	## load initial digest
+	mov	(4*0)(CTX),a
+	mov	(4*1)(CTX),b
+	mov	(4*2)(CTX),c
+	mov	(4*3)(CTX),d
+	mov	(4*4)(CTX),e
+	mov	(4*5)(CTX),f
+	mov	(4*6)(CTX),g
+	mov	(4*7)(CTX),h
+
+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
+
+	mov	CTX, _CTX(%rsp)
+	jmp	.Ldo_last_block
+
+.Ldone_hash:
+
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	vzeroupper
+	RET
+SYM_FUNC_END(sha256_transform_rorx)
+
+.section	.rodata.cst512.K256, "aM", @progbits, 512
+.align 64
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S
new file mode 100644
index 000000000000..4bd9490ffc66
--- /dev/null
+++ b/lib/crypto/x86/sha256-ni-asm.S
@@ -0,0 +1,191 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define STATE_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define SHA256CONSTANTS	%rax
+
+#define MSG		%xmm0  /* sha256rnds2 implicit operand */
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define TMP		%xmm7
+
+#define SHUF_MASK	%xmm8
+
+#define ABEF_SAVE	%xmm9
+#define CDGH_SAVE	%xmm10
+
+.macro do_4rounds	i, m0, m1, m2, m3
+.if \i < 16
+	movdqu		\i*4(DATA_PTR), \m0
+	pshufb		SHUF_MASK, \m0
+.endif
+	movdqa		(\i-32)*4(SHA256CONSTANTS), MSG
+	paddd		\m0, MSG
+	sha256rnds2	STATE0, STATE1
+.if \i >= 12 && \i < 60
+	movdqa		\m0, TMP
+	palignr		$4, \m3, TMP
+	paddd		TMP, \m1
+	sha256msg2	\m0, \m1
+.endif
+	punpckhqdq	MSG, MSG
+	sha256rnds2	STATE1, STATE0
+.if \i >= 4 && \i < 52
+	sha256msg1	\m0, \m3
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 block function
+ *
+ * This function takes a pointer to the current SHA-256 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process.  Once all blocks
+ * have been processed, the state is updated with the new state.  This function
+ * only processes complete blocks.  State initialization, buffering of partial
+ * blocks, and digest finalization is expected to be handled elsewhere.
+ *
+ * void sha256_ni_transform(struct sha256_block_state *state,
+ *			    const u8 *data, size_t nblocks);
+ */
+.text
+SYM_FUNC_START(sha256_ni_transform)
+
+	shl		$6, NUM_BLKS		/*  convert to bytes */
+	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
+
+	/*
+	 * load initial hash values
+	 * Need to reorder these appropriately
+	 * DCBA, HGFE -> ABEF, CDGH
+	 */
+	movdqu		0*16(STATE_PTR), STATE0		/* DCBA */
+	movdqu		1*16(STATE_PTR), STATE1		/* HGFE */
+
+	movdqa		STATE0, TMP
+	punpcklqdq	STATE1, STATE0			/* FEBA */
+	punpckhqdq	TMP, STATE1			/* DCHG */
+	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
+	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+	lea		K256+32*4(%rip), SHA256CONSTANTS
+
+.Lloop0:
+	/* Save hash values for addition after rounds */
+	movdqa		STATE0, ABEF_SAVE
+	movdqa		STATE1, CDGH_SAVE
+
+.irp i, 0, 16, 32, 48
+	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3
+	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0
+	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1
+	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2
+.endr
+
+	/* Add current hash values with previously saved */
+	paddd		ABEF_SAVE, STATE0
+	paddd		CDGH_SAVE, STATE1
+
+	/* Increment data pointer and loop if more to process */
+	add		$64, DATA_PTR
+	cmp		NUM_BLKS, DATA_PTR
+	jne		.Lloop0
+
+	/* Write hash values back in the correct order */
+	movdqa		STATE0, TMP
+	punpcklqdq	STATE1, STATE0			/* GHEF */
+	punpckhqdq	TMP, STATE1			/* ABCD */
+	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
+	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
+
+	movdqu		STATE1, 0*16(STATE_PTR)
+	movdqu		STATE0, 1*16(STATE_PTR)
+
+	RET
+SYM_FUNC_END(sha256_ni_transform)
+
+.section	.rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S
new file mode 100644
index 000000000000..383b8eec7ebe
--- /dev/null
+++ b/lib/crypto/x86/sha256-ssse3-asm.S
@@ -0,0 +1,505 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define    MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+        add     \p1, \p2
+        mov     \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+        MOVDQ \p2, \p1
+        pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx   # 3rd arg
+INP = %rsi        # 2nd arg
+CTX = %rdi        # 1st arg
+
+SRND = %rsi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+	## compute s0 four at a time and s1 two at a time
+	## compute W[-16] + W[-7] 4 at a time
+	movdqa  X3, XTMP0
+	mov     e, y0			# y0 = e
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	movdqa  X1, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	## compute s0
+	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
+	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pslld   $(32-7), XTMP1          #
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	psrld   $7, XTMP2               #
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+					#
+	ROTATE_ARGS                     #
+	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	pslld   $(32-18), XTMP3         #
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                   # y2 = f^g
+	psrld   $18, XTMP2              #
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP3, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
+	add     y0, y2                  # y2 = S1 + CH
+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pxor    XTMP4, XTMP1            # XTMP1 = s0
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	## compute low s1
+	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
+	and     b, y0			# y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	mov     f, y2                   # y2 = f
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
+	xor     g, y2                   # y2 = f^g
+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP3, XTMP2
+	add     y0, y2                  # y2 = S1 + CH
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	## compute high s1
+	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+					#
+	ROTATE_ARGS                     #
+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
+	mov     e, y0                   # y0 = e
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	pxor    XTMP3, XTMP2            #
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+	add     y0, y2                  # y2 = S1 + CH
+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	pxor    XTMP2, X0               # X0 = s1 {xDxC}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+	mov     e, y0                 # y0 = e
+	ror     $(25-11), y0          # y0 = e >> (25-11)
+	mov     a, y1                 # y1 = a
+	xor     e, y0                 # y0 = e ^ (e >> (25-11))
+	ror     $(22-13), y1          # y1 = a >> (22-13)
+	mov     f, y2                 # y2 = f
+	xor     a, y1                 # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                 # y2 = f^g
+	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and     e, y2                 # y2 = (f^g)&e
+	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
+	add     y0, y2                # y2 = S1 + CH
+	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	offset = \round * 4 + _XFER
+	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
+	mov     a, y0                 # y0 = a
+	add     y2, h                 # h = h + S1 + CH + k + w
+	mov     a, y2                 # y2 = a
+	or      c, y0                 # y0 = a|c
+	add     h, d                  # d = d + h + S1 + CH + k + w
+	and     c, y2                 # y2 = a&c
+	and     b, y0                 # y0 = (a|c)&b
+	add     y1, h                 # h = h + S1 + CH + k + w + S0
+	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(struct sha256_block_state *state,
+##			       const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_ssse3)
+	pushq   %rbx
+	pushq   %r12
+	pushq   %r13
+	pushq   %r14
+	pushq   %r15
+	pushq   %rbp
+	mov	%rsp, %rbp
+
+	subq    $STACK_SIZE, %rsp
+	and	$~15, %rsp
+
+	shl     $6, NUM_BLKS		 # convert to bytes
+	add     INP, NUM_BLKS
+	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+	## load initial digest
+	mov     4*0(CTX), a
+	mov     4*1(CTX), b
+	mov     4*2(CTX), c
+	mov     4*3(CTX), d
+	mov     4*4(CTX), e
+	mov     4*5(CTX), f
+	mov     4*6(CTX), g
+	mov     4*7(CTX), h
+
+	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	movdqa  _SHUF_00BA(%rip), SHUF_00BA
+	movdqa  _SHUF_DC00(%rip), SHUF_DC00
+
+.Lloop0:
+	lea     K256(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+
+	mov     INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     $3, SRND
+.align 16
+.Lloop1:
+	movdqa  (TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  1*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  2*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  3*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	add     $4*16, TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     $1, SRND
+	jne     .Lloop1
+
+	mov     $2, SRND
+.Lloop2:
+	paddd   (TBL), X0
+	movdqa  X0, _XFER(%rsp)
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+	paddd   1*16(TBL), X1
+	movdqa  X1, _XFER(%rsp)
+	add     $2*16, TBL
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	movdqa  X2, X0
+	movdqa  X3, X1
+
+	sub     $1, SRND
+	jne     .Lloop2
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	mov     _INP(%rsp), INP
+	add     $64, INP
+	cmp     _INP_END(%rsp), INP
+	jne     .Lloop0
+
+	mov	%rbp, %rsp
+	popq	%rbp
+	popq    %r15
+	popq    %r14
+	popq    %r13
+	popq    %r12
+	popq    %rbx
+
+	RET
+SYM_FUNC_END(sha256_transform_ssse3)
+
+.section	.rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h
new file mode 100644
index 000000000000..669bc06538b6
--- /dev/null
+++ b/lib/crypto/x86/sha256.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <crypto/internal/simd.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
+
+#define DEFINE_X86_SHA256_FN(c_fn, asm_fn)                                 \
+	asmlinkage void asm_fn(struct sha256_block_state *state,           \
+			       const u8 *data, size_t nblocks);            \
+	static void c_fn(struct sha256_block_state *state, const u8 *data, \
+			 size_t nblocks)                                   \
+	{                                                                  \
+		if (likely(crypto_simd_usable())) {                        \
+			kernel_fpu_begin();                                \
+			asm_fn(state, data, nblocks);                      \
+			kernel_fpu_end();                                  \
+		} else {                                                   \
+			sha256_blocks_generic(state, data, nblocks);       \
+		}                                                          \
+	}
+
+DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3);
+DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx);
+DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx);
+DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	static_call(sha256_blocks_x86)(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		static_call_update(sha256_blocks_x86, sha256_blocks_ni);
+	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				     NULL) &&
+		   boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha256_blocks_x86,
+					   sha256_blocks_avx2);
+		else
+			static_call_update(sha256_blocks_x86,
+					   sha256_blocks_avx);
+	} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+		static_call_update(sha256_blocks_x86, sha256_blocks_ssse3);
+	}
+}
diff --git a/lib/crypto/x86/sha512-avx-asm.S b/lib/crypto/x86/sha512-avx-asm.S
new file mode 100644
index 000000000000..7732aa8fd850
--- /dev/null
+++ b/lib/crypto/x86/sha512-avx-asm.S
@@ -0,0 +1,420 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest	= %rdi
+# ARG2
+msg	= %rsi
+# ARG3
+msglen	= %rdx
+T1	= %rcx
+T2	= %r8
+a_64	= %r9
+b_64	= %r10
+c_64	= %r11
+d_64	= %r12
+e_64	= %r13
+f_64	= %r14
+g_64	= %r15
+h_64	= %rbx
+tmp0	= %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+	# Rotate symbols a..h right
+	TMP   = h_64
+	h_64  = g_64
+	g_64  = f_64
+	f_64  = e_64
+	e_64  = d_64
+	d_64  = c_64
+	c_64  = b_64
+	b_64  = a_64
+	a_64  = TMP
+.endm
+
+.macro RORQ p1 p2
+	# shld is faster than ror on Sandybridge
+	shld	$(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+	# Compute Round %%t
+	mov     f_64, T1          # T1 = f
+	mov     e_64, tmp0        # tmp = e
+	xor     g_64, T1          # T1 = f ^ g
+	RORQ    tmp0, 23   # 41    # tmp = e ror 23
+	and     e_64, T1          # T1 = (f ^ g) & e
+	xor     e_64, tmp0        # tmp = (e ror 23) ^ e
+	xor     g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	idx = \rnd
+	add     WK_2(idx), T1     # W[t] + K[t] from message scheduler
+	RORQ    tmp0, 4   # 18    # tmp = ((e ror 23) ^ e) ror 4
+	xor     e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov     a_64, T2          # T2 = a
+	add     h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+	RORQ    tmp0, 14  # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add     tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov     a_64, tmp0        # tmp = a
+	xor     c_64, T2          # T2 = a ^ c
+	and     c_64, tmp0        # tmp = a & c
+	and     b_64, T2          # T2 = (a ^ c) & b
+	xor     tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov     a_64, tmp0        # tmp = a
+	RORQ    tmp0, 5  # 39     # tmp = a ror 5
+	xor     a_64, tmp0        # tmp = (a ror 5) ^ a
+	add     T1, d_64          # e(next_state) = d + T1
+	RORQ    tmp0, 6  # 34     # tmp = ((a ror 5) ^ a) ror 6
+	xor     a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea     (T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+	RORQ    tmp0, 28  # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add     tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+	# Compute rounds t-2 and t-1
+	# Compute message schedule QWORDS t and t+1
+
+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	# scheduler.
+	#   The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+	# They are then added to their respective SHA512 constants at
+	# [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+	#   For brievity, the comments following vectored instructions only refer to
+	# the first of a pair of QWORDS.
+	# Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+	#   The computation of the message schedule and the rounds are tightly
+	# stitched to take advantage of instruction-level parallelism.
+
+	idx = \rnd - 2
+	vmovdqa	W_t(idx), %xmm4		# XMM4 = W[t-2]
+	idx = \rnd - 15
+	vmovdqu	W_t(idx), %xmm5		# XMM5 = W[t-15]
+	mov	f_64, T1
+	vpsrlq	$61, %xmm4, %xmm0	# XMM0 = W[t-2]>>61
+	mov	e_64, tmp0
+	vpsrlq	$1, %xmm5, %xmm6	# XMM6 = W[t-15]>>1
+	xor	g_64, T1
+	RORQ	tmp0, 23 # 41
+	vpsrlq	$19, %xmm4, %xmm1	# XMM1 = W[t-2]>>19
+	and	e_64, T1
+	xor	e_64, tmp0
+	vpxor	%xmm1, %xmm0, %xmm0	# XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+	xor	g_64, T1
+	idx = \rnd
+	add	WK_2(idx), T1#
+	vpsrlq	$8, %xmm5, %xmm7	# XMM7 = W[t-15]>>8
+	RORQ	tmp0, 4 # 18
+	vpsrlq	$6, %xmm4, %xmm2	# XMM2 = W[t-2]>>6
+	xor	e_64, tmp0
+	mov	a_64, T2
+	add	h_64, T1
+	vpxor	%xmm7, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+	RORQ	tmp0, 14 # 14
+	add	tmp0, T1
+	vpsrlq	$7, %xmm5, %xmm8	# XMM8 = W[t-15]>>7
+	mov	a_64, tmp0
+	xor	c_64, T2
+	vpsllq	$(64-61), %xmm4, %xmm3  # XMM3 = W[t-2]<<3
+	and	c_64, tmp0
+	and	b_64, T2
+	vpxor	%xmm3, %xmm2, %xmm2	# XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+	xor	tmp0, T2
+	mov	a_64, tmp0
+	vpsllq	$(64-1), %xmm5, %xmm9	# XMM9 = W[t-15]<<63
+	RORQ	tmp0, 5 # 39
+	vpxor	%xmm9, %xmm8, %xmm8	# XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+	xor	a_64, tmp0
+	add	T1, d_64
+	RORQ	tmp0, 6 # 34
+	xor	a_64, tmp0
+	vpxor	%xmm8, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+					#  W[t-15]>>7 ^ W[t-15]<<63
+	lea	(T1, T2), h_64
+	RORQ	tmp0, 28 # 28
+	vpsllq	$(64-19), %xmm4, %xmm4  # XMM4 = W[t-2]<<25
+	add	tmp0, h_64
+	RotateState
+	vpxor	%xmm4, %xmm0, %xmm0     # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+					#        W[t-2]<<25
+	mov	f_64, T1
+	vpxor	%xmm2, %xmm0, %xmm0     # XMM0 = s1(W[t-2])
+	mov	e_64, tmp0
+	xor	g_64, T1
+	idx = \rnd - 16
+	vpaddq	W_t(idx), %xmm0, %xmm0  # XMM0 = s1(W[t-2]) + W[t-16]
+	idx = \rnd - 7
+	vmovdqu	W_t(idx), %xmm1		# XMM1 = W[t-7]
+	RORQ	tmp0, 23 # 41
+	and	e_64, T1
+	xor	e_64, tmp0
+	xor	g_64, T1
+	vpsllq	$(64-8), %xmm5, %xmm5   # XMM5 = W[t-15]<<56
+	idx = \rnd + 1
+	add	WK_2(idx), T1
+	vpxor	%xmm5, %xmm6, %xmm6     # XMM6 = s0(W[t-15])
+	RORQ	tmp0, 4 # 18
+	vpaddq	%xmm6, %xmm0, %xmm0     # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+	xor	e_64, tmp0
+	vpaddq	%xmm1, %xmm0, %xmm0     # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+					#               s0(W[t-15]) + W[t-16]
+	mov	a_64, T2
+	add	h_64, T1
+	RORQ	tmp0, 14 # 14
+	add	tmp0, T1
+	idx = \rnd
+	vmovdqa	%xmm0, W_t(idx)		# Store W[t]
+	vpaddq	K_t(idx), %xmm0, %xmm0  # Compute W[t]+K[t]
+	vmovdqa	%xmm0, WK_2(idx)	# Store W[t]+K[t] for next rounds
+	mov	a_64, tmp0
+	xor	c_64, T2
+	and	c_64, tmp0
+	and	b_64, T2
+	xor	tmp0, T2
+	mov	a_64, tmp0
+	RORQ	tmp0, 5 # 39
+	xor	a_64, tmp0
+	add	T1, d_64
+	RORQ	tmp0, 6 # 34
+	xor	a_64, tmp0
+	lea	(T1, T2), h_64
+	RORQ	tmp0, 28 # 28
+	add	tmp0, h_64
+	RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(struct sha512_block_state *state,
+#			    const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks.  Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_avx)
+
+	# Save GPRs
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	# Allocate Stack Space
+	push	%rbp
+	mov	%rsp, %rbp
+	sub     $frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+	# Load state variables
+	mov     DIGEST(0), a_64
+	mov     DIGEST(1), b_64
+	mov     DIGEST(2), c_64
+	mov     DIGEST(3), d_64
+	mov     DIGEST(4), e_64
+	mov     DIGEST(5), f_64
+	mov     DIGEST(6), g_64
+	mov     DIGEST(7), h_64
+
+	t = 0
+	.rept 80/2 + 1
+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	# +1 iteration because the scheduler leads hashing by 1 iteration
+		.if t < 2
+			# BSWAP 2 QWORDS
+			vmovdqa  XMM_QWORD_BSWAP(%rip), %xmm1
+			vmovdqu  MSG(t), %xmm0
+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+			vmovdqa  %xmm0, WK_2(t) # Store into WK for rounds
+		.elseif t < 16
+			# BSWAP 2 QWORDS# Compute 2 Rounds
+			vmovdqu  MSG(t), %xmm0
+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+			SHA512_Round t-2    # Round t-2
+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+			SHA512_Round t-1    # Round t-1
+			vmovdqa  %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+		.elseif t < 79
+			# Schedule 2 QWORDS# Compute 2 Rounds
+			SHA512_2Sched_2Round_avx t
+		.else
+			# Compute 2 Rounds
+			SHA512_Round t-2
+			SHA512_Round t-1
+		.endif
+		t = t+2
+	.endr
+
+	# Update digest
+	add     a_64, DIGEST(0)
+	add     b_64, DIGEST(1)
+	add     c_64, DIGEST(2)
+	add     d_64, DIGEST(3)
+	add     e_64, DIGEST(4)
+	add     f_64, DIGEST(5)
+	add     g_64, DIGEST(6)
+	add     h_64, DIGEST(7)
+
+	# Advance to next message block
+	add     $16*8, msg
+	dec     msglen
+	jnz     .Lupdateblock
+
+	# Restore Stack Pointer
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	# Restore GPRs
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	RET
+SYM_FUNC_END(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/lib/crypto/x86/sha512-avx2-asm.S b/lib/crypto/x86/sha512-avx2-asm.S
new file mode 100644
index 000000000000..22bdbfd899d0
--- /dev/null
+++ b/lib/crypto/x86/sha512-avx2-asm.S
@@ -0,0 +1,748 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER  = YTMP0
+
+BYTE_FLIP_MASK  = %ymm9
+
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1        = %rdi
+CTX2        = %r12
+# 2nd arg
+INP         = %rsi
+# 3rd arg
+NUM_BLKS    = %rdx
+
+c           = %rcx
+d           = %r8
+e           = %rdx
+y3          = %rsi
+
+TBL   = %rdi # clobbers CTX1
+
+a     = %rax
+b     = %rbx
+
+f     = %r9
+g     = %r10
+h     = %r11
+old_h = %r11
+
+T1    = %r12 # clobbers CTX2
+y0    = %r13
+y1    = %r14
+y2    = %r15
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_size = frame_CTX + CTX_SIZE
+
+## assume buffers not aligned
+#define	VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add	\p1, \p2
+	mov	\p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+	VMOVDQ \p2, \p1
+	vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+	Y_ = Y_0
+	Y_0 = Y_1
+	Y_1 = Y_2
+	Y_2 = Y_3
+	Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+	# Rotate symbols a..h right
+	old_h  = h
+	TMP_   = h
+	h      = g
+	g      = f
+	f      = e
+	e      = d
+	d      = c
+	c      = b
+	b      = a
+	a      = TMP_
+.endm
+
+# macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
+	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+	# Extract w[t-7]
+	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
+	# Calculate w[t-16] + w[t-7]
+	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
+	# Extract w[t-15]
+	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
+
+	# Calculate sigma0
+
+	# Calculate w[t-15] ror 1
+	vpsrlq		$1, YTMP1, YTMP2
+	vpsllq		$(64-1), YTMP1, YTMP3
+	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
+	# Calculate w[t-15] shr 7
+	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	frame_XFER(%rsp),h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+
+	add	y0, y2		# y2 = S1 + CH                          # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+################################### RND N + 1 #########################################
+
+	# Calculate w[t-15] ror 8
+	vpsrlq		$8, YTMP1, YTMP2
+	vpsllq		$(64-8), YTMP1, YTMP1
+	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
+	# XOR the three components
+	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
+
+
+	# Add three components, w[t-16], w[t-7] and sigma0
+	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
+	# Move to appropriate lanes for calculating w[16] and w[17]
+	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
+	# Move to appropriate lanes for calculating w[18] and w[19]
+	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+	# Calculate w[16] and w[17] in both 128 bit lanes
+
+	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
+	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
+
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+
+################################### RND N + 2 #########################################
+
+	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
+	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
+	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+	# Add sigma1 to the other compunents to get w[16] and w[17]
+	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
+
+	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	xor	g, y2		# y2 = f^g                              # CH
+
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+################################### RND N + 3 #########################################
+
+	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
+	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
+	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+	# to newly calculated sigma1 to get w[18] and w[19]
+	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
+
+	# Form w[19, w[18], w17], w[16]
+	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+	rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 1 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 2 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 3 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(struct sha512_block_state *state,
+#			     const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks.  Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_rorx)
+
+	# Save GPRs
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	# Allocate Stack Space
+	push	%rbp
+	mov	%rsp, %rbp
+	sub	$frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+
+	shl	$7, NUM_BLKS	# convert to bytes
+	add	INP, NUM_BLKS	# pointer to end of data
+	mov	NUM_BLKS, frame_INPEND(%rsp)
+
+	## load initial digest
+	mov	8*0(CTX1), a
+	mov	8*1(CTX1), b
+	mov	8*2(CTX1), c
+	mov	8*3(CTX1), d
+	mov	8*4(CTX1), e
+	mov	8*5(CTX1), f
+	mov	8*6(CTX1), g
+	mov	8*7(CTX1), h
+
+	# save %rdi (CTX) before it gets clobbered
+	mov	%rdi, frame_CTX(%rsp)
+
+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+.Lloop0:
+	lea	K512(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+	mov	INP, frame_INP(%rsp)
+
+	## schedule 64 input dwords, by doing 12 rounds of 4 each
+	movq	$4, frame_SRND(%rsp)
+
+.align 16
+.Lloop1:
+	vpaddq	(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	1*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	2*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	3*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	add	$(4*32), TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	subq	$1, frame_SRND(%rsp)
+	jne	.Lloop1
+
+	movq	$2, frame_SRND(%rsp)
+.Lloop2:
+	vpaddq	(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	DO_4ROUNDS
+	vpaddq	1*32(TBL), Y_1, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	add	$(2*32), TBL
+	DO_4ROUNDS
+
+	vmovdqa	Y_2, Y_0
+	vmovdqa	Y_3, Y_1
+
+	subq	$1, frame_SRND(%rsp)
+	jne	.Lloop2
+
+	mov	frame_CTX(%rsp), CTX2
+	addm	8*0(CTX2), a
+	addm	8*1(CTX2), b
+	addm	8*2(CTX2), c
+	addm	8*3(CTX2), d
+	addm	8*4(CTX2), e
+	addm	8*5(CTX2), f
+	addm	8*6(CTX2), g
+	addm	8*7(CTX2), h
+
+	mov	frame_INP(%rsp), INP
+	add	$128, INP
+	cmp	frame_INPEND(%rsp), INP
+	jne	.Lloop0
+
+	# Restore Stack Pointer
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	# Restore GPRs
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	vzeroupper
+	RET
+SYM_FUNC_END(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+	.octa 0x18191a1b1c1d1e1f1011121314151617
+
+.section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
+MASK_YMM_LO:
+	.octa 0x00000000000000000000000000000000
+	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha512-ssse3-asm.S b/lib/crypto/x86/sha512-ssse3-asm.S
new file mode 100644
index 000000000000..4cae7445b2a8
--- /dev/null
+++ b/lib/crypto/x86/sha512-ssse3-asm.S
@@ -0,0 +1,419 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest =	%rdi
+# ARG2
+msg =		%rsi
+# ARG3
+msglen =	%rdx
+T1 =		%rcx
+T2 =		%r8
+a_64 =		%r9
+b_64 =		%r10
+c_64 =		%r11
+d_64 =		%r12
+e_64 =		%r13
+f_64 =		%r14
+g_64 =		%r15
+h_64 =		%rbx
+tmp0 =		%rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+	# Rotate symbols a..h right
+	TMP   = h_64
+	h_64  = g_64
+	g_64  = f_64
+	f_64  = e_64
+	e_64  = d_64
+	d_64  = c_64
+	c_64  = b_64
+	b_64  = a_64
+	a_64  = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+	# Compute Round %%t
+	mov	f_64, T1          # T1 = f
+	mov	e_64, tmp0        # tmp = e
+	xor	g_64, T1          # T1 = f ^ g
+	ror	$23, tmp0 # 41    # tmp = e ror 23
+	and	e_64, T1          # T1 = (f ^ g) & e
+	xor	e_64, tmp0        # tmp = (e ror 23) ^ e
+	xor	g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	idx = \rnd
+	add	WK_2(idx), T1     # W[t] + K[t] from message scheduler
+	ror	$4, tmp0  # 18    # tmp = ((e ror 23) ^ e) ror 4
+	xor	e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov	a_64, T2          # T2 = a
+	add	h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+	ror	$14, tmp0 # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add	tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov	a_64, tmp0        # tmp = a
+	xor	c_64, T2          # T2 = a ^ c
+	and	c_64, tmp0        # tmp = a & c
+	and	b_64, T2          # T2 = (a ^ c) & b
+	xor	tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov	a_64, tmp0        # tmp = a
+	ror	$5, tmp0 # 39     # tmp = a ror 5
+	xor	a_64, tmp0        # tmp = (a ror 5) ^ a
+	add	T1, d_64          # e(next_state) = d + T1
+	ror	$6, tmp0 # 34     # tmp = ((a ror 5) ^ a) ror 6
+	xor	a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea	(T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+	ror	$28, tmp0 # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add	tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+	# Compute rounds t-2 and t-1
+	# Compute message schedule QWORDS t and t+1
+
+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	# scheduler.
+	#   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+	# They are then added to their respective SHA512 constants at
+	# [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+	#   For brievity, the comments following vectored instructions only refer to
+	# the first of a pair of QWORDS.
+	# Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+	#   The computation of the message schedule and the rounds are tightly
+	# stitched to take advantage of instruction-level parallelism.
+	# For clarity, integer instructions (for the rounds calculation) are indented
+	# by one tab. Vectored instructions (for the message scheduler) are indented
+	# by two tabs.
+
+	mov	f_64, T1
+	idx = \rnd -2
+	movdqa	W_t(idx), %xmm2		    # XMM2 = W[t-2]
+	xor	g_64, T1
+	and	e_64, T1
+	movdqa	%xmm2, %xmm0	            # XMM0 = W[t-2]
+	xor	g_64, T1
+	idx = \rnd
+	add	WK_2(idx), T1
+	idx = \rnd - 15
+	movdqu	W_t(idx), %xmm5		    # XMM5 = W[t-15]
+	mov	e_64, tmp0
+	ror	$23, tmp0 # 41
+	movdqa	%xmm5, %xmm3	            # XMM3 = W[t-15]
+	xor	e_64, tmp0
+	ror	$4, tmp0 # 18
+	psrlq	$61-19, %xmm0		    # XMM0 = W[t-2] >> 42
+	xor	e_64, tmp0
+	ror	$14, tmp0 # 14
+	psrlq	$(8-7), %xmm3		    # XMM3 = W[t-15] >> 1
+	add	tmp0, T1
+	add	h_64, T1
+	pxor	%xmm2, %xmm0                # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+	mov	a_64, T2
+	xor	c_64, T2
+	pxor	%xmm5, %xmm3                # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+	and	b_64, T2
+	mov	a_64, tmp0
+	psrlq	$(19-6), %xmm0		    # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+	and	c_64, tmp0
+	xor	tmp0, T2
+	psrlq	$(7-1), %xmm3		    # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+	mov	a_64, tmp0
+	ror	$5, tmp0 # 39
+	pxor	%xmm2, %xmm0	            # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+	xor	a_64, tmp0
+	ror	$6, tmp0 # 34
+	pxor	%xmm5, %xmm3                # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+	xor	a_64, tmp0
+	ror	$28, tmp0 # 28
+	psrlq	$6, %xmm0                   # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+	add	tmp0, T2
+	add	T1, d_64
+	psrlq	$1, %xmm3                   # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+	lea	(T1, T2), h_64
+	RotateState
+	movdqa	%xmm2, %xmm1	            # XMM1 = W[t-2]
+	mov	f_64, T1
+	xor	g_64, T1
+	movdqa	%xmm5, %xmm4		    # XMM4 = W[t-15]
+	and	e_64, T1
+	xor	g_64, T1
+	psllq	$(64-19)-(64-61) , %xmm1    # XMM1 = W[t-2] << 42
+	idx = \rnd + 1
+	add	WK_2(idx), T1
+	mov	e_64, tmp0
+	psllq	$(64-1)-(64-8), %xmm4	    # XMM4 = W[t-15] << 7
+	ror	$23, tmp0 # 41
+	xor	e_64, tmp0
+	pxor	%xmm2, %xmm1		    # XMM1 = (W[t-2] << 42)^W[t-2]
+	ror	$4, tmp0 # 18
+	xor	e_64, tmp0
+	pxor	%xmm5, %xmm4		    # XMM4 = (W[t-15]<<7)^W[t-15]
+	ror	$14, tmp0 # 14
+	add	tmp0, T1
+	psllq	$(64-61), %xmm1		    # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+	add	h_64, T1
+	mov	a_64, T2
+	psllq	$(64-8), %xmm4		    # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+	xor	c_64, T2
+	and	b_64, T2
+	pxor	%xmm1, %xmm0		    # XMM0 = s1(W[t-2])
+	mov	a_64, tmp0
+	and	c_64, tmp0
+	idx = \rnd - 7
+	movdqu	W_t(idx), %xmm1		    # XMM1 = W[t-7]
+	xor	tmp0, T2
+	pxor	%xmm4, %xmm3                # XMM3 = s0(W[t-15])
+	mov	a_64, tmp0
+	paddq	%xmm3, %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15])
+	ror	$5, tmp0 # 39
+	idx =\rnd-16
+	paddq	W_t(idx), %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+	xor	a_64, tmp0
+	paddq	%xmm1, %xmm0	            # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+	ror	$6, tmp0 # 34
+	movdqa	%xmm0, W_t(\rnd)	    # Store scheduled qwords
+	xor	a_64, tmp0
+	paddq	K_t(\rnd), %xmm0	    # Compute W[t]+K[t]
+	ror	$28, tmp0 # 28
+	idx = \rnd
+	movdqa	%xmm0, WK_2(idx)	    # Store W[t]+K[t] for next rounds
+	add	tmp0, T2
+	add	T1, d_64
+	lea	(T1, T2), h_64
+	RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(struct sha512_block_state *state,
+#			      const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks.  Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_ssse3)
+
+	# Save GPRs
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	# Allocate Stack Space
+	push	%rbp
+	mov	%rsp, %rbp
+	sub	$frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+# Load state variables
+	mov	DIGEST(0), a_64
+	mov	DIGEST(1), b_64
+	mov	DIGEST(2), c_64
+	mov	DIGEST(3), d_64
+	mov	DIGEST(4), e_64
+	mov	DIGEST(5), f_64
+	mov	DIGEST(6), g_64
+	mov	DIGEST(7), h_64
+
+	t = 0
+	.rept 80/2 + 1
+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	# +1 iteration because the scheduler leads hashing by 1 iteration
+		.if t < 2
+			# BSWAP 2 QWORDS
+			movdqa	XMM_QWORD_BSWAP(%rip), %xmm1
+			movdqu	MSG(t), %xmm0
+			pshufb	%xmm1, %xmm0	# BSWAP
+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
+			movdqa	%xmm0, WK_2(t)	# Store into WK for rounds
+		.elseif t < 16
+			# BSWAP 2 QWORDS# Compute 2 Rounds
+			movdqu	MSG(t), %xmm0
+			pshufb	%xmm1, %xmm0	# BSWAP
+			SHA512_Round t-2	# Round t-2
+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
+			SHA512_Round t-1	# Round t-1
+			movdqa	%xmm0, WK_2(t)	# Store W[t]+K[t] into WK
+		.elseif t < 79
+			# Schedule 2 QWORDS# Compute 2 Rounds
+			SHA512_2Sched_2Round_sse t
+		.else
+			# Compute 2 Rounds
+			SHA512_Round t-2
+			SHA512_Round t-1
+		.endif
+		t = t+2
+	.endr
+
+	# Update digest
+	add	a_64, DIGEST(0)
+	add	b_64, DIGEST(1)
+	add	c_64, DIGEST(2)
+	add	d_64, DIGEST(3)
+	add	e_64, DIGEST(4)
+	add	f_64, DIGEST(5)
+	add	g_64, DIGEST(6)
+	add	h_64, DIGEST(7)
+
+	# Advance to next message block
+	add	$16*8, msg
+	dec	msglen
+	jnz	.Lupdateblock
+
+	# Restore Stack Pointer
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	# Restore GPRs
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	RET
+SYM_FUNC_END(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h
new file mode 100644
index 000000000000..c13503d9d57d
--- /dev/null
+++ b/lib/crypto/x86/sha512.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/fpu/api.h>
+#include <crypto/internal/simd.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic);
+
+#define DEFINE_X86_SHA512_FN(c_fn, asm_fn)                                 \
+	asmlinkage void asm_fn(struct sha512_block_state *state,           \
+			       const u8 *data, size_t nblocks);            \
+	static void c_fn(struct sha512_block_state *state, const u8 *data, \
+			 size_t nblocks)                                   \
+	{                                                                  \
+		if (likely(crypto_simd_usable())) {                        \
+			kernel_fpu_begin();                                \
+			asm_fn(state, data, nblocks);                      \
+			kernel_fpu_end();                                  \
+		} else {                                                   \
+			sha512_blocks_generic(state, data, nblocks);       \
+		}                                                          \
+	}
+
+DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	static_call(sha512_blocks_x86)(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static inline void sha512_mod_init_arch(void)
+{
+	if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) &&
+	    boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha512_blocks_x86,
+					   sha512_blocks_avx2);
+		else
+			static_call_update(sha512_blocks_x86,
+					   sha512_blocks_avx);
+	} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+		static_call_update(sha512_blocks_x86, sha512_blocks_ssse3);
+	}
+}